522 files changed, 36750 insertions, 32158 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 87f88f7ed34..e5a5e9773d3 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -177,12 +177,11 @@ if(CXX_HAS_AVX2)
   add_definitions(-DWITH_KERNEL_AVX2)
 endif()
 
-if(WITH_CYCLES_OSL)
-  if(WIN32 AND MSVC)
-    set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-  elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang"))
-    set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-  endif()
+# LLVM and OSL need to build without RTTI
+if(WIN32 AND MSVC)
+  set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang"))
+  set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
 endif()
 
 # Definitions and Includes
@@ -207,9 +206,9 @@ endif()
 
 if(WITH_CYCLES_OSL)
   add_definitions(-DWITH_OSL)
-  #osl 1.9.x
+  # osl 1.9.x
   add_definitions(-DOSL_STATIC_BUILD)
-  #pre 1.9
+  # pre 1.9
   add_definitions(-DOSL_STATIC_LIBRARY)
   include_directories(
     SYSTEM
@@ -217,6 +216,21 @@ if(WITH_CYCLES_OSL)
   )
 endif()
 
+if(WITH_CYCLES_DEVICE_OPTIX)
+  find_package(OptiX)
+
+  if(OPTIX_FOUND)
+    add_definitions(-DWITH_OPTIX)
+    include_directories(
+      SYSTEM
+      ${OPTIX_INCLUDE_DIR}
+      )
+  else()
+    message(STATUS "OptiX not found, disabling it from Cycles")
+    set(WITH_CYCLES_DEVICE_OPTIX OFF)
+  endif()
+endif()
+
 if(WITH_CYCLES_EMBREE)
   add_definitions(-DWITH_EMBREE)
   add_definitions(-DEMBREE_STATIC_LIB)
@@ -272,6 +286,7 @@ include_directories(
   ${OPENEXR_INCLUDE_DIR}
   ${OPENEXR_INCLUDE_DIRS}
   ${PUGIXML_INCLUDE_DIR}
+  ${TBB_INCLUDE_DIRS}
 )
 
 if(CYCLES_STANDALONE_REPOSITORY)
@@ -299,9 +314,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
       set(MAX_MSVC 1910)
     elseif(${CUDA_VERSION} EQUAL "9.1")
       set(MAX_MSVC 1911)
-    elseif(${CUDA_VERSION} EQUAL "10.0")
-      set(MAX_MSVC 1999)
-    elseif(${CUDA_VERSION} EQUAL "10.1")
+    elseif(${CUDA_VERSION} LESS "11.0")
       set(MAX_MSVC 1999)
     endif()
     if(NOT MSVC_VERSION LESS ${MAX_MSVC} OR CMAKE_C_COMPILER_ID MATCHES "Clang")
@@ -318,7 +331,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
 endif()
 
 # NVRTC gives wrong rendering result in CUDA 10.0, so we must use NVCC.
-if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_CUBIN_COMPILER)
+if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_CUBIN_COMPILER AND NOT WITH_CYCLES_CUBIN_COMPILER_OVERRRIDE)
   if(NOT (${CUDA_VERSION} VERSION_LESS 10.0))
     message(STATUS "cycles_cubin_cc not supported for CUDA 10.0+, using nvcc instead.")
     set(WITH_CYCLES_CUBIN_COMPILER OFF)
@@ -336,14 +349,6 @@ if(WITH_CYCLES_NETWORK)
   add_definitions(-DWITH_NETWORK)
 endif()
 
-if(WITH_OPENCOLORIO)
-  add_definitions(-DWITH_OCIO)
-  include_directories(
-    SYSTEM
-    ${OPENCOLORIO_INCLUDE_DIRS}
-  )
-endif()
-
 if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
   add_subdirectory(app)
 endif()
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 36e3e179be5..a2b0ed03925 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -22,7 +22,6 @@ set(LIBRARIES
   ${ZLIB_LIBRARIES}
   ${TIFF_LIBRARY}
   ${PTHREADS_LIBRARIES}
-  extern_clew
 )
 
 if(WITH_CUDA_DYNLOAD)
@@ -36,7 +35,7 @@ if(WITH_CYCLES_OSL)
 endif()
 
 if(NOT CYCLES_STANDALONE_REPOSITORY)
-  list(APPEND LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc bf_intern_numaapi)
+  list(APPEND LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc bf_intern_numaapi bf_intern_sky)
 endif()
 
 if(WITH_CYCLES_LOGGING)
@@ -52,14 +51,17 @@ endif()
 
 # Common configuration.
 
-link_directories(${OPENIMAGEIO_LIBPATH}
-                 ${BOOST_LIBPATH}
-                 ${PNG_LIBPATH}
-                 ${JPEG_LIBPATH}
-                 ${ZLIB_LIBPATH}
-                 ${TIFF_LIBPATH}
-                 ${OPENEXR_LIBPATH}
-                 ${OPENJPEG_LIBPATH})
+link_directories(
+  ${OPENIMAGEIO_LIBPATH}
+  ${BOOST_LIBPATH}
+  ${PNG_LIBPATH}
+  ${JPEG_LIBPATH}
+  ${ZLIB_LIBPATH}
+  ${TIFF_LIBPATH}
+  ${OPENEXR_LIBPATH}
+  ${OPENJPEG_LIBPATH}
+  ${OPENVDB_LIBPATH}
+)
 
 if(WITH_OPENCOLORIO)
   link_directories(${OPENCOLORIO_LIBPATH})
diff --git a/intern/cycles/app/cycles_cubin_cc.cpp b/intern/cycles/app/cycles_cubin_cc.cpp
index 774c18f4219..7631cb9bed5 100644
--- a/intern/cycles/app/cycles_cubin_cc.cpp
+++ b/intern/cycles/app/cycles_cubin_cc.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <stdio.h>
 #include <stdint.h>
+#include <stdio.h>
 
 #include <string>
 #include <vector>
@@ -43,7 +43,8 @@ template<typename T> std::string to_string(const T &n)
 
 class CompilationSettings {
  public:
-  CompilationSettings() : target_arch(0), bits(64), verbose(false), fast_math(false)
+  CompilationSettings()
+      : target_arch(0), bits(64), verbose(false), fast_math(false), ptx_only(false)
   {
   }
 
@@ -57,12 +58,13 @@ class CompilationSettings {
   int bits;
   bool verbose;
   bool fast_math;
+  bool ptx_only;
 };
 
 static bool compile_cuda(CompilationSettings &settings)
 {
-  const char *headers[] = {"stdlib.h", "float.h", "math.h", "stdio.h"};
-  const char *header_content[] = {"\n", "\n", "\n", "\n"};
+  const char *headers[] = {"stdlib.h", "float.h", "math.h", "stdio.h", "stddef.h"};
+  const char *header_content[] = {"\n", "\n", "\n", "\n", "\n"};
 
   printf("Building %s\n", settings.input_file.c_str());
 
@@ -83,6 +85,8 @@ static bool compile_cuda(CompilationSettings &settings)
   options.push_back("-D__KERNEL_CUDA_VERSION__=" + std::to_string(cuewNvrtcVersion()));
   options.push_back("-arch=compute_" + std::to_string(settings.target_arch));
   options.push_back("--device-as-default-execution-space");
+  options.push_back("-DCYCLES_CUBIN_CC");
+  options.push_back("--std=c++11");
   if (settings.fast_math)
     options.push_back("--use_fast_math");
 
@@ -134,10 +138,14 @@ static bool compile_cuda(CompilationSettings &settings)
     fprintf(stderr, "Error: nvrtcGetPTX failed (%d)\n\n", (int)result);
     return false;
   }
-
-  /* Write a file in the temp folder with the ptx code. */
-  settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" +
-                      OIIO::Filesystem::unique_path();
+  if (settings.ptx_only) {
+    settings.ptx_file = settings.output_file;
+  }
+  else {
+    /* Write a file in the temp folder with the ptx code. */
+    settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" +
+                        OIIO::Filesystem::unique_path();
+  }
   FILE *f = fopen(settings.ptx_file.c_str(), "wb");
   fwrite(&ptx_code[0], 1, ptx_size, f);
   fclose(f);
@@ -249,6 +257,9 @@ static bool parse_parameters(int argc, const char **argv, CompilationSettings &s
              "-D %L",
              &settings.defines,
              "Add additional defines",
+             "-ptx",
+             &settings.ptx_only,
+             "emit PTX code",
              "-v",
              &settings.verbose,
              "Use verbose logging",
@@ -303,8 +314,10 @@ int main(int argc, const char **argv)
     exit(EXIT_FAILURE);
   }
 
-  if (!link_ptxas(settings)) {
-    exit(EXIT_FAILURE);
+  if (!settings.ptx_only) {
+    if (!link_ptxas(settings)) {
+      exit(EXIT_FAILURE);
+    }
   }
 
   return 0;
diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp
index c5a4c9b375b..1ad70a376ed 100644
--- a/intern/cycles/app/cycles_server.cpp
+++ b/intern/cycles/app/cycles_server.cpp
@@ -20,11 +20,11 @@
 
 #include "util/util_args.h"
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_task.h"
-#include "util/util_logging.h"
 
 using namespace ccl;
 
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index d2d112e8d7e..f057ce7a2f0 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -16,16 +16,17 @@
 
 #include <stdio.h>
 
+#include "device/device.h"
 #include "render/buffers.h"
 #include "render/camera.h"
-#include "device/device.h"
+#include "render/integrator.h"
 #include "render/scene.h"
 #include "render/session.h"
-#include "render/integrator.h"
 
 #include "util/util_args.h"
 #include "util/util_foreach.h"
 #include "util/util_function.h"
+#include "util/util_image.h"
 #include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_progress.h"
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 9caf7068d3d..aec00f845f3 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -16,9 +16,9 @@
 
 #include <stdio.h>
 
-#include <sstream>
 #include <algorithm>
 #include <iterator>
+#include <sstream>
 
 #include "graph/node_xml.h"
 
@@ -32,8 +32,8 @@
 #include "render/nodes.h"
 #include "render/object.h"
 #include "render/osl.h"
-#include "render/shader.h"
 #include "render/scene.h"
+#include "render/shader.h"
 
 #include "subd/subd_patch.h"
 #include "subd/subd_split.h"
@@ -292,7 +292,7 @@ static void xml_read_shader_graph(XMLReadState &state, Shader *shader, xml_node
             filepath = path_join(state.base, filepath);
           }
 
-          snode = ((OSLShaderManager *)manager)->osl_node(filepath);
+          snode = OSLShaderManager::osl_node(manager, filepath);
 
           if (!snode) {
             fprintf(stderr, "Failed to create OSL node from \"%s\".\n", filepath.c_str());
@@ -326,6 +326,10 @@ static void xml_read_shader_graph(XMLReadState &state, Shader *shader, xml_node
         fprintf(stderr, "Node type \"%s\" is not a shader node.\n", node_type->name.c_str());
         continue;
       }
+      else if (node_type->create == NULL) {
+        fprintf(stderr, "Can't create abstract node type \"%s\".\n", node_type->name.c_str());
+        continue;
+      }
 
       snode = (ShaderNode *)node_type->create(node_type);
     }
@@ -376,11 +380,11 @@ static Mesh *xml_add_mesh(Scene *scene, const Transform &tfm)
 {
   /* create mesh */
   Mesh *mesh = new Mesh();
-  scene->meshes.push_back(mesh);
+  scene->geometry.push_back(mesh);
 
   /* create object*/
   Object *object = new Object();
-  object->mesh = mesh;
+  object->geometry = mesh;
   object->tfm = tfm;
   scene->objects.push_back(object);
 
@@ -495,7 +499,7 @@ static void xml_read_mesh(const XMLReadState &state, xml_node node)
       float3 *fdata = attr->data_float3();
 
 #if 0
-      if(subdivide_uvs) {
+      if (subdivide_uvs) {
         attr->flags |= ATTR_SUBDIVIDED;
       }
 #endif
diff --git a/intern/cycles/app/io_export_cycles_xml.py b/intern/cycles/app/io_export_cycles_xml.py
index a1b42f72f7c..d2c6dc493e8 100644
--- a/intern/cycles/app/io_export_cycles_xml.py
+++ b/intern/cycles/app/io_export_cycles_xml.py
@@ -64,7 +64,7 @@ class RenderButtonsPanel():
     bl_context = "render"
 
     @classmethod
-    def poll(self, context):
+    def poll(cls, context):
         return context.engine == 'CYCLES'
 
 
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 7354b1e615e..2316800e21e 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -18,6 +18,9 @@ set(INC_SYS
 set(SRC
   blender_camera.cpp
   blender_device.cpp
+  blender_image.cpp
+  blender_geometry.cpp
+  blender_light.cpp
   blender_mesh.cpp
   blender_object.cpp
   blender_object_cull.cpp
@@ -29,13 +32,19 @@ set(SRC
   blender_shader.cpp
   blender_sync.cpp
   blender_texture.cpp
+  blender_viewport.cpp
+  blender_volume.cpp
 
   CCL_api.h
+  blender_device.h
+  blender_id_map.h
+  blender_image.h
   blender_object_cull.h
   blender_sync.h
   blender_session.h
   blender_texture.h
   blender_util.h
+  blender_viewport.h
 )
 
 set(LIB
@@ -46,11 +55,15 @@ set(LIB
   cycles_render
   cycles_subd
   cycles_util
+
+  ${PYTHON_LINKFLAGS}
+  ${PYTHON_LIBRARIES}
 )
 
 if(WITH_CYCLES_LOGGING)
   list(APPEND LIB
-    extern_glog
+    ${GLOG_LIBRARIES}
+    ${GFLAGS_LIBRARIES}
   )
 endif()
 
@@ -68,13 +81,34 @@ set(ADDON_FILES
 add_definitions(${GL_DEFINITIONS})
 
 if(WITH_CYCLES_DEVICE_OPENCL)
-    add_definitions(-DWITH_OPENCL)
+  add_definitions(-DWITH_OPENCL)
 endif()
 
 if(WITH_CYCLES_NETWORK)
   add_definitions(-DWITH_NETWORK)
 endif()
 
+if(WITH_MOD_FLUID)
+  add_definitions(-DWITH_FLUID)
+endif()
+
+if(WITH_OPENVDB)
+  add_definitions(-DWITH_OPENVDB ${OPENVDB_DEFINITIONS})
+  list(APPEND INC_SYS
+    ${OPENVDB_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${OPENVDB_LIBRARIES}
+  )
+endif()
+
+if(WITH_OPENIMAGEDENOISE)
+  add_definitions(-DWITH_OPENIMAGEDENOISE)
+  list(APPEND INC_SYS
+    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+  )
+endif()
+
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
 # avoid link failure with clang 3.4 debug
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 93a1271b4b4..3ab352e52a2 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -20,10 +20,9 @@ bl_info = {
     "name": "Cycles Render Engine",
     "author": "",
     "blender": (2, 80, 0),
-    "location": "Info header, render engine menu",
-    "description": "Cycles Render Engine integration",
+    "description": "Cycles renderer integration",
     "warning": "",
-    "wiki_url": "https://docs.blender.org/manual/en/dev/render/cycles/",
+    "doc_url": "https://docs.blender.org/manual/en/latest/render/cycles/",
     "tracker_url": "",
     "support": 'OFFICIAL',
     "category": "Render"}
@@ -55,7 +54,7 @@ from . import (
 class CyclesRender(bpy.types.RenderEngine):
     bl_idname = 'CYCLES'
     bl_label = "Cycles"
-    bl_use_shading_nodes = True
+    bl_use_eevee_viewport = True
     bl_use_preview = True
     bl_use_exclude_layers = True
     bl_use_save_buffers = True
@@ -83,20 +82,20 @@ class CyclesRender(bpy.types.RenderEngine):
     def render(self, depsgraph):
         engine.render(self, depsgraph)
 
-    def bake(self, depsgraph, obj, pass_type, pass_filter, object_id, pixel_array, num_pixels, depth, result):
-        engine.bake(self, depsgraph, obj, pass_type, pass_filter, object_id, pixel_array, num_pixels, depth, result)
+    def bake(self, depsgraph, obj, pass_type, pass_filter, width, height):
+        engine.bake(self, depsgraph, obj, pass_type, pass_filter, width, height)
 
     # viewport render
-    def view_update(self, context):
+    def view_update(self, context, depsgraph):
         if not self.session:
             engine.create(self, context.blend_data,
                           context.region, context.space_data, context.region_data)
 
-        engine.reset(self, context.blend_data, context.depsgraph)
-        engine.sync(self, context.depsgraph, context.blend_data)
+        engine.reset(self, context.blend_data, depsgraph)
+        engine.sync(self, depsgraph, context.blend_data)
 
-    def view_draw(self, context):
-        engine.draw(self, context.depsgraph, context.region, context.space_data, context.region_data)
+    def view_draw(self, context, depsgraph):
+        engine.draw(self, depsgraph, context.region, context.space_data, context.region_data)
 
     def update_script_node(self, node):
         if engine.with_osl():
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index b8bc74f9e35..67e448db859 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -33,7 +33,7 @@ def _is_using_buggy_driver():
             # in the version string, but those cards do not quite work and
             # causing crashes.
             return True
-        regex = re.compile(".*Compatibility Profile Context ([0-9]+(\.[0-9]+)+)$")
+        regex = re.compile(".*Compatibility Profile Context ([0-9]+(\\.[0-9]+)+)$")
         if not regex.match(version):
             # Skip cards like FireGL
             return False
@@ -139,15 +139,19 @@ def create(engine, data, region=None, v3d=None, rv3d=None, preview_osl=False):
 
     data = data.as_pointer()
     prefs = bpy.context.preferences.as_pointer()
+    screen = 0
     if region:
+        screen = region.id_data.as_pointer()
         region = region.as_pointer()
     if v3d:
+        screen = screen or v3d.id_data.as_pointer()
         v3d = v3d.as_pointer()
     if rv3d:
+        screen = screen or rv3d.id_data.as_pointer()
         rv3d = rv3d.as_pointer()
 
     engine.session = _cycles.create(
-            engine.as_pointer(), prefs, data, region, v3d, rv3d, preview_osl)
+            engine.as_pointer(), prefs, data, screen, region, v3d, rv3d, preview_osl)
 
 
 def free(engine):
@@ -164,18 +168,19 @@ def render(engine, depsgraph):
         _cycles.render(engine.session, depsgraph.as_pointer())
 
 
-def bake(engine, depsgraph, obj, pass_type, pass_filter, object_id, pixel_array, num_pixels, depth, result):
+def bake(engine, depsgraph, obj, pass_type, pass_filter, width, height):
     import _cycles
     session = getattr(engine, "session", None)
     if session is not None:
-        _cycles.bake(engine.session, depsgraph.as_pointer(), obj.as_pointer(), pass_type, pass_filter, object_id, pixel_array.as_pointer(), num_pixels, depth, result.as_pointer())
+        _cycles.bake(engine.session, depsgraph.as_pointer(), obj.as_pointer(), pass_type, pass_filter, width, height)
 
 
 def reset(engine, data, depsgraph):
     import _cycles
     import bpy
 
-    if bpy.app.debug_value == 256:
+    prefs = bpy.context.preferences
+    if prefs.experimental.use_cycles_debug and prefs.view.show_developer_ui:
         _cycles.debug_flags_update(depsgraph.scene.as_pointer())
     else:
         _cycles.debug_flags_reset()
@@ -219,65 +224,96 @@ def system_info():
     import _cycles
     return _cycles.system_info()
 
-
-def register_passes(engine, scene, srl):
-    engine.register_pass(scene, srl, "Combined", 4, "RGBA", 'COLOR')
-
-    if srl.use_pass_z:                     engine.register_pass(scene, srl, "Depth",         1, "Z",    'VALUE')
-    if srl.use_pass_mist:                  engine.register_pass(scene, srl, "Mist",          1, "Z",    'VALUE')
-    if srl.use_pass_normal:                engine.register_pass(scene, srl, "Normal",        3, "XYZ",  'VECTOR')
-    if srl.use_pass_vector:                engine.register_pass(scene, srl, "Vector",        4, "XYZW", 'VECTOR')
-    if srl.use_pass_uv:                    engine.register_pass(scene, srl, "UV",            3, "UVA",  'VECTOR')
-    if srl.use_pass_object_index:          engine.register_pass(scene, srl, "IndexOB",       1, "X",    'VALUE')
-    if srl.use_pass_material_index:        engine.register_pass(scene, srl, "IndexMA",       1, "X",    'VALUE')
-    if srl.use_pass_shadow:                engine.register_pass(scene, srl, "Shadow",        3, "RGB",  'COLOR')
-    if srl.use_pass_ambient_occlusion:     engine.register_pass(scene, srl, "AO",            3, "RGB",  'COLOR')
-    if srl.use_pass_diffuse_direct:        engine.register_pass(scene, srl, "DiffDir",       3, "RGB",  'COLOR')
-    if srl.use_pass_diffuse_indirect:      engine.register_pass(scene, srl, "DiffInd",       3, "RGB",  'COLOR')
-    if srl.use_pass_diffuse_color:         engine.register_pass(scene, srl, "DiffCol",       3, "RGB",  'COLOR')
-    if srl.use_pass_glossy_direct:         engine.register_pass(scene, srl, "GlossDir",      3, "RGB",  'COLOR')
-    if srl.use_pass_glossy_indirect:       engine.register_pass(scene, srl, "GlossInd",      3, "RGB",  'COLOR')
-    if srl.use_pass_glossy_color:          engine.register_pass(scene, srl, "GlossCol",      3, "RGB",  'COLOR')
-    if srl.use_pass_transmission_direct:   engine.register_pass(scene, srl, "TransDir",      3, "RGB",  'COLOR')
-    if srl.use_pass_transmission_indirect: engine.register_pass(scene, srl, "TransInd",      3, "RGB",  'COLOR')
-    if srl.use_pass_transmission_color:    engine.register_pass(scene, srl, "TransCol",      3, "RGB",  'COLOR')
-    if srl.use_pass_subsurface_direct:     engine.register_pass(scene, srl, "SubsurfaceDir", 3, "RGB",  'COLOR')
-    if srl.use_pass_subsurface_indirect:   engine.register_pass(scene, srl, "SubsurfaceInd", 3, "RGB",  'COLOR')
-    if srl.use_pass_subsurface_color:      engine.register_pass(scene, srl, "SubsurfaceCol", 3, "RGB",  'COLOR')
-    if srl.use_pass_emit:                  engine.register_pass(scene, srl, "Emit",          3, "RGB",  'COLOR')
-    if srl.use_pass_environment:           engine.register_pass(scene, srl, "Env",           3, "RGB",  'COLOR')
-
+def list_render_passes(scene, srl):
+    # Builtin Blender passes.
+    yield ("Combined", "RGBA", 'COLOR')
+
+    if srl.use_pass_z:                     yield ("Depth",         "Z",    'VALUE')
+    if srl.use_pass_mist:                  yield ("Mist",          "Z",    'VALUE')
+    if srl.use_pass_normal:                yield ("Normal",        "XYZ",  'VECTOR')
+    if srl.use_pass_vector:                yield ("Vector",        "XYZW", 'VECTOR')
+    if srl.use_pass_uv:                    yield ("UV",            "UVA",  'VECTOR')
+    if srl.use_pass_object_index:          yield ("IndexOB",       "X",    'VALUE')
+    if srl.use_pass_material_index:        yield ("IndexMA",       "X",    'VALUE')
+    if srl.use_pass_shadow:                yield ("Shadow",        "RGB",  'COLOR')
+    if srl.use_pass_ambient_occlusion:     yield ("AO",            "RGB",  'COLOR')
+    if srl.use_pass_diffuse_direct:        yield ("DiffDir",       "RGB",  'COLOR')
+    if srl.use_pass_diffuse_indirect:      yield ("DiffInd",       "RGB",  'COLOR')
+    if srl.use_pass_diffuse_color:         yield ("DiffCol",       "RGB",  'COLOR')
+    if srl.use_pass_glossy_direct:         yield ("GlossDir",      "RGB",  'COLOR')
+    if srl.use_pass_glossy_indirect:       yield ("GlossInd",      "RGB",  'COLOR')
+    if srl.use_pass_glossy_color:          yield ("GlossCol",      "RGB",  'COLOR')
+    if srl.use_pass_transmission_direct:   yield ("TransDir",      "RGB",  'COLOR')
+    if srl.use_pass_transmission_indirect: yield ("TransInd",      "RGB",  'COLOR')
+    if srl.use_pass_transmission_color:    yield ("TransCol",      "RGB",  'COLOR')
+    if srl.use_pass_emit:                  yield ("Emit",          "RGB",  'COLOR')
+    if srl.use_pass_environment:           yield ("Env",           "RGB",  'COLOR')
+
+    # Cycles specific passes.
     crl = srl.cycles
-    if crl.pass_debug_render_time:             engine.register_pass(scene, srl, "Debug Render Time",             1, "X",   'VALUE')
-    if crl.pass_debug_bvh_traversed_nodes:     engine.register_pass(scene, srl, "Debug BVH Traversed Nodes",     1, "X",   'VALUE')
-    if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X",   'VALUE')
-    if crl.pass_debug_bvh_intersections:       engine.register_pass(scene, srl, "Debug BVH Intersections",       1, "X",   'VALUE')
-    if crl.pass_debug_ray_bounces:             engine.register_pass(scene, srl, "Debug Ray Bounces",             1, "X",   'VALUE')
-    if crl.use_pass_volume_direct:             engine.register_pass(scene, srl, "VolumeDir",                     3, "RGB", 'COLOR')
-    if crl.use_pass_volume_indirect:           engine.register_pass(scene, srl, "VolumeInd",                     3, "RGB", 'COLOR')
-
+    if crl.pass_debug_render_time:             yield ("Debug Render Time",             "X",   'VALUE')
+    if crl.pass_debug_bvh_traversed_nodes:     yield ("Debug BVH Traversed Nodes",     "X",   'VALUE')
+    if crl.pass_debug_bvh_traversed_instances: yield ("Debug BVH Traversed Instances", "X",   'VALUE')
+    if crl.pass_debug_bvh_intersections:       yield ("Debug BVH Intersections",       "X",   'VALUE')
+    if crl.pass_debug_ray_bounces:             yield ("Debug Ray Bounces",             "X",   'VALUE')
+    if crl.pass_debug_sample_count:            yield ("Debug Sample Count",            "X",   'VALUE')
+    if crl.use_pass_volume_direct:             yield ("VolumeDir",                     "RGB", 'COLOR')
+    if crl.use_pass_volume_indirect:           yield ("VolumeInd",                     "RGB", 'COLOR')
+
+    # Cryptomatte passes.
+    crypto_depth = (crl.pass_crypto_depth + 1) // 2
     if crl.use_pass_crypto_object:
-        for i in range(0, crl.pass_crypto_depth, 2):
-            engine.register_pass(scene, srl, "CryptoObject" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+        for i in range(0, crypto_depth):
+            yield ("CryptoObject" + '{:02d}'.format(i), "RGBA", 'COLOR')
     if crl.use_pass_crypto_material:
-        for i in range(0, crl.pass_crypto_depth, 2):
-            engine.register_pass(scene, srl, "CryptoMaterial" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+        for i in range(0, crypto_depth):
+            yield ("CryptoMaterial" + '{:02d}'.format(i), "RGBA", 'COLOR')
     if srl.cycles.use_pass_crypto_asset:
-        for i in range(0, srl.cycles.pass_crypto_depth, 2):
-            engine.register_pass(scene, srl, "CryptoAsset" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+        for i in range(0, crypto_depth):
+            yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
 
+    # Denoising passes.
     if crl.use_denoising or crl.denoising_store_passes:
-        engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR')
+        yield ("Noisy Image", "RGBA", 'COLOR')
         if crl.denoising_store_passes:
-            engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR')
-            engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR')
-            engine.register_pass(scene, srl, "Denoising Depth",           1, "Z",   'VALUE')
-            engine.register_pass(scene, srl, "Denoising Shadowing",       1, "X",   'VALUE')
-            engine.register_pass(scene, srl, "Denoising Variance",        3, "RGB", 'COLOR')
-            engine.register_pass(scene, srl, "Denoising Intensity",       1, "X",   'VALUE')
-            clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
-                             "denoising_glossy_direct", "denoising_glossy_indirect",
-                             "denoising_transmission_direct", "denoising_transmission_indirect",
-                             "denoising_subsurface_direct", "denoising_subsurface_indirect")
-            if any(getattr(crl, option) for option in clean_options):
-                engine.register_pass(scene, srl, "Denoising Clean", 3, "RGB", 'COLOR')
+            yield ("Denoising Normal",          "XYZ", 'VECTOR')
+            yield ("Denoising Albedo",          "RGB", 'COLOR')
+            yield ("Denoising Depth",           "Z",   'VALUE')
+
+            if scene.cycles.denoiser == 'NLM':
+                yield ("Denoising Shadowing",       "X",   'VALUE')
+                yield ("Denoising Variance",        "RGB", 'COLOR')
+                yield ("Denoising Intensity",       "X",   'VALUE')
+
+                clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
+                                 "denoising_glossy_direct", "denoising_glossy_indirect",
+                                 "denoising_transmission_direct", "denoising_transmission_indirect")
+                if any(getattr(crl, option) for option in clean_options):
+                    yield ("Denoising Clean", "RGB", 'COLOR')
+
+    # Custom AOV passes.
+    for aov in crl.aovs:
+        if aov.type == 'VALUE':
+            yield (aov.name, "X", 'VALUE')
+        else:
+            yield (aov.name, "RGBA", 'COLOR')
+
+def register_passes(engine, scene, view_layer):
+    # Detect duplicate render pass names, first one wins.
+    listed = set()
+    for name, channelids, channeltype in list_render_passes(scene, view_layer):
+        if name not in listed:
+            engine.register_pass(scene, view_layer, name, len(channelids), channelids, channeltype)
+            listed.add(name)
+
+def detect_conflicting_passes(scene, view_layer):
+    # Detect conflicting render pass names for UI.
+    counter = {}
+    for name, _, _ in list_render_passes(scene, view_layer):
+        counter[name] = counter.get(name, 0) + 1
+
+    for aov in view_layer.cycles.aovs:
+        if counter[aov.name] > 1:
+            aov.conflict = "Conflicts with another render pass with the same name"
+        else:
+            aov.conflict = ""
diff --git a/intern/cycles/blender/addon/operators.py b/intern/cycles/blender/addon/operators.py
index 63c61c4799e..3c8e79eaba5 100644
--- a/intern/cycles/blender/addon/operators.py
+++ b/intern/cycles/blender/addon/operators.py
@@ -20,6 +20,8 @@ import bpy
 from bpy.types import Operator
 from bpy.props import StringProperty
 
+from bpy.app.translations import pgettext_tip as tip_
+
 
 class CYCLES_OT_use_shading_nodes(Operator):
     """Enable nodes on a material, world or light"""
@@ -42,6 +44,36 @@ class CYCLES_OT_use_shading_nodes(Operator):
         return {'FINISHED'}
 
 
+class CYCLES_OT_add_aov(bpy.types.Operator):
+    """Add an AOV pass"""
+    bl_idname="cycles.add_aov"
+    bl_label="Add AOV"
+
+    def execute(self, context):
+        view_layer = context.view_layer
+        cycles_view_layer = view_layer.cycles
+
+        cycles_view_layer.aovs.add()
+
+        view_layer.update_render_passes()
+        return {'FINISHED'}
+
+
+class CYCLES_OT_remove_aov(bpy.types.Operator):
+    """Remove an AOV pass"""
+    bl_idname="cycles.remove_aov"
+    bl_label="Remove AOV"
+
+    def execute(self, context):
+        view_layer = context.view_layer
+        cycles_view_layer = view_layer.cycles
+
+        cycles_view_layer.aovs.remove(cycles_view_layer.active_aov)
+
+        view_layer.update_render_passes()
+        return {'FINISHED'}
+
+
 class CYCLES_OT_denoise_animation(Operator):
     "Denoise rendered animation sequence using current scene and view " \
     "layer settings. Requires denoising data passes and output to " \
@@ -98,7 +130,8 @@ class CYCLES_OT_denoise_animation(Operator):
 
                 if not os.path.isfile(filepath):
                     scene.render.filepath = original_filepath
-                    self.report({'ERROR'}, f"Frame '{filepath}' not found, animation must be complete.")
+                    err_msg = tip_("Frame '%s' not found, animation must be complete") % filepath
+                    self.report({'ERROR'}, err_msg)
                     return {'CANCELLED'}
 
                 scene.render.filepath = out_filepath
@@ -120,12 +153,12 @@ class CYCLES_OT_denoise_animation(Operator):
             self.report({'ERROR'}, str(e))
             return {'FINISHED'}
 
-        self.report({'INFO'}, "Denoising completed.")
+        self.report({'INFO'}, "Denoising completed")
         return {'FINISHED'}
 
 
 class CYCLES_OT_merge_images(Operator):
-    "Combine OpenEXR multilayer images rendered with different sample" \
+    "Combine OpenEXR multilayer images rendered with different sample " \
     "ranges into one image with reduced noise"
     bl_idname = "cycles.merge_images"
     bl_label = "Merge Images"
@@ -164,6 +197,8 @@ class CYCLES_OT_merge_images(Operator):
 
 classes = (
     CYCLES_OT_use_shading_nodes,
+    CYCLES_OT_add_aov,
+    CYCLES_OT_remove_aov,
     CYCLES_OT_denoise_animation,
     CYCLES_OT_merge_images
 )
diff --git a/intern/cycles/blender/addon/osl.py b/intern/cycles/blender/addon/osl.py
index dd92ce642d4..4c6e7952491 100644
--- a/intern/cycles/blender/addon/osl.py
+++ b/intern/cycles/blender/addon/osl.py
@@ -85,6 +85,7 @@ def update_script_node(node, report):
             # write text datablock contents to temporary file
             osl_file = tempfile.NamedTemporaryFile(mode='w', suffix=".osl", delete=False)
             osl_file.write(script.as_string())
+            osl_file.write("\n")
             osl_file.close()
 
             ok, oso_path = osl_compile(osl_file.name, report)
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index db9e8bb46b3..d764f469eb7 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -19,6 +19,7 @@
 import bpy
 from bpy.props import (
     BoolProperty,
+    CollectionProperty,
     EnumProperty,
     FloatProperty,
     IntProperty,
@@ -31,6 +32,7 @@ from math import pi
 # enums
 
 import _cycles
+from . import engine
 
 enum_devices = (
     ('CPU', "CPU", "Use CPU for rendering"),
@@ -53,8 +55,7 @@ enum_displacement_methods = (
 
 enum_bvh_layouts = (
     ('BVH2', "BVH2", "", 1),
-    ('BVH4', "BVH4", "", 2),
-    ('BVH8', "BVH8", "", 4),
+    ('EMBREE', "Embree", "", 4),
 )
 
 enum_bvh_types = (
@@ -68,11 +69,6 @@ enum_filter_types = (
     ('BLACKMAN_HARRIS', "Blackman-Harris", "Blackman-Harris filter"),
 )
 
-enum_aperture_types = (
-    ('RADIUS', "Radius", "Directly change the size of the aperture"),
-    ('FSTOP', "F-stop", "Change the size of the aperture by f-stop"),
-)
-
 enum_panorama_types = (
     ('EQUIRECTANGULAR', "Equirectangular", "Render the scene with a spherical camera, also known as Lat Long panorama"),
     ('FISHEYE_EQUIDISTANT', "Fisheye Equidistant", "Ideal for fulldomes, ignore the sensor dimensions"),
@@ -81,20 +77,9 @@ enum_panorama_types = (
     ('MIRRORBALL', "Mirror Ball", "Uses the mirror ball mapping"),
 )
 
-enum_curve_primitives = (
-    ('TRIANGLES', "Triangles", "Create triangle geometry around strands"),
-    ('LINE_SEGMENTS', "Line Segments", "Use line segment primitives"),
-    ('CURVE_SEGMENTS', "Curve Segments", "Use segmented cardinal curve primitives"),
-)
-
-enum_triangle_curves = (
-    ('CAMERA_TRIANGLES', "Planes", "Create individual triangles forming planes that face camera"),
-    ('TESSELLATED_TRIANGLES', "Tessellated", "Create mesh surrounding each strand"),
-)
-
 enum_curve_shape = (
-    ('RIBBONS', "Ribbons", "Ignore thickness of each strand"),
-    ('THICK', "Thick", "Use thickness of strand when rendering"),
+    ('RIBBONS', "Rounded Ribbons", "Render hair as flat ribbon with rounded normals, for fast rendering"),
+    ('THICK', "3D Curves", "Render hair as 3D curve, for accurate results when viewing hair close up"),
 )
 
 enum_tile_order = (
@@ -115,6 +100,7 @@ enum_use_layer_samples = (
 enum_sampling_pattern = (
     ('SOBOL', "Sobol", "Use Sobol random sampling pattern"),
     ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"),
+    ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"),
 )
 
 enum_integrator = (
@@ -142,6 +128,7 @@ enum_world_mis = (
 enum_device_type = (
     ('CPU', "CPU", "CPU", 0),
     ('CUDA', "CUDA", "CUDA", 1),
+    ('OPTIX', "OptiX", "OptiX", 3),
     ('OPENCL', "OpenCL", "OpenCL", 2)
 )
 
@@ -156,6 +143,88 @@ enum_texture_limit = (
     ('8192', "8192", "Limit texture size to 8192 pixels", 7),
 )
 
+enum_view3d_shading_render_pass= (
+    ('', "General", ""),
+
+    ('COMBINED', "Combined", "Show the Combined Render pass", 1),
+    ('EMISSION', "Emission", "Show the Emission render pass", 33),
+    ('BACKGROUND', "Background", "Show the Background render pass", 34),
+    ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass", 35),
+
+    ('', "Light", ""),
+
+    ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass", 38),
+    ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass", 39),
+    ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass", 40),
+
+    ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass", 41),
+    ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass", 42),
+    ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass", 43),
+
+    ('', "", ""),
+
+    ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass", 44),
+    ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass", 45),
+    ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass", 46),
+
+    ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass", 50),
+    ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass", 51),
+
+    ('', "Data", ""),
+
+    ('NORMAL', "Normal", "Show the Normal render pass", 3),
+    ('UV', "UV", "Show the UV render pass", 4),
+    ('MIST', "Mist", "Show the Mist render pass", 32),
+)
+
+enum_aov_types = (
+    ('VALUE', "Value", "Write a Value pass", 0),
+    ('COLOR', "Color", "Write a Color pass", 1),
+)
+
+
+def enum_openimagedenoise_denoiser(self, context):
+    if _cycles.with_openimagedenoise:
+        return [('OPENIMAGEDENOISE', "OpenImageDenoise", "Use Intel OpenImageDenoise AI denoiser running on the CPU", 4)]
+    return []
+
+def enum_optix_denoiser(self, context):
+    if not context or bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX')):
+        return [('OPTIX', "OptiX", "Use the OptiX AI denoiser with GPU acceleration, only available on NVIDIA GPUs", 2)]
+    return []
+
+def enum_preview_denoiser(self, context):
+    optix_items = enum_optix_denoiser(self, context)
+    oidn_items = enum_openimagedenoise_denoiser(self, context)
+
+    if len(optix_items) or len(oidn_items):
+        items = [('AUTO', "Automatic", "Use the fastest available denoiser for viewport rendering (OptiX if available, OpenImageDenoise otherwise)", 0)]
+    else:
+        items = [('AUTO', "None", "Blender was compiled without a viewport denoiser", 0)]
+
+    items += optix_items
+    items += oidn_items
+    return items
+
+def enum_denoiser(self, context):
+    items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)]
+    items += enum_optix_denoiser(self, context)
+    items += enum_openimagedenoise_denoiser(self, context)
+    return items
+
+enum_denoising_input_passes = (
+    ('RGB', "Color", "Use only color as input", 1),
+    ('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2),
+    ('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3),
+)
+
+
+def update_render_passes(self, context):
+    scene = context.scene
+    view_layer = context.view_layer
+    view_layer.update_render_passes()
+    engine.detect_conflicting_passes(scene, view_layer)
+
 
 class CyclesRenderSettings(bpy.types.PropertyGroup):
 
@@ -183,6 +252,39 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default='PATH',
     )
 
+    preview_pause: BoolProperty(
+        name="Pause Preview",
+        description="Pause all viewport preview renders",
+        default=False,
+    )
+
+    use_denoising: BoolProperty(
+        name="Use Denoising",
+        description="Denoise the rendered image",
+        default=False,
+    )
+    use_preview_denoising: BoolProperty(
+        name="Use Viewport Denoising",
+        description="Denoise the image in the 3D viewport",
+        default=False,
+    )
+
+    denoiser: EnumProperty(
+        name="Denoiser",
+        description="Denoise the image with the selected denoiser. "
+        "For denoising the image after rendering, denoising data render passes "
+        "also adapt to the selected denoiser",
+        items=enum_denoiser,
+        default=1,
+        update=update_render_passes,
+    )
+    preview_denoiser: EnumProperty(
+        name="Viewport Denoiser",
+        description="Denoise the image after each preview update with the selected denoiser",
+        items=enum_preview_denoiser,
+        default=0,
+    )
+
     use_square_samples: BoolProperty(
         name="Square Samples",
         description="Square sampling values for easier artist control",
@@ -209,16 +311,11 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=128,
     )
     preview_samples: IntProperty(
-        name="Preview Samples",
+        name="Viewport Samples",
         description="Number of samples to render in the viewport, unlimited if 0",
         min=0, max=(1 << 24),
         default=32,
     )
-    preview_pause: BoolProperty(
-        name="Pause Preview",
-        description="Pause all viewport preview renders",
-        default=False,
-    )
     aa_samples: IntProperty(
         name="AA Samples",
         description="Number of antialiasing samples to render for each pixel",
@@ -231,6 +328,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=0, max=2097151,
         default=32,
     )
+
     diffuse_samples: IntProperty(
         name="Diffuse Samples",
         description="Number of diffuse bounce samples to render for each AA sample",
@@ -261,14 +359,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=1, max=1024,
         default=1,
     )
-
     subsurface_samples: IntProperty(
         name="Subsurface Samples",
         description="Number of subsurface scattering samples to render for each AA sample",
         min=1, max=1024,
         default=1,
     )
-
     volume_samples: IntProperty(
         name="Volume Samples",
         description="Number of volume scattering samples to render for each AA sample",
@@ -309,6 +405,41 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0.01,
     )
 
+    use_adaptive_sampling: BoolProperty(
+        name="Use Adaptive Sampling",
+        description="Automatically reduce the number of samples per pixel based on estimated noise level",
+        default=False,
+    )
+
+    adaptive_threshold: FloatProperty(
+        name="Adaptive Sampling Threshold",
+        description="Noise level step to stop sampling at, lower values reduce noise the cost of render time. Zero for automatic setting based on number of AA samples",
+        min=0.0, max=1.0,
+        default=0.0,
+        precision=4,
+    )
+    adaptive_min_samples: IntProperty(
+        name="Adaptive Min Samples",
+        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on number of AA samples",
+        min=0, max=4096,
+        default=0,
+    )
+
+    min_light_bounces: IntProperty(
+            name="Min Light Bounces",
+            description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
+                        "but can also be less efficient for more complex geometry like hair and volumes",
+            min=0, max=1024,
+            default=0,
+    )
+    min_transparent_bounces: IntProperty(
+            name="Min Transparent Bounces",
+            description="Minimum number of transparent bounces. Setting this higher reduces noise in the first bounces, "
+                        "but can also be less efficient for more complex geometry like hair and volumes",
+            min=0, max=1024,
+            default=0,
+    )
+
     caustics_reflective: BoolProperty(
         name="Reflective Caustics",
         description="Use reflective caustics, resulting in a brighter image (more noise but added realism)",
@@ -368,13 +499,20 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=8,
     )
 
-    volume_step_size: FloatProperty(
-        name="Step Size",
-        description="Distance between volume shader samples when rendering the volume "
-        "(lower values give more accurate and detailed results, but also increased render time)",
-        default=0.1,
-        min=0.0000001, max=100000.0, soft_min=0.01, soft_max=1.0, precision=4,
-        unit='LENGTH'
+    volume_step_rate: FloatProperty(
+        name="Step Rate",
+        description="Globally adjust detail for volume rendering, on top of automatically estimated step size. "
+                    "Higher values reduce render time, lower values render with more detail",
+        default=1.0,
+        min=0.01, max=100.0, soft_min=0.1, soft_max=10.0, precision=2
+    )
+
+    volume_preview_step_rate: FloatProperty(
+        name="Step Rate",
+        description="Globally adjust detail for volume rendering, on top of automatically estimated step size. "
+                    "Higher values reduce render time, lower values render with more detail",
+        default=1.0,
+        min=0.01, max=100.0, soft_min=0.1, soft_max=10.0, precision=2
     )
 
     volume_max_steps: IntProperty(
@@ -393,7 +531,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         subtype='PIXEL'
     )
     preview_dicing_rate: FloatProperty(
-        name="Preview Dicing Rate",
+        name="Viewport Dicing Rate",
         description="Size of a micropolygon in pixels during preview render",
         min=0.1, max=1000.0, soft_min=0.5,
         default=8.0,
@@ -430,11 +568,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=0.0, max=10.0,
         default=1.0,
     )
-    film_transparent: BoolProperty(
-        name="Transparent",
-        description="World background is transparent, for compositing the render over another background",
-        default=False,
-    )
     film_transparent_glass: BoolProperty(
         name="Transparent Glass",
         description="Render transmissive surfaces as transparent, for compositing glass over another background",
@@ -519,6 +652,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=64,
         subtype='PIXEL'
     )
+    preview_denoising_start_sample: IntProperty(
+        name="Start Denoising",
+        description="Sample to start denoising the preview at",
+        min=0, max=(1 << 24),
+        default=1,
+    )
 
     debug_reset_timeout: FloatProperty(
         name="Reset timeout",
@@ -545,11 +684,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         items=enum_bvh_types,
         default='DYNAMIC_BVH',
     )
-    use_bvh_embree: BoolProperty(
-        name="Use Embree",
-        description="Use Embree as ray accelerator",
-        default=False,
-    )
     debug_use_spatial_splits: BoolProperty(
         name="Use Spatial Splits",
         description="Use BVH spatial splits: longer builder time, faster render",
@@ -598,7 +732,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             ('DIFFUSE', "Diffuse", ""),
             ('GLOSSY', "Glossy", ""),
             ('TRANSMISSION', "Transmission", ""),
-            ('SUBSURFACE', "Subsurface", ""),
         ),
     )
 
@@ -703,13 +836,16 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     debug_bvh_layout: EnumProperty(
         name="BVH Layout",
         items=enum_bvh_layouts,
-        default='BVH8',
+        default='EMBREE',
     )
     debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
     debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False)
     debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
+    debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
+    debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False)
+
     debug_opencl_kernel_type: EnumProperty(
         name="OpenCL Kernel Type",
         default='DEFAULT',
@@ -760,49 +896,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
 
 class CyclesCameraSettings(bpy.types.PropertyGroup):
 
-    aperture_type: EnumProperty(
-        name="Aperture Type",
-        description="Use f-stop number or aperture radius",
-        items=enum_aperture_types,
-        default='RADIUS',
-    )
-    aperture_fstop: FloatProperty(
-        name="Aperture f-stop",
-        description="F-stop ratio (lower numbers give more defocus, higher numbers give a sharper image)",
-        min=0.0, soft_min=0.1, soft_max=64.0,
-        default=5.6,
-        step=10,
-        precision=1,
-    )
-    aperture_size: FloatProperty(
-        name="Aperture Size",
-        description="Radius of the aperture for depth of field (higher values give more defocus)",
-        min=0.0, soft_max=10.0,
-        default=0.0,
-        step=1,
-        precision=4,
-        subtype='DISTANCE',
-    )
-    aperture_blades: IntProperty(
-        name="Aperture Blades",
-        description="Number of blades in aperture for polygonal bokeh (at least 3)",
-        min=0, max=100,
-        default=0,
-    )
-    aperture_rotation: FloatProperty(
-        name="Aperture Rotation",
-        description="Rotation of blades in aperture",
-        soft_min=-pi, soft_max=pi,
-        subtype='ANGLE',
-        default=0,
-    )
-    aperture_ratio: FloatProperty(
-        name="Aperture Ratio",
-        description="Distortion to simulate anamorphic lens bokeh",
-        min=0.01, soft_min=1.0, soft_max=2.0,
-        default=1.0,
-        precision=4,
-    )
     panorama_type: EnumProperty(
         name="Panorama Type",
         description="Distortion to use for the calculation",
@@ -899,6 +992,14 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup):
         default='LINEAR',
     )
 
+    volume_step_rate: FloatProperty(
+        name="Step Rate",
+        description="Scale the distance between volume shader samples when rendering the volume "
+                    "(lower values give more accurate and detailed results, but also increased render time)",
+        default=1.0,
+        min=0.001, max=1000.0, soft_min=0.1, soft_max=10.0, precision=4
+    )
+
     displacement_method: EnumProperty(
         name="Displacement Method",
         description="Method to use for the displacement",
@@ -967,7 +1068,7 @@ class CyclesLightSettings(bpy.types.PropertyGroup):
 class CyclesWorldSettings(bpy.types.PropertyGroup):
 
     sampling_method: EnumProperty(
-        name="Sampling method",
+        name="Sampling Method",
         description="How to sample the background light",
         items=enum_world_mis,
         default='AUTOMATIC',
@@ -1009,6 +1110,13 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
         items=enum_volume_interpolation,
         default='LINEAR',
     )
+    volume_step_size: FloatProperty(
+        name="Step Size",
+        description="Distance between volume shader samples when rendering the volume "
+                    "(lower values give more accurate and detailed results, but also increased render time)",
+        default=1.0,
+        min=0.0000001, max=100000.0, soft_min=0.1, soft_max=100.0, precision=4
+    )
 
     @classmethod
     def register(cls):
@@ -1119,7 +1227,7 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
     motion_steps: IntProperty(
         name="Motion Steps",
         description="Control accuracy of motion blur, more steps gives more memory usage (actual number of steps is 2^(steps - 1))",
-        min=1, soft_max=8,
+        min=1, max=7,
         default=1,
     )
 
@@ -1148,6 +1256,13 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
         default=1.0,
     )
 
+    shadow_terminator_offset: FloatProperty(
+        name="Shadow Terminator Offset",
+        description="Push the shadow terminator towards the light to hide artifacts on low poly geometry",
+        min=0.0, max=1.0,
+        default=0.0,
+    )
+
     is_shadow_catcher: BoolProperty(
         name="Shadow Catcher",
         description="Only render shadows on this object, for compositing renders into real footage",
@@ -1177,53 +1292,17 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
 
 class CyclesCurveRenderSettings(bpy.types.PropertyGroup):
 
-    primitive: EnumProperty(
-        name="Primitive",
-        description="Type of primitive used for hair rendering",
-        items=enum_curve_primitives,
-        default='LINE_SEGMENTS',
-    )
     shape: EnumProperty(
         name="Shape",
         description="Form of hair",
         items=enum_curve_shape,
-        default='THICK',
-    )
-    cull_backfacing: BoolProperty(
-        name="Cull Back-faces",
-        description="Do not test the back-face of each strand",
-        default=True,
-    )
-    use_curves: BoolProperty(
-        name="Use Cycles Hair Rendering",
-        description="Activate Cycles hair rendering for particle system",
-        default=True,
-    )
-    resolution: IntProperty(
-        name="Resolution",
-        description="Resolution of generated mesh",
-        min=3, max=64,
-        default=3,
-    )
-    minimum_width: FloatProperty(
-        name="Minimal width",
-        description="Minimal pixel width for strands (0 - deactivated)",
-        min=0.0, max=100.0,
-        default=0.0,
-        subtype='PIXEL'
-    )
-    maximum_width: FloatProperty(
-        name="Maximal width",
-        description="Maximum extension that strand radius can be increased by",
-        min=0.0, max=100.0,
-        default=0.1,
-        subtype='PIXEL'
+        default='RIBBONS',
     )
     subdivisions: IntProperty(
         name="Subdivisions",
         description="Number of subdivisions used in Cardinal curve intersection (power of 2)",
         min=0, max=24,
-        default=4,
+        default=2,
     )
 
     @classmethod
@@ -1239,10 +1318,25 @@ class CyclesCurveRenderSettings(bpy.types.PropertyGroup):
         del bpy.types.Scene.cycles_curves
 
 
-def update_render_passes(self, context):
-    view_layer = context.view_layer
-    view_layer.update_render_passes()
-
+class CyclesAOVPass(bpy.types.PropertyGroup):
+    name: StringProperty(
+        name="Name",
+        description="Name of the pass, to use in the AOV Output shader node",
+        update=update_render_passes,
+        default="AOV"
+    )
+    type: EnumProperty(
+        name="Type",
+        description="Pass data type",
+        update=update_render_passes,
+        items=enum_aov_types,
+        default='COLOR'
+    )
+    conflict: StringProperty(
+        name="Conflict",
+        description="If there is a conflict with another render passes, message explaining why",
+        default=""
+    )
 
 class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
 
@@ -1276,6 +1370,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         default=False,
         update=update_render_passes,
     )
+    pass_debug_sample_count: BoolProperty(
+        name="Debug Sample Count",
+        description="Number of samples/camera rays per pixel",
+        default=False,
+        update=update_render_passes,
+    )
     use_pass_volume_direct: BoolProperty(
         name="Volume Direct",
         description="Deliver direct volumetric scattering pass",
@@ -1292,7 +1392,7 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
     use_denoising: BoolProperty(
         name="Use Denoising",
         description="Denoise the rendered image",
-        default=False,
+        default=True,
         update=update_render_passes,
     )
     denoising_diffuse_direct: BoolProperty(
@@ -1325,16 +1425,6 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         description="Denoise the indirect transmission lighting",
         default=True,
     )
-    denoising_subsurface_direct: BoolProperty(
-        name="Subsurface Direct",
-        description="Denoise the direct subsurface lighting",
-        default=True,
-    )
-    denoising_subsurface_indirect: BoolProperty(
-        name="Subsurface Indirect",
-        description="Denoise the indirect subsurface lighting",
-        default=True,
-    )
     denoising_strength: FloatProperty(
         name="Denoising Strength",
         description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
@@ -1355,13 +1445,13 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         subtype="PIXEL",
     )
     denoising_relative_pca: BoolProperty(
-        name="Relative filter",
+        name="Relative Filter",
         description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
         default=False,
     )
     denoising_store_passes: BoolProperty(
-        name="Store denoising passes",
-        description="Store the denoising feature passes and the noisy image",
+        name="Store Denoising Passes",
+        description="Store the denoising feature passes and the noisy image. The passes adapt to the denoiser selected for rendering",
         default=False,
         update=update_render_passes,
     )
@@ -1371,6 +1461,21 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         min=0, max=7,
         default=0,
     )
+
+    denoising_optix_input_passes: EnumProperty(
+        name="Input Passes",
+        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+        items=enum_denoising_input_passes,
+        default='RGB_ALBEDO',
+    )
+
+    denoising_openimagedenoise_input_passes: EnumProperty(
+        name="Input Passes",
+        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+        items=enum_denoising_input_passes,
+        default='RGB_ALBEDO_NORMAL',
+    )
+
     use_pass_crypto_object: BoolProperty(
         name="Cryptomatte Object",
         description="Render cryptomatte object pass, for isolating objects in compositing",
@@ -1402,6 +1507,15 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         update=update_render_passes,
         )
 
+    aovs: CollectionProperty(
+        type=CyclesAOVPass,
+        description="Custom render passes that can be output by shader nodes",
+    )
+    active_aov: IntProperty(
+        default=0,
+        min=0
+    )
+
     @classmethod
     def register(cls):
         bpy.types.ViewLayer.cycles = PointerProperty(
@@ -1427,10 +1541,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def get_device_types(self, context):
         import _cycles
-        has_cuda, has_opencl = _cycles.get_device_types()
+        has_cuda, has_optix, has_opencl = _cycles.get_device_types()
         list = [('NONE', "None", "Don't use compute device", 0)]
         if has_cuda:
             list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
+        if has_optix:
+            list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
         if has_opencl:
             list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2))
         return list
@@ -1443,6 +1559,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     devices: bpy.props.CollectionProperty(type=CyclesDeviceSettings)
 
+    peer_memory: BoolProperty(
+        name="Distribute memory across devices",
+        description="Make more room for large scenes to fit by distributing memory across interconnected devices (e.g. via NVLink) rather than duplicating it",
+        default=False,
+    )
+
     def find_existing_device_entry(self, device):
         for device_entry in self.devices:
             if device_entry.id == device[2] and device_entry.type == device[1]:
@@ -1451,7 +1573,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPENCL', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}:
                 continue
             # Try to find existing Device entry
             entry = self.find_existing_device_entry(device)
@@ -1466,8 +1588,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                 # Update name in case it changed
                 entry.name = device[0]
 
-    # Gets all devices types by default.
-    def get_devices(self, compute_device_type=''):
+    # Gets all devices types for a compute device type.
+    def get_devices_for_type(self, compute_device_type):
         import _cycles
         # Layout of the device tuples: (Name, Type, Persistent ID)
         device_list = _cycles.available_devices(compute_device_type)
@@ -1476,20 +1598,25 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         # hold pointers to a resized array.
         self.update_device_entries(device_list)
         # Sort entries into lists
-        cuda_devices = []
-        opencl_devices = []
+        devices = []
         cpu_devices = []
         for device in device_list:
             entry = self.find_existing_device_entry(device)
-            if entry.type == 'CUDA':
-                cuda_devices.append(entry)
-            elif entry.type == 'OPENCL':
-                opencl_devices.append(entry)
+            if entry.type == compute_device_type:
+                devices.append(entry)
             elif entry.type == 'CPU':
                 cpu_devices.append(entry)
         # Extend all GPU devices with CPU.
-        cuda_devices.extend(cpu_devices)
-        opencl_devices.extend(cpu_devices)
+        if compute_device_type in ('CUDA', 'OPENCL'):
+            devices.extend(cpu_devices)
+        return devices
+
+    # For backwards compatibility, only returns CUDA and OpenCL but still
+    # refreshes all devices.
+    def get_devices(self, compute_device_type=''):
+        cuda_devices = self.get_devices_for_type('CUDA')
+        self.get_devices_for_type('OPTIX')
+        opencl_devices = self.get_devices_for_type('OPENCL')
         return cuda_devices, opencl_devices
 
     def get_num_gpu_devices(self):
@@ -1517,27 +1644,53 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                 break
 
         if not found_device:
-            box.label(text="No compatible GPUs found", icon='INFO')
+            col = box.column(align=True)
+            col.label(text="No compatible GPUs found for path tracing", icon='INFO')
+            col.label(text="Cycles will render on the CPU", icon='BLANK1')
             return
 
         for device in devices:
             box.prop(device, "use", text=device.name)
 
+        if device_type == 'OPTIX':
+            col = box.column(align=True)
+            col.label(text="OptiX support is experimental", icon='INFO')
+            col.label(text="Not all Cycles features are supported yet", icon='BLANK1')
+
+
     def draw_impl(self, layout, context):
         row = layout.row()
         row.prop(self, "compute_device_type", expand=True)
 
-        cuda_devices, opencl_devices = self.get_devices(self.compute_device_type)
+        if self.compute_device_type == 'NONE':
+            return
         row = layout.row()
-        if self.compute_device_type == 'CUDA':
-            self._draw_devices(row, 'CUDA', cuda_devices)
-        elif self.compute_device_type == 'OPENCL':
-            self._draw_devices(row, 'OPENCL', opencl_devices)
+        devices = self.get_devices_for_type(self.compute_device_type)
+        self._draw_devices(row, self.compute_device_type, devices)
+
+        import _cycles
+        has_peer_memory = 0
+        for device in _cycles.available_devices(self.compute_device_type):
+            if device[3] and self.find_existing_device_entry(device).use:
+                has_peer_memory += 1
+        if has_peer_memory > 1:
+            row = layout.row()
+            row.use_property_split = True
+            row.prop(self, "peer_memory")
 
     def draw(self, context):
         self.draw_impl(self.layout, context)
 
 
+class CyclesView3DShadingSettings(bpy.types.PropertyGroup):
+    render_pass: EnumProperty(
+        name="Render Pass",
+        description="Render pass to show in the 3D Viewport",
+        items=enum_view3d_shading_render_pass,
+        default='COMBINED',
+    )
+
+
 def register():
     bpy.utils.register_class(CyclesRenderSettings)
     bpy.utils.register_class(CyclesCameraSettings)
@@ -1550,7 +1703,14 @@ def register():
     bpy.utils.register_class(CyclesCurveRenderSettings)
     bpy.utils.register_class(CyclesDeviceSettings)
     bpy.utils.register_class(CyclesPreferences)
+    bpy.utils.register_class(CyclesAOVPass)
     bpy.utils.register_class(CyclesRenderLayerSettings)
+    bpy.utils.register_class(CyclesView3DShadingSettings)
+
+    bpy.types.View3DShading.cycles = bpy.props.PointerProperty(
+        name="Cycles Settings",
+        type=CyclesView3DShadingSettings,
+    )
 
 
 def unregister():
@@ -1565,4 +1725,6 @@ def unregister():
     bpy.utils.unregister_class(CyclesCurveRenderSettings)
     bpy.utils.unregister_class(CyclesDeviceSettings)
     bpy.utils.unregister_class(CyclesPreferences)
+    bpy.utils.unregister_class(CyclesAOVPass)
     bpy.utils.unregister_class(CyclesRenderLayerSettings)
+    bpy.utils.unregister_class(CyclesView3DShadingSettings)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 303f1ddf1b2..39cb1477c33 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -22,6 +22,8 @@ from bl_ui.utils import PresetPanel
 
 from bpy.types import Panel
 
+from bl_ui.properties_grease_pencil_common import GreasePencilSimplifyPanel
+
 
 class CYCLES_PT_sampling_presets(PresetPanel, Panel):
     bl_label = "Sampling Presets"
@@ -57,7 +59,7 @@ def node_panel(cls):
 
     node_cls.bl_space_type = 'NODE_EDITOR'
     node_cls.bl_region_type = 'UI'
-    node_cls.bl_category = "Node"
+    node_cls.bl_category = "Options"
     if hasattr(node_cls, 'bl_parent_id'):
         node_cls.bl_parent_id = 'NODE_' + node_cls.bl_parent_id
 
@@ -86,10 +88,16 @@ def use_cuda(context):
     return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')
 
 
+def use_optix(context):
+    cscene = context.scene.cycles
+
+    return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
+
+
 def use_branched_path(context):
     cscene = context.scene.cycles
 
-    return (cscene.progressive == 'BRANCHED_PATH')
+    return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context))
 
 
 def use_sample_all_lights(context):
@@ -166,29 +174,29 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
         layout.use_property_split = True
         layout.use_property_decorate = False
 
-        layout.prop(cscene, "progressive")
+        if not use_optix(context):
+            layout.prop(cscene, "progressive")
 
-        if cscene.progressive == 'PATH' or use_branched_path(context) is False:
+        if not use_branched_path(context):
             col = layout.column(align=True)
             col.prop(cscene, "samples", text="Render")
             col.prop(cscene, "preview_samples", text="Viewport")
-
-            draw_samples_info(layout, context)
         else:
             col = layout.column(align=True)
             col.prop(cscene, "aa_samples", text="Render")
             col.prop(cscene, "preview_aa_samples", text="Viewport")
 
+        if not use_branched_path(context):
+            draw_samples_info(layout, context)
+
 
 class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
     bl_label = "Sub Samples"
     bl_parent_id = "CYCLES_RENDER_PT_sampling"
 
     @classmethod
-    def poll(self, context):
-        scene = context.scene
-        cscene = scene.cycles
-        return cscene.progressive != 'PATH' and use_branched_path(context)
+    def poll(cls, context):
+        return use_branched_path(context)
 
     def draw(self, context):
         layout = self.layout
@@ -213,6 +221,70 @@ class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
         draw_samples_info(layout, context)
 
 
+class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel):
+    bl_label = "Adaptive Sampling"
+    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw_header(self, context):
+        layout = self.layout
+        scene = context.scene
+        cscene = scene.cycles
+
+        layout.prop(cscene, "use_adaptive_sampling", text="")
+
+    def draw(self, context):
+        layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        scene = context.scene
+        cscene = scene.cycles
+
+        layout.active = cscene.use_adaptive_sampling
+
+        col = layout.column(align=True)
+        col.prop(cscene, "adaptive_threshold", text="Noise Threshold")
+        col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+
+
+class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
+    bl_label = "Denoising"
+    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw(self, context):
+        layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        scene = context.scene
+        cscene = scene.cycles
+
+        heading = layout.column(align=True, heading="Render")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_denoising", text="")
+        sub = row.row()
+
+        sub.active = cscene.use_denoising
+        for view_layer in scene.view_layers:
+            if view_layer.cycles.denoising_store_passes:
+                sub.active = True
+
+        sub.prop(cscene, "denoiser", text="")
+
+        heading = layout.column(align=False, heading="Viewport")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_preview_denoising", text="")
+        sub = row.row()
+        sub.active = cscene.use_preview_denoising
+        sub.prop(cscene, "preview_denoiser", text="")
+
+        sub = heading.row(align=True)
+        sub.active = cscene.use_preview_denoising
+        sub.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
+
+
 class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
     bl_label = "Advanced"
     bl_parent_id = "CYCLES_RENDER_PT_sampling"
@@ -230,13 +302,17 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         row.prop(cscene, "seed")
         row.prop(cscene, "use_animated_seed", text="", icon='TIME')
 
-        layout.prop(cscene, "sampling_pattern", text="Pattern")
+        col = layout.column(align=True)
+        col.active = not(cscene.use_adaptive_sampling)
+        col.prop(cscene, "sampling_pattern", text="Pattern")
 
         layout.prop(cscene, "use_square_samples")
 
         layout.separator()
 
         col = layout.column(align=True)
+        col.prop(cscene, "min_light_bounces")
+        col.prop(cscene, "min_transparent_bounces")
         col.prop(cscene, "light_sampling_threshold", text="Light Threshold")
 
         if cscene.progressive != 'PATH' and use_branched_path(context):
@@ -264,7 +340,7 @@ class CYCLES_RENDER_PT_sampling_total(CyclesButtonsPanel, Panel):
     bl_parent_id = "CYCLES_RENDER_PT_sampling"
 
     @classmethod
-    def poll(self, context):
+    def poll(cls, context):
         scene = context.scene
         cscene = scene.cycles
 
@@ -320,7 +396,7 @@ class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel):
     bl_options = {'DEFAULT_CLOSED'}
 
     @classmethod
-    def poll(self, context):
+    def poll(cls, context):
         return (context.scene.render.engine == 'CYCLES') and (context.scene.cycles.feature_set == 'EXPERIMENTAL')
 
     def draw(self, context):
@@ -334,7 +410,7 @@ class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel):
         col = layout.column()
         sub = col.column(align=True)
         sub.prop(cscene, "dicing_rate", text="Dicing Rate Render")
-        sub.prop(cscene, "preview_dicing_rate", text="Preview")
+        sub.prop(cscene, "preview_dicing_rate", text="Viewport")
 
         col.separator()
 
@@ -348,13 +424,6 @@ class CYCLES_RENDER_PT_hair(CyclesButtonsPanel, Panel):
     bl_label = "Hair"
     bl_options = {'DEFAULT_CLOSED'}
 
-    def draw_header(self, context):
-        layout = self.layout
-        scene = context.scene
-        ccscene = scene.cycles_curves
-
-        layout.prop(ccscene, "use_curves", text="")
-
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
@@ -363,20 +432,10 @@ class CYCLES_RENDER_PT_hair(CyclesButtonsPanel, Panel):
         scene = context.scene
         ccscene = scene.cycles_curves
 
-        layout.active = ccscene.use_curves
-
         col = layout.column()
-        col.prop(ccscene, "minimum_width", text="Min Pixels")
-        col.prop(ccscene, "maximum_width", text="Max Extension")
         col.prop(ccscene, "shape", text="Shape")
-        if not (ccscene.primitive in {'CURVE_SEGMENTS', 'LINE_SEGMENTS'} and ccscene.shape == 'RIBBONS'):
-            col.prop(ccscene, "cull_backfacing", text="Cull back-faces")
-        col.prop(ccscene, "primitive", text="Primitive")
-
-        if ccscene.primitive == 'TRIANGLES' and ccscene.shape == 'THICK':
-            col.prop(ccscene, "resolution", text="Resolution")
-        elif ccscene.primitive == 'CURVE_SEGMENTS':
-            col.prop(ccscene, "subdivisions", text="Curve subdivisions")
+        if ccscene.shape == 'RIBBONS':
+            col.prop(ccscene, "subdivisions", text="Curve Subdivisions")
 
 
 class CYCLES_RENDER_PT_volumes(CyclesButtonsPanel, Panel):
@@ -391,9 +450,11 @@ class CYCLES_RENDER_PT_volumes(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        col = layout.column()
-        col.prop(cscene, "volume_step_size", text="Step Size")
-        col.prop(cscene, "volume_max_steps", text="Max Steps")
+        col = layout.column(align=True)
+        col.prop(cscene, "volume_step_rate", text="Step Rate Render")
+        col.prop(cscene, "volume_preview_step_rate", text="Viewport")
+
+        layout.prop(cscene, "volume_max_steps", text="Max Steps")
 
 
 class CYCLES_RENDER_PT_light_paths(CyclesButtonsPanel, Panel):
@@ -461,8 +522,9 @@ class CYCLES_RENDER_PT_light_paths_caustics(CyclesButtonsPanel, Panel):
 
         col = layout.column()
         col.prop(cscene, "blur_glossy")
-        col.prop(cscene, "caustics_reflective")
-        col.prop(cscene, "caustics_refractive")
+        col = layout.column(heading="Caustics", align=True)
+        col.prop(cscene, "caustics_reflective", text="Reflective")
+        col.prop(cscene, "caustics_refractive", text="Refractive")
 
 
 class CYCLES_RENDER_PT_motion_blur(CyclesButtonsPanel, Panel):
@@ -538,31 +600,32 @@ class CYCLES_RENDER_PT_film(CyclesButtonsPanel, Panel):
 
 
 class CYCLES_RENDER_PT_film_transparency(CyclesButtonsPanel, Panel):
-    bl_label = "Transparency"
+    bl_label = "Transparent"
     bl_parent_id = "CYCLES_RENDER_PT_film"
 
     def draw_header(self, context):
         layout = self.layout
 
         scene = context.scene
-        cscene = scene.cycles
+        rd = scene.render
 
-        layout.prop(cscene, "film_transparent", text="")
+        layout.prop(rd, "film_transparent", text="")
 
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
         layout.use_property_decorate = False
         scene = context.scene
+        rd = scene.render
         cscene = scene.cycles
 
-        layout.active = cscene.film_transparent
+        layout.active = rd.film_transparent
 
         col = layout.column()
         col.prop(cscene, "film_transparent_glass", text="Transparent Glass")
 
         sub = col.column()
-        sub.active = cscene.film_transparent and cscene.film_transparent_glass
+        sub.active = rd.film_transparent and cscene.film_transparent_glass
         sub.prop(cscene, "film_transparent_roughness", text="Roughness Threshold")
 
 
@@ -633,9 +696,6 @@ class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
 
         sub = col.column()
         sub.active = not rd.use_save_buffers
-        for view_layer in scene.view_layers:
-            if view_layer.cycles.use_denoising:
-                sub.active = False
         sub.prop(cscene, "use_progressive_refine")
 
 
@@ -655,16 +715,20 @@ class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Pa
 
         col = layout.column()
 
-        if _cycles.with_embree:
-            row = col.row()
-            row.active = use_cpu(context)
-            row.prop(cscene, "use_bvh_embree")
+        use_embree = False
+        if use_cpu(context):
+            use_embree = _cycles.with_embree
+            if not use_embree:
+              sub = col.column(align=True)
+              sub.label(text="Cycles built without Embree support")
+              sub.label(text="CPU raytracing performance will be poor")
+
         col.prop(cscene, "debug_use_spatial_splits")
         sub = col.column()
-        sub.active = not cscene.use_bvh_embree or not _cycles.with_embree
+        sub.active = not use_embree
         sub.prop(cscene, "debug_use_hair_bvh")
         sub = col.column()
-        sub.active = not cscene.debug_use_spatial_splits and not cscene.use_bvh_embree
+        sub.active = not cscene.debug_use_spatial_splits and not use_embree
         sub.prop(cscene, "debug_bvh_time_steps")
 
 
@@ -720,20 +784,12 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
         rd = scene.render
         view_layer = context.view_layer
 
-        flow = layout.grid_flow(row_major=True, columns=0, even_columns=True, even_rows=False, align=False)
-
-        col = flow.column()
+        col = layout.column(heading="Include")
         col.prop(view_layer, "use_sky", text="Environment")
-        col = flow.column()
         col.prop(view_layer, "use_ao", text="Ambient Occlusion")
-        col = flow.column()
         col.prop(view_layer, "use_solid", text="Surfaces")
-        col = flow.column()
         col.prop(view_layer, "use_strand", text="Hair")
-        if with_freestyle:
-            col = flow.column()
-            col.prop(view_layer, "use_freestyle", text="Freestyle")
-            col.active = rd.use_freestyle
+        col.prop(view_layer, "use_volumes", text="Volumes")
 
 
 class CYCLES_RENDER_PT_override(CyclesButtonsPanel, Panel):
@@ -755,7 +811,6 @@ class CYCLES_RENDER_PT_override(CyclesButtonsPanel, Panel):
 class CYCLES_RENDER_PT_passes(CyclesButtonsPanel, Panel):
     bl_label = "Passes"
     bl_context = "view_layer"
-    bl_options = {'DEFAULT_CLOSED'}
 
     def draw(self, context):
         pass
@@ -776,34 +831,27 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
         view_layer = context.view_layer
         cycles_view_layer = view_layer.cycles
 
-        flow = layout.grid_flow(row_major=True, columns=0, even_columns=True, even_rows=False, align=False)
-        col = flow.column()
+        col = layout.column(heading="Include", align=True)
         col.prop(view_layer, "use_pass_combined")
-        col = flow.column()
         col.prop(view_layer, "use_pass_z")
-        col = flow.column()
         col.prop(view_layer, "use_pass_mist")
-        col = flow.column()
         col.prop(view_layer, "use_pass_normal")
-        col = flow.column()
-        col.prop(view_layer, "use_pass_vector")
-        col.active = not rd.use_motion_blur
-        col = flow.column()
+        sub = col.column()
+        sub.active = not rd.use_motion_blur
+        sub.prop(view_layer, "use_pass_vector")
         col.prop(view_layer, "use_pass_uv")
-        col = flow.column()
+
+        col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data")
+
+        col = layout.column(heading="Indexes", align=True)
         col.prop(view_layer, "use_pass_object_index")
-        col = flow.column()
         col.prop(view_layer, "use_pass_material_index")
 
-        layout.separator()
-
-        flow = layout.grid_flow(row_major=True, columns=0, even_columns=True, even_rows=False, align=False)
-        col = flow.column()
-        col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data")
-        col = flow.column()
+        col = layout.column(heading="Debug", align=True)
         col.prop(cycles_view_layer, "pass_debug_render_time", text="Render Time")
+        col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count")
+
 
-        layout.separator()
 
         layout.prop(view_layer, "pass_alpha_threshold")
 
@@ -821,46 +869,26 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel):
         view_layer = context.view_layer
         cycles_view_layer = view_layer.cycles
 
-        split = layout.split(factor=0.35)
-        split.use_property_split = False
-        split.label(text="Diffuse")
-        row = split.row(align=True)
-        row.prop(view_layer, "use_pass_diffuse_direct", text="Direct", toggle=True)
-        row.prop(view_layer, "use_pass_diffuse_indirect", text="Indirect", toggle=True)
-        row.prop(view_layer, "use_pass_diffuse_color", text="Color", toggle=True)
-
-        split = layout.split(factor=0.35)
-        split.use_property_split = False
-        split.label(text="Glossy")
-        row = split.row(align=True)
-        row.prop(view_layer, "use_pass_glossy_direct", text="Direct", toggle=True)
-        row.prop(view_layer, "use_pass_glossy_indirect", text="Indirect", toggle=True)
-        row.prop(view_layer, "use_pass_glossy_color", text="Color", toggle=True)
-
-        split = layout.split(factor=0.35)
-        split.use_property_split = False
-        split.label(text="Transmission")
-        row = split.row(align=True)
-        row.prop(view_layer, "use_pass_transmission_direct", text="Direct", toggle=True)
-        row.prop(view_layer, "use_pass_transmission_indirect", text="Indirect", toggle=True)
-        row.prop(view_layer, "use_pass_transmission_color", text="Color", toggle=True)
-
-        split = layout.split(factor=0.35)
-        split.use_property_split = False
-        split.label(text="Subsurface")
-        row = split.row(align=True)
-        row.prop(view_layer, "use_pass_subsurface_direct", text="Direct", toggle=True)
-        row.prop(view_layer, "use_pass_subsurface_indirect", text="Indirect", toggle=True)
-        row.prop(view_layer, "use_pass_subsurface_color", text="Color", toggle=True)
-
-        split = layout.split(factor=0.35)
-        split.use_property_split = False
-        split.label(text="Volume")
-        row = split.row(align=True)
-        row.prop(cycles_view_layer, "use_pass_volume_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "use_pass_volume_indirect", text="Indirect", toggle=True)
+        col = layout.column(heading="Diffuse", align=True)
+        col.prop(view_layer, "use_pass_diffuse_direct", text="Direct")
+        col.prop(view_layer, "use_pass_diffuse_indirect", text="Indirect")
+        col.prop(view_layer, "use_pass_diffuse_color", text="Color")
 
-        col = layout.column(align=True)
+        col = layout.column(heading="Glossy", align=True)
+        col.prop(view_layer, "use_pass_glossy_direct", text="Direct")
+        col.prop(view_layer, "use_pass_glossy_indirect", text="Indirect")
+        col.prop(view_layer, "use_pass_glossy_color", text="Color")
+
+        col = layout.column(heading="Transmission", align=True)
+        col.prop(view_layer, "use_pass_transmission_direct", text="Direct")
+        col.prop(view_layer, "use_pass_transmission_indirect", text="Indirect")
+        col.prop(view_layer, "use_pass_transmission_color", text="Color")
+
+        col = layout.column(heading="Volume", align=True)
+        col.prop(cycles_view_layer, "use_pass_volume_direct", text="Direct")
+        col.prop(cycles_view_layer, "use_pass_volume_indirect", text="Indirect")
+
+        col = layout.column(heading="Other", align=True)
         col.prop(view_layer, "use_pass_emit", text="Emission")
         col.prop(view_layer, "use_pass_environment")
         col.prop(view_layer, "use_pass_shadow")
@@ -881,11 +909,10 @@ class CYCLES_RENDER_PT_passes_crypto(CyclesButtonsPanel, Panel):
 
         cycles_view_layer = context.view_layer.cycles
 
-        row = layout.row(align=True)
-        row.use_property_split = False
-        row.prop(cycles_view_layer, "use_pass_crypto_object", text="Object", toggle=True)
-        row.prop(cycles_view_layer, "use_pass_crypto_material", text="Material", toggle=True)
-        row.prop(cycles_view_layer, "use_pass_crypto_asset", text="Asset", toggle=True)
+        col = layout.column(heading="Include", align=True)
+        col.prop(cycles_view_layer, "use_pass_crypto_object", text="Object")
+        col.prop(cycles_view_layer, "use_pass_crypto_material", text="Material")
+        col.prop(cycles_view_layer, "use_pass_crypto_asset", text="Asset")
 
         layout.prop(cycles_view_layer, "pass_crypto_depth", text="Levels")
 
@@ -917,17 +944,58 @@ class CYCLES_RENDER_PT_passes_debug(CyclesButtonsPanel, Panel):
         layout.prop(cycles_view_layer, "pass_debug_ray_bounces")
 
 
+class CYCLES_RENDER_UL_aov(bpy.types.UIList):
+    def draw_item(self, context, layout, data, item, icon, active_data, active_propname):
+        row = layout.row()
+        split = row.split(factor=0.65)
+        icon = 'ERROR' if item.conflict else 'NONE'
+        split.row().prop(item, "name", text="", icon=icon, emboss=False)
+        split.row().prop(item, "type", text="", emboss=False)
+
+
+class CYCLES_RENDER_PT_passes_aov(CyclesButtonsPanel, Panel):
+    bl_label = "Shader AOV"
+    bl_context = "view_layer"
+    bl_parent_id = "CYCLES_RENDER_PT_passes"
+
+    def draw(self, context):
+        layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        cycles_view_layer = context.view_layer.cycles
+
+        row = layout.row()
+        col = row.column()
+        col.template_list("CYCLES_RENDER_UL_aov", "aovs", cycles_view_layer, "aovs", cycles_view_layer, "active_aov", rows=2)
+
+        col = row.column()
+        sub = col.column(align=True)
+        sub.operator("cycles.add_aov", icon='ADD', text="")
+        sub.operator("cycles.remove_aov", icon='REMOVE', text="")
+
+        if cycles_view_layer.active_aov < len(cycles_view_layer.aovs):
+          active_aov = cycles_view_layer.aovs[cycles_view_layer.active_aov]
+          if active_aov.conflict:
+            layout.label(text=active_aov.conflict, icon='ERROR')
+
+
 class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
     bl_label = "Denoising"
     bl_context = "view_layer"
     bl_options = {'DEFAULT_CLOSED'}
 
+    @classmethod
+    def poll(cls, context):
+        cscene = context.scene.cycles
+        return CyclesButtonsPanel.poll(context) and cscene.use_denoising
+
     def draw_header(self, context):
         scene = context.scene
         view_layer = context.view_layer
         cycles_view_layer = view_layer.cycles
-        layout = self.layout
 
+        layout = self.layout
         layout.prop(cycles_view_layer, "use_denoising", text="")
 
     def draw(self, context):
@@ -938,66 +1006,43 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
         scene = context.scene
         view_layer = context.view_layer
         cycles_view_layer = view_layer.cycles
+        denoiser = scene.cycles.denoiser
 
-        split = layout.split()
-        split.active = cycles_view_layer.use_denoising
+        layout.active = denoiser != 'NONE' and cycles_view_layer.use_denoising
 
-        layout = layout.column(align=True)
-        layout.prop(cycles_view_layer, "denoising_radius", text="Radius")
-        layout.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
-        layout.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
-        layout.prop(cycles_view_layer, "denoising_relative_pca")
+        col = layout.column()
 
-        layout.separator()
+        if denoiser == 'OPTIX':
+            col.prop(cycles_view_layer, "denoising_optix_input_passes")
+            return
+        elif denoiser == 'OPENIMAGEDENOISE':
+            col.prop(cycles_view_layer, "denoising_openimagedenoise_input_passes")
+            return
 
-        split = layout.split(factor=0.5)
-        split.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
+        col.prop(cycles_view_layer, "denoising_radius", text="Radius")
 
-        col = split.column()
-        col.alignment = 'RIGHT'
-        col.label(text="Diffuse")
+        col = layout.column()
+        col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
+        col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
+        col.prop(cycles_view_layer, "denoising_relative_pca")
 
-        row = split.row(align=True)
-        row.use_property_split = False
-        row.prop(cycles_view_layer, "denoising_diffuse_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_diffuse_indirect", text="Indirect", toggle=True)
+        layout.separator()
 
-        split = layout.split(factor=0.5)
-        split.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
+        col = layout.column()
+        col.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
 
-        col = split.column()
-        col.alignment = 'RIGHT'
-        col.label(text="Glossy")
+        row = col.row(heading="Diffuse", align=True)
+        row.prop(cycles_view_layer, "denoising_diffuse_direct", text="Direct", toggle=True)
+        row.prop(cycles_view_layer, "denoising_diffuse_indirect", text="Indirect", toggle=True)
 
-        row = split.row(align=True)
-        row.use_property_split = False
+        row = col.row(heading="Glossy", align=True)
         row.prop(cycles_view_layer, "denoising_glossy_direct", text="Direct", toggle=True)
         row.prop(cycles_view_layer, "denoising_glossy_indirect", text="Indirect", toggle=True)
 
-        split = layout.split(factor=0.5)
-        split.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
-
-        col = split.column()
-        col.alignment = 'RIGHT'
-        col.label(text="Transmission")
-
-        row = split.row(align=True)
-        row.use_property_split = False
+        row = col.row(heading="Transmission", align=True)
         row.prop(cycles_view_layer, "denoising_transmission_direct", text="Direct", toggle=True)
         row.prop(cycles_view_layer, "denoising_transmission_indirect", text="Indirect", toggle=True)
 
-        split = layout.split(factor=0.5)
-        split.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
-
-        col = split.column()
-        col.alignment = 'RIGHT'
-        col.label(text="Subsurface")
-
-        row = split.row(align=True)
-        row.use_property_split = False
-        row.prop(cycles_view_layer, "denoising_subsurface_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_subsurface_indirect", text="Indirect", toggle=True)
-
 
 class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
@@ -1011,7 +1056,7 @@ class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
 
         rd = context.scene.render
 
-        col = layout.column(align=True)
+        col = layout.column(align=True, heading="Pipeline")
         col.prop(rd, "use_compositing")
         col.prop(rd, "use_sequencer")
 
@@ -1026,20 +1071,27 @@ class CYCLES_CAMERA_PT_dof(CyclesButtonsPanel, Panel):
     def poll(cls, context):
         return context.camera and CyclesButtonsPanel.poll(context)
 
+    def draw_header(self, context):
+        cam = context.camera
+        dof = cam.dof
+        self.layout.prop(dof, "use_dof", text="")
+
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
 
         cam = context.camera
+        dof = cam.dof
+        layout.active = dof.use_dof
 
         split = layout.split()
 
         col = split.column()
-        col.prop(cam, "dof_object", text="Focus Object")
+        col.prop(dof, "focus_object", text="Focus Object")
 
         sub = col.row()
-        sub.active = cam.dof_object is None
-        sub.prop(cam, "dof_distance", text="Distance")
+        sub.active = dof.focus_object is None
+        sub.prop(dof, "focus_distance", text="Distance")
 
 
 class CYCLES_CAMERA_PT_dof_aperture(CyclesButtonsPanel, Panel):
@@ -1053,44 +1105,17 @@ class CYCLES_CAMERA_PT_dof_aperture(CyclesButtonsPanel, Panel):
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
-        flow = layout.grid_flow(row_major=True, columns=0, even_columns=True, even_rows=False, align=False)
 
         cam = context.camera
-        ccam = cam.cycles
-
-        col = flow.column()
-        col.prop(ccam, "aperture_type")
-        if ccam.aperture_type == 'RADIUS':
-            col.prop(ccam, "aperture_size", text="Size")
-        elif ccam.aperture_type == 'FSTOP':
-            col.prop(ccam, "aperture_fstop", text="Number")
-        col.separator()
-
-        col = flow.column()
-        col.prop(ccam, "aperture_blades", text="Blades")
-        col.prop(ccam, "aperture_rotation", text="Rotation")
-        col.prop(ccam, "aperture_ratio", text="Ratio")
-
-
-class CYCLES_CAMERA_PT_dof_viewport(CyclesButtonsPanel, Panel):
-    bl_label = "Viewport"
-    bl_parent_id = "CYCLES_CAMERA_PT_dof"
-
-    @classmethod
-    def poll(cls, context):
-        return context.camera and CyclesButtonsPanel.poll(context)
-
-    def draw(self, context):
-        layout = self.layout
-        layout.use_property_split = True
+        dof = cam.dof
+        layout.active = dof.use_dof
         flow = layout.grid_flow(row_major=True, columns=0, even_columns=True, even_rows=False, align=False)
 
-        cam = context.camera
-        dof_options = cam.gpu_dof
-
-        sub = flow.column(align=True)
-        sub.prop(dof_options, "fstop")
-        sub.prop(dof_options, "blades")
+        col = flow.column()
+        col.prop(dof, "aperture_fstop")
+        col.prop(dof, "aperture_blades")
+        col.prop(dof, "aperture_rotation")
+        col.prop(dof, "aperture_ratio")
 
 
 class CYCLES_PT_context_material(CyclesButtonsPanel, Panel):
@@ -1187,6 +1212,7 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):
 
     def draw(self, context):
         layout = self.layout
+        layout.use_property_split = True
 
         rd = context.scene.render
         # scene = context.scene
@@ -1196,33 +1222,78 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):
 
         layout.active = (rd.use_motion_blur and cob.use_motion_blur)
 
-        row = layout.row()
+        col = layout.column()
+        col.prop(cob, "motion_steps", text="Steps")
         if ob.type != 'CAMERA':
-            row.prop(cob, "use_deform_motion", text="Deformation")
-        row.prop(cob, "motion_steps", text="Steps")
+            col.prop(cob, "use_deform_motion", text="Deformation")
+
+
+def has_geometry_visibility(ob):
+    return ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT'}) or
+                    (ob.instance_type == 'COLLECTION' and ob.instance_collection))
 
 
-class CYCLES_OBJECT_PT_cycles_settings(CyclesButtonsPanel, Panel):
-    bl_label = "Cycles Settings"
+class CYCLES_OBJECT_PT_shading(CyclesButtonsPanel, Panel):
+    bl_label = "Shading"
     bl_context = "object"
     bl_options = {'DEFAULT_CLOSED'}
 
     @classmethod
     def poll(cls, context):
+        return  CyclesButtonsPanel.poll(context) and (context.object)
+
+    def draw(self, context):
+        layout = self.layout
+        layout.use_property_split = True
+
+        flow = layout.grid_flow(row_major=False, columns=0, even_columns=True, even_rows=False, align=False)
+        layout = self.layout
         ob = context.object
-        return (CyclesButtonsPanel.poll(context) and
-                ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT'}) or
-                        (ob.instance_type == 'COLLECTION' and ob.instance_collection)))
+        cob = ob.cycles
+
+        if has_geometry_visibility(ob):
+            col = flow.column()
+            col.prop(cob, "shadow_terminator_offset")
+
+
+class CYCLES_OBJECT_PT_visibility(CyclesButtonsPanel, Panel):
+    bl_label = "Visibility"
+    bl_context = "object"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    @classmethod
+    def poll(cls, context):
+        return  CyclesButtonsPanel.poll(context) and (context.object)
 
     def draw(self, context):
-        pass
+        layout = self.layout
+        layout.use_property_split = True
+
+        ob = context.object
 
+        layout.prop(ob, "hide_select", text="Selectable", invert_checkbox=True, toggle=False)
 
-class CYCLES_OBJECT_PT_cycles_settings_ray_visibility(CyclesButtonsPanel, Panel):
+        col = layout.column(heading="Show in")
+        col.prop(ob, "hide_viewport", text="Viewports", invert_checkbox=True, toggle=False)
+        col.prop(ob, "hide_render", text="Renders", invert_checkbox=True, toggle=False)
+
+        if has_geometry_visibility(ob):
+            cob = ob.cycles
+            col = layout.column(heading="Mask")
+            col.prop(cob, "is_shadow_catcher")
+            col.prop(cob, "is_holdout")
+
+
+class CYCLES_OBJECT_PT_visibility_ray_visibility(CyclesButtonsPanel, Panel):
     bl_label = "Ray Visibility"
-    bl_parent_id = "CYCLES_OBJECT_PT_cycles_settings"
+    bl_parent_id = "CYCLES_OBJECT_PT_visibility"
     bl_context = "object"
 
+    @classmethod
+    def poll(cls, context):
+        ob = context.object
+        return CyclesButtonsPanel.poll(context) and has_geometry_visibility(ob)
+
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
@@ -1233,38 +1304,27 @@ class CYCLES_OBJECT_PT_cycles_settings_ray_visibility(CyclesButtonsPanel, Panel)
         cob = ob.cycles
         visibility = ob.cycles_visibility
 
-        flow = layout.grid_flow(row_major=True, columns=0, even_columns=True, even_rows=False, align=False)
-
-        col = flow.column()
+        col = layout.column()
         col.prop(visibility, "camera")
-        col = flow.column()
         col.prop(visibility, "diffuse")
-        col = flow.column()
         col.prop(visibility, "glossy")
-        col = flow.column()
         col.prop(visibility, "transmission")
-        col = flow.column()
         col.prop(visibility, "scatter")
 
         if ob.type != 'LIGHT':
-            col = flow.column()
-            col.prop(visibility, "shadow")
-
-        layout.separator()
-
-        flow = layout.grid_flow(row_major=True, columns=0, even_columns=True, even_rows=False, align=False)
-
-        col = flow.column()
-        col.prop(cob, "is_shadow_catcher")
-        col = flow.column()
-        col.prop(cob, "is_holdout")
+            sub = col.column()
+            sub.prop(visibility, "shadow")
 
 
-class CYCLES_OBJECT_PT_cycles_settings_performance(CyclesButtonsPanel, Panel):
-    bl_label = "Performance"
-    bl_parent_id = "CYCLES_OBJECT_PT_cycles_settings"
+class CYCLES_OBJECT_PT_visibility_culling(CyclesButtonsPanel, Panel):
+    bl_label = "Culling"
+    bl_parent_id = "CYCLES_OBJECT_PT_visibility"
     bl_context = "object"
 
+    @classmethod
+    def poll(cls, context):
+        ob = context.object
+        return CyclesButtonsPanel.poll(context) and has_geometry_visibility(ob)
 
     def draw(self, context):
         layout = self.layout
@@ -1276,15 +1336,13 @@ class CYCLES_OBJECT_PT_cycles_settings_performance(CyclesButtonsPanel, Panel):
         ob = context.object
         cob = ob.cycles
 
-        flow = layout.grid_flow(row_major=True, columns=0, even_columns=True, even_rows=False, align=False)
-
-        col = flow.column()
-        col.active = scene.render.use_simplify and cscene.use_camera_cull
-        col.prop(cob, "use_camera_cull")
+        row = layout.row()
+        row.active = scene.render.use_simplify and cscene.use_camera_cull
+        row.prop(cob, "use_camera_cull")
 
-        col = flow.column()
-        col.active = scene.render.use_simplify and cscene.use_distance_cull
-        col.prop(cob, "use_distance_cull")
+        row = layout.row()
+        row.active = scene.render.use_simplify and cscene.use_distance_cull
+        row.prop(cob, "use_distance_cull")
 
 
 def panel_node_draw(layout, id_data, output_type, input_name):
@@ -1341,8 +1399,6 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel):
         light = context.light
         clamp = light.cycles
 
-        layout.use_property_decorate = False
-
         if self.bl_space_type == 'PROPERTIES':
             layout.row().prop(light, "type", expand=True)
             layout.use_property_split = True
@@ -1352,8 +1408,14 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel):
 
         col = layout.column()
 
-        if light.type in {'POINT', 'SUN', 'SPOT'}:
-            col.prop(light, "shadow_soft_size", text="Size")
+        col.prop(light, "color")
+        col.prop(light, "energy")
+        col.separator()
+
+        if light.type in {'POINT', 'SPOT'}:
+            col.prop(light, "shadow_soft_size", text="Radius")
+        elif light.type == 'SUN':
+            col.prop(light, "angle")
         elif light.type == 'AREA':
             col.prop(light, "shape", text="Shape")
             sub = col.column(align=True)
@@ -1394,9 +1456,10 @@ class CYCLES_LIGHT_PT_nodes(CyclesButtonsPanel, Panel):
     def draw(self, context):
         layout = self.layout
 
+        layout.use_property_split = True
+
         light = context.light
-        if not panel_node_draw(layout, light, 'OUTPUT_LIGHT', 'Surface'):
-            layout.prop(light, "color")
+        panel_node_draw(layout, light, 'OUTPUT_LIGHT', 'Surface')
 
 
 class CYCLES_LIGHT_PT_spot(CyclesButtonsPanel, Panel):
@@ -1444,6 +1507,8 @@ class CYCLES_WORLD_PT_surface(CyclesButtonsPanel, Panel):
     def draw(self, context):
         layout = self.layout
 
+        layout.use_property_split = True
+
         world = context.world
 
         if not panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Surface'):
@@ -1463,6 +1528,8 @@ class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel):
     def draw(self, context):
         layout = self.layout
 
+        layout.use_property_split = True
+
         world = context.world
         panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume')
 
@@ -1533,17 +1600,18 @@ class CYCLES_WORLD_PT_ray_visibility(CyclesButtonsPanel, Panel):
 
     def draw(self, context):
         layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
 
         world = context.world
         visibility = world.cycles_visibility
 
-        flow = layout.column_flow()
-
-        flow.prop(visibility, "camera")
-        flow.prop(visibility, "diffuse")
-        flow.prop(visibility, "glossy")
-        flow.prop(visibility, "transmission")
-        flow.prop(visibility, "scatter")
+        col = layout.column()
+        col.prop(visibility, "camera")
+        col.prop(visibility, "diffuse")
+        col.prop(visibility, "glossy")
+        col.prop(visibility, "transmission")
+        col.prop(visibility, "scatter")
 
 
 class CYCLES_WORLD_PT_settings(CyclesButtonsPanel, Panel):
@@ -1619,6 +1687,9 @@ class CYCLES_WORLD_PT_settings_volume(CyclesButtonsPanel, Panel):
         sub.prop(cworld, "volume_sampling", text="Sampling")
         col.prop(cworld, "volume_interpolation", text="Interpolation")
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
+        sub = col.column()
+        sub.active = not cworld.homogeneous_volume
+        sub.prop(cworld, "volume_step_size")
 
 
 class CYCLES_MATERIAL_PT_preview(CyclesButtonsPanel, Panel):
@@ -1647,6 +1718,8 @@ class CYCLES_MATERIAL_PT_surface(CyclesButtonsPanel, Panel):
     def draw(self, context):
         layout = self.layout
 
+        layout.use_property_split = True
+
         mat = context.material
         if not panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Surface'):
             layout.prop(mat, "diffuse_color")
@@ -1665,6 +1738,8 @@ class CYCLES_MATERIAL_PT_volume(CyclesButtonsPanel, Panel):
     def draw(self, context):
         layout = self.layout
 
+        layout.use_property_split = True
+
         mat = context.material
         # cmat = mat.cycles
 
@@ -1683,6 +1758,8 @@ class CYCLES_MATERIAL_PT_displacement(CyclesButtonsPanel, Panel):
     def draw(self, context):
         layout = self.layout
 
+        layout.use_property_split = True
+
         mat = context.material
         panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Displacement')
 
@@ -1725,7 +1802,7 @@ class CYCLES_MATERIAL_PT_settings_surface(CyclesButtonsPanel, Panel):
         col = layout.column()
         col.prop(cmat, "sample_as_light", text="Multiple Importance")
         col.prop(cmat, "use_transparent_shadow")
-        col.prop(cmat, "displacement_method", text="Displacement Method")
+        col.prop(cmat, "displacement_method", text="Displacement")
 
     def draw(self, context):
         self.draw_shared(self, context.material)
@@ -1750,6 +1827,9 @@ class CYCLES_MATERIAL_PT_settings_volume(CyclesButtonsPanel, Panel):
         sub.prop(cmat, "volume_sampling", text="Sampling")
         col.prop(cmat, "volume_interpolation", text="Interpolation")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
+        sub = col.column()
+        sub.active = not cmat.homogeneous_volume
+        sub.prop(cmat, "volume_step_rate")
 
     def draw(self, context):
         self.draw_shared(self, context, context.material)
@@ -1761,6 +1841,10 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
     bl_options = {'DEFAULT_CLOSED'}
     COMPAT_ENGINES = {'CYCLES'}
 
+    @classmethod
+    def poll(cls, context):
+        return CyclesButtonsPanel.poll(context) and not use_optix(context)
+
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
@@ -1793,7 +1877,7 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel):
         cscene = scene.cycles
         rd = scene.render
         if rd.use_bake_multires == False and cscene.bake_type in {
-                'NORMAL', 'COMBINED', 'DIFFUSE', 'GLOSSY', 'TRANSMISSION', 'SUBSURFACE'}:
+                'NORMAL', 'COMBINED', 'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}:
             return True
 
     def draw(self, context):
@@ -1817,27 +1901,24 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel):
             sub.prop(cbk, "normal_b", text="B")
 
         elif cscene.bake_type == 'COMBINED':
-            row = col.row(align=True)
-            row.use_property_split = False
-            row.prop(cbk, "use_pass_direct", toggle=True)
-            row.prop(cbk, "use_pass_indirect", toggle=True)
 
-            flow = col.grid_flow(row_major=False, columns=0, even_columns=False, even_rows=False, align=True)
+            col = layout.column(heading="Lighting", align=True)
+            col.prop(cbk, "use_pass_direct")
+            col.prop(cbk, "use_pass_indirect")
 
-            flow.active = cbk.use_pass_direct or cbk.use_pass_indirect
-            flow.prop(cbk, "use_pass_diffuse")
-            flow.prop(cbk, "use_pass_glossy")
-            flow.prop(cbk, "use_pass_transmission")
-            flow.prop(cbk, "use_pass_subsurface")
-            flow.prop(cbk, "use_pass_ambient_occlusion")
-            flow.prop(cbk, "use_pass_emit")
+            col = layout.column(heading="Contributions", align=True)
+            col.active = cbk.use_pass_direct or cbk.use_pass_indirect
+            col.prop(cbk, "use_pass_diffuse")
+            col.prop(cbk, "use_pass_glossy")
+            col.prop(cbk, "use_pass_transmission")
+            col.prop(cbk, "use_pass_ambient_occlusion")
+            col.prop(cbk, "use_pass_emit")
 
-        elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION', 'SUBSURFACE'}:
-            row = col.row(align=True)
-            row.use_property_split = False
-            row.prop(cbk, "use_pass_direct", toggle=True)
-            row.prop(cbk, "use_pass_indirect", toggle=True)
-            row.prop(cbk, "use_pass_color", toggle=True)
+        elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}:
+            col = layout.column(heading="Contributions", align=True)
+            col.prop(cbk, "use_pass_direct")
+            col.prop(cbk, "use_pass_indirect")
+            col.prop(cbk, "use_pass_color")
 
 
 class CYCLES_RENDER_PT_bake_selected_to_active(CyclesButtonsPanel, Panel):
@@ -1873,10 +1954,15 @@ class CYCLES_RENDER_PT_bake_selected_to_active(CyclesButtonsPanel, Panel):
 
         col.prop(cbk, "use_cage", text="Cage")
         if cbk.use_cage:
-            col.prop(cbk, "cage_extrusion", text="Extrusion")
-            col.prop(cbk, "cage_object", text="Cage Object")
+            col.prop(cbk, "cage_object")
+            col = layout.column()
+            col.prop(cbk, "cage_extrusion")
+            col.active = cbk.cage_object is None
         else:
-            col.prop(cbk, "cage_extrusion", text="Ray Distance")
+            col.prop(cbk, "cage_extrusion", text="Extrusion")
+
+        col = layout.column()
+        col.prop(cbk, "max_ray_distance")
 
 
 class CYCLES_RENDER_PT_bake_output(CyclesButtonsPanel, Panel):
@@ -1900,7 +1986,7 @@ class CYCLES_RENDER_PT_bake_output(CyclesButtonsPanel, Panel):
             layout.prop(rd, "use_bake_clear", text="Clear Image")
 
             if rd.bake_type == 'DISPLACEMENT':
-                col.prop(rd, "use_bake_lores_mesh")
+                layout.prop(rd, "use_bake_lores_mesh")
         else:
 
             layout.prop(cbk, "margin")
@@ -1915,7 +2001,10 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
 
     @classmethod
     def poll(cls, context):
-        return CyclesButtonsPanel.poll(context) and bpy.app.debug_value == 256
+        prefs = bpy.context.preferences
+        return (CyclesButtonsPanel.poll(context)
+                and prefs.experimental.use_cycles_debug
+                and prefs.view.show_developer_ui)
 
     def draw(self, context):
         layout = self.layout
@@ -1945,7 +2034,14 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
         col.separator()
 
         col = layout.column()
-        col.label(text='OpenCL Flags:')
+        col.label(text="OptiX Flags:")
+        col.prop(cscene, "debug_optix_cuda_streams")
+        col.prop(cscene, "debug_optix_curves_api")
+
+        col.separator()
+
+        col = layout.column()
+        col.label(text="OpenCL Flags:")
         col.prop(cscene, "debug_opencl_device_type", text="Device")
         col.prop(cscene, "debug_use_opencl_debug", text="Debug")
         col.prop(cscene, "debug_opencl_mem_limit")
@@ -1992,7 +2088,7 @@ class CYCLES_RENDER_PT_simplify_viewport(CyclesButtonsPanel, Panel):
         col.prop(rd, "simplify_child_particles", text="Child Particles")
         col.prop(cscene, "texture_limit", text="Texture Limit")
         col.prop(cscene, "ao_bounces", text="AO Bounces")
-        col.prop(rd, "use_simplify_smoke_highres")
+
 
 class CYCLES_RENDER_PT_simplify_render(CyclesButtonsPanel, Panel):
     bl_label = "Render"
@@ -2037,19 +2133,83 @@ class CYCLES_RENDER_PT_simplify_culling(CyclesButtonsPanel, Panel):
 
         layout.active = rd.use_simplify
 
-        col = layout.column()
-        col.prop(cscene, "use_camera_cull")
-        sub = col.column()
+        row = layout.row(heading="Camera Culling")
+        row.prop(cscene, "use_camera_cull", text="")
+        sub = row.column()
         sub.active = cscene.use_camera_cull
-        sub.prop(cscene, "camera_cull_margin")
+        sub.prop(cscene, "camera_cull_margin", text="")
 
-        col = layout.column()
-        col.prop(cscene, "use_distance_cull")
-        sub = col.column()
+        row = layout.row(heading="Distance Culling")
+        row.prop(cscene, "use_distance_cull", text="")
+        sub = row.column()
         sub.active = cscene.use_distance_cull
-        sub.prop(cscene, "distance_cull_margin", text="Distance")
+        sub.prop(cscene, "distance_cull_margin", text="")
+
+
+class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
+    bl_space_type = 'VIEW_3D'
+    bl_region_type = 'HEADER'
+    bl_label = "Render Pass"
+    bl_parent_id = 'VIEW3D_PT_shading'
+    COMPAT_ENGINES = {'CYCLES'}
+
+    @classmethod
+    def poll(cls, context):
+        return (context.engine in cls.COMPAT_ENGINES
+            and context.space_data.shading.type == 'RENDERED')
+
+    def draw(self, context):
+        shading = context.space_data.shading
+
+        layout = self.layout
+        layout.prop(shading.cycles, "render_pass", text="")
 
 
+class CYCLES_VIEW3D_PT_shading_lighting(Panel):
+    bl_space_type = 'VIEW_3D'
+    bl_region_type = 'HEADER'
+    bl_label = "Lighting"
+    bl_parent_id = 'VIEW3D_PT_shading'
+    COMPAT_ENGINES = {'CYCLES'}
+
+    @classmethod
+    def poll(cls, context):
+        return (context.engine in cls.COMPAT_ENGINES
+            and context.space_data.shading.type == 'RENDERED')
+
+    def draw(self, context):
+        layout = self.layout
+        col = layout.column()
+        split = col.split(factor=0.9)
+
+        shading = context.space_data.shading
+        col.prop(shading, "use_scene_lights_render")
+        col.prop(shading, "use_scene_world_render")
+
+        if not shading.use_scene_world_render:
+            col = layout.column()
+            split = col.split(factor=0.9)
+
+            col = split.column()
+            sub = col.row()
+            sub.scale_y = 0.6
+            sub.template_icon_view(shading, "studio_light", scale_popup=3)
+
+            col = split.column()
+            col.operator("preferences.studiolight_show", emboss=False, text="", icon='PREFERENCES')
+
+            split = layout.split(factor=0.9)
+            col = split.column()
+            col.prop(shading, "studiolight_rotate_z", text="Rotation")
+            col.prop(shading, "studiolight_intensity")
+            col.prop(shading, "studiolight_background_alpha")
+
+class CYCLES_VIEW3D_PT_simplify_greasepencil(CyclesButtonsPanel, Panel, GreasePencilSimplifyPanel):
+    bl_label = "Grease Pencil"
+    bl_parent_id = "CYCLES_RENDER_PT_simplify"
+    COMPAT_ENGINES = {'CYCLES'}
+    bl_options = {'DEFAULT_CLOSED'}
+
 def draw_device(self, context):
     scene = context.scene
     layout = self.layout
@@ -2063,8 +2223,6 @@ def draw_device(self, context):
         col = layout.column()
         col.prop(cscene, "feature_set")
 
-        scene = context.scene
-
         col = layout.column()
         col.active = show_device_active(context)
         col.prop(cscene, "device")
@@ -2083,7 +2241,7 @@ def draw_pause(self, context):
 
         if view.shading.type == 'RENDERED':
             cscene = scene.cycles
-            layout.prop(cscene, "preview_pause", icon='PAUSE', text="")
+            layout.prop(cscene, "preview_pause", icon='PLAY' if cscene.preview_pause else 'PAUSE', text="")
 
 
 def get_panels():
@@ -2098,6 +2256,7 @@ def get_panels():
         'MATERIAL_PT_preview',
         'NODE_DATA_PT_light',
         'NODE_DATA_PT_spot',
+        'OBJECT_PT_visibility',
         'VIEWLAYER_PT_filter',
         'VIEWLAYER_PT_layer_passes',
         'RENDER_PT_post_processing',
@@ -2118,6 +2277,8 @@ classes = (
     CYCLES_PT_integrator_presets,
     CYCLES_RENDER_PT_sampling,
     CYCLES_RENDER_PT_sampling_sub_samples,
+    CYCLES_RENDER_PT_sampling_adaptive,
+    CYCLES_RENDER_PT_sampling_denoising,
     CYCLES_RENDER_PT_sampling_advanced,
     CYCLES_RENDER_PT_light_paths,
     CYCLES_RENDER_PT_light_paths_max_bounces,
@@ -2130,6 +2291,9 @@ classes = (
     CYCLES_RENDER_PT_simplify_viewport,
     CYCLES_RENDER_PT_simplify_render,
     CYCLES_RENDER_PT_simplify_culling,
+    CYCLES_VIEW3D_PT_simplify_greasepencil,
+    CYCLES_VIEW3D_PT_shading_lighting,
+    CYCLES_VIEW3D_PT_shading_render_pass,
     CYCLES_RENDER_PT_motion_blur,
     CYCLES_RENDER_PT_motion_blur_curve,
     CYCLES_RENDER_PT_film,
@@ -2141,23 +2305,25 @@ classes = (
     CYCLES_RENDER_PT_performance_acceleration_structure,
     CYCLES_RENDER_PT_performance_final_render,
     CYCLES_RENDER_PT_performance_viewport,
-    CYCLES_RENDER_PT_filter,
-    CYCLES_RENDER_PT_override,
     CYCLES_RENDER_PT_passes,
     CYCLES_RENDER_PT_passes_data,
     CYCLES_RENDER_PT_passes_light,
     CYCLES_RENDER_PT_passes_crypto,
     CYCLES_RENDER_PT_passes_debug,
+    CYCLES_RENDER_UL_aov,
+    CYCLES_RENDER_PT_passes_aov,
+    CYCLES_RENDER_PT_filter,
+    CYCLES_RENDER_PT_override,
     CYCLES_RENDER_PT_denoising,
     CYCLES_PT_post_processing,
     CYCLES_CAMERA_PT_dof,
     CYCLES_CAMERA_PT_dof_aperture,
-    CYCLES_CAMERA_PT_dof_viewport,
     CYCLES_PT_context_material,
     CYCLES_OBJECT_PT_motion_blur,
-    CYCLES_OBJECT_PT_cycles_settings,
-    CYCLES_OBJECT_PT_cycles_settings_ray_visibility,
-    CYCLES_OBJECT_PT_cycles_settings_performance,
+    CYCLES_OBJECT_PT_shading,
+    CYCLES_OBJECT_PT_visibility,
+    CYCLES_OBJECT_PT_visibility_ray_visibility,
+    CYCLES_OBJECT_PT_visibility_culling,
     CYCLES_LIGHT_PT_preview,
     CYCLES_LIGHT_PT_light,
     CYCLES_LIGHT_PT_nodes,
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 6f005727b95..49f23f4ba30 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -22,140 +22,6 @@ import math
 from bpy.app.handlers import persistent
 
 
-def foreach_cycles_nodetree_group(nodetree, traversed):
-    for node in nodetree.nodes:
-        if node.bl_idname == 'ShaderNodeGroup':
-            group = node.node_tree
-            if group and group not in traversed:
-                traversed.add(group)
-                yield group, group.library
-                yield from foreach_cycles_nodetree_group(group, traversed)
-
-
-def foreach_cycles_nodetree():
-    traversed = set()
-
-    for material in bpy.data.materials:
-        nodetree = material.node_tree
-        if nodetree:
-            yield nodetree, material.library
-            yield from foreach_cycles_nodetree_group(nodetree, traversed)
-
-    for world in bpy.data.worlds:
-        nodetree = world.node_tree
-        if nodetree:
-            yield nodetree, world.library
-            foreach_cycles_nodetree_group(nodetree, traversed)
-
-    for light in bpy.data.lights:
-        nodetree = light.node_tree
-        if nodetree:
-            yield nodetree, light.library
-            foreach_cycles_nodetree_group(nodetree, traversed)
-
-
-def displacement_node_insert(nodetree):
-    # Gather links to replace
-    displacement_links = []
-    for link in nodetree.links:
-        if (
-                link.to_node.bl_idname == 'ShaderNodeOutputMaterial' and
-                link.from_node.bl_idname != 'ShaderNodeDisplacement' and
-                link.to_socket.identifier == 'Displacement'
-        ):
-            displacement_links.append(link)
-
-    # Replace links with displacement node
-    for link in displacement_links:
-        from_node = link.from_node
-        from_socket = link.from_socket
-        to_node = link.to_node
-        to_socket = link.to_socket
-
-        nodetree.links.remove(link)
-
-        node = nodetree.nodes.new(type='ShaderNodeDisplacement')
-        node.location[0] = 0.5 * (from_node.location[0] + to_node.location[0])
-        node.location[1] = 0.5 * (from_node.location[1] + to_node.location[1])
-        node.inputs['Scale'].default_value = 0.1
-        node.inputs['Midlevel'].default_value = 0.0
-
-        nodetree.links.new(from_socket, node.inputs['Height'])
-        nodetree.links.new(node.outputs['Displacement'], to_socket)
-
-
-def displacement_principled_nodes(node):
-    if node.bl_idname == 'ShaderNodeDisplacement':
-        if node.space != 'WORLD':
-            node.space = 'OBJECT'
-    if node.bl_idname == 'ShaderNodeBsdfPrincipled':
-        if node.subsurface_method != 'RANDOM_WALK':
-            node.subsurface_method = 'BURLEY'
-
-
-def square_roughness_node_insert(nodetree):
-    roughness_node_types = {
-        'ShaderNodeBsdfAnisotropic',
-        'ShaderNodeBsdfGlass',
-        'ShaderNodeBsdfGlossy',
-        'ShaderNodeBsdfRefraction'}
-
-    # Update default values
-    for node in nodetree.nodes:
-        if node.bl_idname in roughness_node_types:
-            roughness_input = node.inputs['Roughness']
-            roughness_input.default_value = math.sqrt(max(roughness_input.default_value, 0.0))
-
-    # Gather roughness links to replace
-    roughness_links = []
-    for link in nodetree.links:
-        if link.to_node.bl_idname in roughness_node_types and \
-           link.to_socket.identifier == 'Roughness':
-            roughness_links.append(link)
-
-    # Replace links with sqrt node
-    for link in roughness_links:
-        from_node = link.from_node
-        from_socket = link.from_socket
-        to_node = link.to_node
-        to_socket = link.to_socket
-
-        nodetree.links.remove(link)
-
-        node = nodetree.nodes.new(type='ShaderNodeMath')
-        node.operation = 'POWER'
-        node.location[0] = 0.5 * (from_node.location[0] + to_node.location[0])
-        node.location[1] = 0.5 * (from_node.location[1] + to_node.location[1])
-
-        nodetree.links.new(from_socket, node.inputs[0])
-        node.inputs[1].default_value = 0.5
-        nodetree.links.new(node.outputs['Value'], to_socket)
-
-
-def mapping_node_order_flip(node):
-    """
-    Flip euler order of mapping shader node
-    """
-    if node.bl_idname == 'ShaderNodeMapping':
-        rot = node.rotation.copy()
-        rot.order = 'ZYX'
-        quat = rot.to_quaternion()
-        node.rotation = quat.to_euler('XYZ')
-
-
-def vector_curve_node_remap(node):
-    """
-    Remap values of vector curve node from normalized to absolute values
-    """
-    if node.bl_idname == 'ShaderNodeVectorCurve':
-        node.mapping.use_clip = False
-        for curve in node.mapping.curves:
-            for point in curve.points:
-                point.location.x = (point.location.x * 2.0) - 1.0
-                point.location.y = (point.location.y - 0.5) * 2.0
-        node.mapping.update()
-
-
 def custom_bake_remap(scene):
     """
     Remap bake types into the new types and set the flags accordingly
@@ -176,10 +42,7 @@ def custom_bake_remap(scene):
         'GLOSSY_COLOR',
         'TRANSMISSION_DIRECT',
         'TRANSMISSION_INDIRECT',
-        'TRANSMISSION_COLOR',
-        'SUBSURFACE_DIRECT',
-        'SUBSURFACE_INDIRECT',
-        'SUBSURFACE_COLOR')
+        'TRANSMISSION_COLOR')
 
     diffuse_direct_idx = bake_lookup.index('DIFFUSE_DIRECT')
 
@@ -213,28 +76,6 @@ def custom_bake_remap(scene):
         scene.render.bake.use_pass_indirect = False
 
 
-def ambient_occlusion_node_relink(nodetree):
-    for node in nodetree.nodes:
-        if node.bl_idname == 'ShaderNodeAmbientOcclusion':
-            node.samples = 1
-            node.only_local = False
-            node.inputs['Distance'].default_value = 0.0
-
-    # Gather links to replace
-    ao_links = []
-    for link in nodetree.links:
-        if link.from_node.bl_idname == 'ShaderNodeAmbientOcclusion':
-            ao_links.append(link)
-
-    # Replace links
-    for link in ao_links:
-        from_node = link.from_node
-        to_socket = link.to_socket
-
-        nodetree.links.remove(link)
-        nodetree.links.new(from_node.outputs['Color'], to_socket)
-
-
 @persistent
 def do_versions(self):
     if bpy.context.preferences.version <= (2, 78, 1):
@@ -411,48 +252,3 @@ def do_versions(self):
                 cmat = mat.cycles
                 if not cmat.is_property_set("displacement_method"):
                     cmat.displacement_method = 'DISPLACEMENT'
-
-        # Nodes
-        for nodetree, library in foreach_cycles_nodetree():
-            if library not in libraries:
-                continue
-
-            # Euler order was ZYX in previous versions.
-            if version <= (2, 73, 4):
-                for node in nodetree.nodes:
-                    mapping_node_order_flip(node)
-
-            if version <= (2, 76, 5):
-                for node in nodetree.nodes:
-                    vector_curve_node_remap(node)
-
-            if version <= (2, 79, 1) or \
-               (version >= (2, 80, 0) and version <= (2, 80, 3)):
-                displacement_node_insert(nodetree)
-
-            if version <= (2, 79, 2):
-                for node in nodetree.nodes:
-                    displacement_principled_nodes(node)
-
-            if version <= (2, 79, 3) or \
-               (version >= (2, 80, 0) and version <= (2, 80, 4)):
-                # Switch to squared roughness convention
-                square_roughness_node_insert(nodetree)
-
-            if version <= (2, 79, 4):
-                ambient_occlusion_node_relink(nodetree)
-
-        # Particles
-        for part in bpy.data.particles:
-            if part.library not in libraries:
-                continue
-
-            # Copy cycles hair settings to internal settings
-            if version <= (2, 80, 15):
-                cpart = part.get("cycles", None)
-                if cpart:
-                    part.shape = cpart.get("shape", 0.0)
-                    part.root_radius = cpart.get("root_width", 1.0)
-                    part.tip_radius = cpart.get("tip_width", 0.0)
-                    part.radius_scale = cpart.get("radius_scale", 0.01)
-                    part.use_close_tip = cpart.get("use_closetip", True)
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index b3bfaa992a9..592a69585de 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -91,16 +91,31 @@ static void blender_camera_init(BlenderCamera *bcam, BL::RenderSettings &b_rende
 {
   memset((void *)bcam, 0, sizeof(BlenderCamera));
 
+  bcam->nearclip = 1e-5f;
+  bcam->farclip = 1e5f;
+
   bcam->type = CAMERA_PERSPECTIVE;
+  bcam->ortho_scale = 1.0f;
+
+  bcam->lens = 50.0f;
+  bcam->shuttertime = 1.0f;
+
+  bcam->rolling_shutter_type = Camera::ROLLING_SHUTTER_NONE;
+  bcam->rolling_shutter_duration = 0.1f;
+
+  bcam->aperturesize = 0.0f;
+  bcam->apertureblades = 0;
+  bcam->aperturerotation = 0.0f;
+  bcam->focaldistance = 10.0f;
+
   bcam->zoom = 1.0f;
   bcam->pixelaspect = make_float2(1.0f, 1.0f);
+  bcam->aperture_ratio = 1.0f;
+
   bcam->sensor_width = 36.0f;
   bcam->sensor_height = 24.0f;
   bcam->sensor_fit = BlenderCamera::AUTO;
-  bcam->shuttertime = 1.0f;
   bcam->motion_position = Camera::MOTION_POSITION_CENTER;
-  bcam->rolling_shutter_type = Camera::ROLLING_SHUTTER_NONE;
-  bcam->rolling_shutter_duration = 0.1f;
   bcam->border.right = 1.0f;
   bcam->border.top = 1.0f;
   bcam->pano_viewplane.right = 1.0f;
@@ -108,6 +123,7 @@ static void blender_camera_init(BlenderCamera *bcam, BL::RenderSettings &b_rende
   bcam->viewport_camera_border.right = 1.0f;
   bcam->viewport_camera_border.top = 1.0f;
   bcam->offscreen_dicing_scale = 1.0f;
+  bcam->matrix = transform_identity();
 
   /* render resolution */
   bcam->full_width = render_resolution_x(b_render);
@@ -119,10 +135,10 @@ static float blender_camera_focal_distance(BL::RenderEngine &b_engine,
                                            BL::Camera &b_camera,
                                            BlenderCamera *bcam)
 {
-  BL::Object b_dof_object = b_camera.dof_object();
+  BL::Object b_dof_object = b_camera.dof().focus_object();
 
   if (!b_dof_object)
-    return b_camera.dof_distance();
+    return b_camera.dof().focus_distance();
 
   /* for dof object, return distance along camera Z direction */
   BL::Array<float, 16> b_ob_matrix;
@@ -191,26 +207,30 @@ static void blender_camera_from_object(BlenderCamera *bcam,
 
     bcam->lens = b_camera.lens();
 
-    /* allow f/stop number to change aperture_size but still
-     * give manual control over aperture radius */
-    int aperture_type = get_enum(ccamera, "aperture_type");
-
-    if (aperture_type == 1) {
-      float fstop = RNA_float_get(&ccamera, "aperture_fstop");
+    if (b_camera.dof().use_dof()) {
+      /* allow f/stop number to change aperture_size but still
+       * give manual control over aperture radius */
+      float fstop = b_camera.dof().aperture_fstop();
       fstop = max(fstop, 1e-5f);
 
       if (bcam->type == CAMERA_ORTHOGRAPHIC)
         bcam->aperturesize = 1.0f / (2.0f * fstop);
       else
         bcam->aperturesize = (bcam->lens * 1e-3f) / (2.0f * fstop);
-    }
-    else
-      bcam->aperturesize = RNA_float_get(&ccamera, "aperture_size");
 
-    bcam->apertureblades = RNA_int_get(&ccamera, "aperture_blades");
-    bcam->aperturerotation = RNA_float_get(&ccamera, "aperture_rotation");
-    bcam->focaldistance = blender_camera_focal_distance(b_engine, b_ob, b_camera, bcam);
-    bcam->aperture_ratio = RNA_float_get(&ccamera, "aperture_ratio");
+      bcam->apertureblades = b_camera.dof().aperture_blades();
+      bcam->aperturerotation = b_camera.dof().aperture_rotation();
+      bcam->focaldistance = blender_camera_focal_distance(b_engine, b_ob, b_camera, bcam);
+      bcam->aperture_ratio = b_camera.dof().aperture_ratio();
+    }
+    else {
+      /* DOF is turned of for the camera. */
+      bcam->aperturesize = 0.0f;
+      bcam->apertureblades = 0;
+      bcam->aperturerotation = 0.0f;
+      bcam->focaldistance = 0.0f;
+      bcam->aperture_ratio = 1.0f;
+    }
 
     bcam->shift.x = b_engine.camera_shift_x(b_ob, bcam->use_spherical_stereo);
     bcam->shift.y = b_camera.shift_y();
@@ -689,6 +709,10 @@ static void blender_camera_from_view(BlenderCamera *bcam,
 
   /* 3d view transform */
   bcam->matrix = transform_inverse(get_transform(b_rv3d.view_matrix()));
+
+  /* dimensions */
+  bcam->full_width = width;
+  bcam->full_height = height;
 }
 
 static void blender_camera_view_subset(BL::RenderEngine &b_engine,
@@ -705,22 +729,26 @@ static void blender_camera_view_subset(BL::RenderEngine &b_engine,
   BoundBox2D cam, view;
   float view_aspect, cam_aspect, sensor_size;
 
-  /* get viewport viewplane */
+  /* Get viewport viewplane. */
   BlenderCamera view_bcam;
   blender_camera_init(&view_bcam, b_render);
   blender_camera_from_view(&view_bcam, b_engine, b_scene, b_v3d, b_rv3d, width, height, true);
 
   blender_camera_viewplane(&view_bcam, width, height, &view, &view_aspect, &sensor_size);
 
-  /* get camera viewplane */
+  /* Get camera viewplane. */
   BlenderCamera cam_bcam;
   blender_camera_init(&cam_bcam, b_render);
   blender_camera_from_object(&cam_bcam, b_engine, b_ob, true);
 
+  /* Camera border is affect by aspect, viewport is not. */
+  cam_bcam.pixelaspect.x = b_render.pixel_aspect_x();
+  cam_bcam.pixelaspect.y = b_render.pixel_aspect_y();
+
   blender_camera_viewplane(
       &cam_bcam, cam_bcam.full_width, cam_bcam.full_height, &cam, &cam_aspect, &sensor_size);
 
-  /* return */
+  /* Return */
   *view_box = view * (1.0f / view_aspect);
   *cam_box = cam * (1.0f / cam_aspect);
 }
@@ -848,7 +876,8 @@ BufferParams BlenderSync::get_buffer_params(BL::RenderSettings &b_render,
                                             BL::RegionView3D &b_rv3d,
                                             Camera *cam,
                                             int width,
-                                            int height)
+                                            int height,
+                                            const bool use_denoiser)
 {
   BufferParams params;
   bool use_border = false;
@@ -879,6 +908,11 @@ BufferParams BlenderSync::get_buffer_params(BL::RenderSettings &b_render,
     params.height = height;
   }
 
+  PassType display_pass = update_viewport_display_passes(b_v3d, params.passes);
+
+  /* Can only denoise the combined image pass */
+  params.denoising_data_pass = display_pass == PASS_COMBINED && use_denoiser;
+
   return params;
 }
 
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index d0375ceb79c..82c99631a89 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -17,7 +17,7 @@
 #include "render/attribute.h"
 #include "render/camera.h"
 #include "render/curves.h"
-#include "render/mesh.h"
+#include "render/hair.h"
 #include "render/object.h"
 #include "render/scene.h"
 
@@ -38,27 +38,6 @@ ParticleCurveData::~ParticleCurveData()
 {
 }
 
-static void interp_weights(float t, float data[4])
-{
-  /* Cardinal curve interpolation */
-  float t2 = t * t;
-  float t3 = t2 * t;
-  float fc = 0.71f;
-
-  data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t;
-  data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f;
-  data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t;
-  data[3] = fc * t3 - fc * t2;
-}
-
-static void curveinterp_v3_v3v3v3v3(
-    float3 *p, float3 *v1, float3 *v2, float3 *v3, float3 *v4, const float w[4])
-{
-  p->x = v1->x * w[0] + v2->x * w[1] + v3->x * w[2] + v4->x * w[3];
-  p->y = v1->y * w[0] + v2->y * w[1] + v3->y * w[2] + v4->y * w[3];
-  p->z = v1->z * w[0] + v2->z * w[1] + v3->z * w[2] + v4->z * w[3];
-}
-
 static float shaperadius(float shape, float root, float tip, float time)
 {
   assert(time >= 0.0f);
@@ -76,43 +55,13 @@ static float shaperadius(float shape, float root, float tip, float time)
 
 /* curve functions */
 
-static void InterpolateKeySegments(
-    int seg, int segno, int key, int curve, float3 *keyloc, float *time, ParticleCurveData *CData)
-{
-  float3 ckey_loc1 = CData->curvekey_co[key];
-  float3 ckey_loc2 = ckey_loc1;
-  float3 ckey_loc3 = CData->curvekey_co[key + 1];
-  float3 ckey_loc4 = ckey_loc3;
-
-  if (key > CData->curve_firstkey[curve])
-    ckey_loc1 = CData->curvekey_co[key - 1];
-
-  if (key < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2)
-    ckey_loc4 = CData->curvekey_co[key + 2];
-
-  float time1 = CData->curvekey_time[key] / CData->curve_length[curve];
-  float time2 = CData->curvekey_time[key + 1] / CData->curve_length[curve];
-
-  float dfra = (time2 - time1) / (float)segno;
-
-  if (time)
-    *time = (dfra * seg) + time1;
-
-  float t[4];
-
-  interp_weights((float)seg / (float)segno, t);
-
-  if (keyloc)
-    curveinterp_v3_v3v3v3v3(keyloc, &ckey_loc1, &ckey_loc2, &ckey_loc3, &ckey_loc4, t);
-}
-
 static bool ObtainCacheParticleData(
-    Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background)
+    Hair *hair, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background)
 {
   int curvenum = 0;
   int keyno = 0;
 
-  if (!(mesh && b_mesh && b_ob && CData))
+  if (!(hair && b_mesh && b_ob && CData))
     return false;
 
   Transform tfm = get_transform(b_ob->matrix_world());
@@ -128,7 +77,7 @@ static bool ObtainCacheParticleData(
 
       if ((b_part.render_type() == BL::ParticleSettings::render_type_PATH) &&
           (b_part.type() == BL::ParticleSettings::type_HAIR)) {
-        int shader = clamp(b_part.material() - 1, 0, mesh->used_shaders.size() - 1);
+        int shader = clamp(b_part.material() - 1, 0, hair->used_shaders.size() - 1);
         int display_step = background ? b_part.render_step() : b_part.display_step();
         int totparts = b_psys.particles.length();
         int totchild = background ? b_psys.child_particles.length() :
@@ -173,19 +122,20 @@ static bool ObtainCacheParticleData(
           CData->curve_firstkey.push_back_slow(keyno);
 
           float curve_length = 0.0f;
-          float3 pcKey;
+          float3 prev_co_world = make_float3(0.0f, 0.0f, 0.0f);
+          float3 prev_co_object = make_float3(0.0f, 0.0f, 0.0f);
           for (int step_no = 0; step_no < ren_step; step_no++) {
-            float nco[3];
-            b_psys.co_hair(*b_ob, pa_no, step_no, nco);
-            float3 cKey = make_float3(nco[0], nco[1], nco[2]);
-            cKey = transform_point(&itfm, cKey);
+            float3 co_world = prev_co_world;
+            b_psys.co_hair(*b_ob, pa_no, step_no, &co_world.x);
+            float3 co_object = transform_point(&itfm, co_world);
             if (step_no > 0) {
-              const float step_length = len(cKey - pcKey);
+              const float step_length = len(co_object - prev_co_object);
               curve_length += step_length;
             }
-            CData->curvekey_co.push_back_slow(cKey);
+            CData->curvekey_co.push_back_slow(co_object);
             CData->curvekey_time.push_back_slow(curve_length);
-            pcKey = cKey;
+            prev_co_object = co_object;
+            prev_co_world = co_world;
             keynum++;
           }
           keyno += keynum;
@@ -201,14 +151,14 @@ static bool ObtainCacheParticleData(
   return true;
 }
 
-static bool ObtainCacheParticleUV(Mesh *mesh,
+static bool ObtainCacheParticleUV(Hair *hair,
                                   BL::Mesh *b_mesh,
                                   BL::Object *b_ob,
                                   ParticleCurveData *CData,
                                   bool background,
                                   int uv_num)
 {
-  if (!(mesh && b_mesh && b_ob && CData))
+  if (!(hair && b_mesh && b_ob && CData))
     return false;
 
   CData->curve_uv.clear();
@@ -264,14 +214,14 @@ static bool ObtainCacheParticleUV(Mesh *mesh,
   return true;
 }
 
-static bool ObtainCacheParticleVcol(Mesh *mesh,
+static bool ObtainCacheParticleVcol(Hair *hair,
                                     BL::Mesh *b_mesh,
                                     BL::Object *b_ob,
                                     ParticleCurveData *CData,
                                     bool background,
                                     int vcol_num)
 {
-  if (!(mesh && b_mesh && b_ob && CData))
+  if (!(hair && b_mesh && b_ob && CData))
     return false;
 
   CData->curve_vcol.clear();
@@ -312,7 +262,7 @@ static bool ObtainCacheParticleVcol(Mesh *mesh,
           BL::Mesh::vertex_colors_iterator l;
           b_mesh->vertex_colors.begin(l);
 
-          float3 vcol = make_float3(0.0f, 0.0f, 0.0f);
+          float4 vcol = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
           if (b_mesh->vertex_colors.length())
             b_psys.mcol_on_emitter(psmd, *b_pa, pa_no, vcol_num, &vcol.x);
           CData->curve_vcol.push_back_slow(vcol);
@@ -327,287 +277,21 @@ static bool ObtainCacheParticleVcol(Mesh *mesh,
   return true;
 }
 
-static void ExportCurveTrianglePlanes(Mesh *mesh,
-                                      ParticleCurveData *CData,
-                                      float3 RotCam,
-                                      bool is_ortho)
-{
-  int vertexno = mesh->verts.size();
-  int vertexindex = vertexno;
-  int numverts = 0, numtris = 0;
-
-  /* compute and reserve size of arrays */
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      numverts += 2 + (CData->curve_keynum[curve] - 1) * 2;
-      numtris += (CData->curve_keynum[curve] - 1) * 2;
-    }
-  }
-
-  mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);
-
-  /* actually export */
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      float3 xbasis;
-      float3 v1;
-      float time = 0.0f;
-      float3 ickey_loc = CData->curvekey_co[CData->curve_firstkey[curve]];
-      float radius = shaperadius(
-          CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], 0.0f);
-      v1 = CData->curvekey_co[CData->curve_firstkey[curve] + 1] -
-           CData->curvekey_co[CData->curve_firstkey[curve]];
-      if (is_ortho)
-        xbasis = normalize(cross(RotCam, v1));
-      else
-        xbasis = normalize(cross(RotCam - ickey_loc, v1));
-      float3 ickey_loc_shfl = ickey_loc - radius * xbasis;
-      float3 ickey_loc_shfr = ickey_loc + radius * xbasis;
-      mesh->add_vertex(ickey_loc_shfl);
-      mesh->add_vertex(ickey_loc_shfr);
-      vertexindex += 2;
-
-      for (int curvekey = CData->curve_firstkey[curve] + 1;
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve];
-           curvekey++) {
-        ickey_loc = CData->curvekey_co[curvekey];
-
-        if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)
-          v1 = CData->curvekey_co[curvekey] -
-               CData->curvekey_co[max(curvekey - 1, CData->curve_firstkey[curve])];
-        else
-          v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey - 1];
-
-        time = CData->curvekey_time[curvekey] / CData->curve_length[curve];
-        radius = shaperadius(
-            CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], time);
-
-        if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)
-          radius = shaperadius(CData->psys_shape[sys],
-                               CData->psys_rootradius[sys],
-                               CData->psys_tipradius[sys],
-                               0.95f);
-
-        if (CData->psys_closetip[sys] &&
-            (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1))
-          radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], 0.0f, 0.95f);
-
-        if (is_ortho)
-          xbasis = normalize(cross(RotCam, v1));
-        else
-          xbasis = normalize(cross(RotCam - ickey_loc, v1));
-        float3 ickey_loc_shfl = ickey_loc - radius * xbasis;
-        float3 ickey_loc_shfr = ickey_loc + radius * xbasis;
-        mesh->add_vertex(ickey_loc_shfl);
-        mesh->add_vertex(ickey_loc_shfr);
-        mesh->add_triangle(
-            vertexindex - 2, vertexindex, vertexindex - 1, CData->psys_shader[sys], true);
-        mesh->add_triangle(
-            vertexindex + 1, vertexindex - 1, vertexindex, CData->psys_shader[sys], true);
-        vertexindex += 2;
-      }
-    }
-  }
-
-  mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
-  mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
-  mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
-  mesh->add_face_normals();
-  mesh->add_vertex_normals();
-  mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
-
-  /* texture coords still needed */
-}
-
-static void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution)
-{
-  int vertexno = mesh->verts.size();
-  int vertexindex = vertexno;
-  int numverts = 0, numtris = 0;
-
-  /* compute and reserve size of arrays */
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      numverts += (CData->curve_keynum[curve] - 1) * resolution + resolution;
-      numtris += (CData->curve_keynum[curve] - 1) * 2 * resolution;
-    }
-  }
-
-  mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);
-
-  /* actually export */
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      float3 firstxbasis = cross(make_float3(1.0f, 0.0f, 0.0f),
-                                 CData->curvekey_co[CData->curve_firstkey[curve] + 1] -
-                                     CData->curvekey_co[CData->curve_firstkey[curve]]);
-      if (!is_zero(firstxbasis))
-        firstxbasis = normalize(firstxbasis);
-      else
-        firstxbasis = normalize(cross(make_float3(0.0f, 1.0f, 0.0f),
-                                      CData->curvekey_co[CData->curve_firstkey[curve] + 1] -
-                                          CData->curvekey_co[CData->curve_firstkey[curve]]));
-
-      for (int curvekey = CData->curve_firstkey[curve];
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1;
-           curvekey++) {
-        float3 xbasis = firstxbasis;
-        float3 v1;
-        float3 v2;
-
-        if (curvekey == CData->curve_firstkey[curve]) {
-          v1 = CData->curvekey_co[min(
-                   curvekey + 2, CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)] -
-               CData->curvekey_co[curvekey + 1];
-          v2 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey];
-        }
-        else if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1) {
-          v1 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1];
-          v2 = CData->curvekey_co[curvekey - 1] -
-               CData->curvekey_co[max(curvekey - 2, CData->curve_firstkey[curve])];
-        }
-        else {
-          v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey];
-          v2 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1];
-        }
-
-        xbasis = cross(v1, v2);
-
-        if (len_squared(xbasis) >= 0.05f * len_squared(v1) * len_squared(v2)) {
-          firstxbasis = normalize(xbasis);
-          break;
-        }
-      }
-
-      for (int curvekey = CData->curve_firstkey[curve];
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1;
-           curvekey++) {
-        int subv = 1;
-        float3 xbasis;
-        float3 ybasis;
-        float3 v1;
-        float3 v2;
-
-        if (curvekey == CData->curve_firstkey[curve]) {
-          subv = 0;
-          v1 = CData->curvekey_co[min(
-                   curvekey + 2, CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)] -
-               CData->curvekey_co[curvekey + 1];
-          v2 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey];
-        }
-        else if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1) {
-          v1 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1];
-          v2 = CData->curvekey_co[curvekey - 1] -
-               CData->curvekey_co[max(curvekey - 2, CData->curve_firstkey[curve])];
-        }
-        else {
-          v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey];
-          v2 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1];
-        }
-
-        xbasis = cross(v1, v2);
-
-        if (len_squared(xbasis) >= 0.05f * len_squared(v1) * len_squared(v2)) {
-          xbasis = normalize(xbasis);
-          firstxbasis = xbasis;
-        }
-        else
-          xbasis = firstxbasis;
-
-        ybasis = normalize(cross(xbasis, v2));
-
-        for (; subv <= 1; subv++) {
-          float3 ickey_loc = make_float3(0.0f, 0.0f, 0.0f);
-          float time = 0.0f;
-
-          InterpolateKeySegments(subv, 1, curvekey, curve, &ickey_loc, &time, CData);
-
-          float radius = shaperadius(CData->psys_shape[sys],
-                                     CData->psys_rootradius[sys],
-                                     CData->psys_tipradius[sys],
-                                     time);
-
-          if ((curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2) &&
-              (subv == 1))
-            radius = shaperadius(CData->psys_shape[sys],
-                                 CData->psys_rootradius[sys],
-                                 CData->psys_tipradius[sys],
-                                 0.95f);
-
-          if (CData->psys_closetip[sys] && (subv == 1) &&
-              (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2))
-            radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], 0.0f, 0.95f);
-
-          float angle = M_2PI_F / (float)resolution;
-          for (int section = 0; section < resolution; section++) {
-            float3 ickey_loc_shf = ickey_loc + radius * (cosf(angle * section) * xbasis +
-                                                         sinf(angle * section) * ybasis);
-            mesh->add_vertex(ickey_loc_shf);
-          }
-
-          if (subv != 0) {
-            for (int section = 0; section < resolution - 1; section++) {
-              mesh->add_triangle(vertexindex - resolution + section,
-                                 vertexindex + section,
-                                 vertexindex - resolution + section + 1,
-                                 CData->psys_shader[sys],
-                                 true);
-              mesh->add_triangle(vertexindex + section + 1,
-                                 vertexindex - resolution + section + 1,
-                                 vertexindex + section,
-                                 CData->psys_shader[sys],
-                                 true);
-            }
-            mesh->add_triangle(vertexindex - 1,
-                               vertexindex + resolution - 1,
-                               vertexindex - resolution,
-                               CData->psys_shader[sys],
-                               true);
-            mesh->add_triangle(vertexindex,
-                               vertexindex - resolution,
-                               vertexindex + resolution - 1,
-                               CData->psys_shader[sys],
-                               true);
-          }
-          vertexindex += resolution;
-        }
-      }
-    }
-  }
-
-  mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles());
-  mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
-  mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
-  mesh->add_face_normals();
-  mesh->add_vertex_normals();
-  mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
-
-  /* texture coords still needed */
-}
-
-static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
+static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CData)
 {
   int num_keys = 0;
   int num_curves = 0;
 
-  if (mesh->num_curves())
+  if (hair->num_curves())
     return;
 
   Attribute *attr_intercept = NULL;
   Attribute *attr_random = NULL;
 
-  if (mesh->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
-    attr_intercept = mesh->curve_attributes.add(ATTR_STD_CURVE_INTERCEPT);
-  if (mesh->need_attribute(scene, ATTR_STD_CURVE_RANDOM))
-    attr_random = mesh->curve_attributes.add(ATTR_STD_CURVE_RANDOM);
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
+    attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM))
+    attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
 
   /* compute and reserve size of arrays */
   for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
@@ -620,10 +304,10 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
   }
 
   if (num_curves > 0) {
-    VLOG(1) << "Exporting curve segments for mesh " << mesh->name;
+    VLOG(1) << "Exporting curve segments for mesh " << hair->name;
   }
 
-  mesh->reserve_curves(mesh->num_curves() + num_curves, mesh->curve_keys.size() + num_keys);
+  hair->reserve_curves(hair->num_curves() + num_curves, hair->curve_keys.size() + num_keys);
 
   num_keys = 0;
   num_curves = 0;
@@ -648,7 +332,7 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
             (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)) {
           radius = 0.0f;
         }
-        mesh->add_curve_key(ickey_loc, radius);
+        hair->add_curve_key(ickey_loc, radius);
         if (attr_intercept)
           attr_intercept->add(time);
 
@@ -656,19 +340,19 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
       }
 
       if (attr_random != NULL) {
-        attr_random->add(hash_int_01(num_curves));
+        attr_random->add(hash_uint2_to_float(num_curves, 0));
       }
 
-      mesh->add_curve(num_keys, CData->psys_shader[sys]);
+      hair->add_curve(num_keys, CData->psys_shader[sys]);
       num_keys += num_curve_keys;
       num_curves++;
     }
   }
 
   /* check allocation */
-  if ((mesh->curve_keys.size() != num_keys) || (mesh->num_curves() != num_curves)) {
+  if ((hair->curve_keys.size() != num_keys) || (hair->num_curves() != num_curves)) {
     VLOG(1) << "Allocation failed, clearing data";
-    mesh->clear();
+    hair->clear();
   }
 }
 
@@ -712,24 +396,58 @@ static float4 LerpCurveSegmentMotionCV(ParticleCurveData *CData, int sys, int cu
   return lerp(mP, mP2, remainder);
 }
 
-static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int motion_step)
+static void export_hair_motion_validate_attribute(Hair *hair,
+                                                  int motion_step,
+                                                  int num_motion_keys,
+                                                  bool have_motion)
+{
+  Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  const int num_keys = hair->curve_keys.size();
+
+  if (num_motion_keys != num_keys || !have_motion) {
+    /* No motion or hair "topology" changed, remove attributes again. */
+    if (num_motion_keys != num_keys) {
+      VLOG(1) << "Hair topology changed, removing attribute.";
+    }
+    else {
+      VLOG(1) << "No motion, removing attribute.";
+    }
+    hair->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
+  }
+  else if (motion_step > 0) {
+    VLOG(1) << "Filling in new motion vertex position for motion_step " << motion_step;
+
+    /* Motion, fill up previous steps that we might have skipped because
+     * they had no motion, but we need them anyway now. */
+    for (int step = 0; step < motion_step; step++) {
+      float4 *mP = attr_mP->data_float4() + step * num_keys;
+
+      for (int key = 0; key < num_keys; key++) {
+        mP[key] = float3_to_float4(hair->curve_keys[key]);
+        mP[key].w = hair->curve_radius[key];
+      }
+    }
+  }
+}
+
+static void ExportCurveSegmentsMotion(Hair *hair, ParticleCurveData *CData, int motion_step)
 {
-  VLOG(1) << "Exporting curve motion segments for mesh " << mesh->name << ", motion step "
+  VLOG(1) << "Exporting curve motion segments for hair " << hair->name << ", motion step "
           << motion_step;
 
   /* find attribute */
-  Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
   bool new_attribute = false;
 
   /* add new attribute if it doesn't exist already */
   if (!attr_mP) {
     VLOG(1) << "Creating new motion vertex position attribute";
-    attr_mP = mesh->curve_attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
+    attr_mP = hair->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
     new_attribute = true;
   }
 
   /* export motion vectors for curve keys */
-  size_t numkeys = mesh->curve_keys.size();
+  size_t numkeys = hair->curve_keys.size();
   float4 *mP = attr_mP->data_float4() + motion_step * numkeys;
   bool have_motion = false;
   int i = 0;
@@ -740,24 +458,24 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
          curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
          curve++) {
       /* Curve lengths may not match! Curves can be clipped. */
-      int curve_key_end = (num_curves + 1 < (int)mesh->curve_first_key.size() ?
-                               mesh->curve_first_key[num_curves + 1] :
-                               (int)mesh->curve_keys.size());
-      const int num_center_curve_keys = curve_key_end - mesh->curve_first_key[num_curves];
+      int curve_key_end = (num_curves + 1 < (int)hair->curve_first_key.size() ?
+                               hair->curve_first_key[num_curves + 1] :
+                               (int)hair->curve_keys.size());
+      const int num_center_curve_keys = curve_key_end - hair->curve_first_key[num_curves];
       const int is_num_keys_different = CData->curve_keynum[curve] - num_center_curve_keys;
 
       if (!is_num_keys_different) {
         for (int curvekey = CData->curve_firstkey[curve];
              curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve];
              curvekey++) {
-          if (i < mesh->curve_keys.size()) {
+          if (i < hair->curve_keys.size()) {
             mP[i] = CurveSegmentMotionCV(CData, sys, curve, curvekey);
             if (!have_motion) {
               /* unlike mesh coordinates, these tend to be slightly different
                * between frames due to particle transforms into/out of object
                * space, so we use an epsilon to detect actual changes */
-              float4 curve_key = float3_to_float4(mesh->curve_keys[i]);
-              curve_key.w = mesh->curve_radius[i];
+              float4 curve_key = float3_to_float4(hair->curve_keys[i]);
+              curve_key.w = hair->curve_radius[i];
               if (len_squared(mP[i] - curve_key) > 1e-5f * 1e-5f)
                 have_motion = true;
             }
@@ -766,7 +484,7 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
         }
       }
       else {
-        /* Number of keys has changed. Genereate an interpolated version
+        /* Number of keys has changed. Generate an interpolated version
          * to preserve motion blur. */
         const float step_size = num_center_curve_keys > 1 ? 1.0f / (num_center_curve_keys - 1) :
                                                             0.0f;
@@ -781,276 +499,69 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
     }
   }
 
-  /* in case of new attribute, we verify if there really was any motion */
+  /* In case of new attribute, we verify if there really was any motion. */
   if (new_attribute) {
-    if (i != numkeys || !have_motion) {
-      /* No motion or hair "topology" changed, remove attributes again. */
-      if (i != numkeys) {
-        VLOG(1) << "Hair topology changed, removing attribute.";
-      }
-      else {
-        VLOG(1) << "No motion, removing attribute.";
-      }
-      mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
-    }
-    else if (motion_step > 0) {
-      VLOG(1) << "Filling in new motion vertex position for motion_step " << motion_step;
-      /* motion, fill up previous steps that we might have skipped because
-       * they had no motion, but we need them anyway now */
-      for (int step = 0; step < motion_step; step++) {
-        float4 *mP = attr_mP->data_float4() + step * numkeys;
-
-        for (int key = 0; key < numkeys; key++) {
-          mP[key] = float3_to_float4(mesh->curve_keys[key]);
-          mP[key].w = mesh->curve_radius[key];
-        }
-      }
-    }
-  }
-}
-
-static void ExportCurveTriangleUV(ParticleCurveData *CData,
-                                  int vert_offset,
-                                  int resol,
-                                  float2 *uvdata)
-{
-  if (uvdata == NULL)
-    return;
-  int vertexindex = vert_offset;
-
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      for (int curvekey = CData->curve_firstkey[curve];
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1;
-           curvekey++) {
-        for (int section = 0; section < resol; section++) {
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-          uvdata[vertexindex] = CData->curve_uv[curve];
-          vertexindex++;
-        }
-      }
-    }
-  }
-}
-
-static void ExportCurveTriangleVcol(ParticleCurveData *CData,
-                                    int vert_offset,
-                                    int resol,
-                                    uchar4 *cdata)
-{
-  if (cdata == NULL)
-    return;
-
-  int vertexindex = vert_offset;
-
-  for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-    for (int curve = CData->psys_firstcurve[sys];
-         curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
-         curve++) {
-      for (int curvekey = CData->curve_firstkey[curve];
-           curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1;
-           curvekey++) {
-        for (int section = 0; section < resol; section++) {
-          /* Encode vertex color using the sRGB curve. */
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-          cdata[vertexindex] = color_float_to_byte(
-              color_srgb_to_linear_v3(CData->curve_vcol[curve]));
-          vertexindex++;
-        }
-      }
-    }
+    export_hair_motion_validate_attribute(hair, motion_step, i, have_motion);
   }
 }
 
 /* Hair Curve Sync */
 
-void BlenderSync::sync_curve_settings()
+bool BlenderSync::object_has_particle_hair(BL::Object b_ob)
 {
-  PointerRNA csscene = RNA_pointer_get(&b_scene.ptr, "cycles_curves");
-
-  CurveSystemManager *curve_system_manager = scene->curve_system_manager;
-  CurveSystemManager prev_curve_system_manager = *curve_system_manager;
-
-  curve_system_manager->use_curves = get_boolean(csscene, "use_curves");
-  curve_system_manager->minimum_width = get_float(csscene, "minimum_width");
-  curve_system_manager->maximum_width = get_float(csscene, "maximum_width");
-
-  curve_system_manager->primitive = (CurvePrimitiveType)get_enum(
-      csscene, "primitive", CURVE_NUM_PRIMITIVE_TYPES, CURVE_LINE_SEGMENTS);
-  curve_system_manager->curve_shape = (CurveShapeType)get_enum(
-      csscene, "shape", CURVE_NUM_SHAPE_TYPES, CURVE_THICK);
-  curve_system_manager->resolution = get_int(csscene, "resolution");
-  curve_system_manager->subdivisions = get_int(csscene, "subdivisions");
-  curve_system_manager->use_backfacing = !get_boolean(csscene, "cull_backfacing");
-
-  /* Triangles */
-  if (curve_system_manager->primitive == CURVE_TRIANGLES) {
-    /* camera facing planes */
-    if (curve_system_manager->curve_shape == CURVE_RIBBON) {
-      curve_system_manager->triangle_method = CURVE_CAMERA_TRIANGLES;
-      curve_system_manager->resolution = 1;
-    }
-    else if (curve_system_manager->curve_shape == CURVE_THICK) {
-      curve_system_manager->triangle_method = CURVE_TESSELATED_TRIANGLES;
-    }
-  }
-  /* Line Segments */
-  else if (curve_system_manager->primitive == CURVE_LINE_SEGMENTS) {
-    if (curve_system_manager->curve_shape == CURVE_RIBBON) {
-      /* tangent shading */
-      curve_system_manager->line_method = CURVE_UNCORRECTED;
-      curve_system_manager->use_encasing = true;
-      curve_system_manager->use_backfacing = false;
-      curve_system_manager->use_tangent_normal_geometry = true;
-    }
-    else if (curve_system_manager->curve_shape == CURVE_THICK) {
-      curve_system_manager->line_method = CURVE_ACCURATE;
-      curve_system_manager->use_encasing = false;
-      curve_system_manager->use_tangent_normal_geometry = false;
-    }
-  }
-  /* Curve Segments */
-  else if (curve_system_manager->primitive == CURVE_SEGMENTS) {
-    if (curve_system_manager->curve_shape == CURVE_RIBBON) {
-      curve_system_manager->primitive = CURVE_RIBBONS;
-      curve_system_manager->use_backfacing = false;
-    }
-  }
+  /* Test if the object has a particle modifier with hair. */
+  BL::Object::modifiers_iterator b_mod;
+  for (b_ob.modifiers.begin(b_mod); b_mod != b_ob.modifiers.end(); ++b_mod) {
+    if ((b_mod->type() == b_mod->type_PARTICLE_SYSTEM) &&
+        (preview ? b_mod->show_viewport() : b_mod->show_render())) {
+      BL::ParticleSystemModifier psmd((const PointerRNA)b_mod->ptr);
+      BL::ParticleSystem b_psys((const PointerRNA)psmd.particle_system().ptr);
+      BL::ParticleSettings b_part((const PointerRNA)b_psys.settings().ptr);
 
-  if (curve_system_manager->modified_mesh(prev_curve_system_manager)) {
-    BL::BlendData::objects_iterator b_ob;
-
-    for (b_data.objects.begin(b_ob); b_ob != b_data.objects.end(); ++b_ob) {
-      if (object_is_mesh(*b_ob)) {
-        BL::Object::particle_systems_iterator b_psys;
-        for (b_ob->particle_systems.begin(b_psys); b_psys != b_ob->particle_systems.end();
-             ++b_psys) {
-          if ((b_psys->settings().render_type() == BL::ParticleSettings::render_type_PATH) &&
-              (b_psys->settings().type() == BL::ParticleSettings::type_HAIR)) {
-            BL::ID key = BKE_object_is_modified(*b_ob) ? *b_ob : b_ob->data();
-            mesh_map.set_recalc(key);
-            object_map.set_recalc(*b_ob);
-          }
-        }
+      if ((b_part.render_type() == BL::ParticleSettings::render_type_PATH) &&
+          (b_part.type() == BL::ParticleSettings::type_HAIR)) {
+        return true;
       }
     }
   }
 
-  if (curve_system_manager->modified(prev_curve_system_manager))
-    curve_system_manager->tag_update(scene);
+  return false;
 }
 
-void BlenderSync::sync_curves(
-    Mesh *mesh, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step)
+/* Old particle hair. */
+void BlenderSync::sync_particle_hair(
+    Hair *hair, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step)
 {
-  if (!motion) {
-    /* Clear stored curve data */
-    mesh->curve_keys.clear();
-    mesh->curve_radius.clear();
-    mesh->curve_first_key.clear();
-    mesh->curve_shader.clear();
-    mesh->curve_attributes.clear();
-  }
-
   /* obtain general settings */
-  const bool use_curves = scene->curve_system_manager->use_curves;
-
-  if (!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) {
-    if (!motion)
-      mesh->compute_bounds();
+  if (b_ob.mode() == b_ob.mode_PARTICLE_EDIT || b_ob.mode() == b_ob.mode_EDIT) {
     return;
   }
 
-  const int primitive = scene->curve_system_manager->primitive;
-  const int triangle_method = scene->curve_system_manager->triangle_method;
-  const int resolution = scene->curve_system_manager->resolution;
-  const size_t vert_num = mesh->verts.size();
-  const size_t tri_num = mesh->num_triangles();
-  int used_res = 1;
-
   /* extract particle hair data - should be combined with connecting to mesh later*/
 
   ParticleCurveData CData;
 
-  ObtainCacheParticleData(mesh, &b_mesh, &b_ob, &CData, !preview);
-
-  /* add hair geometry to mesh */
-  if (primitive == CURVE_TRIANGLES) {
-    if (triangle_method == CURVE_CAMERA_TRIANGLES) {
-      /* obtain camera parameters */
-      float3 RotCam;
-      Camera *camera = scene->camera;
-      Transform &ctfm = camera->matrix;
-      if (camera->type == CAMERA_ORTHOGRAPHIC) {
-        RotCam = -make_float3(ctfm.x.z, ctfm.y.z, ctfm.z.z);
-      }
-      else {
-        Transform tfm = get_transform(b_ob.matrix_world());
-        Transform itfm = transform_quick_inverse(tfm);
-        RotCam = transform_point(&itfm, make_float3(ctfm.x.w, ctfm.y.w, ctfm.z.w));
-      }
-      bool is_ortho = camera->type == CAMERA_ORTHOGRAPHIC;
-      ExportCurveTrianglePlanes(mesh, &CData, RotCam, is_ortho);
-    }
-    else {
-      ExportCurveTriangleGeometry(mesh, &CData, resolution);
-      used_res = resolution;
-    }
-  }
-  else {
-    if (motion)
-      ExportCurveSegmentsMotion(mesh, &CData, motion_step);
-    else
-      ExportCurveSegments(scene, mesh, &CData);
-  }
+  ObtainCacheParticleData(hair, &b_mesh, &b_ob, &CData, !preview);
+
+  /* add hair geometry */
+  if (motion)
+    ExportCurveSegmentsMotion(hair, &CData, motion_step);
+  else
+    ExportCurveSegments(scene, hair, &CData);
 
   /* generated coordinates from first key. we should ideally get this from
    * blender to handle deforming objects */
   if (!motion) {
-    if (mesh->need_attribute(scene, ATTR_STD_GENERATED)) {
+    if (hair->need_attribute(scene, ATTR_STD_GENERATED)) {
       float3 loc, size;
       mesh_texture_space(b_mesh, loc, size);
 
-      if (primitive == CURVE_TRIANGLES) {
-        Attribute *attr_generated = mesh->attributes.add(ATTR_STD_GENERATED);
-        float3 *generated = attr_generated->data_float3();
+      Attribute *attr_generated = hair->attributes.add(ATTR_STD_GENERATED);
+      float3 *generated = attr_generated->data_float3();
 
-        for (size_t i = vert_num; i < mesh->verts.size(); i++)
-          generated[i] = mesh->verts[i] * size - loc;
-      }
-      else {
-        Attribute *attr_generated = mesh->curve_attributes.add(ATTR_STD_GENERATED);
-        float3 *generated = attr_generated->data_float3();
-
-        for (size_t i = 0; i < mesh->num_curves(); i++) {
-          float3 co = mesh->curve_keys[mesh->get_curve(i).first_key];
-          generated[i] = co * size - loc;
-        }
+      for (size_t i = 0; i < hair->num_curves(); i++) {
+        float3 co = hair->curve_keys[hair->get_curve(i).first_key];
+        generated[i] = co * size - loc;
       }
     }
   }
@@ -1061,32 +572,22 @@ void BlenderSync::sync_curves(
     int vcol_num = 0;
 
     for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l, vcol_num++) {
-      if (!mesh->need_attribute(scene, ustring(l->name().c_str())))
+      if (!hair->need_attribute(scene, ustring(l->name().c_str())))
         continue;
 
-      ObtainCacheParticleVcol(mesh, &b_mesh, &b_ob, &CData, !preview, vcol_num);
+      ObtainCacheParticleVcol(hair, &b_mesh, &b_ob, &CData, !preview, vcol_num);
 
-      if (primitive == CURVE_TRIANGLES) {
-        Attribute *attr_vcol = mesh->attributes.add(
-            ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
+      Attribute *attr_vcol = hair->attributes.add(
+          ustring(l->name().c_str()), TypeRGBA, ATTR_ELEMENT_CURVE);
 
-        uchar4 *cdata = attr_vcol->data_uchar4();
+      float4 *fdata = attr_vcol->data_float4();
 
-        ExportCurveTriangleVcol(&CData, tri_num * 3, used_res, cdata);
-      }
-      else {
-        Attribute *attr_vcol = mesh->curve_attributes.add(
-            ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CURVE);
-
-        float3 *fdata = attr_vcol->data_float3();
-
-        if (fdata) {
-          size_t i = 0;
+      if (fdata) {
+        size_t i = 0;
 
-          /* Encode vertex color using the sRGB curve. */
-          for (size_t curve = 0; curve < CData.curve_vcol.size(); curve++) {
-            fdata[i++] = color_srgb_to_linear_v3(CData.curve_vcol[curve]);
-          }
+        /* Encode vertex color using the sRGB curve. */
+        for (size_t curve = 0; curve < CData.curve_vcol.size(); curve++) {
+          fdata[i++] = color_srgb_to_linear_v4(CData.curve_vcol[curve]);
         }
       }
     }
@@ -1103,42 +604,279 @@ void BlenderSync::sync_curves(
       ustring name = ustring(l->name().c_str());
 
       /* UV map */
-      if (mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) {
+      if (hair->need_attribute(scene, name) || hair->need_attribute(scene, std)) {
         Attribute *attr_uv;
 
-        ObtainCacheParticleUV(mesh, &b_mesh, &b_ob, &CData, !preview, uv_num);
+        ObtainCacheParticleUV(hair, &b_mesh, &b_ob, &CData, !preview, uv_num);
 
-        if (primitive == CURVE_TRIANGLES) {
-          if (active_render)
-            attr_uv = mesh->attributes.add(std, name);
-          else
-            attr_uv = mesh->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CORNER);
+        if (active_render)
+          attr_uv = hair->attributes.add(std, name);
+        else
+          attr_uv = hair->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
 
-          float2 *uv = attr_uv->data_float2();
+        float2 *uv = attr_uv->data_float2();
 
-          ExportCurveTriangleUV(&CData, tri_num * 3, used_res, uv);
+        if (uv) {
+          size_t i = 0;
+
+          for (size_t curve = 0; curve < CData.curve_uv.size(); curve++) {
+            uv[i++] = CData.curve_uv[curve];
+          }
         }
-        else {
-          if (active_render)
-            attr_uv = mesh->curve_attributes.add(std, name);
-          else
-            attr_uv = mesh->curve_attributes.add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
+      }
+    }
+  }
+}
 
-          float2 *uv = attr_uv->data_float2();
+static float4 hair_point_as_float4(BL::HairPoint b_point)
+{
+  float4 mP = float3_to_float4(get_float3(b_point.co()));
+  mP.w = b_point.radius();
+  return mP;
+}
 
-          if (uv) {
-            size_t i = 0;
+static float4 interpolate_hair_points(BL::Hair b_hair,
+                                      const int first_point_index,
+                                      const int num_points,
+                                      const float step)
+{
+  const float curve_t = step * (num_points - 1);
+  const int point_a = clamp((int)curve_t, 0, num_points - 1);
+  const int point_b = min(point_a + 1, num_points - 1);
+  const float t = curve_t - (float)point_a;
+  return lerp(hair_point_as_float4(b_hair.points[first_point_index + point_a]),
+              hair_point_as_float4(b_hair.points[first_point_index + point_b]),
+              t);
+}
 
-            for (size_t curve = 0; curve < CData.curve_uv.size(); curve++) {
-              uv[i++] = CData.curve_uv[curve];
-            }
+static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
+{
+  /* TODO: optimize so we can straight memcpy arrays from Blender? */
+
+  /* Add requested attributes. */
+  Attribute *attr_intercept = NULL;
+  Attribute *attr_random = NULL;
+
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT)) {
+    attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
+  }
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM)) {
+    attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
+  }
+
+  /* Reserve memory. */
+  const int num_keys = b_hair.points.length();
+  const int num_curves = b_hair.curves.length();
+
+  if (num_curves > 0) {
+    VLOG(1) << "Exporting curve segments for hair " << hair->name;
+  }
+
+  hair->reserve_curves(num_curves, num_keys);
+
+  /* Export curves and points. */
+  vector<float> points_length;
+
+  BL::Hair::curves_iterator b_curve_iter;
+  for (b_hair.curves.begin(b_curve_iter); b_curve_iter != b_hair.curves.end(); ++b_curve_iter) {
+    BL::HairCurve b_curve = *b_curve_iter;
+    const int first_point_index = b_curve.first_point_index();
+    const int num_points = b_curve.num_points();
+
+    float3 prev_co = make_float3(0.0f, 0.0f, 0.0f);
+    float length = 0.0f;
+    if (attr_intercept) {
+      points_length.clear();
+      points_length.reserve(num_points);
+    }
+
+    /* Position and radius. */
+    for (int i = 0; i < num_points; i++) {
+      BL::HairPoint b_point = b_hair.points[first_point_index + i];
+
+      const float3 co = get_float3(b_point.co());
+      const float radius = b_point.radius();
+      hair->add_curve_key(co, radius);
+
+      if (attr_intercept) {
+        if (i > 0) {
+          length += len(co - prev_co);
+          points_length.push_back(length);
+        }
+        prev_co = co;
+      }
+    }
+
+    /* Normalized 0..1 attribute along curve. */
+    if (attr_intercept) {
+      for (int i = 0; i < num_points; i++) {
+        attr_intercept->add((length == 0.0f) ? 0.0f : points_length[i] / length);
+      }
+    }
+
+    /* Random number per curve. */
+    if (attr_random != NULL) {
+      attr_random->add(hash_uint2_to_float(b_curve.index(), 0));
+    }
+
+    /* Curve. */
+    const int shader_index = 0;
+    hair->add_curve(first_point_index, shader_index);
+  }
+}
+
+static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_step)
+{
+  VLOG(1) << "Exporting curve motion segments for hair " << hair->name << ", motion step "
+          << motion_step;
+
+  /* Find or add attribute. */
+  Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  bool new_attribute = false;
+
+  if (!attr_mP) {
+    VLOG(1) << "Creating new motion vertex position attribute";
+    attr_mP = hair->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
+    new_attribute = true;
+  }
+
+  /* Export motion keys. */
+  const int num_keys = hair->curve_keys.size();
+  float4 *mP = attr_mP->data_float4() + motion_step * num_keys;
+  bool have_motion = false;
+  int num_motion_keys = 0;
+  int curve_index = 0;
+
+  BL::Hair::curves_iterator b_curve_iter;
+  for (b_hair.curves.begin(b_curve_iter); b_curve_iter != b_hair.curves.end(); ++b_curve_iter) {
+    BL::HairCurve b_curve = *b_curve_iter;
+    const int first_point_index = b_curve.first_point_index();
+    const int num_points = b_curve.num_points();
+
+    Hair::Curve curve = hair->get_curve(curve_index);
+    curve_index++;
+
+    if (num_points == curve.num_keys) {
+      /* Number of keys matches. */
+      for (int i = 0; i < num_points; i++) {
+        int point_index = first_point_index + i;
+
+        if (point_index < num_keys) {
+          mP[num_motion_keys] = hair_point_as_float4(b_hair.points[point_index]);
+          num_motion_keys++;
+
+          if (!have_motion) {
+            /* TODO: use epsilon for comparison? Was needed for particles due to
+             * transform, but ideally should not happen anymore. */
+            float4 curve_key = float3_to_float4(hair->curve_keys[i]);
+            curve_key.w = hair->curve_radius[i];
+            have_motion = !(mP[i] == curve_key);
           }
         }
       }
     }
+    else {
+      /* Number of keys has changed. Generate an interpolated version
+       * to preserve motion blur. */
+      const float step_size = curve.num_keys > 1 ? 1.0f / (curve.num_keys - 1) : 0.0f;
+      for (int i = 0; i < curve.num_keys; i++) {
+        const float step = i * step_size;
+        mP[num_motion_keys] = interpolate_hair_points(b_hair, first_point_index, num_points, step);
+        num_motion_keys++;
+      }
+      have_motion = true;
+    }
+  }
+
+  /* In case of new attribute, we verify if there really was any motion. */
+  if (new_attribute) {
+    export_hair_motion_validate_attribute(hair, motion_step, num_motion_keys, have_motion);
+  }
+}
+
+/* Hair object. */
+void BlenderSync::sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motion_step)
+{
+  /* Convert Blender hair to Cycles curves. */
+  BL::Hair b_hair(b_ob.data());
+  if (motion) {
+    export_hair_curves_motion(hair, b_hair, motion_step);
+  }
+  else {
+    export_hair_curves(scene, hair, b_hair);
+  }
+}
+
+void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph,
+                            BL::Object b_ob,
+                            Hair *hair,
+                            const vector<Shader *> &used_shaders)
+{
+  /* Compares curve_keys rather than strands in order to handle quick hair
+   * adjustments in dynamic BVH - other methods could probably do this better. */
+  array<float3> oldcurve_keys;
+  array<float> oldcurve_radius;
+  oldcurve_keys.steal_data(hair->curve_keys);
+  oldcurve_radius.steal_data(hair->curve_radius);
+
+  hair->clear();
+  hair->used_shaders = used_shaders;
+
+  if (view_layer.use_hair) {
+    if (b_ob.type() == BL::Object::type_HAIR) {
+      /* Hair object. */
+      sync_hair(hair, b_ob, false);
+    }
+    else {
+      /* Particle hair. */
+      bool need_undeformed = hair->need_attribute(scene, ATTR_STD_GENERATED);
+      BL::Mesh b_mesh = object_to_mesh(
+          b_data, b_ob, b_depsgraph, need_undeformed, Mesh::SUBDIVISION_NONE);
+
+      if (b_mesh) {
+        sync_particle_hair(hair, b_mesh, b_ob, false);
+        free_object_to_mesh(b_data, b_ob, b_mesh);
+      }
+    }
+  }
+
+  /* tag update */
+  const bool rebuild = ((oldcurve_keys != hair->curve_keys) ||
+                        (oldcurve_radius != hair->curve_radius));
+
+  hair->tag_update(scene, rebuild);
+}
+
+void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph,
+                                   BL::Object b_ob,
+                                   Hair *hair,
+                                   int motion_step)
+{
+  /* Skip if nothing exported. */
+  if (hair->num_keys() == 0) {
+    return;
+  }
+
+  /* Export deformed coordinates. */
+  if (ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
+    if (b_ob.type() == BL::Object::type_HAIR) {
+      /* Hair object. */
+      sync_hair(hair, b_ob, true, motion_step);
+      return;
+    }
+    else {
+      /* Particle hair. */
+      BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_depsgraph, false, Mesh::SUBDIVISION_NONE);
+      if (b_mesh) {
+        sync_particle_hair(hair, b_mesh, b_ob, true, motion_step);
+        free_object_to_mesh(b_data, b_ob, b_mesh);
+        return;
+      }
+    }
   }
 
-  mesh->compute_bounds();
+  /* No deformation on this frame, copy coordinates if other frames did have it. */
+  hair->copy_center_to_motion_step(motion_step);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index 98fc0c6dec4..fb9ab9e8c97 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -17,8 +17,19 @@
 #include "blender/blender_device.h"
 #include "blender/blender_util.h"
 
+#include "util/util_foreach.h"
+
 CCL_NAMESPACE_BEGIN
 
+enum ComputeDevice {
+  COMPUTE_DEVICE_CPU = 0,
+  COMPUTE_DEVICE_CUDA = 1,
+  COMPUTE_DEVICE_OPENCL = 2,
+  COMPUTE_DEVICE_OPTIX = 3,
+
+  COMPUTE_DEVICE_NUM
+};
+
 int blender_device_threads(BL::Scene &b_scene)
 {
   BL::RenderSettings b_r = b_scene.render();
@@ -40,7 +51,7 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
     /* Find network device. */
     vector<DeviceInfo> devices = Device::available_devices(DEVICE_MASK_NETWORK);
     if (!devices.empty()) {
-      device = devices.front();
+      return devices.front();
     }
   }
   else if (get_enum(cscene, "device") == 1) {
@@ -57,13 +68,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
     }
 
     /* Test if we are using GPU devices. */
-    enum ComputeDevice {
-      COMPUTE_DEVICE_CPU = 0,
-      COMPUTE_DEVICE_CUDA = 1,
-      COMPUTE_DEVICE_OPENCL = 2,
-      COMPUTE_DEVICE_NUM = 3,
-    };
-
     ComputeDevice compute_device = (ComputeDevice)get_enum(
         cpreferences, "compute_device_type", COMPUTE_DEVICE_NUM, COMPUTE_DEVICE_CPU);
 
@@ -73,6 +77,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       if (compute_device == COMPUTE_DEVICE_CUDA) {
         mask |= DEVICE_MASK_CUDA;
       }
+      else if (compute_device == COMPUTE_DEVICE_OPTIX) {
+        /* Cannot use CPU and OptiX device at the same time right now, so replace mask. */
+        mask = DEVICE_MASK_OPTIX;
+      }
       else if (compute_device == COMPUTE_DEVICE_OPENCL) {
         mask |= DEVICE_MASK_OPENCL;
       }
@@ -98,6 +106,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
         device = Device::get_multi_device(used_devices, threads, background);
       }
       /* Else keep using the CPU device that was set before. */
+
+      if (!get_boolean(cpreferences, "peer_memory")) {
+        device.has_peer_memory = false;
+      }
     }
   }
 
diff --git a/intern/cycles/blender/blender_device.h b/intern/cycles/blender/blender_device.h
index fd6c045c966..8d2ecac7483 100644
--- a/intern/cycles/blender/blender_device.h
+++ b/intern/cycles/blender/blender_device.h
@@ -18,9 +18,9 @@
 #define __BLENDER_DEVICE_H__
 
 #include "MEM_guardedalloc.h"
-#include "RNA_types.h"
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
+#include "RNA_types.h"
 
 #include "device/device.h"
 
diff --git a/intern/cycles/blender/blender_geometry.cpp b/intern/cycles/blender/blender_geometry.cpp
new file mode 100644
index 00000000000..f7e4623024d
--- /dev/null
+++ b/intern/cycles/blender/blender_geometry.cpp
@@ -0,0 +1,178 @@
+
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/curves.h"
+#include "render/hair.h"
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_foreach.h"
+
+CCL_NAMESPACE_BEGIN
+
+Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
+                                     BL::Object &b_ob,
+                                     BL::Object &b_ob_instance,
+                                     bool object_updated,
+                                     bool use_particle_hair)
+{
+  /* Test if we can instance or if the object is modified. */
+  BL::ID b_ob_data = b_ob.data();
+  BL::ID b_key_id = (BKE_object_is_modified(b_ob)) ? b_ob_instance : b_ob_data;
+  GeometryKey key(b_key_id.ptr.data, use_particle_hair);
+  BL::Material material_override = view_layer.material_override;
+  Shader *default_shader = (b_ob.type() == BL::Object::type_VOLUME) ? scene->default_volume :
+                                                                      scene->default_surface;
+  Geometry::Type geom_type = (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) ?
+                                 Geometry::HAIR :
+                                 Geometry::MESH;
+
+  /* Find shader indices. */
+  vector<Shader *> used_shaders;
+
+  BL::Object::material_slots_iterator slot;
+  for (b_ob.material_slots.begin(slot); slot != b_ob.material_slots.end(); ++slot) {
+    if (material_override) {
+      find_shader(material_override, used_shaders, default_shader);
+    }
+    else {
+      BL::ID b_material(slot->material());
+      find_shader(b_material, used_shaders, default_shader);
+    }
+  }
+
+  if (used_shaders.size() == 0) {
+    if (material_override)
+      find_shader(material_override, used_shaders, default_shader);
+    else
+      used_shaders.push_back(default_shader);
+  }
+
+  /* Test if we need to sync. */
+  Geometry *geom = geometry_map.find(key);
+  bool sync = true;
+  if (geom == NULL) {
+    /* Add new geometry if it did not exist yet. */
+    if (geom_type == Geometry::HAIR) {
+      geom = new Hair();
+    }
+    else {
+      geom = new Mesh();
+    }
+    geometry_map.add(key, geom);
+  }
+  else {
+    /* Test if we need to update existing geometry. */
+    sync = geometry_map.update(geom, b_key_id);
+  }
+
+  if (!sync) {
+    /* If transform was applied to geometry, need full update. */
+    if (object_updated && geom->transform_applied) {
+      ;
+    }
+    /* Test if shaders changed, these can be object level so geometry
+     * does not get tagged for recalc. */
+    else if (geom->used_shaders != used_shaders) {
+      ;
+    }
+    else {
+      /* Even if not tagged for recalc, we may need to sync anyway
+       * because the shader needs different geometry attributes. */
+      bool attribute_recalc = false;
+
+      foreach (Shader *shader, geom->used_shaders) {
+        if (shader->need_update_geometry) {
+          attribute_recalc = true;
+        }
+      }
+
+      if (!attribute_recalc) {
+        return geom;
+      }
+    }
+  }
+
+  /* Ensure we only sync instanced geometry once. */
+  if (geometry_synced.find(geom) != geometry_synced.end()) {
+    return geom;
+  }
+
+  progress.set_sync_status("Synchronizing object", b_ob.name());
+
+  geometry_synced.insert(geom);
+
+  geom->name = ustring(b_ob_data.name().c_str());
+
+  if (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) {
+    Hair *hair = static_cast<Hair *>(geom);
+    sync_hair(b_depsgraph, b_ob, hair, used_shaders);
+  }
+  else if (b_ob.type() == BL::Object::type_VOLUME || object_fluid_gas_domain_find(b_ob)) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    sync_volume(b_ob, mesh, used_shaders);
+  }
+  else {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    sync_mesh(b_depsgraph, b_ob, mesh, used_shaders);
+  }
+
+  return geom;
+}
+
+void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph,
+                                       BL::Object &b_ob,
+                                       Object *object,
+                                       float motion_time,
+                                       bool use_particle_hair)
+{
+  /* Ensure we only sync instanced geometry once. */
+  Geometry *geom = object->geometry;
+
+  if (geometry_motion_synced.find(geom) != geometry_motion_synced.end())
+    return;
+
+  geometry_motion_synced.insert(geom);
+
+  /* Ensure we only motion sync geometry that also had geometry synced, to avoid
+   * unnecessary work and to ensure that its attributes were clear. */
+  if (geometry_synced.find(geom) == geometry_synced.end())
+    return;
+
+  /* Find time matching motion step required by geometry. */
+  int motion_step = geom->motion_step(motion_time);
+  if (motion_step < 0) {
+    return;
+  }
+
+  if (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) {
+    Hair *hair = static_cast<Hair *>(geom);
+    sync_hair_motion(b_depsgraph, b_ob, hair, motion_step);
+  }
+  else if (b_ob.type() == BL::Object::type_VOLUME || object_fluid_gas_domain_find(b_ob)) {
+    /* No volume motion blur support yet. */
+  }
+  else {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    sync_mesh_motion(b_depsgraph, b_ob, mesh, motion_step);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_id_map.h b/intern/cycles/blender/blender_id_map.h
new file mode 100644
index 00000000000..b5f6aaa67a8
--- /dev/null
+++ b/intern/cycles/blender/blender_id_map.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BLENDER_ID_MAP_H__
+#define __BLENDER_ID_MAP_H__
+
+#include <string.h>
+
+#include "util/util_map.h"
+#include "util/util_set.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* ID Map
+ *
+ * Utility class to map between Blender datablocks and Cycles data structures,
+ * and keep track of recalc tags from the dependency graph. */
+
+template<typename K, typename T> class id_map {
+ public:
+  id_map(vector<T *> *scene_data_)
+  {
+    scene_data = scene_data_;
+  }
+
+  T *find(const BL::ID &id)
+  {
+    return find(id.ptr.owner_id);
+  }
+
+  T *find(const K &key)
+  {
+    if (b_map.find(key) != b_map.end()) {
+      T *data = b_map[key];
+      return data;
+    }
+
+    return NULL;
+  }
+
+  void set_recalc(const BL::ID &id)
+  {
+    b_recalc.insert(id.ptr.data);
+  }
+
+  void set_recalc(void *id_ptr)
+  {
+    b_recalc.insert(id_ptr);
+  }
+
+  bool has_recalc()
+  {
+    return !(b_recalc.empty());
+  }
+
+  void pre_sync()
+  {
+    used_set.clear();
+  }
+
+  /* Add new data. */
+  void add(const K &key, T *data)
+  {
+    assert(find(key) == NULL);
+    scene_data->push_back(data);
+    b_map[key] = data;
+    used(data);
+  }
+
+  /* Update existing data. */
+  bool update(T *data, const BL::ID &id)
+  {
+    return update(data, id, id);
+  }
+  bool update(T *data, const BL::ID &id, const BL::ID &parent)
+  {
+    bool recalc = (b_recalc.find(id.ptr.data) != b_recalc.end());
+    if (parent.ptr.data && parent.ptr.data != id.ptr.data) {
+      recalc = recalc || (b_recalc.find(parent.ptr.data) != b_recalc.end());
+    }
+    used(data);
+    return recalc;
+  }
+
+  /* Combined add and update as needed. */
+  bool add_or_update(T **r_data, const BL::ID &id)
+  {
+    return add_or_update(r_data, id, id, id.ptr.owner_id);
+  }
+  bool add_or_update(T **r_data, const BL::ID &id, const K &key)
+  {
+    return add_or_update(r_data, id, id, key);
+  }
+  bool add_or_update(T **r_data, const BL::ID &id, const BL::ID &parent, const K &key)
+  {
+    T *data = find(key);
+    bool recalc;
+
+    if (!data) {
+      /* Add data if it didn't exist yet. */
+      data = new T();
+      add(key, data);
+      recalc = true;
+    }
+    else {
+      /* check if updated needed. */
+      recalc = update(data, id, parent);
+    }
+
+    *r_data = data;
+    return recalc;
+  }
+
+  /* Combined add or update for convenience. */
+
+  bool is_used(const K &key)
+  {
+    T *data = find(key);
+    return (data) ? used_set.find(data) != used_set.end() : false;
+  }
+
+  void used(T *data)
+  {
+    /* tag data as still in use */
+    used_set.insert(data);
+  }
+
+  void set_default(T *data)
+  {
+    b_map[NULL] = data;
+  }
+
+  bool post_sync(bool do_delete = true)
+  {
+    /* remove unused data */
+    vector<T *> new_scene_data;
+    typename vector<T *>::iterator it;
+    bool deleted = false;
+
+    for (it = scene_data->begin(); it != scene_data->end(); it++) {
+      T *data = *it;
+
+      if (do_delete && used_set.find(data) == used_set.end()) {
+        delete data;
+        deleted = true;
+      }
+      else
+        new_scene_data.push_back(data);
+    }
+
+    *scene_data = new_scene_data;
+
+    /* update mapping */
+    map<K, T *> new_map;
+    typedef pair<const K, T *> TMapPair;
+    typename map<K, T *>::iterator jt;
+
+    for (jt = b_map.begin(); jt != b_map.end(); jt++) {
+      TMapPair &pair = *jt;
+
+      if (used_set.find(pair.second) != used_set.end())
+        new_map[pair.first] = pair.second;
+    }
+
+    used_set.clear();
+    b_recalc.clear();
+    b_map = new_map;
+
+    return deleted;
+  }
+
+  const map<K, T *> &key_to_scene_data()
+  {
+    return b_map;
+  }
+
+ protected:
+  vector<T *> *scene_data;
+  map<K, T *> b_map;
+  set<T *> used_set;
+  set<void *> b_recalc;
+};
+
+/* Object Key
+ *
+ * To uniquely identify instances, we use the parent, object and persistent instance ID.
+ * We also export separate object for a mesh and its particle hair. */
+
+enum { OBJECT_PERSISTENT_ID_SIZE = 8 /* MAX_DUPLI_RECUR in Blender. */ };
+
+struct ObjectKey {
+  void *parent;
+  int id[OBJECT_PERSISTENT_ID_SIZE];
+  void *ob;
+  bool use_particle_hair;
+
+  ObjectKey(void *parent_, int id_[OBJECT_PERSISTENT_ID_SIZE], void *ob_, bool use_particle_hair_)
+      : parent(parent_), ob(ob_), use_particle_hair(use_particle_hair_)
+  {
+    if (id_)
+      memcpy(id, id_, sizeof(id));
+    else
+      memset(id, 0, sizeof(id));
+  }
+
+  bool operator<(const ObjectKey &k) const
+  {
+    if (ob < k.ob) {
+      return true;
+    }
+    else if (ob == k.ob) {
+      if (parent < k.parent) {
+        return true;
+      }
+      else if (parent == k.parent) {
+        if (use_particle_hair < k.use_particle_hair) {
+          return true;
+        }
+        else if (use_particle_hair == k.use_particle_hair) {
+          return memcmp(id, k.id, sizeof(id)) < 0;
+        }
+      }
+    }
+
+    return false;
+  }
+};
+
+/* Geometry Key
+ *
+ * We export separate geometry for a mesh and its particle hair, so key needs to
+ * distinguish between them. */
+
+struct GeometryKey {
+  void *id;
+  bool use_particle_hair;
+
+  GeometryKey(void *id, bool use_particle_hair) : id(id), use_particle_hair(use_particle_hair)
+  {
+  }
+
+  bool operator<(const GeometryKey &k) const
+  {
+    if (id < k.id) {
+      return true;
+    }
+    else if (id == k.id) {
+      if (use_particle_hair < k.use_particle_hair) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+/* Particle System Key */
+
+struct ParticleSystemKey {
+  void *ob;
+  int id[OBJECT_PERSISTENT_ID_SIZE];
+
+  ParticleSystemKey(void *ob_, int id_[OBJECT_PERSISTENT_ID_SIZE]) : ob(ob_)
+  {
+    if (id_)
+      memcpy(id, id_, sizeof(id));
+    else
+      memset(id, 0, sizeof(id));
+  }
+
+  bool operator<(const ParticleSystemKey &k) const
+  {
+    /* first id is particle index, we don't compare that */
+    if (ob < k.ob)
+      return true;
+    else if (ob == k.ob)
+      return memcmp(id + 1, k.id + 1, sizeof(int) * (OBJECT_PERSISTENT_ID_SIZE - 1)) < 0;
+
+    return false;
+  }
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BLENDER_ID_MAP_H__ */
diff --git a/intern/cycles/blender/blender_image.cpp b/intern/cycles/blender/blender_image.cpp
new file mode 100644
index 00000000000..459dc1779fb
--- /dev/null
+++ b/intern/cycles/blender/blender_image.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MEM_guardedalloc.h"
+
+#include "blender/blender_image.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Packed Images */
+
+BlenderImageLoader::BlenderImageLoader(BL::Image b_image, int frame)
+    : b_image(b_image), frame(frame), free_cache(!b_image.has_data())
+{
+}
+
+bool BlenderImageLoader::load_metadata(ImageMetaData &metadata)
+{
+  metadata.width = b_image.size()[0];
+  metadata.height = b_image.size()[1];
+  metadata.depth = 1;
+  metadata.channels = b_image.channels();
+
+  if (b_image.is_float()) {
+    if (metadata.channels == 1) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT;
+    }
+    else if (metadata.channels == 4) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+    }
+    else {
+      return false;
+    }
+
+    /* Float images are already converted on the Blender side,
+     * no need to do anything in Cycles. */
+    metadata.colorspace = u_colorspace_raw;
+  }
+  else {
+    if (metadata.channels == 1) {
+      metadata.type = IMAGE_DATA_TYPE_BYTE;
+    }
+    else if (metadata.channels == 4) {
+      metadata.type = IMAGE_DATA_TYPE_BYTE4;
+    }
+    else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool BlenderImageLoader::load_pixels(const ImageMetaData &metadata,
+                                     void *pixels,
+                                     const size_t pixels_size,
+                                     const bool associate_alpha)
+{
+  const size_t num_pixels = ((size_t)metadata.width) * metadata.height;
+  const int channels = metadata.channels;
+  const int tile = 0; /* TODO(lukas): Support tiles here? */
+
+  if (b_image.is_float()) {
+    /* image data */
+    float *image_pixels;
+    image_pixels = image_get_float_pixels_for_frame(b_image, frame, tile);
+
+    if (image_pixels && num_pixels * channels == pixels_size) {
+      memcpy(pixels, image_pixels, pixels_size * sizeof(float));
+    }
+    else {
+      if (channels == 1) {
+        memset(pixels, 0, num_pixels * sizeof(float));
+      }
+      else {
+        const size_t num_pixels_safe = pixels_size / channels;
+        float *fp = (float *)pixels;
+        for (int i = 0; i < num_pixels_safe; i++, fp += channels) {
+          fp[0] = 1.0f;
+          fp[1] = 0.0f;
+          fp[2] = 1.0f;
+          if (channels == 4) {
+            fp[3] = 1.0f;
+          }
+        }
+      }
+    }
+
+    if (image_pixels) {
+      MEM_freeN(image_pixels);
+    }
+  }
+  else {
+    unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame, tile);
+
+    if (image_pixels && num_pixels * channels == pixels_size) {
+      memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
+    }
+    else {
+      if (channels == 1) {
+        memset(pixels, 0, pixels_size * sizeof(unsigned char));
+      }
+      else {
+        const size_t num_pixels_safe = pixels_size / channels;
+        unsigned char *cp = (unsigned char *)pixels;
+        for (size_t i = 0; i < num_pixels_safe; i++, cp += channels) {
+          cp[0] = 255;
+          cp[1] = 0;
+          cp[2] = 255;
+          if (channels == 4) {
+            cp[3] = 255;
+          }
+        }
+      }
+    }
+
+    if (image_pixels) {
+      MEM_freeN(image_pixels);
+    }
+
+    if (associate_alpha) {
+      /* Premultiply, byte images are always straight for Blender. */
+      unsigned char *cp = (unsigned char *)pixels;
+      for (size_t i = 0; i < num_pixels; i++, cp += channels) {
+        cp[0] = (cp[0] * cp[3]) >> 8;
+        cp[1] = (cp[1] * cp[3]) >> 8;
+        cp[2] = (cp[2] * cp[3]) >> 8;
+      }
+    }
+  }
+
+  /* Free image buffers to save memory during render. */
+  if (free_cache) {
+    b_image.buffers_free();
+  }
+
+  return true;
+}
+
+string BlenderImageLoader::name() const
+{
+  return BL::Image(b_image).name();
+}
+
+bool BlenderImageLoader::equals(const ImageLoader &other) const
+{
+  const BlenderImageLoader &other_loader = (const BlenderImageLoader &)other;
+  return b_image == other_loader.b_image && frame == other_loader.frame;
+}
+
+/* Point Density */
+
+BlenderPointDensityLoader::BlenderPointDensityLoader(BL::Depsgraph b_depsgraph,
+                                                     BL::ShaderNodeTexPointDensity b_node)
+    : b_depsgraph(b_depsgraph), b_node(b_node)
+{
+}
+
+bool BlenderPointDensityLoader::load_metadata(ImageMetaData &metadata)
+{
+  metadata.channels = 4;
+  metadata.width = b_node.resolution();
+  metadata.height = metadata.width;
+  metadata.depth = metadata.width;
+  metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+  return true;
+}
+
+bool BlenderPointDensityLoader::load_pixels(const ImageMetaData &,
+                                            void *pixels,
+                                            const size_t,
+                                            const bool)
+{
+  int length;
+  b_node.calc_point_density(b_depsgraph, &length, (float **)&pixels);
+  return true;
+}
+
+void BlenderSession::builtin_images_load()
+{
+  /* Force builtin images to be loaded along with Blender data sync. This
+   * is needed because we may be reading from depsgraph evaluated data which
+   * can be freed by Blender before Cycles reads it.
+   *
+   * TODO: the assumption that no further access to builtin image data will
+   * happen is really weak, and likely to break in the future. We should find
+   * a better solution to hand over the data directly to the image manager
+   * instead of through callbacks whose timing is difficult to control. */
+  ImageManager *manager = session->scene->image_manager;
+  Device *device = session->device;
+  manager->device_load_builtin(device, session->scene, session->progress);
+}
+
+string BlenderPointDensityLoader::name() const
+{
+  return BL::ShaderNodeTexPointDensity(b_node).name();
+}
+
+bool BlenderPointDensityLoader::equals(const ImageLoader &other) const
+{
+  const BlenderPointDensityLoader &other_loader = (const BlenderPointDensityLoader &)other;
+  return b_node == other_loader.b_node && b_depsgraph == other_loader.b_depsgraph;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_image.h b/intern/cycles/blender/blender_image.h
new file mode 100644
index 00000000000..b58a159a6ba
--- /dev/null
+++ b/intern/cycles/blender/blender_image.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BLENDER_IMAGE_H__
+#define __BLENDER_IMAGE_H__
+
+#include "RNA_blender_cpp.h"
+
+#include "render/image.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BlenderImageLoader : public ImageLoader {
+ public:
+  BlenderImageLoader(BL::Image b_image, int frame);
+
+  bool load_metadata(ImageMetaData &metadata) override;
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t pixels_size,
+                   const bool associate_alpha) override;
+  string name() const override;
+  bool equals(const ImageLoader &other) const override;
+
+  BL::Image b_image;
+  int frame;
+  bool free_cache;
+};
+
+class BlenderPointDensityLoader : public ImageLoader {
+ public:
+  BlenderPointDensityLoader(BL::Depsgraph depsgraph, BL::ShaderNodeTexPointDensity b_node);
+
+  bool load_metadata(ImageMetaData &metadata) override;
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t pixels_size,
+                   const bool associate_alpha) override;
+  string name() const override;
+  bool equals(const ImageLoader &other) const override;
+
+  BL::Depsgraph b_depsgraph;
+  BL::ShaderNodeTexPointDensity b_node;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BLENDER_IMAGE_H__ */
diff --git a/intern/cycles/blender/blender_light.cpp b/intern/cycles/blender/blender_light.cpp
new file mode 100644
index 00000000000..6f95821e31e
--- /dev/null
+++ b/intern/cycles/blender/blender_light.cpp
@@ -0,0 +1,212 @@
+
+
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/light.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_hash.h"
+
+CCL_NAMESPACE_BEGIN
+
+void BlenderSync::sync_light(BL::Object &b_parent,
+                             int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
+                             BL::Object &b_ob,
+                             BL::Object &b_ob_instance,
+                             int random_id,
+                             Transform &tfm,
+                             bool *use_portal)
+{
+  /* test if we need to sync */
+  Light *light;
+  ObjectKey key(b_parent, persistent_id, b_ob_instance, false);
+  BL::Light b_light(b_ob.data());
+
+  /* Update if either object or light data changed. */
+  if (!light_map.add_or_update(&light, b_ob, b_parent, key)) {
+    Shader *shader;
+    if (!shader_map.add_or_update(&shader, b_light)) {
+      if (light->is_portal)
+        *use_portal = true;
+      return;
+    }
+  }
+
+  /* type */
+  switch (b_light.type()) {
+    case BL::Light::type_POINT: {
+      BL::PointLight b_point_light(b_light);
+      light->size = b_point_light.shadow_soft_size();
+      light->type = LIGHT_POINT;
+      break;
+    }
+    case BL::Light::type_SPOT: {
+      BL::SpotLight b_spot_light(b_light);
+      light->size = b_spot_light.shadow_soft_size();
+      light->type = LIGHT_SPOT;
+      light->spot_angle = b_spot_light.spot_size();
+      light->spot_smooth = b_spot_light.spot_blend();
+      break;
+    }
+    /* Hemi were removed from 2.8 */
+    // case BL::Light::type_HEMI: {
+    //  light->type = LIGHT_DISTANT;
+    //  light->size = 0.0f;
+    //  break;
+    // }
+    case BL::Light::type_SUN: {
+      BL::SunLight b_sun_light(b_light);
+      light->angle = b_sun_light.angle();
+      light->type = LIGHT_DISTANT;
+      break;
+    }
+    case BL::Light::type_AREA: {
+      BL::AreaLight b_area_light(b_light);
+      light->size = 1.0f;
+      light->axisu = transform_get_column(&tfm, 0);
+      light->axisv = transform_get_column(&tfm, 1);
+      light->sizeu = b_area_light.size();
+      switch (b_area_light.shape()) {
+        case BL::AreaLight::shape_SQUARE:
+          light->sizev = light->sizeu;
+          light->round = false;
+          break;
+        case BL::AreaLight::shape_RECTANGLE:
+          light->sizev = b_area_light.size_y();
+          light->round = false;
+          break;
+        case BL::AreaLight::shape_DISK:
+          light->sizev = light->sizeu;
+          light->round = true;
+          break;
+        case BL::AreaLight::shape_ELLIPSE:
+          light->sizev = b_area_light.size_y();
+          light->round = true;
+          break;
+      }
+      light->type = LIGHT_AREA;
+      break;
+    }
+  }
+
+  /* strength */
+  light->strength = get_float3(b_light.color());
+  light->strength *= BL::PointLight(b_light).energy();
+
+  /* location and (inverted!) direction */
+  light->co = transform_get_column(&tfm, 3);
+  light->dir = -transform_get_column(&tfm, 2);
+  light->tfm = tfm;
+
+  /* shader */
+  vector<Shader *> used_shaders;
+  find_shader(b_light, used_shaders, scene->default_light);
+  light->shader = used_shaders[0];
+
+  /* shadow */
+  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+  PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles");
+  light->cast_shadow = get_boolean(clight, "cast_shadow");
+  light->use_mis = get_boolean(clight, "use_multiple_importance_sampling");
+
+  int samples = get_int(clight, "samples");
+  if (get_boolean(cscene, "use_square_samples"))
+    light->samples = samples * samples;
+  else
+    light->samples = samples;
+
+  light->max_bounces = get_int(clight, "max_bounces");
+
+  if (b_ob != b_ob_instance) {
+    light->random_id = random_id;
+  }
+  else {
+    light->random_id = hash_uint2(hash_string(b_ob.name().c_str()), 0);
+  }
+
+  if (light->type == LIGHT_AREA)
+    light->is_portal = get_boolean(clight, "is_portal");
+  else
+    light->is_portal = false;
+
+  if (light->is_portal)
+    *use_portal = true;
+
+  /* visibility */
+  uint visibility = object_ray_visibility(b_ob);
+  light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0;
+  light->use_glossy = (visibility & PATH_RAY_GLOSSY) != 0;
+  light->use_transmission = (visibility & PATH_RAY_TRANSMIT) != 0;
+  light->use_scatter = (visibility & PATH_RAY_VOLUME_SCATTER) != 0;
+
+  /* tag */
+  light->tag_update(scene);
+}
+
+void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
+{
+  BL::World b_world = b_scene.world();
+
+  if (b_world) {
+    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+    PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
+
+    enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM };
+    int sampling_method = get_enum(cworld, "sampling_method", SAMPLING_NUM, SAMPLING_AUTOMATIC);
+    bool sample_as_light = (sampling_method != SAMPLING_NONE);
+
+    if (sample_as_light || use_portal) {
+      /* test if we need to sync */
+      Light *light;
+      ObjectKey key(b_world, 0, b_world, false);
+
+      if (light_map.add_or_update(&light, b_world, b_world, key) || world_recalc ||
+          b_world.ptr.data != world_map) {
+        light->type = LIGHT_BACKGROUND;
+        if (sampling_method == SAMPLING_MANUAL) {
+          light->map_resolution = get_int(cworld, "sample_map_resolution");
+        }
+        else {
+          light->map_resolution = 0;
+        }
+        light->shader = scene->default_background;
+        light->use_mis = sample_as_light;
+        light->max_bounces = get_int(cworld, "max_bounces");
+
+        /* force enable light again when world is resynced */
+        light->is_enabled = true;
+
+        int samples = get_int(cworld, "samples");
+        if (get_boolean(cscene, "use_square_samples"))
+          light->samples = samples * samples;
+        else
+          light->samples = samples;
+
+        light->tag_update(scene);
+        light_map.set_recalc(b_world);
+      }
+    }
+  }
+
+  world_map = b_world.ptr.data;
+  world_recalc = false;
+  viewport_parameters = BlenderViewportParameters(b_v3d);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index de594f4fb6c..49407799fcd 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -14,20 +14,23 @@
  * limitations under the License.
  */
 
+#include "render/camera.h"
+#include "render/colorspace.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
-#include "render/camera.h"
 
-#include "blender/blender_sync.h"
 #include "blender/blender_session.h"
+#include "blender/blender_sync.h"
 #include "blender/blender_util.h"
 
 #include "subd/subd_patch.h"
 #include "subd/subd_split.h"
 
 #include "util/util_algorithm.h"
+#include "util/util_disjoint_set.h"
 #include "util/util_foreach.h"
+#include "util/util_hash.h"
 #include "util/util_logging.h"
 #include "util/util_math.h"
 
@@ -275,102 +278,99 @@ static void mikk_compute_tangents(
   genTangSpaceDefault(&context);
 }
 
-/* Create Volume Attribute */
-
-static void create_mesh_volume_attribute(
-    BL::Object &b_ob, Mesh *mesh, ImageManager *image_manager, AttributeStandard std, float frame)
+/* Create sculpt vertex color attributes. */
+static void attr_create_sculpt_vertex_color(Scene *scene,
+                                            Mesh *mesh,
+                                            BL::Mesh &b_mesh,
+                                            bool subdivision)
 {
-  BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
+  BL::Mesh::sculpt_vertex_colors_iterator l;
 
-  if (!b_domain)
-    return;
+  for (b_mesh.sculpt_vertex_colors.begin(l); l != b_mesh.sculpt_vertex_colors.end(); ++l) {
+    const bool active_render = l->active_render();
+    AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
+    ustring vcol_name = ustring(l->name().c_str());
 
-  mesh->volume_isovalue = b_domain.clipping();
-
-  Attribute *attr = mesh->attributes.add(std);
-  VoxelAttribute *volume_data = attr->data_voxel();
-  ImageMetaData metadata;
-  bool animated = false;
-  bool use_alpha = true;
-
-  volume_data->manager = image_manager;
-  volume_data->slot = image_manager->add_image(Attribute::standard_name(std),
-                                               b_ob.ptr.data,
-                                               animated,
-                                               frame,
-                                               INTERPOLATION_LINEAR,
-                                               EXTENSION_CLIP,
-                                               use_alpha,
-                                               metadata);
-}
+    const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
+                           mesh->need_attribute(scene, vcol_std);
 
-static void create_mesh_volume_attributes(Scene *scene, BL::Object &b_ob, Mesh *mesh, float frame)
-{
-  /* for smoke volume rendering */
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_DENSITY))
-    create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_DENSITY, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_COLOR))
-    create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_COLOR, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_FLAME))
-    create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_HEAT))
-    create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_TEMPERATURE))
-    create_mesh_volume_attribute(
-        b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_TEMPERATURE, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_VELOCITY))
-    create_mesh_volume_attribute(
-        b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY, frame);
+    if (!need_vcol) {
+      continue;
+    }
+
+    AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes;
+    Attribute *vcol_attr = attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_VERTEX);
+    vcol_attr->std = vcol_std;
+
+    float4 *cdata = vcol_attr->data_float4();
+    int numverts = b_mesh.vertices.length();
+
+    for (int i = 0; i < numverts; i++) {
+      *(cdata++) = get_float4(l->data[i].color());
+    }
+  }
 }
 
 /* Create vertex color attributes. */
 static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivision)
 {
-  if (subdivision) {
-    BL::Mesh::vertex_colors_iterator l;
+  BL::Mesh::vertex_colors_iterator l;
+
+  for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
+    const bool active_render = l->active_render();
+    AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
+    ustring vcol_name = ustring(l->name().c_str());
 
-    for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
-      if (!mesh->need_attribute(scene, ustring(l->name().c_str())))
-        continue;
+    const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
+                           mesh->need_attribute(scene, vcol_std);
 
-      Attribute *attr = mesh->subd_attributes.add(
-          ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
+    if (!need_vcol) {
+      continue;
+    }
+
+    Attribute *vcol_attr = NULL;
+
+    if (subdivision) {
+      if (active_render) {
+        vcol_attr = mesh->subd_attributes.add(vcol_std, vcol_name);
+      }
+      else {
+        vcol_attr = mesh->subd_attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
+      }
 
       BL::Mesh::polygons_iterator p;
-      uchar4 *cdata = attr->data_uchar4();
+      uchar4 *cdata = vcol_attr->data_uchar4();
 
       for (b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
         int n = p->loop_total();
         for (int i = 0; i < n; i++) {
-          float3 color = get_float3(l->data[p->loop_start() + i].color());
+          float4 color = get_float4(l->data[p->loop_start() + i].color());
           /* Compress/encode vertex color using the sRGB curve. */
-          *(cdata++) = color_float_to_byte(color_srgb_to_linear_v3(color));
+          *(cdata++) = color_float4_to_uchar4(color_srgb_to_linear_v4(color));
         }
       }
     }
-  }
-  else {
-    BL::Mesh::vertex_colors_iterator l;
-    for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
-      if (!mesh->need_attribute(scene, ustring(l->name().c_str())))
-        continue;
-
-      Attribute *attr = mesh->attributes.add(
-          ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
+    else {
+      if (active_render) {
+        vcol_attr = mesh->attributes.add(vcol_std, vcol_name);
+      }
+      else {
+        vcol_attr = mesh->attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
+      }
 
       BL::Mesh::loop_triangles_iterator t;
-      uchar4 *cdata = attr->data_uchar4();
+      uchar4 *cdata = vcol_attr->data_uchar4();
 
       for (b_mesh.loop_triangles.begin(t); t != b_mesh.loop_triangles.end(); ++t) {
         int3 li = get_int3(t->loops());
-        float3 c1 = get_float3(l->data[li[0]].color());
-        float3 c2 = get_float3(l->data[li[1]].color());
-        float3 c3 = get_float3(l->data[li[2]].color());
+        float4 c1 = get_float4(l->data[li[0]].color());
+        float4 c2 = get_float4(l->data[li[1]].color());
+        float4 c3 = get_float4(l->data[li[2]].color());
 
         /* Compress/encode vertex color using the sRGB curve. */
-        cdata[0] = color_float_to_byte(color_srgb_to_linear_v3(c1));
-        cdata[1] = color_float_to_byte(color_srgb_to_linear_v3(c2));
-        cdata[2] = color_float_to_byte(color_srgb_to_linear_v3(c3));
+        cdata[0] = color_float4_to_uchar4(color_srgb_to_linear_v4(c1));
+        cdata[1] = color_float4_to_uchar4(color_srgb_to_linear_v4(c2));
+        cdata[2] = color_float4_to_uchar4(color_srgb_to_linear_v4(c3));
         cdata += 3;
       }
     }
@@ -678,6 +678,55 @@ static void attr_create_pointiness(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, b
   }
 }
 
+/* The Random Per Island attribute is a random float associated with each
+ * connected component (island) of the mesh. The attribute is computed by
+ * first classifying the vertices into different sets using a Disjoint Set
+ * data structure. Then the index of the root of each vertex (Which is the
+ * representative of the set the vertex belongs to) is hashed and stored.
+ *
+ * We are using a face attribute to avoid interpolation during rendering,
+ * allowing the user to safely hash the output further. Had we used vertex
+ * attribute, the interpolation will introduce very slight variations,
+ * making the output unsafe to hash. */
+static void attr_create_random_per_island(Scene *scene,
+                                          Mesh *mesh,
+                                          BL::Mesh &b_mesh,
+                                          bool subdivision)
+{
+  if (!mesh->need_attribute(scene, ATTR_STD_RANDOM_PER_ISLAND)) {
+    return;
+  }
+
+  int number_of_vertices = b_mesh.vertices.length();
+  if (number_of_vertices == 0) {
+    return;
+  }
+
+  DisjointSet vertices_sets(number_of_vertices);
+
+  BL::Mesh::edges_iterator e;
+  for (b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e) {
+    vertices_sets.join(e->vertices()[0], e->vertices()[1]);
+  }
+
+  AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes;
+  Attribute *attribute = attributes.add(ATTR_STD_RANDOM_PER_ISLAND);
+  float *data = attribute->data_float();
+
+  if (!subdivision) {
+    BL::Mesh::loop_triangles_iterator t;
+    for (b_mesh.loop_triangles.begin(t); t != b_mesh.loop_triangles.end(); ++t) {
+      data[t->index()] = hash_uint_to_float(vertices_sets.find(t->vertices()[0]));
+    }
+  }
+  else {
+    BL::Mesh::polygons_iterator p;
+    for (b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
+      data[p->index()] = hash_uint_to_float(vertices_sets.find(p->vertices()[0]));
+    }
+  }
+}
+
 /* Create Mesh */
 
 static void create_mesh(Scene *scene,
@@ -798,6 +847,8 @@ static void create_mesh(Scene *scene,
    */
   attr_create_pointiness(scene, mesh, b_mesh, subdivision);
   attr_create_vertex_color(scene, mesh, b_mesh, subdivision);
+  attr_create_sculpt_vertex_color(scene, mesh, b_mesh, subdivision);
+  attr_create_random_per_island(scene, mesh, b_mesh, subdivision);
 
   if (subdivision) {
     attr_create_subd_uv_map(scene, mesh, b_mesh, subdivide_uvs);
@@ -806,9 +857,9 @@ static void create_mesh(Scene *scene,
     attr_create_uv_map(scene, mesh, b_mesh);
   }
 
-  /* for volume objects, create a matrix to transform from object space to
+  /* For volume objects, create a matrix to transform from object space to
    * mesh texture space. this does not work with deformations but that can
-   * probably only be done well with a volume grid mapping of coordinates */
+   * probably only be done well with a volume grid mapping of coordinates. */
   if (mesh->need_attribute(scene, ATTR_STD_GENERATED_TRANSFORM)) {
     Attribute *attr = mesh->attributes.add(ATTR_STD_GENERATED_TRANSFORM);
     Transform *tfm = attr->data_transform();
@@ -877,13 +928,13 @@ static void sync_mesh_fluid_motion(BL::Object &b_ob, Scene *scene, Mesh *mesh)
   if (scene->need_motion() == Scene::MOTION_NONE)
     return;
 
-  BL::DomainFluidSettings b_fluid_domain = object_fluid_domain_find(b_ob);
+  BL::FluidDomainSettings b_fluid_domain = object_fluid_liquid_domain_find(b_ob);
 
   if (!b_fluid_domain)
     return;
 
   /* If the mesh has modifiers following the fluid domain we can't export motion. */
-  if (b_fluid_domain.fluid_mesh_vertices.length() != mesh->verts.size())
+  if (b_fluid_domain.mesh_vertices.length() != mesh->verts.size())
     return;
 
   /* Find or add attribute */
@@ -900,93 +951,21 @@ static void sync_mesh_fluid_motion(BL::Object &b_ob, Scene *scene, Mesh *mesh)
     float relative_time = motion_times[step] * scene->motion_shutter_time() * 0.5f;
     float3 *mP = attr_mP->data_float3() + step * mesh->verts.size();
 
-    BL::DomainFluidSettings::fluid_mesh_vertices_iterator fvi;
+    BL::FluidDomainSettings::mesh_vertices_iterator svi;
     int i = 0;
 
-    for (b_fluid_domain.fluid_mesh_vertices.begin(fvi);
-         fvi != b_fluid_domain.fluid_mesh_vertices.end();
-         ++fvi, ++i) {
-      mP[i] = P[i] + get_float3(fvi->velocity()) * relative_time;
+    for (b_fluid_domain.mesh_vertices.begin(svi); svi != b_fluid_domain.mesh_vertices.end();
+         ++svi, ++i) {
+      mP[i] = P[i] + get_float3(svi->velocity()) * relative_time;
     }
   }
 }
 
-Mesh *BlenderSync::sync_mesh(BL::Depsgraph &b_depsgraph,
-                             BL::Object &b_ob,
-                             BL::Object &b_ob_instance,
-                             bool object_updated,
-                             bool show_self,
-                             bool show_particles)
+void BlenderSync::sync_mesh(BL::Depsgraph b_depsgraph,
+                            BL::Object b_ob,
+                            Mesh *mesh,
+                            const vector<Shader *> &used_shaders)
 {
-  /* test if we can instance or if the object is modified */
-  BL::ID b_ob_data = b_ob.data();
-  BL::ID key = (BKE_object_is_modified(b_ob)) ? b_ob_instance : b_ob_data;
-  BL::Material material_override = view_layer.material_override;
-
-  /* find shader indices */
-  vector<Shader *> used_shaders;
-
-  BL::Object::material_slots_iterator slot;
-  for (b_ob.material_slots.begin(slot); slot != b_ob.material_slots.end(); ++slot) {
-    if (material_override) {
-      find_shader(material_override, used_shaders, scene->default_surface);
-    }
-    else {
-      BL::ID b_material(slot->material());
-      find_shader(b_material, used_shaders, scene->default_surface);
-    }
-  }
-
-  if (used_shaders.size() == 0) {
-    if (material_override)
-      find_shader(material_override, used_shaders, scene->default_surface);
-    else
-      used_shaders.push_back(scene->default_surface);
-  }
-
-  /* test if we need to sync */
-  int requested_geometry_flags = Mesh::GEOMETRY_NONE;
-  if (view_layer.use_surfaces) {
-    requested_geometry_flags |= Mesh::GEOMETRY_TRIANGLES;
-  }
-  if (view_layer.use_hair) {
-    requested_geometry_flags |= Mesh::GEOMETRY_CURVES;
-  }
-  Mesh *mesh;
-
-  if (!mesh_map.sync(&mesh, key)) {
-    /* if transform was applied to mesh, need full update */
-    if (object_updated && mesh->transform_applied)
-      ;
-    /* test if shaders changed, these can be object level so mesh
-     * does not get tagged for recalc */
-    else if (mesh->used_shaders != used_shaders)
-      ;
-    else if (requested_geometry_flags != mesh->geometry_flags)
-      ;
-    else {
-      /* even if not tagged for recalc, we may need to sync anyway
-       * because the shader needs different mesh attributes */
-      bool attribute_recalc = false;
-
-      foreach (Shader *shader, mesh->used_shaders)
-        if (shader->need_update_mesh)
-          attribute_recalc = true;
-
-      if (!attribute_recalc)
-        return mesh;
-    }
-  }
-
-  /* ensure we only sync instanced meshes once */
-  if (mesh_synced.find(mesh) != mesh_synced.end())
-    return mesh;
-
-  progress.set_sync_status("Synchronizing object", b_ob.name());
-
-  mesh_synced.insert(mesh);
-
-  /* create derived mesh */
   array<int> oldtriangles;
   array<Mesh::SubdFace> oldsubd_faces;
   array<int> oldsubd_face_corners;
@@ -994,146 +973,73 @@ Mesh *BlenderSync::sync_mesh(BL::Depsgraph &b_depsgraph,
   oldsubd_faces.steal_data(mesh->subd_faces);
   oldsubd_face_corners.steal_data(mesh->subd_face_corners);
 
-  /* compares curve_keys rather than strands in order to handle quick hair
-   * adjustments in dynamic BVH - other methods could probably do this better*/
-  array<float3> oldcurve_keys;
-  array<float> oldcurve_radius;
-  oldcurve_keys.steal_data(mesh->curve_keys);
-  oldcurve_radius.steal_data(mesh->curve_radius);
-
   mesh->clear();
   mesh->used_shaders = used_shaders;
-  mesh->name = ustring(b_ob_data.name().c_str());
 
-  if (requested_geometry_flags != Mesh::GEOMETRY_NONE) {
+  mesh->subdivision_type = Mesh::SUBDIVISION_NONE;
+
+  if (view_layer.use_surfaces) {
     /* Adaptive subdivision setup. Not for baking since that requires
      * exact mapping to the Blender mesh. */
-    if (scene->bake_manager->get_baking()) {
-      mesh->subdivision_type = Mesh::SUBDIVISION_NONE;
-    }
-    else {
+    if (!scene->bake_manager->get_baking()) {
       mesh->subdivision_type = object_subdivision_type(b_ob, preview, experimental);
     }
 
     /* For some reason, meshes do not need this... */
     bool need_undeformed = mesh->need_attribute(scene, ATTR_STD_GENERATED);
-
     BL::Mesh b_mesh = object_to_mesh(
         b_data, b_ob, b_depsgraph, need_undeformed, mesh->subdivision_type);
 
     if (b_mesh) {
       /* Sync mesh itself. */
-      if (view_layer.use_surfaces && show_self) {
-        if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE)
-          create_subd_mesh(scene, mesh, b_ob, b_mesh, used_shaders, dicing_rate, max_subdivisions);
-        else
-          create_mesh(scene, mesh, b_mesh, used_shaders, false);
-
-        create_mesh_volume_attributes(scene, b_ob, mesh, b_scene.frame_current());
-      }
-
-      /* Sync hair curves. */
-      if (view_layer.use_hair && show_particles &&
-          mesh->subdivision_type == Mesh::SUBDIVISION_NONE) {
-        sync_curves(mesh, b_mesh, b_ob, false);
-      }
+      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE)
+        create_subd_mesh(
+            scene, mesh, b_ob, b_mesh, mesh->used_shaders, dicing_rate, max_subdivisions);
+      else
+        create_mesh(scene, mesh, b_mesh, mesh->used_shaders, false);
 
       free_object_to_mesh(b_data, b_ob, b_mesh);
     }
   }
-  mesh->geometry_flags = requested_geometry_flags;
 
-  /* fluid motion */
+  /* mesh fluid motion mantaflow */
   sync_mesh_fluid_motion(b_ob, scene, mesh);
 
   /* tag update */
   bool rebuild = (oldtriangles != mesh->triangles) || (oldsubd_faces != mesh->subd_faces) ||
-                 (oldsubd_face_corners != mesh->subd_face_corners) ||
-                 (oldcurve_keys != mesh->curve_keys) || (oldcurve_radius != mesh->curve_radius);
+                 (oldsubd_face_corners != mesh->subd_face_corners);
 
   mesh->tag_update(scene, rebuild);
-
-  return mesh;
 }
 
-void BlenderSync::sync_mesh_motion(BL::Depsgraph &b_depsgraph,
-                                   BL::Object &b_ob,
-                                   Object *object,
-                                   float motion_time)
+void BlenderSync::sync_mesh_motion(BL::Depsgraph b_depsgraph,
+                                   BL::Object b_ob,
+                                   Mesh *mesh,
+                                   int motion_step)
 {
-  /* ensure we only sync instanced meshes once */
-  Mesh *mesh = object->mesh;
-
-  if (mesh_motion_synced.find(mesh) != mesh_motion_synced.end())
-    return;
-
-  mesh_motion_synced.insert(mesh);
-
-  /* ensure we only motion sync meshes that also had mesh synced, to avoid
-   * unnecessary work and to ensure that its attributes were clear */
-  if (mesh_synced.find(mesh) == mesh_synced.end())
-    return;
-
-  /* Find time matching motion step required by mesh. */
-  int motion_step = mesh->motion_step(motion_time);
-  if (motion_step < 0) {
+  /* Fluid motion blur already exported. */
+  BL::FluidDomainSettings b_fluid_domain = object_fluid_liquid_domain_find(b_ob);
+  if (b_fluid_domain) {
     return;
   }
 
-  /* skip empty meshes */
-  const size_t numverts = mesh->verts.size();
-  const size_t numkeys = mesh->curve_keys.size();
-
-  if (!numverts && !numkeys)
+  /* Skip if no vertices were exported. */
+  size_t numverts = mesh->verts.size();
+  if (numverts == 0) {
     return;
+  }
 
-  /* skip objects without deforming modifiers. this is not totally reliable,
-   * would need a more extensive check to see which objects are animated */
+  /* Skip objects without deforming modifiers. this is not totally reliable,
+   * would need a more extensive check to see which objects are animated. */
   BL::Mesh b_mesh(PointerRNA_NULL);
-
-  /* fluid motion is exported immediate with mesh, skip here */
-  BL::DomainFluidSettings b_fluid_domain = object_fluid_domain_find(b_ob);
-  if (b_fluid_domain)
-    return;
-
   if (ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
     /* get derived mesh */
     b_mesh = object_to_mesh(b_data, b_ob, b_depsgraph, false, Mesh::SUBDIVISION_NONE);
   }
 
-  if (!b_mesh) {
-    /* if we have no motion blur on this frame, but on other frames, copy */
-    if (numverts) {
-      /* triangles */
-      Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-      if (attr_mP) {
-        Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
-        Attribute *attr_N = mesh->attributes.find(ATTR_STD_VERTEX_NORMAL);
-        float3 *P = &mesh->verts[0];
-        float3 *N = (attr_N) ? attr_N->data_float3() : NULL;
-
-        memcpy(attr_mP->data_float3() + motion_step * numverts, P, sizeof(float3) * numverts);
-        if (attr_mN)
-          memcpy(attr_mN->data_float3() + motion_step * numverts, N, sizeof(float3) * numverts);
-      }
-    }
-
-    if (numkeys) {
-      /* curves */
-      Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-      if (attr_mP) {
-        float3 *keys = &mesh->curve_keys[0];
-        memcpy(attr_mP->data_float3() + motion_step * numkeys, keys, sizeof(float3) * numkeys);
-      }
-    }
-
-    return;
-  }
-
-  /* TODO(sergey): Perform preliminary check for number of verticies. */
-  if (numverts) {
+  /* TODO(sergey): Perform preliminary check for number of vertices. */
+  if (b_mesh) {
+    /* Export deformed coordinates. */
     /* Find attributes. */
     Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
@@ -1198,14 +1104,13 @@ void BlenderSync::sync_mesh_motion(BL::Depsgraph &b_depsgraph,
         }
       }
     }
-  }
 
-  /* hair motion */
-  if (numkeys)
-    sync_curves(mesh, b_mesh, b_ob, true, motion_step);
+    free_object_to_mesh(b_data, b_ob, b_mesh);
+    return;
+  }
 
-  /* free derived mesh */
-  free_object_to_mesh(b_data, b_ob, b_mesh);
+  /* No deformation on this frame, copy coordinates if other frames did have it. */
+  mesh->copy_center_to_motion_step(motion_step);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 095ecd59985..3ea6892a349 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -15,14 +15,14 @@
  */
 
 #include "render/camera.h"
-#include "render/integrator.h"
 #include "render/graph.h"
+#include "render/integrator.h"
 #include "render/light.h"
 #include "render/mesh.h"
-#include "render/object.h"
-#include "render/scene.h"
 #include "render/nodes.h"
+#include "render/object.h"
 #include "render/particles.h"
+#include "render/scene.h"
 #include "render/shader.h"
 
 #include "blender/blender_object_cull.h"
@@ -59,7 +59,7 @@ bool BlenderSync::BKE_object_is_modified(BL::Object &b_ob)
   return false;
 }
 
-bool BlenderSync::object_is_mesh(BL::Object &b_ob)
+bool BlenderSync::object_is_geometry(BL::Object &b_ob)
 {
   BL::ID b_ob_data = b_ob.data();
 
@@ -67,10 +67,16 @@ bool BlenderSync::object_is_mesh(BL::Object &b_ob)
     return false;
   }
 
-  if (b_ob.type() == BL::Object::type_CURVE) {
+  BL::Object::type_enum type = b_ob.type();
+
+  if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR) {
+    /* Will be exported attached to mesh. */
+    return true;
+  }
+  else if (type == BL::Object::type_CURVE) {
     /* Skip exporting curves without faces, overhead can be
      * significant if there are many for path animation. */
-    BL::Curve b_curve(b_ob.data());
+    BL::Curve b_curve(b_ob_data);
 
     return (b_curve.bevel_object() || b_curve.extrude() != 0.0f || b_curve.bevel_depth() != 0.0f ||
             b_curve.dimensions() == BL::Curve::dimensions_2D || b_ob.modifiers.length());
@@ -88,204 +94,14 @@ bool BlenderSync::object_is_light(BL::Object &b_ob)
   return (b_ob_data && b_ob_data.is_a(&RNA_Light));
 }
 
-static uint object_ray_visibility(BL::Object &b_ob)
-{
-  PointerRNA cvisibility = RNA_pointer_get(&b_ob.ptr, "cycles_visibility");
-  uint flag = 0;
-
-  flag |= get_boolean(cvisibility, "camera") ? PATH_RAY_CAMERA : 0;
-  flag |= get_boolean(cvisibility, "diffuse") ? PATH_RAY_DIFFUSE : 0;
-  flag |= get_boolean(cvisibility, "glossy") ? PATH_RAY_GLOSSY : 0;
-  flag |= get_boolean(cvisibility, "transmission") ? PATH_RAY_TRANSMIT : 0;
-  flag |= get_boolean(cvisibility, "shadow") ? PATH_RAY_SHADOW : 0;
-  flag |= get_boolean(cvisibility, "scatter") ? PATH_RAY_VOLUME_SCATTER : 0;
-
-  return flag;
-}
-
-/* Light */
-
-void BlenderSync::sync_light(BL::Object &b_parent,
-                             int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
-                             BL::Object &b_ob,
-                             BL::Object &b_ob_instance,
-                             int random_id,
-                             Transform &tfm,
-                             bool *use_portal)
-{
-  /* test if we need to sync */
-  Light *light;
-  ObjectKey key(b_parent, persistent_id, b_ob_instance);
-
-  if (!light_map.sync(&light, b_ob, b_parent, key)) {
-    if (light->is_portal)
-      *use_portal = true;
-    return;
-  }
-
-  BL::Light b_light(b_ob.data());
-
-  /* type */
-  switch (b_light.type()) {
-    case BL::Light::type_POINT: {
-      BL::PointLight b_point_light(b_light);
-      light->size = b_point_light.shadow_soft_size();
-      light->type = LIGHT_POINT;
-      break;
-    }
-    case BL::Light::type_SPOT: {
-      BL::SpotLight b_spot_light(b_light);
-      light->size = b_spot_light.shadow_soft_size();
-      light->type = LIGHT_SPOT;
-      light->spot_angle = b_spot_light.spot_size();
-      light->spot_smooth = b_spot_light.spot_blend();
-      break;
-    }
-    /* Hemi were removed from 2.8 */
-    // case BL::Light::type_HEMI: {
-    //  light->type = LIGHT_DISTANT;
-    //  light->size = 0.0f;
-    //  break;
-    // }
-    case BL::Light::type_SUN: {
-      BL::SunLight b_sun_light(b_light);
-      light->size = b_sun_light.shadow_soft_size();
-      light->type = LIGHT_DISTANT;
-      break;
-    }
-    case BL::Light::type_AREA: {
-      BL::AreaLight b_area_light(b_light);
-      light->size = 1.0f;
-      light->axisu = transform_get_column(&tfm, 0);
-      light->axisv = transform_get_column(&tfm, 1);
-      light->sizeu = b_area_light.size();
-      switch (b_area_light.shape()) {
-        case BL::AreaLight::shape_SQUARE:
-          light->sizev = light->sizeu;
-          light->round = false;
-          break;
-        case BL::AreaLight::shape_RECTANGLE:
-          light->sizev = b_area_light.size_y();
-          light->round = false;
-          break;
-        case BL::AreaLight::shape_DISK:
-          light->sizev = light->sizeu;
-          light->round = true;
-          break;
-        case BL::AreaLight::shape_ELLIPSE:
-          light->sizev = b_area_light.size_y();
-          light->round = true;
-          break;
-      }
-      light->type = LIGHT_AREA;
-      break;
-    }
-  }
-
-  /* location and (inverted!) direction */
-  light->co = transform_get_column(&tfm, 3);
-  light->dir = -transform_get_column(&tfm, 2);
-  light->tfm = tfm;
-
-  /* shader */
-  vector<Shader *> used_shaders;
-  find_shader(b_light, used_shaders, scene->default_light);
-  light->shader = used_shaders[0];
-
-  /* shadow */
-  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-  PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles");
-  light->cast_shadow = get_boolean(clight, "cast_shadow");
-  light->use_mis = get_boolean(clight, "use_multiple_importance_sampling");
-
-  int samples = get_int(clight, "samples");
-  if (get_boolean(cscene, "use_square_samples"))
-    light->samples = samples * samples;
-  else
-    light->samples = samples;
-
-  light->max_bounces = get_int(clight, "max_bounces");
-
-  if (b_ob != b_ob_instance) {
-    light->random_id = random_id;
-  }
-  else {
-    light->random_id = hash_int_2d(hash_string(b_ob.name().c_str()), 0);
-  }
-
-  if (light->type == LIGHT_AREA)
-    light->is_portal = get_boolean(clight, "is_portal");
-  else
-    light->is_portal = false;
-
-  if (light->is_portal)
-    *use_portal = true;
-
-  /* visibility */
-  uint visibility = object_ray_visibility(b_ob);
-  light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0;
-  light->use_glossy = (visibility & PATH_RAY_GLOSSY) != 0;
-  light->use_transmission = (visibility & PATH_RAY_TRANSMIT) != 0;
-  light->use_scatter = (visibility & PATH_RAY_VOLUME_SCATTER) != 0;
-
-  /* tag */
-  light->tag_update(scene);
-}
-
-void BlenderSync::sync_background_light(bool use_portal)
-{
-  BL::World b_world = b_scene.world();
-
-  if (b_world) {
-    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-    PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
-
-    enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM };
-    int sampling_method = get_enum(cworld, "sampling_method", SAMPLING_NUM, SAMPLING_AUTOMATIC);
-    bool sample_as_light = (sampling_method != SAMPLING_NONE);
-
-    if (sample_as_light || use_portal) {
-      /* test if we need to sync */
-      Light *light;
-      ObjectKey key(b_world, 0, b_world);
-
-      if (light_map.sync(&light, b_world, b_world, key) || world_recalc ||
-          b_world.ptr.data != world_map) {
-        light->type = LIGHT_BACKGROUND;
-        if (sampling_method == SAMPLING_MANUAL) {
-          light->map_resolution = get_int(cworld, "sample_map_resolution");
-        }
-        else {
-          light->map_resolution = 0;
-        }
-        light->shader = scene->default_background;
-        light->use_mis = sample_as_light;
-        light->max_bounces = get_int(cworld, "max_bounces");
-
-        int samples = get_int(cworld, "samples");
-        if (get_boolean(cscene, "use_square_samples"))
-          light->samples = samples * samples;
-        else
-          light->samples = samples;
-
-        light->tag_update(scene);
-        light_map.set_recalc(b_world);
-      }
-    }
-  }
-
-  world_map = b_world.ptr.data;
-  world_recalc = false;
-}
-
 /* Object */
 
 Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
                                  BL::ViewLayer &b_view_layer,
                                  BL::DepsgraphObjectInstance &b_instance,
                                  float motion_time,
-                                 bool show_self,
-                                 bool show_particles,
+                                 bool use_particle_hair,
+                                 bool show_lights,
                                  BlenderObjectCulling &culling,
                                  bool *use_portal)
 {
@@ -304,11 +120,14 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
 
   /* light is handled separately */
   if (!motion && object_is_light(b_ob)) {
+    if (!show_lights) {
+      return NULL;
+    }
+
     /* TODO: don't use lights for excluded layers used as mask layer,
      * when dynamic overrides are back. */
 #if 0
-    if(!((layer_flag & view_layer.holdout_layer) &&
-         (layer_flag & view_layer.exclude_layer)))
+    if (!((layer_flag & view_layer.holdout_layer) && (layer_flag & view_layer.exclude_layer)))
 #endif
     {
       sync_light(b_parent,
@@ -324,7 +143,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
   }
 
   /* only interested in object that we can create meshes from */
-  if (!object_is_mesh(b_ob)) {
+  if (!object_is_geometry(b_ob)) {
     return NULL;
   }
 
@@ -345,13 +164,14 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
 
   /* TODO: make holdout objects on excluded layer invisible for non-camera rays. */
 #if 0
-  if(use_holdout && (layer_flag & view_layer.exclude_layer)) {
+  if (use_holdout && (layer_flag & view_layer.exclude_layer)) {
     visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA);
   }
 #endif
 
   /* Clear camera visibility for indirect only objects. */
-  bool use_indirect_only = b_parent.indirect_only_get(PointerRNA_NULL, b_view_layer);
+  bool use_indirect_only = !use_holdout &&
+                           b_parent.indirect_only_get(PointerRNA_NULL, b_view_layer);
   if (use_indirect_only) {
     visibility &= ~PATH_RAY_CAMERA;
   }
@@ -362,7 +182,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
   }
 
   /* key to lookup object */
-  ObjectKey key(b_parent, persistent_id, b_ob_instance);
+  ObjectKey key(b_parent, persistent_id, b_ob_instance, use_particle_hair);
   Object *object;
 
   /* motion vector case */
@@ -377,8 +197,8 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
       }
 
       /* mesh deformation */
-      if (object->mesh)
-        sync_mesh_motion(b_depsgraph, b_ob, object, motion_time);
+      if (object->geometry)
+        sync_geometry_motion(b_depsgraph, b_ob, object, motion_time, use_particle_hair);
     }
 
     return object;
@@ -387,12 +207,12 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
   /* test if we need to sync */
   bool object_updated = false;
 
-  if (object_map.sync(&object, b_ob, b_parent, key))
+  if (object_map.add_or_update(&object, b_ob, b_parent, key))
     object_updated = true;
 
   /* mesh sync */
-  object->mesh = sync_mesh(
-      b_depsgraph, b_ob, b_ob_instance, object_updated, show_self, show_particles);
+  object->geometry = sync_geometry(
+      b_depsgraph, b_ob, b_ob_instance, object_updated, use_particle_hair);
 
   /* special case not tracked by object update flags */
 
@@ -414,6 +234,12 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
     object_updated = true;
   }
 
+  float shadow_terminator_offset = get_float(cobject, "shadow_terminator_offset");
+  if (shadow_terminator_offset != object->shadow_terminator_offset) {
+    object->shadow_terminator_offset = shadow_terminator_offset;
+    object_updated = true;
+  }
+
   /* sync the asset name for Cryptomatte */
   BL::Object parent = b_ob.parent();
   ustring parent_name;
@@ -434,31 +260,33 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
   /* object sync
    * transform comparison should not be needed, but duplis don't work perfect
    * in the depsgraph and may not signal changes, so this is a workaround */
-  if (object_updated || (object->mesh && object->mesh->need_update) || tfm != object->tfm) {
+  if (object_updated || (object->geometry && object->geometry->need_update) ||
+      tfm != object->tfm) {
     object->name = b_ob.name().c_str();
     object->pass_id = b_ob.pass_index();
+    object->color = get_float3(b_ob.color());
     object->tfm = tfm;
     object->motion.clear();
 
     /* motion blur */
     Scene::MotionType need_motion = scene->need_motion();
-    if (need_motion != Scene::MOTION_NONE && object->mesh) {
-      Mesh *mesh = object->mesh;
-      mesh->use_motion_blur = false;
-      mesh->motion_steps = 0;
+    if (need_motion != Scene::MOTION_NONE && object->geometry) {
+      Geometry *geom = object->geometry;
+      geom->use_motion_blur = false;
+      geom->motion_steps = 0;
 
       uint motion_steps;
 
       if (need_motion == Scene::MOTION_BLUR) {
-        motion_steps = object_motion_steps(b_parent, b_ob);
-        mesh->motion_steps = motion_steps;
+        motion_steps = object_motion_steps(b_parent, b_ob, Object::MAX_MOTION_STEPS);
+        geom->motion_steps = motion_steps;
         if (motion_steps && object_use_deform_motion(b_parent, b_ob)) {
-          mesh->use_motion_blur = true;
+          geom->use_motion_blur = true;
         }
       }
       else {
         motion_steps = 3;
-        mesh->motion_steps = motion_steps;
+        geom->motion_steps = motion_steps;
       }
 
       object->motion.clear();
@@ -483,7 +311,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
     else {
       object->dupli_generated = make_float3(0.0f, 0.0f, 0.0f);
       object->dupli_uv = make_float2(0.0f, 0.0f);
-      object->random_id = hash_int_2d(hash_string(object->name.c_str()), 0);
+      object->random_id = hash_uint2(hash_string(object->name.c_str()), 0);
     }
 
     object->tag_update(scene);
@@ -499,7 +327,9 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
 
 /* Object Loop */
 
-void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph, float motion_time)
+void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
+                               BL::SpaceView3D &b_v3d,
+                               float motion_time)
 {
   /* layer data */
   bool motion = motion_time != 0.0f;
@@ -507,13 +337,13 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph, float motion_time)
   if (!motion) {
     /* prepare for sync */
     light_map.pre_sync();
-    mesh_map.pre_sync();
+    geometry_map.pre_sync();
     object_map.pre_sync();
     particle_system_map.pre_sync();
     motion_times.clear();
   }
   else {
-    mesh_motion_synced.clear();
+    geometry_motion_synced.clear();
   }
 
   /* initialize culling */
@@ -522,6 +352,7 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph, float motion_time)
   /* object loop */
   bool cancel = false;
   bool use_portal = false;
+  const bool show_lights = BlenderViewportParameters(b_v3d).use_scene_lights;
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
 
@@ -532,21 +363,35 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph, float motion_time)
     BL::DepsgraphObjectInstance b_instance = *b_instance_iter;
     BL::Object b_ob = b_instance.object();
 
-    /* load per-object culling data */
+    /* Viewport visibility. */
+    const bool show_in_viewport = !b_v3d || b_ob.visible_in_viewport_get(b_v3d);
+    if (show_in_viewport == false) {
+      continue;
+    }
+
+    /* Load per-object culling data. */
     culling.init_object(scene, b_ob);
 
-    /* test if object needs to be hidden */
-    const bool show_self = b_instance.show_self();
-    const bool show_particles = b_instance.show_particles();
+    /* Object itself. */
+    if (b_instance.show_self()) {
+      sync_object(b_depsgraph,
+                  b_view_layer,
+                  b_instance,
+                  motion_time,
+                  false,
+                  show_lights,
+                  culling,
+                  &use_portal);
+    }
 
-    if (show_self || show_particles) {
-      /* object itself */
+    /* Particle hair as separate object. */
+    if (b_instance.show_particles() && object_has_particle_hair(b_ob)) {
       sync_object(b_depsgraph,
                   b_view_layer,
                   b_instance,
                   motion_time,
-                  show_self,
-                  show_particles,
+                  true,
+                  show_lights,
                   culling,
                   &use_portal);
     }
@@ -557,13 +402,13 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph, float motion_time)
   progress.set_sync_status("");
 
   if (!cancel && !motion) {
-    sync_background_light(use_portal);
+    sync_background_light(b_v3d, use_portal);
 
     /* handle removed data and modified pointers */
     if (light_map.post_sync())
       scene->light_manager->tag_update(scene);
-    if (mesh_map.post_sync())
-      scene->mesh_manager->tag_update(scene);
+    if (geometry_map.post_sync())
+      scene->geometry_manager->tag_update(scene);
     if (object_map.post_sync())
       scene->object_manager->tag_update(scene);
     if (particle_system_map.post_sync())
@@ -571,11 +416,12 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph, float motion_time)
   }
 
   if (motion)
-    mesh_motion_synced.clear();
+    geometry_motion_synced.clear();
 }
 
 void BlenderSync::sync_motion(BL::RenderSettings &b_render,
                               BL::Depsgraph &b_depsgraph,
+                              BL::SpaceView3D &b_v3d,
                               BL::Object &b_override,
                               int width,
                               int height,
@@ -613,12 +459,15 @@ void BlenderSync::sync_motion(BL::RenderSettings &b_render,
     b_engine.frame_set(frame, subframe);
     python_thread_state_save(python_thread_state);
     sync_camera_motion(b_render, b_cam, width, height, 0.0f);
-    sync_objects(b_depsgraph, 0.0f);
+    sync_objects(b_depsgraph, b_v3d, 0.0f);
   }
 
-  /* always sample these times for camera motion */
-  motion_times.insert(-1.0f);
-  motion_times.insert(1.0f);
+  /* Insert motion times from camera. Motion times from other objects
+   * have already been added in a sync_objects call. */
+  uint camera_motion_steps = object_motion_steps(b_cam, b_cam);
+  for (size_t step = 0; step < camera_motion_steps; step++) {
+    motion_times.insert(scene->camera->motion_time(step));
+  }
 
   /* note iteration over motion_times set happens in sorted order */
   foreach (float relative_time, motion_times) {
@@ -643,13 +492,11 @@ void BlenderSync::sync_motion(BL::RenderSettings &b_render,
     b_engine.frame_set(frame, subframe);
     python_thread_state_save(python_thread_state);
 
-    /* sync camera, only supports two times at the moment */
-    if (relative_time == -1.0f || relative_time == 1.0f) {
-      sync_camera_motion(b_render, b_cam, width, height, relative_time);
-    }
+    /* Syncs camera motion if relative_time is one of the camera's motion times. */
+    sync_camera_motion(b_render, b_cam, width, height, relative_time);
 
     /* sync object */
-    sync_objects(b_depsgraph, relative_time);
+    sync_objects(b_depsgraph, b_v3d, relative_time);
   }
 
   /* we need to set the python thread state again because this
diff --git a/intern/cycles/blender/blender_object_cull.cpp b/intern/cycles/blender/blender_object_cull.cpp
index 74f8fb1dc53..bebecb364eb 100644
--- a/intern/cycles/blender/blender_object_cull.cpp
+++ b/intern/cycles/blender/blender_object_cull.cpp
@@ -19,6 +19,7 @@
 #include "render/camera.h"
 
 #include "blender/blender_object_cull.h"
+#include "blender/blender_util.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp
index d74f132ed60..e5eab1ae62b 100644
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -39,7 +39,7 @@ bool BlenderSync::sync_dupli_particle(BL::Object &b_ob,
   object->hide_on_missing_motion = true;
 
   /* test if we need particle data */
-  if (!object->mesh->need_attribute(scene, ATTR_STD_PARTICLE))
+  if (!object->geometry->need_attribute(scene, ATTR_STD_PARTICLE))
     return false;
 
   /* don't handle child particles yet */
@@ -53,10 +53,10 @@ bool BlenderSync::sync_dupli_particle(BL::Object &b_ob,
   ParticleSystem *psys;
 
   bool first_use = !particle_system_map.is_used(key);
-  bool need_update = particle_system_map.sync(&psys, b_ob, b_instance.object(), key);
+  bool need_update = particle_system_map.add_or_update(&psys, b_ob, b_instance.object(), key);
 
   /* no update needed? */
-  if (!need_update && !object->mesh->need_update && !scene->object_manager->need_update)
+  if (!need_update && !object->geometry->need_update && !scene->object_manager->need_update)
     return true;
 
   /* first time used in this sync loop? clear and tag update */
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index ffd1c70a4e4..25c77b74ce3 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -19,8 +19,9 @@
 #include "blender/CCL_api.h"
 
 #include "blender/blender_device.h"
-#include "blender/blender_sync.h"
 #include "blender/blender_session.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
 #include "render/denoising.h"
 #include "render/merge.h"
@@ -30,15 +31,17 @@
 #include "util/util_logging.h"
 #include "util/util_md5.h"
 #include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
 #include "util/util_path.h"
 #include "util/util_string.h"
+#include "util/util_task.h"
 #include "util/util_types.h"
 
 #ifdef WITH_OSL
 #  include "render/osl.h"
 
-#  include <OSL/oslquery.h>
 #  include <OSL/oslconfig.h>
+#  include <OSL/oslquery.h>
 #endif
 
 #ifdef WITH_OPENCL
@@ -59,6 +62,12 @@ void *pylong_as_voidptr_typesafe(PyObject *object)
   return PyLong_AsVoidPtr(object);
 }
 
+PyObject *pyunicode_from_string(const char *str)
+{
+  /* Ignore errors if device API returns invalid UTF-8 strings. */
+  return PyUnicode_DecodeUTF8(str, strlen(str), "ignore");
+}
+
 /* Synchronize debug flags from a given Blender scene.
  * Return truth when device list needs invalidation.
  */
@@ -81,6 +90,9 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
   /* Synchronize CUDA flags. */
   flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
   flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
+  /* Synchronize OptiX flags. */
+  flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
+  flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api");
   /* Synchronize OpenCL device type. */
   switch (get_enum(cscene, "debug_opencl_device_type")) {
     case 0:
@@ -138,7 +150,7 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
   const char *result = _PyUnicode_AsString(py_str);
   if (result) {
     /* 99% of the time this is enough but we better support non unicode
-     * chars since blender doesnt limit this.
+     * chars since blender doesn't limit this.
      */
     return result;
   }
@@ -151,7 +163,7 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
       return PyBytes_AS_STRING(*coerce);
     }
     else {
-      /* Clear the error, so Cycles can be at leadt used without
+      /* Clear the error, so Cycles can be at least used without
        * GPU and OSL support,
        */
       PyErr_Clear();
@@ -177,6 +189,8 @@ static PyObject *init_func(PyObject * /*self*/, PyObject *args)
 
   BlenderSession::headless = headless;
 
+  DebugFlags().running_inside_blender = true;
+
   VLOG(2) << "Debug flags initialized to:\n" << DebugFlags();
 
   Py_RETURN_NONE;
@@ -192,14 +206,15 @@ static PyObject *exit_func(PyObject * /*self*/, PyObject * /*args*/)
 
 static PyObject *create_func(PyObject * /*self*/, PyObject *args)
 {
-  PyObject *pyengine, *pypreferences, *pydata, *pyregion, *pyv3d, *pyrv3d;
+  PyObject *pyengine, *pypreferences, *pydata, *pyscreen, *pyregion, *pyv3d, *pyrv3d;
   int preview_osl;
 
   if (!PyArg_ParseTuple(args,
-                        "OOOOOOi",
+                        "OOOOOOOi",
                         &pyengine,
                         &pypreferences,
                         &pydata,
+                        &pyscreen,
                         &pyregion,
                         &pyv3d,
                         &pyrv3d,
@@ -208,6 +223,8 @@ static PyObject *create_func(PyObject * /*self*/, PyObject *args)
   }
 
   /* RNA */
+  ID *bScreen = (ID *)PyLong_AsVoidPtr(pyscreen);
+
   PointerRNA engineptr;
   RNA_pointer_create(NULL, &RNA_RenderEngine, (void *)PyLong_AsVoidPtr(pyengine), &engineptr);
   BL::RenderEngine engine(engineptr);
@@ -222,15 +239,15 @@ static PyObject *create_func(PyObject * /*self*/, PyObject *args)
   BL::BlendData data(dataptr);
 
   PointerRNA regionptr;
-  RNA_pointer_create(NULL, &RNA_Region, pylong_as_voidptr_typesafe(pyregion), &regionptr);
+  RNA_pointer_create(bScreen, &RNA_Region, pylong_as_voidptr_typesafe(pyregion), &regionptr);
   BL::Region region(regionptr);
 
   PointerRNA v3dptr;
-  RNA_pointer_create(NULL, &RNA_SpaceView3D, pylong_as_voidptr_typesafe(pyv3d), &v3dptr);
+  RNA_pointer_create(bScreen, &RNA_SpaceView3D, pylong_as_voidptr_typesafe(pyv3d), &v3dptr);
   BL::SpaceView3D v3d(v3dptr);
 
   PointerRNA rv3dptr;
-  RNA_pointer_create(NULL, &RNA_RegionView3D, pylong_as_voidptr_typesafe(pyrv3d), &rv3dptr);
+  RNA_pointer_create(bScreen, &RNA_RegionView3D, pylong_as_voidptr_typesafe(pyrv3d), &rv3dptr);
   BL::RegionView3D rv3d(rv3dptr);
 
   /* create session */
@@ -284,22 +301,18 @@ static PyObject *render_func(PyObject * /*self*/, PyObject *args)
 static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
 {
   PyObject *pysession, *pydepsgraph, *pyobject;
-  PyObject *pypixel_array, *pyresult;
   const char *pass_type;
-  int num_pixels, depth, object_id, pass_filter;
+  int pass_filter, width, height;
 
   if (!PyArg_ParseTuple(args,
-                        "OOOsiiOiiO",
+                        "OOOsiii",
                         &pysession,
                         &pydepsgraph,
                         &pyobject,
                         &pass_type,
                         &pass_filter,
-                        &object_id,
-                        &pypixel_array,
-                        &num_pixels,
-                        &depth,
-                        &pyresult))
+                        &width,
+                        &height))
     return NULL;
 
   BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(pysession);
@@ -312,23 +325,9 @@ static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
   RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyobject), &objectptr);
   BL::Object b_object(objectptr);
 
-  void *b_result = PyLong_AsVoidPtr(pyresult);
-
-  PointerRNA bakepixelptr;
-  RNA_pointer_create(NULL, &RNA_BakePixel, PyLong_AsVoidPtr(pypixel_array), &bakepixelptr);
-  BL::BakePixel b_bake_pixel(bakepixelptr);
-
   python_thread_state_save(&session->python_thread_state);
 
-  session->bake(b_depsgraph,
-                b_object,
-                pass_type,
-                pass_filter,
-                object_id,
-                b_bake_pixel,
-                (size_t)num_pixels,
-                depth,
-                (float *)b_result);
+  session->bake(b_depsgraph, b_object, pass_type, pass_filter, width, height);
 
   python_thread_state_restore(&session->python_thread_state);
 
@@ -420,10 +419,11 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
   for (size_t i = 0; i < devices.size(); i++) {
     DeviceInfo &device = devices[i];
     string type_name = Device::string_from_type(device.type);
-    PyObject *device_tuple = PyTuple_New(3);
-    PyTuple_SET_ITEM(device_tuple, 0, PyUnicode_FromString(device.description.c_str()));
-    PyTuple_SET_ITEM(device_tuple, 1, PyUnicode_FromString(type_name.c_str()));
-    PyTuple_SET_ITEM(device_tuple, 2, PyUnicode_FromString(device.id.c_str()));
+    PyObject *device_tuple = PyTuple_New(4);
+    PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str()));
+    PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str()));
+    PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str()));
+    PyTuple_SET_ITEM(device_tuple, 3, PyBool_FromLong(device.has_peer_memory));
     PyTuple_SET_ITEM(ret, i, device_tuple);
   }
 
@@ -634,7 +634,7 @@ static PyObject *osl_compile_func(PyObject * /*self*/, PyObject *args)
 static PyObject *system_info_func(PyObject * /*self*/, PyObject * /*value*/)
 {
   string system_info = Device::device_capabilities();
-  return PyUnicode_FromString(system_info.c_str());
+  return pyunicode_from_string(system_info.c_str());
 }
 
 #ifdef WITH_OPENCL
@@ -937,6 +937,15 @@ static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *a
   Py_RETURN_NONE;
 }
 
+static PyObject *clear_resumable_chunk_func(PyObject * /*self*/, PyObject * /*value*/)
+{
+  VLOG(1) << "Clear resumable render";
+  BlenderSession::num_resumable_chunks = 0;
+  BlenderSession::current_resumable_chunk = 0;
+
+  Py_RETURN_NONE;
+}
+
 static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   BlenderSession::print_render_stats = true;
@@ -946,14 +955,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_opencl = false;
+  bool has_cuda = false, has_optix = false, has_opencl = false;
   foreach (DeviceType device_type, device_types) {
     has_cuda |= (device_type == DEVICE_CUDA);
+    has_optix |= (device_type == DEVICE_OPTIX);
     has_opencl |= (device_type == DEVICE_OPENCL);
   }
-  PyObject *list = PyTuple_New(2);
+  PyObject *list = PyTuple_New(3);
   PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
-  PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_opencl));
+  PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
+  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl));
   return list;
 }
 
@@ -992,6 +1003,7 @@ static PyMethodDef methods[] = {
     /* Resumable render */
     {"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
     {"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
+    {"clear_resumable_chunk", clear_resumable_chunk_func, METH_NOARGS, ""},
 
     /* Compute Device selection */
     {"get_device_types", get_device_types_func, METH_VARARGS, ""},
@@ -1066,5 +1078,14 @@ void *CCL_python_module_init()
   Py_INCREF(Py_False);
 #endif /* WITH_EMBREE */
 
+  if (ccl::openimagedenoise_supported()) {
+    PyModule_AddObject(mod, "with_openimagedenoise", Py_True);
+    Py_INCREF(Py_True);
+  }
+  else {
+    PyModule_AddObject(mod, "with_openimagedenoise", Py_False);
+    Py_INCREF(Py_False);
+  }
+
   return (void *)mod;
 }
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 29a97bf6546..391a1b8f473 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -16,12 +16,13 @@
 
 #include <stdlib.h>
 
+#include "device/device.h"
 #include "render/background.h"
 #include "render/buffers.h"
 #include "render/camera.h"
-#include "device/device.h"
-#include "render/integrator.h"
+#include "render/colorspace.h"
 #include "render/film.h"
+#include "render/integrator.h"
 #include "render/light.h"
 #include "render/mesh.h"
 #include "render/object.h"
@@ -40,8 +41,8 @@
 #include "util/util_progress.h"
 #include "util/util_time.h"
 
-#include "blender/blender_sync.h"
 #include "blender/blender_session.h"
+#include "blender/blender_sync.h"
 #include "blender/blender_util.h"
 
 CCL_NAMESPACE_BEGIN
@@ -113,11 +114,6 @@ BlenderSession::~BlenderSession()
   free_session();
 }
 
-void BlenderSession::create()
-{
-  create_session();
-}
-
 void BlenderSession::create_session()
 {
   SessionParams session_params = BlenderSync::get_session_params(
@@ -142,21 +138,13 @@ void BlenderSession::create_session()
   scene = new Scene(scene_params, session->device);
   scene->name = b_scene.name();
 
-  /* setup callbacks for builtin image support */
-  scene->image_manager->builtin_image_info_cb = function_bind(
-      &BlenderSession::builtin_image_info, this, _1, _2, _3);
-  scene->image_manager->builtin_image_pixels_cb = function_bind(
-      &BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5);
-  scene->image_manager->builtin_image_float_pixels_cb = function_bind(
-      &BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5);
-
   session->scene = scene;
 
   /* There is no single depsgraph to use for the entire render.
    * So we need to handle this differently.
    *
-   * We could loop over the final render result render layers in pipeline and keep Cycles unaware of multiple layers,
-   * or perhaps move syncing further down in the pipeline.
+   * We could loop over the final render result render layers in pipeline and keep Cycles unaware
+   * of multiple layers, or perhaps move syncing further down in the pipeline.
    */
   /* create sync */
   sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
@@ -170,7 +158,7 @@ void BlenderSession::create_session()
 
   /* set buffer parameters */
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_render, b_v3d, b_rv3d, scene->camera, width, height);
+      b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
   session->reset(buffer_params, session_params.samples);
 
   b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -180,9 +168,13 @@ void BlenderSession::create_session()
 
 void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsgraph)
 {
+  /* Update data, scene and depsgraph pointers. These can change after undo. */
   this->b_data = b_data;
   this->b_depsgraph = b_depsgraph;
   this->b_scene = b_depsgraph.scene_eval();
+  if (sync) {
+    sync->reset(this->b_data, this->b_scene);
+  }
 
   if (preview_osl) {
     PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -198,8 +190,12 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
     height = render_resolution_y(b_render);
   }
 
-  if (session == NULL) {
-    create();
+  bool is_new_session = (session == NULL);
+  if (is_new_session) {
+    /* Initialize session and remember it was just created so not to
+     * re-create it below.
+     */
+    create_session();
   }
 
   if (b_v3d) {
@@ -218,8 +214,10 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
     /* if scene or session parameters changed, it's easier to simply re-create
      * them rather than trying to distinguish which settings need to be updated
      */
-    free_session();
-    create_session();
+    if (!is_new_session) {
+      free_session();
+      create_session();
+    }
     return;
   }
 
@@ -241,8 +239,13 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
   BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL);
   BL::RegionView3D b_null_region_view3d(PointerRNA_NULL);
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_render, b_null_space_view3d, b_null_region_view3d, scene->camera, width, height);
+  BufferParams buffer_params = BlenderSync::get_buffer_params(b_render,
+                                                              b_null_space_view3d,
+                                                              b_null_region_view3d,
+                                                              scene->camera,
+                                                              width,
+                                                              height,
+                                                              session_params.denoising.use);
   session->reset(buffer_params, session_params.samples);
 
   b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -253,9 +256,7 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
 void BlenderSession::free_session()
 {
-  if (sync)
-    delete sync;
-
+  delete sync;
   delete session;
 }
 
@@ -276,8 +277,6 @@ static ShaderEvalType get_shader_type(const string &pass_type)
     return SHADER_EVAL_GLOSSY_COLOR;
   else if (strcmp(shader_type, "TRANSMISSION_COLOR") == 0)
     return SHADER_EVAL_TRANSMISSION_COLOR;
-  else if (strcmp(shader_type, "SUBSURFACE_COLOR") == 0)
-    return SHADER_EVAL_SUBSURFACE_COLOR;
   else if (strcmp(shader_type, "EMIT") == 0)
     return SHADER_EVAL_EMISSION;
 
@@ -294,8 +293,6 @@ static ShaderEvalType get_shader_type(const string &pass_type)
     return SHADER_EVAL_GLOSSY;
   else if (strcmp(shader_type, "TRANSMISSION") == 0)
     return SHADER_EVAL_TRANSMISSION;
-  else if (strcmp(shader_type, "SUBSURFACE") == 0)
-    return SHADER_EVAL_SUBSURFACE;
 
   /* extra */
   else if (strcmp(shader_type, "ENVIRONMENT") == 0)
@@ -327,6 +324,7 @@ static void end_render_result(BL::RenderEngine &b_engine,
 
 void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
                                                  bool do_update_only,
+                                                 bool do_read_only,
                                                  bool highlight)
 {
   int x = rtile.x - session->tile_manager.params.full_x;
@@ -352,7 +350,23 @@ void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
 
   BL::RenderLayer b_rlay = *b_single_rlay;
 
-  if (do_update_only) {
+  if (do_read_only) {
+    /* copy each pass */
+    BL::RenderLayer::passes_iterator b_iter;
+
+    for (b_rlay.passes.begin(b_iter); b_iter != b_rlay.passes.end(); ++b_iter) {
+      BL::RenderPass b_pass(*b_iter);
+
+      /* find matching pass type */
+      PassType pass_type = BlenderSync::get_pass_type(b_pass);
+      int components = b_pass.channels();
+
+      rtile.buffers->set_pass_rect(pass_type, components, (float *)b_pass.rect());
+    }
+
+    end_render_result(b_engine, b_rr, false, false, false);
+  }
+  else if (do_update_only) {
     /* Sample would be zero at initial tile update, which is only needed
      * to tag tile form blender side as IN PROGRESS for proper highlight
      * no buffers should be sent to blender yet. For denoise we also
@@ -360,21 +374,26 @@ void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
     bool merge = (rtile.sample != 0) && (rtile.task != RenderTile::DENOISE);
 
     if (merge) {
-      update_render_result(b_rr, b_rlay, rtile);
+      update_render_result(b_rlay, rtile);
     }
 
     end_render_result(b_engine, b_rr, true, highlight, merge);
   }
   else {
     /* Write final render result. */
-    write_render_result(b_rr, b_rlay, rtile);
+    write_render_result(b_rlay, rtile);
     end_render_result(b_engine, b_rr, false, false, true);
   }
 }
 
+void BlenderSession::read_render_tile(RenderTile &rtile)
+{
+  do_write_update_render_tile(rtile, false, true, false);
+}
+
 void BlenderSession::write_render_tile(RenderTile &rtile)
 {
-  do_write_update_render_tile(rtile, false, false);
+  do_write_update_render_tile(rtile, false, false, false);
 }
 
 void BlenderSession::update_render_tile(RenderTile &rtile, bool highlight)
@@ -384,9 +403,9 @@ void BlenderSession::update_render_tile(RenderTile &rtile, bool highlight)
    * would need to be investigated a bit further, but for now shall be fine
    */
   if (!b_engine.is_preview())
-    do_write_update_render_tile(rtile, true, highlight);
+    do_write_update_render_tile(rtile, true, false, highlight);
   else
-    do_write_update_render_tile(rtile, false, false);
+    do_write_update_render_tile(rtile, false, false, false);
 }
 
 static void add_cryptomatte_layer(BL::RenderResult &b_rr, string name, string manifest)
@@ -454,14 +473,13 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   session->update_render_tile_cb = function_bind(
       &BlenderSession::update_render_tile, this, _1, _2);
 
+  BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
+
   /* get buffer parameters */
   SessionParams session_params = BlenderSync::get_session_params(
-      b_engine, b_userpref, b_scene, background);
+      b_engine, b_userpref, b_scene, background, b_view_layer);
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_render, b_v3d, b_rv3d, scene->camera, width, height);
-
-  /* render each layer */
-  BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
+      b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
 
   /* temporary render result to find needed passes and views */
   BL::RenderResult b_rr = begin_render_result(
@@ -471,33 +489,26 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   BL::RenderLayer b_rlay = *b_single_rlay;
   b_rlay_name = b_view_layer.name();
 
-  /* add passes */
-  vector<Pass> passes = sync->sync_render_passes(b_rlay, b_view_layer);
-  buffer_params.passes = passes;
-
-  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
-  bool full_denoising = get_boolean(crl, "use_denoising");
-  bool write_denoising_passes = get_boolean(crl, "denoising_store_passes");
+  /* Update denoising parameters. */
+  session->set_denoising(session_params.denoising);
 
-  bool run_denoising = full_denoising || write_denoising_passes;
+  bool use_denoising = session_params.denoising.use;
+  bool store_denoising_passes = session_params.denoising.store_passes;
 
-  session->tile_manager.schedule_denoising = run_denoising;
-  buffer_params.denoising_data_pass = run_denoising;
+  buffer_params.denoising_data_pass = use_denoising || store_denoising_passes;
   buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
-  buffer_params.denoising_prefiltered_pass = write_denoising_passes;
-
-  session->params.run_denoising = run_denoising;
-  session->params.full_denoising = full_denoising;
-  session->params.write_denoising_passes = write_denoising_passes;
-  session->params.denoising.radius = get_int(crl, "denoising_radius");
-  session->params.denoising.strength = get_float(crl, "denoising_strength");
-  session->params.denoising.feature_strength = get_float(crl, "denoising_feature_strength");
-  session->params.denoising.relative_pca = get_boolean(crl, "denoising_relative_pca");
+  buffer_params.denoising_prefiltered_pass = store_denoising_passes &&
+                                             session_params.denoising.type == DENOISER_NLM;
 
   scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
   scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass;
   scene->film->denoising_prefiltered_pass = buffer_params.denoising_prefiltered_pass;
 
+  /* Add passes */
+  vector<Pass> passes = sync->sync_render_passes(
+      b_rlay, b_view_layer, session_params.adaptive_sampling, session_params.denoising);
+  buffer_params.passes = passes;
+
   scene->film->pass_alpha_threshold = b_view_layer.pass_alpha_threshold();
   scene->film->tag_passes_update(scene, passes);
   scene->film->tag_update(scene);
@@ -526,19 +537,20 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
     builtin_images_load();
 
     /* Attempt to free all data which is held by Blender side, since at this
-     * point we knwo that we've got everything to render current view layer.
+     * point we know that we've got everything to render current view layer.
      */
-    /* At the moment we only free if we are not doing multi-view (or if we are rendering the last view).
-     * See T58142/D4239 for discussion.
+    /* At the moment we only free if we are not doing multi-view
+     * (or if we are rendering the last view). See T58142/D4239 for discussion.
      */
     if (view_index == num_views - 1) {
       free_blender_memory_if_possible();
     }
 
-    /* Make sure all views have different noise patterns. - hardcoded value just to make it random */
+    /* Make sure all views have different noise patterns. - hardcoded value just to make it random
+     */
     if (view_index != 0) {
-      scene->integrator->seed += hash_int_2d(scene->integrator->seed,
-                                             hash_int(view_index * 0xdeadbeef));
+      scene->integrator->seed += hash_uint2(scene->integrator->seed,
+                                            hash_uint2(view_index * 0xdeadbeef, 0));
       scene->integrator->tag_update(scene);
     }
 
@@ -600,25 +612,6 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
 #endif
 }
 
-static void populate_bake_data(BakeData *data,
-                               const int object_id,
-                               BL::BakePixel &pixel_array,
-                               const int num_pixels)
-{
-  BL::BakePixel bp = pixel_array;
-
-  int i;
-  for (i = 0; i < num_pixels; i++) {
-    if (bp.object_id() == object_id) {
-      data->set(i, bp.primitive_id(), bp.uv(), bp.du_dx(), bp.du_dy(), bp.dv_dx(), bp.dv_dy());
-    }
-    else {
-      data->set_null(i);
-    }
-    bp = bp.next();
-  }
-}
-
 static int bake_pass_filter_get(const int pass_filter)
 {
   int flag = BAKE_FILTER_NONE;
@@ -636,8 +629,6 @@ static int bake_pass_filter_get(const int pass_filter)
     flag |= BAKE_FILTER_GLOSSY;
   if ((pass_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0)
     flag |= BAKE_FILTER_TRANSMISSION;
-  if ((pass_filter & BL::BakeSettings::pass_filter_SUBSURFACE) != 0)
-    flag |= BAKE_FILTER_SUBSURFACE;
 
   if ((pass_filter & BL::BakeSettings::pass_filter_EMIT) != 0)
     flag |= BAKE_FILTER_EMISSION;
@@ -651,43 +642,26 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
                           BL::Object &b_object,
                           const string &pass_type,
                           const int pass_filter,
-                          const int object_id,
-                          BL::BakePixel &pixel_array,
-                          const size_t num_pixels,
-                          const int /*depth*/,
-                          float result[])
+                          const int bake_width,
+                          const int bake_height)
 {
   b_depsgraph = b_depsgraph_;
 
   ShaderEvalType shader_type = get_shader_type(pass_type);
-
-  /* Set baking flag in advance, so kernel loading can check if we need
-   * any baking capabilities.
-   */
-  scene->bake_manager->set_baking(true);
-
-  /* ensure kernels are loaded before we do any scene updates */
-  session->load_kernels();
-
-  if (shader_type == SHADER_EVAL_UV) {
-    /* force UV to be available */
-    Pass::add(PASS_UV, scene->film->passes);
-  }
-
   int bake_pass_filter = bake_pass_filter_get(pass_filter);
-  bake_pass_filter = BakeManager::shader_type_to_pass_filter(shader_type, bake_pass_filter);
 
-  /* force use_light_pass to be true if we bake more than just colors */
-  if (bake_pass_filter & ~BAKE_FILTER_COLOR) {
-    Pass::add(PASS_LIGHT, scene->film->passes);
-  }
+  /* Initialize bake manager, before we load the baking kernels. */
+  scene->bake_manager->set(scene, b_object.name(), shader_type, bake_pass_filter);
 
-  /* create device and update scene */
-  scene->film->tag_update(scene);
-  scene->integrator->tag_update(scene);
+  /* Passes are identified by name, so in order to return the combined pass we need to set the
+   * name. */
+  Pass::add(PASS_COMBINED, scene->film->passes, "Combined");
+
+  session->read_bake_tile_cb = function_bind(&BlenderSession::read_render_tile, this, _1);
+  session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
 
   if (!session->progress.get_cancel()) {
-    /* update scene */
+    /* Sync scene. */
     BL::Object b_camera_override(b_engine.camera_override());
     sync->sync_camera(b_render, b_camera_override, width, height, "");
     sync->sync_data(
@@ -695,76 +669,46 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
     builtin_images_load();
   }
 
-  BakeData *bake_data = NULL;
+  /* Object might have been disabled for rendering or excluded in some
+   * other way, in that case Blender will report a warning afterwards. */
+  bool object_found = false;
+  foreach (Object *ob, scene->objects) {
+    if (ob->name == b_object.name()) {
+      object_found = true;
+      break;
+    }
+  }
 
-  if (!session->progress.get_cancel()) {
-    /* get buffer parameters */
+  if (object_found && !session->progress.get_cancel()) {
+    /* Get session and buffer parameters. */
     SessionParams session_params = BlenderSync::get_session_params(
         b_engine, b_userpref, b_scene, background);
-    BufferParams buffer_params = BlenderSync::get_buffer_params(
-        b_render, b_v3d, b_rv3d, scene->camera, width, height);
-
-    scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
-
-    /* set number of samples */
-    session->tile_manager.set_samples(session_params.samples);
-    session->reset(buffer_params, session_params.samples);
-    session->update_scene();
-
-    /* find object index. todo: is arbitrary - copied from mesh_displace.cpp */
-    size_t object_index = OBJECT_NONE;
-    int tri_offset = 0;
-
-    for (size_t i = 0; i < scene->objects.size(); i++) {
-      if (strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
-        object_index = i;
-        tri_offset = scene->objects[i]->mesh->tri_offset;
-        break;
-      }
-    }
-
-    /* Object might have been disabled for rendering or excluded in some
-     * other way, in that case Blender will report a warning afterwards. */
-    if (object_index != OBJECT_NONE) {
-      int object = object_index;
+    session_params.progressive_refine = false;
 
-      bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
-      populate_bake_data(bake_data, object_id, pixel_array, num_pixels);
-    }
+    BufferParams buffer_params;
+    buffer_params.width = bake_width;
+    buffer_params.height = bake_height;
+    buffer_params.passes = scene->film->passes;
 
-    /* set number of samples */
+    /* Update session. */
     session->tile_manager.set_samples(session_params.samples);
     session->reset(buffer_params, session_params.samples);
-    session->update_scene();
 
     session->progress.set_update_callback(
         function_bind(&BlenderSession::update_bake_progress, this));
   }
 
   /* Perform bake. Check cancel to avoid crash with incomplete scene data. */
-  if (!session->progress.get_cancel() && bake_data) {
-    scene->bake_manager->bake(scene->device,
-                              &scene->dscene,
-                              scene,
-                              session->progress,
-                              shader_type,
-                              bake_pass_filter,
-                              bake_data,
-                              result);
+  if (object_found && !session->progress.get_cancel()) {
+    session->start();
+    session->wait();
   }
 
-  /* free all memory used (host and device), so we wouldn't leave render
-   * engine with extra memory allocated
-   */
-
-  session->device_free();
-
-  delete sync;
-  sync = NULL;
+  session->read_bake_tile_cb = function_null;
+  session->write_render_tile_cb = function_null;
 }
 
-void BlenderSession::do_write_update_render_result(BL::RenderResult &b_rr,
-                                                   BL::RenderLayer &b_rlay,
+void BlenderSession::do_write_update_render_result(BL::RenderLayer &b_rlay,
                                                    RenderTile &rtile,
                                                    bool do_update_only)
 {
@@ -791,18 +735,13 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult &b_rr,
 
     for (b_rlay.passes.begin(b_iter); b_iter != b_rlay.passes.end(); ++b_iter) {
       BL::RenderPass b_pass(*b_iter);
-
-      /* find matching pass type */
-      PassType pass_type = BlenderSync::get_pass_type(b_pass);
       int components = b_pass.channels();
 
-      bool read = false;
-      if (pass_type != PASS_NONE) {
-        /* copy pixels */
-        read = buffers->get_pass_rect(
-            pass_type, exposure, sample, components, &pixels[0], b_pass.name());
-      }
-      else {
+      /* Copy pixels from regular render passes. */
+      bool read = buffers->get_pass_rect(b_pass.name(), exposure, sample, components, &pixels[0]);
+
+      /* If denoising pass, */
+      if (!read) {
         int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
         if (denoising_offset >= 0) {
           read = buffers->get_denoising_pass_rect(
@@ -820,26 +759,19 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult &b_rr,
   else {
     /* copy combined pass */
     BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
-    if (buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0], "Combined"))
+    if (buffers->get_pass_rect("Combined", exposure, sample, 4, &pixels[0]))
       b_combined_pass.rect(&pixels[0]);
   }
-
-  /* tag result as updated */
-  b_engine.update_result(b_rr);
 }
 
-void BlenderSession::write_render_result(BL::RenderResult &b_rr,
-                                         BL::RenderLayer &b_rlay,
-                                         RenderTile &rtile)
+void BlenderSession::write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
 {
-  do_write_update_render_result(b_rr, b_rlay, rtile, false);
+  do_write_update_render_result(b_rlay, rtile, false);
 }
 
-void BlenderSession::update_render_result(BL::RenderResult &b_rr,
-                                          BL::RenderLayer &b_rlay,
-                                          RenderTile &rtile)
+void BlenderSession::update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
 {
-  do_write_update_render_result(b_rr, b_rlay, rtile, true);
+  do_write_update_render_result(b_rlay, rtile, true);
 }
 
 void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
@@ -857,16 +789,16 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
   if (session->params.modified(session_params) || scene->params.modified(scene_params)) {
     free_session();
     create_session();
-    return;
   }
 
   /* increase samples, but never decrease */
   session->set_samples(session_params.samples);
+  session->set_denoising_start_sample(session_params.denoising.start_sample);
   session->set_pause(session_pause);
 
   /* copy recalc flags, outside of mutex so we can decide to do the real
    * synchronization at a later time to not block on running updates */
-  sync->sync_recalc(b_depsgraph_);
+  sync->sync_recalc(b_depsgraph_, b_v3d);
 
   /* don't do synchronization if on pause */
   if (session_pause) {
@@ -892,21 +824,36 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
   else
     sync->sync_camera(b_render, b_camera_override, width, height, "");
 
-  builtin_images_load();
+  /* get buffer parameters */
+  BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
 
-  /* unlock */
-  session->scene->mutex.unlock();
+  if (!buffer_params.denoising_data_pass) {
+    session_params.denoising.use = false;
+  }
+
+  session->set_denoising(session_params.denoising);
+
+  /* Update film if denoising data was enabled or disabled. */
+  if (scene->film->denoising_data_pass != buffer_params.denoising_data_pass) {
+    scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
+    scene->film->tag_update(scene);
+  }
 
   /* reset if needed */
   if (scene->need_reset()) {
-    BufferParams buffer_params = BlenderSync::get_buffer_params(
-        b_render, b_v3d, b_rv3d, scene->camera, width, height);
     session->reset(buffer_params, session_params.samples);
 
+    /* After session reset, so device is not accessing image data anymore. */
+    builtin_images_load();
+
     /* reset time */
     start_resize_time = 0.0;
   }
 
+  /* unlock */
+  session->scene->mutex.unlock();
+
   /* Start rendering thread, if it's not running already. Do this
    * after all scene data has been synced at least once. */
   session->start();
@@ -961,7 +908,7 @@ bool BlenderSession::draw(int w, int h)
       SessionParams session_params = BlenderSync::get_session_params(
           b_engine, b_userpref, b_scene, background);
       BufferParams buffer_params = BlenderSync::get_buffer_params(
-          b_render, b_v3d, b_rv3d, scene->camera, width, height);
+          b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
       bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
       if (session_pause == false) {
@@ -979,7 +926,7 @@ bool BlenderSession::draw(int w, int h)
 
   /* draw */
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_render, b_v3d, b_rv3d, scene->camera, width, height);
+      b_render, b_v3d, b_rv3d, scene->camera, width, height, session->params.denoising.use);
   DeviceDrawParams draw_params;
 
   if (session->params.display_buffer_linear) {
@@ -1057,8 +1004,9 @@ void BlenderSession::update_status_progress()
   }
 
   double current_time = time_dt();
-  /* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date.
-   * For headless rendering, only report when something significant changes to keep the console output readable. */
+  /* When rendering in a window, redraw the status at least once per second to keep the elapsed and
+   * remaining time up-to-date. For headless rendering, only report when something significant
+   * changes to keep the console output readable. */
   if (status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
     b_engine.update_stats("", (timestatus + scene_status + status).c_str());
     b_engine.update_memory_stats(mem_used, mem_peak);
@@ -1119,324 +1067,6 @@ void BlenderSession::test_cancel()
       session->progress.set_cancel("Cancelled");
 }
 
-/* builtin image file name is actually an image datablock name with
- * absolute sequence frame number concatenated via '@' character
- *
- * this function splits frame from builtin name
- */
-int BlenderSession::builtin_image_frame(const string &builtin_name)
-{
-  int last = builtin_name.find_last_of('@');
-  return atoi(builtin_name.substr(last + 1, builtin_name.size() - last - 1).c_str());
-}
-
-void BlenderSession::builtin_image_info(const string &builtin_name,
-                                        void *builtin_data,
-                                        ImageMetaData &metadata)
-{
-  /* empty image */
-  metadata.width = 1;
-  metadata.height = 1;
-
-  if (!builtin_data)
-    return;
-
-  /* recover ID pointer */
-  PointerRNA ptr;
-  RNA_id_pointer_create((ID *)builtin_data, &ptr);
-  BL::ID b_id(ptr);
-
-  if (b_id.is_a(&RNA_Image)) {
-    /* image data */
-    BL::Image b_image(b_id);
-
-    metadata.builtin_free_cache = !b_image.has_data();
-    metadata.is_float = b_image.is_float();
-    metadata.width = b_image.size()[0];
-    metadata.height = b_image.size()[1];
-    metadata.depth = 1;
-    metadata.channels = b_image.channels();
-  }
-  else if (b_id.is_a(&RNA_Object)) {
-    /* smoke volume data */
-    BL::Object b_ob(b_id);
-    BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
-
-    metadata.is_float = true;
-    metadata.depth = 1;
-    metadata.channels = 1;
-
-    if (!b_domain)
-      return;
-
-    if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE))
-      metadata.channels = 1;
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR))
-      metadata.channels = 4;
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY))
-      metadata.channels = 3;
-    else
-      return;
-
-    int3 resolution = get_int3(b_domain.domain_resolution());
-    int amplify = (b_domain.use_high_resolution()) ? b_domain.amplify() + 1 : 1;
-
-    /* Velocity and heat data is always low-resolution. */
-    if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) {
-      amplify = 1;
-    }
-
-    metadata.width = resolution.x * amplify;
-    metadata.height = resolution.y * amplify;
-    metadata.depth = resolution.z * amplify;
-  }
-  else {
-    /* TODO(sergey): Check we're indeed in shader node tree. */
-    PointerRNA ptr;
-    RNA_pointer_create(NULL, &RNA_Node, builtin_data, &ptr);
-    BL::Node b_node(ptr);
-    if (b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
-      BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
-      metadata.channels = 4;
-      metadata.width = b_point_density_node.resolution();
-      metadata.height = metadata.width;
-      metadata.depth = metadata.width;
-      metadata.is_float = true;
-    }
-  }
-}
-
-bool BlenderSession::builtin_image_pixels(const string &builtin_name,
-                                          void *builtin_data,
-                                          unsigned char *pixels,
-                                          const size_t pixels_size,
-                                          const bool free_cache)
-{
-  if (!builtin_data) {
-    return false;
-  }
-
-  const int frame = builtin_image_frame(builtin_name);
-
-  PointerRNA ptr;
-  RNA_id_pointer_create((ID *)builtin_data, &ptr);
-  BL::Image b_image(ptr);
-
-  const int width = b_image.size()[0];
-  const int height = b_image.size()[1];
-  const int channels = b_image.channels();
-
-  unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame);
-  const size_t num_pixels = ((size_t)width) * height;
-
-  if (image_pixels && num_pixels * channels == pixels_size) {
-    memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
-  }
-  else {
-    if (channels == 1) {
-      memset(pixels, 0, pixels_size * sizeof(unsigned char));
-    }
-    else {
-      const size_t num_pixels_safe = pixels_size / channels;
-      unsigned char *cp = pixels;
-      for (size_t i = 0; i < num_pixels_safe; i++, cp += channels) {
-        cp[0] = 255;
-        cp[1] = 0;
-        cp[2] = 255;
-        if (channels == 4) {
-          cp[3] = 255;
-        }
-      }
-    }
-  }
-
-  if (image_pixels) {
-    MEM_freeN(image_pixels);
-  }
-
-  /* Free image buffers to save memory during render. */
-  if (free_cache) {
-    b_image.buffers_free();
-  }
-
-  /* Premultiply, byte images are always straight for Blender. */
-  unsigned char *cp = pixels;
-  for (size_t i = 0; i < num_pixels; i++, cp += channels) {
-    cp[0] = (cp[0] * cp[3]) >> 8;
-    cp[1] = (cp[1] * cp[3]) >> 8;
-    cp[2] = (cp[2] * cp[3]) >> 8;
-  }
-  return true;
-}
-
-bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
-                                                void *builtin_data,
-                                                float *pixels,
-                                                const size_t pixels_size,
-                                                const bool free_cache)
-{
-  if (!builtin_data) {
-    return false;
-  }
-
-  PointerRNA ptr;
-  RNA_id_pointer_create((ID *)builtin_data, &ptr);
-  BL::ID b_id(ptr);
-
-  if (b_id.is_a(&RNA_Image)) {
-    /* image data */
-    BL::Image b_image(b_id);
-    int frame = builtin_image_frame(builtin_name);
-
-    const int width = b_image.size()[0];
-    const int height = b_image.size()[1];
-    const int channels = b_image.channels();
-
-    float *image_pixels;
-    image_pixels = image_get_float_pixels_for_frame(b_image, frame);
-    const size_t num_pixels = ((size_t)width) * height;
-
-    if (image_pixels && num_pixels * channels == pixels_size) {
-      memcpy(pixels, image_pixels, pixels_size * sizeof(float));
-    }
-    else {
-      if (channels == 1) {
-        memset(pixels, 0, num_pixels * sizeof(float));
-      }
-      else {
-        const size_t num_pixels_safe = pixels_size / channels;
-        float *fp = pixels;
-        for (int i = 0; i < num_pixels_safe; i++, fp += channels) {
-          fp[0] = 1.0f;
-          fp[1] = 0.0f;
-          fp[2] = 1.0f;
-          if (channels == 4) {
-            fp[3] = 1.0f;
-          }
-        }
-      }
-    }
-
-    if (image_pixels) {
-      MEM_freeN(image_pixels);
-    }
-
-    /* Free image buffers to save memory during render. */
-    if (free_cache) {
-      b_image.buffers_free();
-    }
-
-    return true;
-  }
-  else if (b_id.is_a(&RNA_Object)) {
-    /* smoke volume data */
-    BL::Object b_ob(b_id);
-    BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
-
-    if (!b_domain) {
-      return false;
-    }
-
-    int3 resolution = get_int3(b_domain.domain_resolution());
-    int length, amplify = (b_domain.use_high_resolution()) ? b_domain.amplify() + 1 : 1;
-
-    /* Velocity and heat data is always low-resolution. */
-    if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) {
-      amplify = 1;
-    }
-
-    const int width = resolution.x * amplify;
-    const int height = resolution.y * amplify;
-    const int depth = resolution.z * amplify;
-    const size_t num_pixels = ((size_t)width) * height * depth;
-
-    if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
-      SmokeDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels) {
-        SmokeDomainSettings_density_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME)) {
-      /* this is in range 0..1, and interpreted by the OpenGL smoke viewer
-       * as 1500..3000 K with the first part faded to zero density */
-      SmokeDomainSettings_flame_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels) {
-        SmokeDomainSettings_flame_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR)) {
-      /* the RGB is "premultiplied" by density for better interpolation results */
-      SmokeDomainSettings_color_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels * 4) {
-        SmokeDomainSettings_color_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY)) {
-      SmokeDomainSettings_velocity_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels * 3) {
-        SmokeDomainSettings_velocity_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) {
-      SmokeDomainSettings_heat_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels) {
-        SmokeDomainSettings_heat_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE)) {
-      SmokeDomainSettings_temperature_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels) {
-        SmokeDomainSettings_temperature_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else {
-      fprintf(
-          stderr, "Cycles error: unknown volume attribute %s, skipping\n", builtin_name.c_str());
-      pixels[0] = 0.0f;
-      return false;
-    }
-
-    fprintf(stderr, "Cycles error: unexpected smoke volume resolution, skipping\n");
-  }
-  else {
-    /* We originally were passing view_layer here but in reality we need a
-     * a depsgraph to pass to the RE_point_density_minmax() function.
-     */
-    /* TODO(sergey): Check we're indeed in shader node tree. */
-    PointerRNA ptr;
-    RNA_pointer_create(NULL, &RNA_Node, builtin_data, &ptr);
-    BL::Node b_node(ptr);
-    if (b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
-      BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
-      int length;
-      b_point_density_node.calc_point_density(b_depsgraph, &length, &pixels);
-    }
-  }
-
-  return false;
-}
-
-void BlenderSession::builtin_images_load()
-{
-  /* Force builtin images to be loaded along with Blender data sync. This
-   * is needed because we may be reading from depsgraph evaluated data which
-   * can be freed by Blender before Cycles reads it. */
-  ImageManager *manager = session->scene->image_manager;
-  Device *device = session->device;
-  manager->device_load_builtin(device, session->scene, session->progress);
-}
-
 void BlenderSession::update_resumable_tile_manager(int num_samples)
 {
   const int num_resumable_chunks = BlenderSession::num_resumable_chunks,
@@ -1470,8 +1100,8 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 
   /* Round after doing the multiplications with num_chunks and num_samples_per_chunk
    * to allow for many small chunks. */
-  int rounded_range_start_sample = (int)floor(range_start_sample + 0.5f);
-  int rounded_range_num_samples = max((int)floor(range_num_samples + 0.5f), 1);
+  int rounded_range_start_sample = (int)floorf(range_start_sample + 0.5f);
+  int rounded_range_num_samples = max((int)floorf(range_num_samples + 0.5f), 1);
 
   /* Make sure we don't overshoot. */
   if (rounded_range_start_sample + rounded_range_num_samples > num_samples) {
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index f0107d4e0b1..34e952e312b 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -17,15 +17,19 @@
 #ifndef __BLENDER_SESSION_H__
 #define __BLENDER_SESSION_H__
 
+#include "RNA_blender_cpp.h"
+
 #include "device/device.h"
+
+#include "render/bake.h"
 #include "render/scene.h"
 #include "render/session.h"
-#include "render/bake.h"
 
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class BlenderSync;
 class ImageMetaData;
 class Scene;
 class Session;
@@ -49,8 +53,6 @@ class BlenderSession {
 
   ~BlenderSession();
 
-  void create();
-
   /* session */
   void create_session();
   void free_session();
@@ -64,18 +66,16 @@ class BlenderSession {
             BL::Object &b_object,
             const string &pass_type,
             const int custom_flag,
-            const int object_id,
-            BL::BakePixel &pixel_array,
-            const size_t num_pixels,
-            const int depth,
-            float pixels[]);
+            const int bake_width,
+            const int bake_height);
 
-  void write_render_result(BL::RenderResult &b_rr, BL::RenderLayer &b_rlay, RenderTile &rtile);
+  void write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
   void write_render_tile(RenderTile &rtile);
+  void read_render_tile(RenderTile &rtile);
 
   /* update functions are used to update display buffer only after sample was rendered
    * only needed for better visual feedback */
-  void update_render_result(BL::RenderResult &b_rr, BL::RenderLayer &b_rlay, RenderTile &rtile);
+  void update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
   void update_render_tile(RenderTile &rtile, bool highlight);
 
   /* interactive updates */
@@ -150,24 +150,14 @@ class BlenderSession {
  protected:
   void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name);
 
-  void do_write_update_render_result(BL::RenderResult &b_rr,
-                                     BL::RenderLayer &b_rlay,
+  void do_write_update_render_result(BL::RenderLayer &b_rlay,
                                      RenderTile &rtile,
                                      bool do_update_only);
-  void do_write_update_render_tile(RenderTile &rtile, bool do_update_only, bool highlight);
-
-  int builtin_image_frame(const string &builtin_name);
-  void builtin_image_info(const string &builtin_name, void *builtin_data, ImageMetaData &metadata);
-  bool builtin_image_pixels(const string &builtin_name,
-                            void *builtin_data,
-                            unsigned char *pixels,
-                            const size_t pixels_size,
-                            const bool free_cache);
-  bool builtin_image_float_pixels(const string &builtin_name,
-                                  void *builtin_data,
-                                  float *pixels,
-                                  const size_t pixels_size,
-                                  const bool free_cache);
+  void do_write_update_render_tile(RenderTile &rtile,
+                                   bool do_update_only,
+                                   bool do_read_only,
+                                   bool highlight);
+
   void builtin_images_load();
 
   /* Update tile manager to reflect resumable render settings. */
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 169c4d414a6..33e73b5a4b9 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "render/background.h"
+#include "render/colorspace.h"
 #include "render/graph.h"
 #include "render/light.h"
 #include "render/nodes.h"
@@ -22,14 +23,15 @@
 #include "render/scene.h"
 #include "render/shader.h"
 
-#include "blender/blender_texture.h"
+#include "blender/blender_image.h"
 #include "blender/blender_sync.h"
+#include "blender/blender_texture.h"
 #include "blender/blender_util.h"
 
 #include "util/util_debug.h"
 #include "util/util_foreach.h"
-#include "util/util_string.h"
 #include "util/util_set.h"
+#include "util/util_string.h"
 #include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
@@ -89,6 +91,12 @@ template<typename NodeType> static ExtensionType get_image_extension(NodeType &b
   return (ExtensionType)validate_enum_value(value, EXTENSION_NUM_TYPES, EXTENSION_REPEAT);
 }
 
+static ImageAlphaType get_image_alpha_type(BL::Image &b_image)
+{
+  int value = b_image.alpha_mode();
+  return (ImageAlphaType)validate_enum_value(value, IMAGE_ALPHA_NUM_TYPES, IMAGE_ALPHA_AUTO);
+}
+
 /* Graph */
 
 static BL::NodeSocket get_node_output(BL::Node &b_node, const string &name)
@@ -201,24 +209,6 @@ static void get_tex_mapping(TextureMapping *mapping, BL::TexMapping &b_mapping)
   mapping->z_mapping = (TextureMapping::Mapping)b_mapping.mapping_z();
 }
 
-static void get_tex_mapping(TextureMapping *mapping, BL::ShaderNodeMapping &b_mapping)
-{
-  if (!b_mapping)
-    return;
-
-  mapping->translation = get_float3(b_mapping.translation());
-  mapping->rotation = get_float3(b_mapping.rotation());
-  mapping->scale = get_float3(b_mapping.scale());
-  mapping->type = (TextureMapping::Type)b_mapping.vector_type();
-
-  mapping->use_minmax = b_mapping.use_min() || b_mapping.use_max();
-
-  if (b_mapping.use_min())
-    mapping->min = get_float3(b_mapping.min());
-  if (b_mapping.use_max())
-    mapping->max = get_float3(b_mapping.max());
-}
-
 static ShaderNode *add_node(Scene *scene,
                             BL::RenderEngine &b_engine,
                             BL::BlendData &b_data,
@@ -308,18 +298,38 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeRGBToBW)) {
     node = new RGBToBWNode();
   }
+  else if (b_node.is_a(&RNA_ShaderNodeMapRange)) {
+    BL::ShaderNodeMapRange b_map_range_node(b_node);
+    MapRangeNode *map_range_node = new MapRangeNode();
+    map_range_node->clamp = b_map_range_node.clamp();
+    map_range_node->type = (NodeMapRangeType)b_map_range_node.interpolation_type();
+    node = map_range_node;
+  }
+  else if (b_node.is_a(&RNA_ShaderNodeClamp)) {
+    BL::ShaderNodeClamp b_clamp_node(b_node);
+    ClampNode *clamp_node = new ClampNode();
+    clamp_node->type = (NodeClampType)b_clamp_node.clamp_type();
+    node = clamp_node;
+  }
   else if (b_node.is_a(&RNA_ShaderNodeMath)) {
     BL::ShaderNodeMath b_math_node(b_node);
-    MathNode *math = new MathNode();
-    math->type = (NodeMath)b_math_node.operation();
-    math->use_clamp = b_math_node.use_clamp();
-    node = math;
+    MathNode *math_node = new MathNode();
+    math_node->type = (NodeMathType)b_math_node.operation();
+    math_node->use_clamp = b_math_node.use_clamp();
+    node = math_node;
   }
   else if (b_node.is_a(&RNA_ShaderNodeVectorMath)) {
     BL::ShaderNodeVectorMath b_vector_math_node(b_node);
-    VectorMathNode *vmath = new VectorMathNode();
-    vmath->type = (NodeVectorMath)b_vector_math_node.operation();
-    node = vmath;
+    VectorMathNode *vector_math_node = new VectorMathNode();
+    vector_math_node->type = (NodeVectorMathType)b_vector_math_node.operation();
+    node = vector_math_node;
+  }
+  else if (b_node.is_a(&RNA_ShaderNodeVectorRotate)) {
+    BL::ShaderNodeVectorRotate b_vector_rotate_node(b_node);
+    VectorRotateNode *vector_rotate_node = new VectorRotateNode();
+    vector_rotate_node->type = (NodeVectorRotateType)b_vector_rotate_node.rotation_type();
+    vector_rotate_node->invert = b_vector_rotate_node.invert();
+    node = vector_rotate_node;
   }
   else if (b_node.is_a(&RNA_ShaderNodeVectorTransform)) {
     BL::ShaderNodeVectorTransform b_vector_transform_node(b_node);
@@ -341,9 +351,7 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeMapping)) {
     BL::ShaderNodeMapping b_mapping_node(b_node);
     MappingNode *mapping = new MappingNode();
-
-    get_tex_mapping(&mapping->tex_mapping, b_mapping_node);
-
+    mapping->type = (NodeMappingType)b_mapping_node.vector_type();
     node = mapping;
   }
   else if (b_node.is_a(&RNA_ShaderNodeFresnel)) {
@@ -376,16 +384,16 @@ static ShaderNode *add_node(Scene *scene,
 
     switch (b_aniso_node.distribution()) {
       case BL::ShaderNodeBsdfAnisotropic::distribution_BECKMANN:
-        aniso->distribution = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
+        aniso->distribution = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
         break;
       case BL::ShaderNodeBsdfAnisotropic::distribution_GGX:
-        aniso->distribution = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
+        aniso->distribution = CLOSURE_BSDF_MICROFACET_GGX_ID;
         break;
       case BL::ShaderNodeBsdfAnisotropic::distribution_MULTI_GGX:
-        aniso->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID;
+        aniso->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
         break;
       case BL::ShaderNodeBsdfAnisotropic::distribution_ASHIKHMIN_SHIRLEY:
-        aniso->distribution = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
+        aniso->distribution = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
         break;
     }
 
@@ -591,6 +599,15 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeHairInfo)) {
     node = new HairInfoNode();
   }
+  else if (b_node.is_a(&RNA_ShaderNodeVolumeInfo)) {
+    node = new VolumeInfoNode();
+  }
+  else if (b_node.is_a(&RNA_ShaderNodeVertexColor)) {
+    BL::ShaderNodeVertexColor b_vertex_color_node(b_node);
+    VertexColorNode *vertex_color_node = new VertexColorNode();
+    vertex_color_node->layer_name = b_vertex_color_node.layer_name();
+    node = vertex_color_node;
+  }
   else if (b_node.is_a(&RNA_ShaderNodeBump)) {
     BL::ShaderNodeBump b_bump_node(b_node);
     BumpNode *bump = new BumpNode();
@@ -603,16 +620,16 @@ static ShaderNode *add_node(Scene *scene,
       /* create script node */
       BL::ShaderNodeScript b_script_node(b_node);
 
-      OSLShaderManager *manager = (OSLShaderManager *)scene->shader_manager;
+      ShaderManager *manager = scene->shader_manager;
       string bytecode_hash = b_script_node.bytecode_hash();
 
       if (!bytecode_hash.empty()) {
-        node = manager->osl_node("", bytecode_hash, b_script_node.bytecode());
+        node = OSLShaderManager::osl_node(manager, "", bytecode_hash, b_script_node.bytecode());
       }
       else {
         string absolute_filepath = blender_absolute_path(
             b_data, b_ntree, b_script_node.filepath());
-        node = manager->osl_node(absolute_filepath, "");
+        node = OSLShaderManager::osl_node(manager, absolute_filepath, "");
       }
     }
 #else
@@ -625,7 +642,27 @@ static ShaderNode *add_node(Scene *scene,
     BL::Image b_image(b_image_node.image());
     BL::ImageUser b_image_user(b_image_node.image_user());
     ImageTextureNode *image = new ImageTextureNode();
+
+    image->interpolation = get_image_interpolation(b_image_node);
+    image->extension = get_image_extension(b_image_node);
+    image->projection = (NodeImageProjection)b_image_node.projection();
+    image->projection_blend = b_image_node.projection_blend();
+    BL::TexMapping b_texture_mapping(b_image_node.texture_mapping());
+    get_tex_mapping(&image->tex_mapping, b_texture_mapping);
+
     if (b_image) {
+      PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
+      image->colorspace = get_enum_identifier(colorspace_ptr, "name");
+
+      image->animated = b_image_node.image_user().use_auto_refresh();
+      image->alpha_type = get_image_alpha_type(b_image);
+
+      image->tiles.clear();
+      BL::Image::tiles_iterator b_iter;
+      for (b_image.tiles.begin(b_iter); b_iter != b_image.tiles.end(); ++b_iter) {
+        image->tiles.push_back(b_iter->number());
+      }
+
       /* builtin images will use callback-based reading because
        * they could only be loaded correct from blender side
        */
@@ -642,37 +679,14 @@ static ShaderNode *add_node(Scene *scene,
          */
         int scene_frame = b_scene.frame_current();
         int image_frame = image_user_frame_number(b_image_user, scene_frame);
-        image->filename = b_image.name() + "@" + string_printf("%d", image_frame);
-        image->builtin_data = b_image.ptr.data;
+        image->handle = scene->image_manager->add_image(
+            new BlenderImageLoader(b_image, image_frame), image->image_params());
       }
       else {
-        image->filename = image_user_file_path(b_image_user, b_image, b_scene.frame_current());
-        image->builtin_data = NULL;
-      }
-
-      image->animated = b_image_node.image_user().use_auto_refresh();
-      image->use_alpha = b_image.use_alpha();
-
-      /* TODO: restore */
-      /* TODO(sergey): Does not work properly when we change builtin type. */
-#if 0
-      if(b_image.is_updated()) {
-        scene->image_manager->tag_reload_image(
-                image->filename.string(),
-                image->builtin_data,
-                get_image_interpolation(b_image_node),
-                get_image_extension(b_image_node),
-                image->use_alpha);
+        image->filename = image_user_file_path(
+            b_image_user, b_image, b_scene.frame_current(), true);
       }
-#endif
     }
-    image->color_space = (NodeImageColorSpace)b_image_node.color_space();
-    image->projection = (NodeImageProjection)b_image_node.projection();
-    image->interpolation = get_image_interpolation(b_image_node);
-    image->extension = get_image_extension(b_image_node);
-    image->projection_blend = b_image_node.projection_blend();
-    BL::TexMapping b_texture_mapping(b_image_node.texture_mapping());
-    get_tex_mapping(&image->tex_mapping, b_texture_mapping);
     node = image;
   }
   else if (b_node.is_a(&RNA_ShaderNodeTexEnvironment)) {
@@ -680,7 +694,19 @@ static ShaderNode *add_node(Scene *scene,
     BL::Image b_image(b_env_node.image());
     BL::ImageUser b_image_user(b_env_node.image_user());
     EnvironmentTextureNode *env = new EnvironmentTextureNode();
+
+    env->interpolation = get_image_interpolation(b_env_node);
+    env->projection = (NodeEnvironmentProjection)b_env_node.projection();
+    BL::TexMapping b_texture_mapping(b_env_node.texture_mapping());
+    get_tex_mapping(&env->tex_mapping, b_texture_mapping);
+
     if (b_image) {
+      PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
+      env->colorspace = get_enum_identifier(colorspace_ptr, "name");
+
+      env->animated = b_env_node.image_user().use_auto_refresh();
+      env->alpha_type = get_image_alpha_type(b_image);
+
       bool is_builtin = b_image.packed_file() || b_image.source() == BL::Image::source_GENERATED ||
                         b_image.source() == BL::Image::source_MOVIE ||
                         (b_engine.is_preview() && b_image.source() != BL::Image::source_SEQUENCE);
@@ -688,35 +714,14 @@ static ShaderNode *add_node(Scene *scene,
       if (is_builtin) {
         int scene_frame = b_scene.frame_current();
         int image_frame = image_user_frame_number(b_image_user, scene_frame);
-        env->filename = b_image.name() + "@" + string_printf("%d", image_frame);
-        env->builtin_data = b_image.ptr.data;
+        env->handle = scene->image_manager->add_image(new BlenderImageLoader(b_image, image_frame),
+                                                      env->image_params());
       }
       else {
-        env->filename = image_user_file_path(b_image_user, b_image, b_scene.frame_current());
-        env->builtin_data = NULL;
-      }
-
-      env->animated = b_env_node.image_user().use_auto_refresh();
-      env->use_alpha = b_image.use_alpha();
-
-      /* TODO: restore */
-      /* TODO(sergey): Does not work properly when we change builtin type. */
-#if 0
-      if(b_image.is_updated()) {
-        scene->image_manager->tag_reload_image(
-                env->filename.string(),
-                env->builtin_data,
-                get_image_interpolation(b_env_node),
-                EXTENSION_REPEAT,
-                env->use_alpha);
+        env->filename = image_user_file_path(
+            b_image_user, b_image, b_scene.frame_current(), false);
       }
-#endif
     }
-    env->color_space = (NodeImageColorSpace)b_env_node.color_space();
-    env->interpolation = get_image_interpolation(b_env_node);
-    env->projection = (NodeEnvironmentProjection)b_env_node.projection();
-    BL::TexMapping b_texture_mapping(b_env_node.texture_mapping());
-    get_tex_mapping(&env->tex_mapping, b_texture_mapping);
     node = env;
   }
   else if (b_node.is_a(&RNA_ShaderNodeTexGradient)) {
@@ -730,9 +735,9 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeTexVoronoi)) {
     BL::ShaderNodeTexVoronoi b_voronoi_node(b_node);
     VoronoiTextureNode *voronoi = new VoronoiTextureNode();
-    voronoi->coloring = (NodeVoronoiColoring)b_voronoi_node.coloring();
-    voronoi->metric = (NodeVoronoiDistanceMetric)b_voronoi_node.distance();
+    voronoi->dimensions = b_voronoi_node.voronoi_dimensions();
     voronoi->feature = (NodeVoronoiFeature)b_voronoi_node.feature();
+    voronoi->metric = (NodeVoronoiDistanceMetric)b_voronoi_node.distance();
     BL::TexMapping b_texture_mapping(b_voronoi_node.texture_mapping());
     get_tex_mapping(&voronoi->tex_mapping, b_texture_mapping);
     node = voronoi;
@@ -749,6 +754,8 @@ static ShaderNode *add_node(Scene *scene,
     BL::ShaderNodeTexWave b_wave_node(b_node);
     WaveTextureNode *wave = new WaveTextureNode();
     wave->type = (NodeWaveType)b_wave_node.wave_type();
+    wave->bands_direction = (NodeWaveBandsDirection)b_wave_node.bands_direction();
+    wave->rings_direction = (NodeWaveRingsDirection)b_wave_node.rings_direction();
     wave->profile = (NodeWaveProfile)b_wave_node.wave_profile();
     BL::TexMapping b_texture_mapping(b_wave_node.texture_mapping());
     get_tex_mapping(&wave->tex_mapping, b_texture_mapping);
@@ -775,17 +782,19 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeTexNoise)) {
     BL::ShaderNodeTexNoise b_noise_node(b_node);
     NoiseTextureNode *noise = new NoiseTextureNode();
+    noise->dimensions = b_noise_node.noise_dimensions();
     BL::TexMapping b_texture_mapping(b_noise_node.texture_mapping());
     get_tex_mapping(&noise->tex_mapping, b_texture_mapping);
     node = noise;
   }
   else if (b_node.is_a(&RNA_ShaderNodeTexMusgrave)) {
     BL::ShaderNodeTexMusgrave b_musgrave_node(b_node);
-    MusgraveTextureNode *musgrave = new MusgraveTextureNode();
-    musgrave->type = (NodeMusgraveType)b_musgrave_node.musgrave_type();
+    MusgraveTextureNode *musgrave_node = new MusgraveTextureNode();
+    musgrave_node->type = (NodeMusgraveType)b_musgrave_node.musgrave_type();
+    musgrave_node->dimensions = b_musgrave_node.musgrave_dimensions();
     BL::TexMapping b_texture_mapping(b_musgrave_node.texture_mapping());
-    get_tex_mapping(&musgrave->tex_mapping, b_texture_mapping);
-    node = musgrave;
+    get_tex_mapping(&musgrave_node->tex_mapping, b_texture_mapping);
+    node = musgrave_node;
   }
   else if (b_node.is_a(&RNA_ShaderNodeTexCoord)) {
     BL::ShaderNodeTexCoord b_tex_coord_node(b_node);
@@ -804,6 +813,15 @@ static ShaderNode *add_node(Scene *scene,
     sky->sun_direction = normalize(get_float3(b_sky_node.sun_direction()));
     sky->turbidity = b_sky_node.turbidity();
     sky->ground_albedo = b_sky_node.ground_albedo();
+    sky->sun_disc = b_sky_node.sun_disc();
+    sky->sun_size = b_sky_node.sun_size();
+    sky->sun_intensity = b_sky_node.sun_intensity();
+    sky->sun_elevation = b_sky_node.sun_elevation();
+    sky->sun_rotation = b_sky_node.sun_rotation();
+    sky->altitude = 1000.0f * b_sky_node.altitude();
+    sky->air_density = b_sky_node.air_density();
+    sky->dust_density = b_sky_node.dust_density();
+    sky->ozone_density = b_sky_node.ozone_density();
     BL::TexMapping b_texture_mapping(b_sky_node.texture_mapping());
     get_tex_mapping(&sky->tex_mapping, b_texture_mapping);
     node = sky;
@@ -824,6 +842,12 @@ static ShaderNode *add_node(Scene *scene,
     }
     node = ies;
   }
+  else if (b_node.is_a(&RNA_ShaderNodeTexWhiteNoise)) {
+    BL::ShaderNodeTexWhiteNoise b_tex_white_noise_node(b_node);
+    WhiteNoiseTextureNode *white_noise_node = new WhiteNoiseTextureNode();
+    white_noise_node->dimensions = b_tex_white_noise_node.noise_dimensions();
+    node = white_noise_node;
+  }
   else if (b_node.is_a(&RNA_ShaderNodeNormalMap)) {
     BL::ShaderNodeNormalMap b_normal_map_node(b_node);
     NormalMapNode *nmap = new NormalMapNode();
@@ -849,22 +873,13 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
     BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
     PointDensityTextureNode *point_density = new PointDensityTextureNode();
-    point_density->filename = b_point_density_node.name();
     point_density->space = (NodeTexVoxelSpace)b_point_density_node.space();
     point_density->interpolation = get_image_interpolation(b_point_density_node);
-    point_density->builtin_data = b_point_density_node.ptr.data;
-    point_density->image_manager = scene->image_manager;
-
-    /* TODO(sergey): Use more proper update flag. */
-    if (true) {
-      point_density->add_image();
-      b_point_density_node.cache_point_density(b_depsgraph);
-      scene->image_manager->tag_reload_image(point_density->filename.string(),
-                                             point_density->builtin_data,
-                                             point_density->interpolation,
-                                             EXTENSION_CLIP,
-                                             true);
-    }
+    point_density->handle = scene->image_manager->add_image(
+        new BlenderPointDensityLoader(b_depsgraph, b_point_density_node),
+        point_density->image_params());
+
+    b_point_density_node.cache_point_density(b_depsgraph);
     node = point_density;
 
     /* Transformation form world space to texture space.
@@ -899,6 +914,12 @@ static ShaderNode *add_node(Scene *scene,
     disp->attribute = "";
     node = disp;
   }
+  else if (b_node.is_a(&RNA_ShaderNodeOutputAOV)) {
+    BL::ShaderNodeOutputAOV b_aov_node(b_node);
+    OutputAOVNode *aov = new OutputAOVNode();
+    aov->name = b_aov_node.name();
+    node = aov;
+  }
 
   if (node) {
     node->name = b_node.name();
@@ -910,7 +931,7 @@ static ShaderNode *add_node(Scene *scene,
 
 static bool node_use_modified_socket_name(ShaderNode *node)
 {
-  if (node->special_type == SHADER_SPECIAL_TYPE_SCRIPT)
+  if (node->special_type == SHADER_SPECIAL_TYPE_OSL)
     return false;
 
   return true;
@@ -1153,8 +1174,10 @@ static void add_nodes(Scene *scene,
   BL::NodeTree::links_iterator b_link;
 
   for (b_ntree.links.begin(b_link); b_link != b_ntree.links.end(); ++b_link) {
-    /* Ignore invalid links to avoid unwanted cycles created in graph. */
-    if (!b_link->is_valid()) {
+    /* Ignore invalid links to avoid unwanted cycles created in graph.
+     * Also ignore links with unavailable sockets. */
+    if (!(b_link->is_valid() && b_link->from_socket().enabled() &&
+          b_link->to_socket().enabled())) {
       continue;
     }
     /* get blender link data */
@@ -1217,12 +1240,11 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
     Shader *shader;
 
     /* test if we need to sync */
-    if (shader_map.sync(&shader, b_mat) || shader->need_sync_object || update_all) {
+    if (shader_map.add_or_update(&shader, b_mat) || update_all) {
       ShaderGraph *graph = new ShaderGraph();
 
       shader->name = b_mat.name().c_str();
       shader->pass_id = b_mat.pass_index();
-      shader->need_sync_object = false;
 
       /* create nodes */
       if (b_mat.use_nodes() && b_mat.node_tree()) {
@@ -1246,6 +1268,7 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
       shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume");
       shader->volume_sampling_method = get_volume_sampling(cmat);
       shader->volume_interpolation_method = get_volume_interpolation(cmat);
+      shader->volume_step_rate = get_float(cmat, "volume_step_rate");
       shader->displacement_method = get_displacement_method(cmat);
 
       shader->set_graph(graph);
@@ -1284,19 +1307,23 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
 
 /* Sync World */
 
-void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, bool update_all)
+void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, bool update_all)
 {
   Background *background = scene->background;
   Background prevbackground = *background;
 
   BL::World b_world = b_scene.world();
 
-  if (world_recalc || update_all || b_world.ptr.data != world_map) {
+  BlenderViewportParameters new_viewport_parameters(b_v3d);
+
+  if (world_recalc || update_all || b_world.ptr.data != world_map ||
+      viewport_parameters.modified(new_viewport_parameters)) {
     Shader *shader = scene->default_background;
     ShaderGraph *graph = new ShaderGraph();
 
     /* create nodes */
-    if (b_world && b_world.use_nodes() && b_world.node_tree()) {
+    if (new_viewport_parameters.use_scene_world && b_world && b_world.use_nodes() &&
+        b_world.node_tree()) {
       BL::ShaderNodeTree b_ntree(b_world.node_tree());
 
       add_nodes(scene, b_engine, b_data, b_depsgraph, b_scene, graph, b_ntree);
@@ -1306,8 +1333,9 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, bool update_all)
       shader->heterogeneous_volume = !get_boolean(cworld, "homogeneous_volume");
       shader->volume_sampling_method = get_volume_sampling(cworld);
       shader->volume_interpolation_method = get_volume_interpolation(cworld);
+      shader->volume_step_rate = get_float(cworld, "volume_step_size");
     }
-    else if (b_world) {
+    else if (new_viewport_parameters.use_scene_world && b_world) {
       BackgroundNode *background = new BackgroundNode();
       background->color = get_float3(b_world.color());
       graph->add(background);
@@ -1315,6 +1343,61 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, bool update_all)
       ShaderNode *out = graph->output();
       graph->connect(background->output("Background"), out->input("Surface"));
     }
+    else if (!new_viewport_parameters.use_scene_world) {
+      float3 world_color;
+      if (b_world) {
+        world_color = get_float3(b_world.color());
+      }
+      else {
+        world_color = make_float3(0.0f, 0.0f, 0.0f);
+      }
+
+      BackgroundNode *background = new BackgroundNode();
+      graph->add(background);
+
+      LightPathNode *light_path = new LightPathNode();
+      graph->add(light_path);
+
+      MixNode *mix_scene_with_background = new MixNode();
+      mix_scene_with_background->color2 = world_color;
+      graph->add(mix_scene_with_background);
+
+      EnvironmentTextureNode *texture_environment = new EnvironmentTextureNode();
+      texture_environment->tex_mapping.type = TextureMapping::VECTOR;
+      texture_environment->tex_mapping.rotation[2] = new_viewport_parameters.studiolight_rotate_z;
+      texture_environment->filename = new_viewport_parameters.studiolight_path;
+      graph->add(texture_environment);
+
+      MixNode *mix_intensity = new MixNode();
+      mix_intensity->type = NODE_MIX_MUL;
+      mix_intensity->fac = 1.0f;
+      mix_intensity->color2 = make_float3(new_viewport_parameters.studiolight_intensity,
+                                          new_viewport_parameters.studiolight_intensity,
+                                          new_viewport_parameters.studiolight_intensity);
+      graph->add(mix_intensity);
+
+      TextureCoordinateNode *texture_coordinate = new TextureCoordinateNode();
+      graph->add(texture_coordinate);
+
+      MixNode *mix_background_with_environment = new MixNode();
+      mix_background_with_environment->fac = new_viewport_parameters.studiolight_background_alpha;
+      mix_background_with_environment->color1 = world_color;
+      graph->add(mix_background_with_environment);
+
+      ShaderNode *out = graph->output();
+
+      graph->connect(texture_coordinate->output("Generated"),
+                     texture_environment->input("Vector"));
+      graph->connect(texture_environment->output("Color"), mix_intensity->input("Color1"));
+      graph->connect(light_path->output("Is Camera Ray"), mix_scene_with_background->input("Fac"));
+      graph->connect(mix_intensity->output("Color"), mix_scene_with_background->input("Color1"));
+      graph->connect(mix_intensity->output("Color"),
+                     mix_background_with_environment->input("Color2"));
+      graph->connect(mix_background_with_environment->output("Color"),
+                     mix_scene_with_background->input("Color2"));
+      graph->connect(mix_scene_with_background->output("Color"), background->input("Color"));
+      graph->connect(background->output("Background"), out->input("Surface"));
+    }
 
     if (b_world) {
       /* AO */
@@ -1348,16 +1431,7 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, bool update_all)
   }
 
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-
-  /* when doing preview render check for BI's transparency settings,
-   * this is so because Blender's preview render routines are not able
-   * to tweak all cycles's settings depending on different circumstances
-   */
-  if (b_engine.is_preview() == false)
-    background->transparent = get_boolean(cscene, "film_transparent");
-  else
-    background->transparent = b_scene.render().alpha_mode() ==
-                              BL::RenderSettings::alpha_mode_TRANSPARENT;
+  background->transparent = b_scene.render().film_transparent();
 
   if (background->transparent) {
     background->transparent_glass = get_boolean(cscene, "film_transparent_glass");
@@ -1368,7 +1442,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, bool update_all)
     background->transparent_roughness_threshold = 0.0f;
   }
 
-  background->use_shader = view_layer.use_background_shader;
+  background->use_shader = view_layer.use_background_shader |
+                           viewport_parameters.custom_viewport_parameters();
   background->use_ao = background->use_ao && view_layer.use_background_ao;
 
   if (background->modified(prevbackground))
@@ -1391,7 +1466,7 @@ void BlenderSync::sync_lights(BL::Depsgraph &b_depsgraph, bool update_all)
     Shader *shader;
 
     /* test if we need to sync */
-    if (shader_map.sync(&shader, b_light) || update_all) {
+    if (shader_map.add_or_update(&shader, b_light) || update_all) {
       ShaderGraph *graph = new ShaderGraph();
 
       /* create nodes */
@@ -1403,16 +1478,9 @@ void BlenderSync::sync_lights(BL::Depsgraph &b_depsgraph, bool update_all)
         add_nodes(scene, b_engine, b_data, b_depsgraph, b_scene, graph, b_ntree);
       }
       else {
-        float strength = 1.0f;
-
-        if (b_light.type() == BL::Light::type_POINT || b_light.type() == BL::Light::type_SPOT ||
-            b_light.type() == BL::Light::type_AREA) {
-          strength = 100.0f;
-        }
-
         EmissionNode *emission = new EmissionNode();
-        emission->color = get_float3(b_light.color());
-        emission->strength = strength;
+        emission->color = make_float3(1.0f, 1.0f, 1.0f);
+        emission->strength = 1.0f;
         graph->add(emission);
 
         ShaderNode *out = graph->output();
@@ -1425,7 +1493,7 @@ void BlenderSync::sync_lights(BL::Depsgraph &b_depsgraph, bool update_all)
   }
 }
 
-void BlenderSync::sync_shaders(BL::Depsgraph &b_depsgraph)
+void BlenderSync::sync_shaders(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d)
 {
   /* for auto refresh images */
   bool auto_refresh_update = false;
@@ -1438,12 +1506,9 @@ void BlenderSync::sync_shaders(BL::Depsgraph &b_depsgraph)
 
   shader_map.pre_sync();
 
-  sync_world(b_depsgraph, auto_refresh_update);
+  sync_world(b_depsgraph, b_v3d, auto_refresh_update);
   sync_lights(b_depsgraph, auto_refresh_update);
   sync_materials(b_depsgraph, auto_refresh_update);
-
-  /* false = don't delete unused shaders, not supported */
-  shader_map.post_sync(false);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 28d0c554f22..ee90b4dfbfe 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -16,6 +16,7 @@
 
 #include "render/background.h"
 #include "render/camera.h"
+#include "render/curves.h"
 #include "render/film.h"
 #include "render/graph.h"
 #include "render/integrator.h"
@@ -25,19 +26,19 @@
 #include "render/object.h"
 #include "render/scene.h"
 #include "render/shader.h"
-#include "render/curves.h"
 
 #include "device/device.h"
 
 #include "blender/blender_device.h"
-#include "blender/blender_sync.h"
 #include "blender/blender_session.h"
+#include "blender/blender_sync.h"
 #include "blender/blender_util.h"
 
 #include "util/util_debug.h"
 #include "util/util_foreach.h"
-#include "util/util_opengl.h"
 #include "util/util_hash.h"
+#include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -56,7 +57,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
       b_scene(b_scene),
       shader_map(&scene->shaders),
       object_map(&scene->objects),
-      mesh_map(&scene->meshes),
+      geometry_map(&scene->geometry),
       light_map(&scene->lights),
       particle_system_map(&scene->particle_systems),
       world_map(NULL),
@@ -78,15 +79,21 @@ BlenderSync::~BlenderSync()
 {
 }
 
+void BlenderSync::reset(BL::BlendData &b_data, BL::Scene &b_scene)
+{
+  /* Update data and scene pointers in case they change in session reset,
+   * for example after undo. */
+  this->b_data = b_data;
+  this->b_scene = b_scene;
+}
+
 /* Sync */
 
-void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph)
+void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d)
 {
   /* Sync recalc flags from blender to cycles. Actual update is done separate,
    * so we can do it later on if doing it immediate is not suitable. */
 
-  bool has_updated_objects = b_depsgraph.id_type_updated(BL::DriverTarget::id_type_OBJECT);
-
   if (experimental) {
     /* Mark all meshes as needing to be exported again if dicing changed. */
     PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -108,10 +115,15 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph)
     }
 
     if (dicing_prop_changed) {
-      for (const pair<void *, Mesh *> &iter : mesh_map.key_to_scene_data()) {
-        Mesh *mesh = iter.second;
-        if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
-          mesh_map.set_recalc(iter.first);
+      for (const pair<const GeometryKey, Geometry *> &iter : geometry_map.key_to_scene_data()) {
+        Geometry *geom = iter.second;
+        if (geom->type == Geometry::MESH) {
+          Mesh *mesh = static_cast<Mesh *>(geom);
+          if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
+            PointerRNA id_ptr;
+            RNA_id_pointer_create((::ID *)iter.first.id, &id_ptr);
+            geometry_map.set_recalc(BL::ID(id_ptr));
+          }
         }
       }
     }
@@ -135,36 +147,49 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph)
     /* Object */
     else if (b_id.is_a(&RNA_Object)) {
       BL::Object b_ob(b_id);
-      const bool updated_geometry = b_update->is_updated_geometry();
-
-      if (b_update->is_updated_transform()) {
-        object_map.set_recalc(b_ob);
-        light_map.set_recalc(b_ob);
-      }
-
-      if (object_is_mesh(b_ob)) {
-        if (updated_geometry ||
-            (object_subdivision_type(b_ob, preview, experimental) != Mesh::SUBDIVISION_NONE)) {
-          BL::ID key = BKE_object_is_modified(b_ob) ? b_ob : b_ob.data();
-          mesh_map.set_recalc(key);
+      const bool is_geometry = object_is_geometry(b_ob);
+      const bool is_light = !is_geometry && object_is_light(b_ob);
+
+      if (is_geometry || is_light) {
+        const bool updated_geometry = b_update->is_updated_geometry();
+
+        /* Geometry (mesh, hair, volume). */
+        if (is_geometry) {
+          if (b_update->is_updated_transform() || b_update->is_updated_shading()) {
+            object_map.set_recalc(b_ob);
+          }
+
+          if (updated_geometry ||
+              (object_subdivision_type(b_ob, preview, experimental) != Mesh::SUBDIVISION_NONE)) {
+            BL::ID key = BKE_object_is_modified(b_ob) ? b_ob : b_ob.data();
+            geometry_map.set_recalc(key);
+          }
+
+          if (updated_geometry) {
+            BL::Object::particle_systems_iterator b_psys;
+            for (b_ob.particle_systems.begin(b_psys); b_psys != b_ob.particle_systems.end();
+                 ++b_psys) {
+              particle_system_map.set_recalc(b_ob);
+            }
+          }
         }
-      }
-      else if (object_is_light(b_ob)) {
-        if (updated_geometry) {
-          light_map.set_recalc(b_ob);
+        /* Light */
+        else if (is_light) {
+          if (b_update->is_updated_transform() || b_update->is_updated_shading()) {
+            object_map.set_recalc(b_ob);
+            light_map.set_recalc(b_ob);
+          }
+
+          if (updated_geometry) {
+            light_map.set_recalc(b_ob);
+          }
         }
       }
-
-      if (updated_geometry) {
-        BL::Object::particle_systems_iterator b_psys;
-        for (b_ob.particle_systems.begin(b_psys); b_psys != b_ob.particle_systems.end(); ++b_psys)
-          particle_system_map.set_recalc(b_ob);
-      }
     }
     /* Mesh */
     else if (b_id.is_a(&RNA_Mesh)) {
       BL::Mesh b_mesh(b_id);
-      mesh_map.set_recalc(b_mesh);
+      geometry_map.set_recalc(b_mesh);
     }
     /* World */
     else if (b_id.is_a(&RNA_World)) {
@@ -173,19 +198,16 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph)
         world_recalc = true;
       }
     }
-  }
-
-  /* Updates shader with object dependency if objects changed. */
-  if (has_updated_objects) {
-    if (scene->default_background->has_object_dependency) {
-      world_recalc = true;
+    /* Volume */
+    else if (b_id.is_a(&RNA_Volume)) {
+      BL::Volume b_volume(b_id);
+      geometry_map.set_recalc(b_volume);
     }
+  }
 
-    foreach (Shader *shader, scene->shaders) {
-      if (shader->has_object_dependency) {
-        shader->need_sync_object = true;
-      }
-    }
+  BlenderViewportParameters new_viewport_parameters(b_v3d);
+  if (viewport_parameters.modified(new_viewport_parameters)) {
+    world_recalc = true;
   }
 }
 
@@ -201,20 +223,23 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
 
   sync_view_layer(b_v3d, b_view_layer);
   sync_integrator();
-  sync_film();
-  sync_shaders(b_depsgraph);
+  sync_film(b_v3d);
+  sync_shaders(b_depsgraph, b_v3d);
   sync_images();
-  sync_curve_settings();
 
-  mesh_synced.clear(); /* use for objects and motion sync */
+  geometry_synced.clear(); /* use for objects and motion sync */
 
   if (scene->need_motion() == Scene::MOTION_PASS || scene->need_motion() == Scene::MOTION_NONE ||
       scene->camera->motion_position == Camera::MOTION_POSITION_CENTER) {
-    sync_objects(b_depsgraph);
+    sync_objects(b_depsgraph, b_v3d);
   }
-  sync_motion(b_render, b_depsgraph, b_override, width, height, python_thread_state);
+  sync_motion(b_render, b_depsgraph, b_v3d, b_override, width, height, python_thread_state);
 
-  mesh_synced.clear();
+  geometry_synced.clear();
+
+  /* Shader sync done at the end, since object sync uses it.
+   * false = don't delete unused shaders, not supported. */
+  shader_map.post_sync(false);
 
   free_data_after_sync(b_depsgraph);
 }
@@ -231,6 +256,7 @@ void BlenderSync::sync_integrator()
   Integrator *integrator = scene->integrator;
   Integrator previntegrator = *integrator;
 
+  integrator->min_bounce = get_int(cscene, "min_light_bounces");
   integrator->max_bounce = get_int(cscene, "max_bounces");
 
   integrator->max_diffuse_bounce = get_int(cscene, "diffuse_bounces");
@@ -238,10 +264,12 @@ void BlenderSync::sync_integrator()
   integrator->max_transmission_bounce = get_int(cscene, "transmission_bounces");
   integrator->max_volume_bounce = get_int(cscene, "volume_bounces");
 
+  integrator->transparent_min_bounce = get_int(cscene, "min_transparent_bounces");
   integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces");
 
   integrator->volume_max_steps = get_int(cscene, "volume_max_steps");
-  integrator->volume_step_size = get_float(cscene, "volume_step_size");
+  integrator->volume_step_rate = (preview) ? get_float(cscene, "volume_preview_step_rate") :
+                                             get_float(cscene, "volume_step_rate");
 
   integrator->caustics_reflective = get_boolean(cscene, "caustics_reflective");
   integrator->caustics_refractive = get_boolean(cscene, "caustics_refractive");
@@ -249,13 +277,13 @@ void BlenderSync::sync_integrator()
 
   integrator->seed = get_int(cscene, "seed");
   if (get_boolean(cscene, "use_animated_seed")) {
-    integrator->seed = hash_int_2d(b_scene.frame_current(), get_int(cscene, "seed"));
+    integrator->seed = hash_uint2(b_scene.frame_current(), get_int(cscene, "seed"));
     if (b_scene.frame_subframe() != 0.0f) {
       /* TODO(sergey): Ideally should be some sort of hash_merge,
        * but this is good enough for now.
        */
-      integrator->seed += hash_int_2d((int)(b_scene.frame_subframe() * (float)INT_MAX),
-                                      get_int(cscene, "seed"));
+      integrator->seed += hash_uint2((int)(b_scene.frame_subframe() * (float)INT_MAX),
+                                     get_int(cscene, "seed"));
     }
   }
 
@@ -287,6 +315,16 @@ void BlenderSync::sync_integrator()
   integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
   integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold");
 
+  if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
+    integrator->sampling_pattern = SAMPLING_PATTERN_PMJ;
+    integrator->adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
+    integrator->adaptive_threshold = get_float(cscene, "adaptive_threshold");
+  }
+  else {
+    integrator->adaptive_min_samples = INT_MAX;
+    integrator->adaptive_threshold = 0.0f;
+  }
+
   int diffuse_samples = get_int(cscene, "diffuse_samples");
   int glossy_samples = get_int(cscene, "glossy_samples");
   int transmission_samples = get_int(cscene, "transmission_samples");
@@ -303,6 +341,8 @@ void BlenderSync::sync_integrator()
     integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples;
     integrator->subsurface_samples = subsurface_samples * subsurface_samples;
     integrator->volume_samples = volume_samples * volume_samples;
+    integrator->adaptive_min_samples = min(
+        integrator->adaptive_min_samples * integrator->adaptive_min_samples, INT_MAX);
   }
   else {
     integrator->diffuse_samples = diffuse_samples;
@@ -337,13 +377,17 @@ void BlenderSync::sync_integrator()
 
 /* Film */
 
-void BlenderSync::sync_film()
+void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
 {
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
   Film *film = scene->film;
   Film prevfilm = *film;
 
+  if (b_v3d) {
+    film->display_pass = update_viewport_display_passes(b_v3d, film->passes);
+  }
+
   film->exposure = get_float(cscene, "film_exposure");
   film->filter_type = (FilterType)get_enum(
       cscene, "pixel_filter_type", FILTER_NUM_TYPES, FILTER_BLACKMAN_HARRIS);
@@ -369,8 +413,10 @@ void BlenderSync::sync_film()
     }
   }
 
-  if (film->modified(prevfilm))
+  if (film->modified(prevfilm)) {
     film->tag_update(scene);
+    film->tag_passes_update(scene, prevfilm.passes, false);
+  }
 }
 
 /* Render Layer */
@@ -383,6 +429,7 @@ void BlenderSync::sync_view_layer(BL::SpaceView3D & /*b_v3d*/, BL::ViewLayer &b_
   view_layer.use_background_ao = b_view_layer.use_ao();
   view_layer.use_surfaces = b_view_layer.use_solid();
   view_layer.use_hair = b_view_layer.use_strand();
+  view_layer.use_volumes = b_view_layer.use_volumes();
 
   /* Material override. */
   view_layer.material_override = b_view_layer.material_override();
@@ -451,25 +498,25 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT);
   MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT);
   MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT);
-  MAP_PASS("SubsurfaceDir", PASS_SUBSURFACE_DIRECT);
   MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT);
 
   MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT);
   MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT);
   MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT);
-  MAP_PASS("SubsurfaceInd", PASS_SUBSURFACE_INDIRECT);
   MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT);
 
   MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR);
   MAP_PASS("GlossCol", PASS_GLOSSY_COLOR);
   MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR);
-  MAP_PASS("SubsurfaceCol", PASS_SUBSURFACE_COLOR);
 
   MAP_PASS("Emit", PASS_EMISSION);
   MAP_PASS("Env", PASS_BACKGROUND);
   MAP_PASS("AO", PASS_AO);
   MAP_PASS("Shadow", PASS_SHADOW);
 
+  MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE);
+  MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL);
+
 #ifdef __KERNEL_DEBUG__
   MAP_PASS("Debug BVH Traversed Nodes", PASS_BVH_TRAVERSED_NODES);
   MAP_PASS("Debug BVH Traversed Instances", PASS_BVH_TRAVERSED_INSTANCES);
@@ -477,6 +524,8 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
 #endif
   MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
+  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
   if (string_startswith(name, cryptomatte_prefix)) {
     return PASS_CRYPTOMATTE;
   }
@@ -512,10 +561,12 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
   return -1;
 }
 
-vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
+vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
+                                             BL::ViewLayer &b_view_layer,
+                                             bool adaptive_sampling,
+                                             const DenoiseParams &denoising)
 {
   vector<Pass> passes;
-  Pass::add(PASS_COMBINED, passes);
 
   /* loop over passes */
   BL::RenderLayer::passes_iterator b_pass_iter;
@@ -527,80 +578,85 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
     if (pass_type == PASS_MOTION && scene->integrator->motion_blur)
       continue;
     if (pass_type != PASS_NONE)
-      Pass::add(pass_type, passes);
+      Pass::add(pass_type, passes, b_pass.name().c_str());
   }
 
-  PointerRNA crp = RNA_pointer_get(&b_view_layer.ptr, "cycles");
-  bool full_denoising = get_boolean(crp, "use_denoising");
-  bool write_denoising_passes = get_boolean(crp, "denoising_store_passes");
+  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
   scene->film->denoising_flags = 0;
-  if (full_denoising || write_denoising_passes) {
+  if (denoising.use || denoising.store_passes) {
+    if (denoising.type == DENOISER_NLM) {
 #define MAP_OPTION(name, flag) \
-  if (!get_boolean(crp, name)) \
+  if (!get_boolean(crl, name)) \
     scene->film->denoising_flags |= flag;
-    MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
-    MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
-    MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR);
-    MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
-    MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
-    MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
-    MAP_OPTION("denoising_subsurface_direct", DENOISING_CLEAN_SUBSURFACE_DIR);
-    MAP_OPTION("denoising_subsurface_indirect", DENOISING_CLEAN_SUBSURFACE_IND);
+      MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
+      MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
+      MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR);
+      MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
+      MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
+      MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
 #undef MAP_OPTION
+    }
     b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
   }
 
-  if (write_denoising_passes) {
+  if (denoising.store_passes) {
     b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
     b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
     b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
+    if (denoising.type == DENOISER_NLM) {
+      b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
+      b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
+      b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
+    }
 
     if (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) {
       b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str());
     }
   }
+
 #ifdef __KERNEL_DEBUG__
-  if (get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
+  if (get_boolean(crl, "pass_debug_bvh_traversed_nodes")) {
     b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
+    Pass::add(PASS_BVH_TRAVERSED_NODES, passes, "Debug BVH Traversed Nodes");
   }
-  if (get_boolean(crp, "pass_debug_bvh_traversed_instances")) {
+  if (get_boolean(crl, "pass_debug_bvh_traversed_instances")) {
     b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes);
+    Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes, "Debug BVH Traversed Instances");
   }
-  if (get_boolean(crp, "pass_debug_bvh_intersections")) {
+  if (get_boolean(crl, "pass_debug_bvh_intersections")) {
     b_engine.add_pass("Debug BVH Intersections", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_BVH_INTERSECTIONS, passes);
+    Pass::add(PASS_BVH_INTERSECTIONS, passes, "Debug BVH Intersections");
   }
-  if (get_boolean(crp, "pass_debug_ray_bounces")) {
+  if (get_boolean(crl, "pass_debug_ray_bounces")) {
     b_engine.add_pass("Debug Ray Bounces", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_RAY_BOUNCES, passes);
+    Pass::add(PASS_RAY_BOUNCES, passes, "Debug Ray Bounces");
   }
 #endif
-  if (get_boolean(crp, "pass_debug_render_time")) {
+  if (get_boolean(crl, "pass_debug_render_time")) {
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_RENDER_TIME, passes);
+    Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
+  }
+  if (get_boolean(crl, "pass_debug_sample_count")) {
+    b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
+    Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
   }
-  if (get_boolean(crp, "use_pass_volume_direct")) {
+  if (get_boolean(crl, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
-    Pass::add(PASS_VOLUME_DIRECT, passes);
+    Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
   }
-  if (get_boolean(crp, "use_pass_volume_indirect")) {
+  if (get_boolean(crl, "use_pass_volume_indirect")) {
     b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str());
-    Pass::add(PASS_VOLUME_INDIRECT, passes);
+    Pass::add(PASS_VOLUME_INDIRECT, passes, "VolumeInd");
   }
 
   /* Cryptomatte stores two ID/weight pairs per RGBA layer.
-   * User facing paramter is the number of pairs. */
-  int crypto_depth = min(16, get_int(crp, "pass_crypto_depth")) / 2;
+   * User facing parameter is the number of pairs. */
+  int crypto_depth = divide_up(min(16, get_int(crl, "pass_crypto_depth")), 2);
   scene->film->cryptomatte_depth = crypto_depth;
   scene->film->cryptomatte_passes = CRYPT_NONE;
-  if (get_boolean(crp, "use_pass_crypto_object")) {
-    for (int i = 0; i < crypto_depth; ++i) {
+  if (get_boolean(crl, "use_pass_crypto_object")) {
+    for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Object%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
@@ -608,8 +664,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
     scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes |
                                                         CRYPT_OBJECT);
   }
-  if (get_boolean(crp, "use_pass_crypto_material")) {
-    for (int i = 0; i < crypto_depth; ++i) {
+  if (get_boolean(crl, "use_pass_crypto_material")) {
+    for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Material%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
@@ -617,8 +673,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
     scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes |
                                                         CRYPT_MATERIAL);
   }
-  if (get_boolean(crp, "use_pass_crypto_asset")) {
-    for (int i = 0; i < crypto_depth; ++i) {
+  if (get_boolean(crl, "use_pass_crypto_asset")) {
+    for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
@@ -626,11 +682,33 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
     scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes |
                                                         CRYPT_ASSET);
   }
-  if (get_boolean(crp, "pass_crypto_accurate") && scene->film->cryptomatte_passes != CRYPT_NONE) {
+  if (get_boolean(crl, "pass_crypto_accurate") && scene->film->cryptomatte_passes != CRYPT_NONE) {
     scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes |
                                                         CRYPT_ACCURATE);
   }
 
+  if (adaptive_sampling) {
+    Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
+    if (!get_boolean(crl, "pass_debug_sample_count")) {
+      Pass::add(PASS_SAMPLE_COUNT, passes);
+    }
+  }
+
+  RNA_BEGIN (&crl, b_aov, "aovs") {
+    bool is_color = (get_enum(b_aov, "type") == 1);
+    string name = get_string(b_aov, "name");
+
+    if (is_color) {
+      b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str());
+      Pass::add(PASS_AOV_COLOR, passes, name.c_str());
+    }
+    else {
+      b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str());
+      Pass::add(PASS_AOV_VALUE, passes, name.c_str());
+    }
+  }
+  RNA_END;
+
   return passes;
 }
 
@@ -677,6 +755,11 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
   params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
   params.num_bvh_time_steps = RNA_int_get(&cscene, "debug_bvh_time_steps");
 
+  PointerRNA csscene = RNA_pointer_get(&b_scene.ptr, "cycles_curves");
+  params.hair_subdivisions = get_int(csscene, "subdivisions");
+  params.hair_shape = (CurveShapeType)get_enum(
+      csscene, "shape", CURVE_NUM_SHAPE_TYPES, CURVE_THICK);
+
   if (background && params.shadingsystem != SHADINGSYSTEM_OSL)
     params.persistent_data = r.use_persistent_data();
   else
@@ -696,20 +779,10 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
     params.texture_limit = 0;
   }
 
-  /* TODO(sergey): Once OSL supports per-microarchitecture optimization get
-   * rid of this.
-   */
-  if (params.shadingsystem == SHADINGSYSTEM_OSL) {
-    params.bvh_layout = BVH_LAYOUT_BVH4;
-  }
-  else {
-    params.bvh_layout = DebugFlags().cpu.bvh_layout;
-  }
+  params.bvh_layout = DebugFlags().cpu.bvh_layout;
+
+  params.background = background;
 
-#ifdef WITH_EMBREE
-  params.bvh_layout = RNA_boolean_get(&cscene, "use_bvh_embree") ? BVH_LAYOUT_EMBREE :
-                                                                   params.bvh_layout;
-#endif
   return params;
 }
 
@@ -724,7 +797,8 @@ bool BlenderSync::get_session_pause(BL::Scene &b_scene, bool background)
 SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
                                               BL::Preferences &b_preferences,
                                               BL::Scene &b_scene,
-                                              bool background)
+                                              bool background,
+                                              BL::ViewLayer b_view_layer)
 {
   SessionParams params;
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -753,7 +827,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
     preview_samples = preview_samples * preview_samples;
   }
 
-  if (get_enum(cscene, "progressive") == 0) {
+  if (get_enum(cscene, "progressive") == 0 && (params.device.type != DEVICE_OPTIX)) {
     if (background) {
       params.samples = aa_samples;
     }
@@ -802,7 +876,21 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
     params.tile_order = TILE_BOTTOM_TO_TOP;
   }
 
-  /* other parameters */
+  /* Denoising */
+  params.denoising = get_denoise_params(b_scene, b_view_layer, background);
+
+  if (params.denoising.use) {
+    /* Add additional denoising devices if we are rendering and denoising
+     * with different devices. */
+    params.device.add_denoising_devices(params.denoising.type);
+
+    /* Check if denoiser is supported by device. */
+    if (!(params.device.denoisers & params.denoising.type)) {
+      params.denoising.use = false;
+    }
+  }
+
+  /* Viewport Performance */
   params.start_resolution = get_int(cscene, "preview_start_resolution");
   params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
 
@@ -813,20 +901,10 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
 
   /* progressive refine */
   BL::RenderSettings b_r = b_scene.render();
-  params.progressive_refine = (b_engine.is_preview() ||
-                               get_boolean(cscene, "use_progressive_refine")) &&
-                              !b_r.use_save_buffers();
-
-  if (params.progressive_refine) {
-    BL::Scene::view_layers_iterator b_view_layer;
-    for (b_scene.view_layers.begin(b_view_layer); b_view_layer != b_scene.view_layers.end();
-         ++b_view_layer) {
-      PointerRNA crl = RNA_pointer_get(&b_view_layer->ptr, "cycles");
-      if (get_boolean(crl, "use_denoising")) {
-        params.progressive_refine = false;
-      }
-    }
-  }
+  params.progressive_refine = b_engine.is_preview() ||
+                              get_boolean(cscene, "use_progressive_refine");
+  if (b_r.use_save_buffers())
+    params.progressive_refine = false;
 
   if (background) {
     if (params.progressive_refine)
@@ -861,7 +939,66 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
                          BlenderSession::print_render_stats;
 
+  params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
+
   return params;
 }
 
+DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
+                                              BL::ViewLayer &b_view_layer,
+                                              bool background)
+{
+  DenoiseParams denoising;
+  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+
+  if (background) {
+    /* Final Render Denoising */
+    denoising.use = get_boolean(cscene, "use_denoising");
+    denoising.type = (DenoiserType)get_enum(cscene, "denoiser", DENOISER_NUM, DENOISER_NONE);
+
+    if (b_view_layer) {
+      PointerRNA clayer = RNA_pointer_get(&b_view_layer.ptr, "cycles");
+      if (!get_boolean(clayer, "use_denoising")) {
+        denoising.use = false;
+      }
+
+      denoising.radius = get_int(clayer, "denoising_radius");
+      denoising.strength = get_float(clayer, "denoising_strength");
+      denoising.feature_strength = get_float(clayer, "denoising_feature_strength");
+      denoising.relative_pca = get_boolean(clayer, "denoising_relative_pca");
+
+      denoising.input_passes = (DenoiserInput)get_enum(
+          clayer,
+          (denoising.type == DENOISER_OPTIX) ? "denoising_optix_input_passes" :
+                                               "denoising_openimagedenoise_input_passes",
+          DENOISER_INPUT_NUM,
+          DENOISER_INPUT_RGB_ALBEDO_NORMAL);
+
+      denoising.store_passes = get_boolean(clayer, "denoising_store_passes");
+    }
+  }
+  else {
+    /* Viewport Denoising */
+    denoising.use = get_boolean(cscene, "use_preview_denoising");
+    denoising.type = (DenoiserType)get_enum(
+        cscene, "preview_denoiser", DENOISER_NUM, DENOISER_NONE);
+    denoising.start_sample = get_int(cscene, "preview_denoising_start_sample");
+
+    /* Auto select fastest denoiser. */
+    if (denoising.type == DENOISER_NONE) {
+      if (!Device::available_devices(DEVICE_MASK_OPTIX).empty()) {
+        denoising.type = DENOISER_OPTIX;
+      }
+      else if (openimagedenoise_supported()) {
+        denoising.type = DENOISER_OPENIMAGEDENOISE;
+      }
+      else {
+        denoising.use = false;
+      }
+    }
+  }
+
+  return denoising;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 00afceebde3..a551ec31e04 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -18,11 +18,12 @@
 #define __BLENDER_SYNC_H__
 
 #include "MEM_guardedalloc.h"
-#include "RNA_types.h"
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
+#include "RNA_types.h"
 
-#include "blender/blender_util.h"
+#include "blender/blender_id_map.h"
+#include "blender/blender_viewport.h"
 
 #include "render/scene.h"
 #include "render/session.h"
@@ -36,8 +37,10 @@ CCL_NAMESPACE_BEGIN
 
 class Background;
 class BlenderObjectCulling;
+class BlenderViewportParameters;
 class Camera;
 class Film;
+class Hair;
 class Light;
 class Mesh;
 class Object;
@@ -58,8 +61,10 @@ class BlenderSync {
               Progress &progress);
   ~BlenderSync();
 
+  void reset(BL::BlendData &b_data, BL::Scene &b_scene);
+
   /* sync */
-  void sync_recalc(BL::Depsgraph &b_depsgraph);
+  void sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d);
   void sync_data(BL::RenderSettings &b_render,
                  BL::Depsgraph &b_depsgraph,
                  BL::SpaceView3D &b_v3d,
@@ -68,7 +73,10 @@ class BlenderSync {
                  int height,
                  void **python_thread_state);
   void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer);
-  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer,
+                                  BL::ViewLayer &b_view_layer,
+                                  bool adaptive_sampling,
+                                  const DenoiseParams &denoising);
   void sync_integrator();
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,
@@ -87,55 +95,96 @@ class BlenderSync {
 
   /* get parameters */
   static SceneParams get_scene_params(BL::Scene &b_scene, bool background);
-  static SessionParams get_session_params(BL::RenderEngine &b_engine,
-                                          BL::Preferences &b_userpref,
-                                          BL::Scene &b_scene,
-                                          bool background);
+  static SessionParams get_session_params(
+      BL::RenderEngine &b_engine,
+      BL::Preferences &b_userpref,
+      BL::Scene &b_scene,
+      bool background,
+      BL::ViewLayer b_view_layer = BL::ViewLayer(PointerRNA_NULL));
   static bool get_session_pause(BL::Scene &b_scene, bool background);
   static BufferParams get_buffer_params(BL::RenderSettings &b_render,
                                         BL::SpaceView3D &b_v3d,
                                         BL::RegionView3D &b_rv3d,
                                         Camera *cam,
                                         int width,
-                                        int height);
+                                        int height,
+                                        const bool use_denoiser);
 
   static PassType get_pass_type(BL::RenderPass &b_pass);
   static int get_denoising_pass(BL::RenderPass &b_pass);
 
  private:
+  static DenoiseParams get_denoise_params(BL::Scene &b_scene,
+                                          BL::ViewLayer &b_view_layer,
+                                          bool background);
+
   /* sync */
   void sync_lights(BL::Depsgraph &b_depsgraph, bool update_all);
   void sync_materials(BL::Depsgraph &b_depsgraph, bool update_all);
-  void sync_objects(BL::Depsgraph &b_depsgraph, float motion_time = 0.0f);
+  void sync_objects(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, float motion_time = 0.0f);
   void sync_motion(BL::RenderSettings &b_render,
                    BL::Depsgraph &b_depsgraph,
+                   BL::SpaceView3D &b_v3d,
                    BL::Object &b_override,
                    int width,
                    int height,
                    void **python_thread_state);
-  void sync_film();
+  void sync_film(BL::SpaceView3D &b_v3d);
   void sync_view();
-  void sync_world(BL::Depsgraph &b_depsgraph, bool update_all);
-  void sync_shaders(BL::Depsgraph &b_depsgraph);
-  void sync_curve_settings();
 
+  /* Shader */
+  void sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, bool update_all);
+  void sync_shaders(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d);
   void sync_nodes(Shader *shader, BL::ShaderNodeTree &b_ntree);
-  Mesh *sync_mesh(BL::Depsgraph &b_depsgrpah,
-                  BL::Object &b_ob,
-                  BL::Object &b_ob_instance,
-                  bool object_updated,
-                  bool show_self,
-                  bool show_particles);
-  void sync_curves(
-      Mesh *mesh, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step = 0);
+
+  /* Object */
   Object *sync_object(BL::Depsgraph &b_depsgraph,
                       BL::ViewLayer &b_view_layer,
                       BL::DepsgraphObjectInstance &b_instance,
                       float motion_time,
-                      bool show_self,
-                      bool show_particles,
+                      bool use_particle_hair,
+                      bool show_lights,
                       BlenderObjectCulling &culling,
                       bool *use_portal);
+
+  /* Volume */
+  void sync_volume(BL::Object &b_ob, Mesh *mesh, const vector<Shader *> &used_shaders);
+
+  /* Mesh */
+  void sync_mesh(BL::Depsgraph b_depsgraph,
+                 BL::Object b_ob,
+                 Mesh *mesh,
+                 const vector<Shader *> &used_shaders);
+  void sync_mesh_motion(BL::Depsgraph b_depsgraph, BL::Object b_ob, Mesh *mesh, int motion_step);
+
+  /* Hair */
+  void sync_hair(BL::Depsgraph b_depsgraph,
+                 BL::Object b_ob,
+                 Hair *hair,
+                 const vector<Shader *> &used_shaders);
+  void sync_hair_motion(BL::Depsgraph b_depsgraph, BL::Object b_ob, Hair *hair, int motion_step);
+  void sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motion_step = 0);
+  void sync_particle_hair(
+      Hair *hair, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step = 0);
+  bool object_has_particle_hair(BL::Object b_ob);
+
+  /* Camera */
+  void sync_camera_motion(
+      BL::RenderSettings &b_render, BL::Object &b_ob, int width, int height, float motion_time);
+
+  /* Geometry */
+  Geometry *sync_geometry(BL::Depsgraph &b_depsgrpah,
+                          BL::Object &b_ob,
+                          BL::Object &b_ob_instance,
+                          bool object_updated,
+                          bool use_particle_hair);
+  void sync_geometry_motion(BL::Depsgraph &b_depsgraph,
+                            BL::Object &b_ob,
+                            Object *object,
+                            float motion_time,
+                            bool use_particle_hair);
+
+  /* Light */
   void sync_light(BL::Object &b_parent,
                   int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
                   BL::Object &b_ob,
@@ -143,15 +192,9 @@ class BlenderSync {
                   int random_id,
                   Transform &tfm,
                   bool *use_portal);
-  void sync_background_light(bool use_portal);
-  void sync_mesh_motion(BL::Depsgraph &b_depsgraph,
-                        BL::Object &b_ob,
-                        Object *object,
-                        float motion_time);
-  void sync_camera_motion(
-      BL::RenderSettings &b_render, BL::Object &b_ob, int width, int height, float motion_time);
+  void sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal);
 
-  /* particles */
+  /* Particles */
   bool sync_dupli_particle(BL::Object &b_ob,
                            BL::DepsgraphObjectInstance &b_instance,
                            Object *object);
@@ -165,7 +208,7 @@ class BlenderSync {
   /* util */
   void find_shader(BL::ID &id, vector<Shader *> &used_shaders, Shader *default_shader);
   bool BKE_object_is_modified(BL::Object &b_ob);
-  bool object_is_mesh(BL::Object &b_ob);
+  bool object_is_geometry(BL::Object &b_ob);
   bool object_is_light(BL::Object &b_ob);
 
   /* variables */
@@ -175,14 +218,15 @@ class BlenderSync {
 
   id_map<void *, Shader> shader_map;
   id_map<ObjectKey, Object> object_map;
-  id_map<void *, Mesh> mesh_map;
+  id_map<GeometryKey, Geometry> geometry_map;
   id_map<ObjectKey, Light> light_map;
   id_map<ParticleSystemKey, ParticleSystem> particle_system_map;
-  set<Mesh *> mesh_synced;
-  set<Mesh *> mesh_motion_synced;
+  set<Geometry *> geometry_synced;
+  set<Geometry *> geometry_motion_synced;
   set<float> motion_times;
   void *world_map;
   bool world_recalc;
+  BlenderViewportParameters viewport_parameters;
 
   Scene *scene;
   bool preview;
@@ -198,6 +242,7 @@ class BlenderSync {
           use_background_ao(true),
           use_surfaces(true),
           use_hair(true),
+          use_volumes(true),
           samples(0),
           bound_samples(false)
     {
@@ -209,6 +254,7 @@ class BlenderSync {
     bool use_background_ao;
     bool use_surfaces;
     bool use_hair;
+    bool use_volumes;
     int samples;
     bool bound_samples;
   } view_layer;
diff --git a/intern/cycles/blender/blender_texture.h b/intern/cycles/blender/blender_texture.h
index 896bf62da70..8ab061aaed9 100644
--- a/intern/cycles/blender/blender_texture.h
+++ b/intern/cycles/blender/blender_texture.h
@@ -17,8 +17,8 @@
 #ifndef __BLENDER_TEXTURE_H__
 #define __BLENDER_TEXTURE_H__
 
-#include <stdlib.h>
 #include "blender/blender_sync.h"
+#include <stdlib.h>
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index e68f92474bf..ad90a5f8d52 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -32,10 +32,10 @@
  * todo: clean this up ... */
 
 extern "C" {
-void BKE_image_user_frame_calc(void *iuser, int cfra);
+void BKE_image_user_frame_calc(void *ima, void *iuser, int cfra);
 void BKE_image_user_file_path(void *iuser, void *ima, char *path);
-unsigned char *BKE_image_get_pixels_for_frame(void *image, int frame);
-float *BKE_image_get_float_pixels_for_frame(void *image, int frame);
+unsigned char *BKE_image_get_pixels_for_frame(void *image, int frame, int tile);
+float *BKE_image_get_float_pixels_for_frame(void *image, int frame, int tile);
 }
 
 CCL_NAMESPACE_BEGIN
@@ -43,10 +43,10 @@ CCL_NAMESPACE_BEGIN
 void python_thread_state_save(void **python_thread_state);
 void python_thread_state_restore(void **python_thread_state);
 
-static inline BL::Mesh object_to_mesh(BL::BlendData &data,
+static inline BL::Mesh object_to_mesh(BL::BlendData & /*data*/,
                                       BL::Object &object,
-                                      BL::Depsgraph &depsgraph,
-                                      bool calc_undeformed,
+                                      BL::Depsgraph & /*depsgraph*/,
+                                      bool /*calc_undeformed*/,
                                       Mesh::SubdivisionType subdivision_type)
 {
   /* TODO: make this work with copy-on-write, modifiers are already evaluated. */
@@ -54,8 +54,8 @@ static inline BL::Mesh object_to_mesh(BL::BlendData &data,
   bool subsurf_mod_show_render = false;
   bool subsurf_mod_show_viewport = false;
 
-  if(subdivision_type != Mesh::SUBDIVISION_NONE) {
-    BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
+  if (subdivision_type != Mesh::SUBDIVISION_NONE) {
+    BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length() - 1];
 
     subsurf_mod_show_render = subsurf_mod.show_render();
     subsurf_mod_show_viewport = subsurf_mod.show_viewport();
@@ -75,16 +75,18 @@ static inline BL::Mesh object_to_mesh(BL::BlendData &data,
      * UV are not empty. */
     if (mesh.is_editmode() ||
         (mesh.use_auto_smooth() && subdivision_type == Mesh::SUBDIVISION_NONE)) {
-      mesh = data.meshes.new_from_object(depsgraph, object, false, false);
+      BL::Depsgraph depsgraph(PointerRNA_NULL);
+      mesh = object.to_mesh(false, depsgraph);
     }
   }
   else {
-    mesh = data.meshes.new_from_object(depsgraph, object, true, calc_undeformed);
+    BL::Depsgraph depsgraph(PointerRNA_NULL);
+    mesh = object.to_mesh(false, depsgraph);
   }
 
 #if 0
-  if(subdivision_type != Mesh::SUBDIVISION_NONE) {
-    BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
+  if (subdivision_type != Mesh::SUBDIVISION_NONE) {
+    BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length() - 1];
 
     subsurf_mod.show_render(subsurf_mod_show_render);
     subsurf_mod.show_viewport(subsurf_mod_show_viewport);
@@ -102,11 +104,13 @@ static inline BL::Mesh object_to_mesh(BL::BlendData &data,
   return mesh;
 }
 
-static inline void free_object_to_mesh(BL::BlendData &data, BL::Object &object, BL::Mesh &mesh)
+static inline void free_object_to_mesh(BL::BlendData & /*data*/,
+                                       BL::Object &object,
+                                       BL::Mesh &mesh)
 {
   /* Free mesh if we didn't just use the existing one. */
   if (object.data().ptr.data != mesh.ptr.data) {
-    data.meshes.remove(mesh, false, true, false);
+    object.to_mesh_clear();
   }
 }
 
@@ -155,7 +159,7 @@ static inline void curvemapping_to_array(BL::CurveMapping &cumap, array<float> &
   data.resize(size);
   for (int i = 0; i < size; i++) {
     float t = (float)i / (float)(size - 1);
-    data[i] = curve.evaluate(t);
+    data[i] = cumap.evaluate(curve, t);
   }
 }
 
@@ -193,15 +197,16 @@ static inline void curvemapping_color_to_array(BL::CurveMapping &cumap,
     BL::CurveMap mapI = cumap.curves[3];
     for (int i = 0; i < size; i++) {
       const float t = min_x + (float)i / (float)(size - 1) * range_x;
-      data[i] = make_float3(mapR.evaluate(mapI.evaluate(t)),
-                            mapG.evaluate(mapI.evaluate(t)),
-                            mapB.evaluate(mapI.evaluate(t)));
+      data[i] = make_float3(cumap.evaluate(mapR, cumap.evaluate(mapI, t)),
+                            cumap.evaluate(mapG, cumap.evaluate(mapI, t)),
+                            cumap.evaluate(mapB, cumap.evaluate(mapI, t)));
     }
   }
   else {
     for (int i = 0; i < size; i++) {
       float t = min_x + (float)i / (float)(size - 1) * range_x;
-      data[i] = make_float3(mapR.evaluate(t), mapG.evaluate(t), mapB.evaluate(t));
+      data[i] = make_float3(
+          cumap.evaluate(mapR, t), cumap.evaluate(mapG, t), cumap.evaluate(mapB, t));
     }
   }
 }
@@ -226,28 +231,37 @@ static inline int render_resolution_y(BL::RenderSettings &b_render)
   return b_render.resolution_y() * b_render.resolution_percentage() / 100;
 }
 
-static inline string image_user_file_path(BL::ImageUser &iuser, BL::Image &ima, int cfra)
+static inline string image_user_file_path(BL::ImageUser &iuser,
+                                          BL::Image &ima,
+                                          int cfra,
+                                          bool load_tiled)
 {
   char filepath[1024];
-  BKE_image_user_frame_calc(iuser.ptr.data, cfra);
+  iuser.tile(0);
+  BKE_image_user_frame_calc(NULL, iuser.ptr.data, cfra);
   BKE_image_user_file_path(iuser.ptr.data, ima.ptr.data, filepath);
-  return string(filepath);
+
+  string filepath_str = string(filepath);
+  if (load_tiled && ima.source() == BL::Image::source_TILED) {
+    string_replace(filepath_str, "1001", "<UDIM>");
+  }
+  return filepath_str;
 }
 
 static inline int image_user_frame_number(BL::ImageUser &iuser, int cfra)
 {
-  BKE_image_user_frame_calc(iuser.ptr.data, cfra);
+  BKE_image_user_frame_calc(NULL, iuser.ptr.data, cfra);
   return iuser.frame_current();
 }
 
-static inline unsigned char *image_get_pixels_for_frame(BL::Image &image, int frame)
+static inline unsigned char *image_get_pixels_for_frame(BL::Image &image, int frame, int tile)
 {
-  return BKE_image_get_pixels_for_frame(image.ptr.data, frame);
+  return BKE_image_get_pixels_for_frame(image.ptr.data, frame, tile);
 }
 
-static inline float *image_get_float_pixels_for_frame(BL::Image &image, int frame)
+static inline float *image_get_float_pixels_for_frame(BL::Image &image, int frame, int tile)
 {
-  return BKE_image_get_float_pixels_for_frame(image.ptr.data, frame);
+  return BKE_image_get_float_pixels_for_frame(image.ptr.data, frame, tile);
 }
 
 static inline void render_add_metadata(BL::RenderResult &b_rr, string name, string value)
@@ -469,7 +483,9 @@ static inline void mesh_texture_space(BL::Mesh &b_mesh, float3 &loc, float3 &siz
 }
 
 /* Object motion steps, returns 0 if no motion blur needed. */
-static inline uint object_motion_steps(BL::Object &b_parent, BL::Object &b_ob)
+static inline uint object_motion_steps(BL::Object &b_parent,
+                                       BL::Object &b_ob,
+                                       const int max_steps = INT_MAX)
 {
   /* Get motion enabled and steps from object itself. */
   PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
@@ -478,7 +494,7 @@ static inline uint object_motion_steps(BL::Object &b_parent, BL::Object &b_ob)
     return 0;
   }
 
-  uint steps = max(1, get_int(cobject, "motion_steps"));
+  int steps = max(1, get_int(cobject, "motion_steps"));
 
   /* Also check parent object, so motion blur and steps can be
    * controlled by dupligroup duplicator for linked groups. */
@@ -496,7 +512,7 @@ static inline uint object_motion_steps(BL::Object &b_parent, BL::Object &b_ob)
   /* Use uneven number of steps so we get one keyframe at the current frame,
    * and use 2^(steps - 1) so objects with more/fewer steps still have samples
    * at the same times, to avoid sampling at many different times. */
-  return (2 << (steps - 1)) + 1;
+  return min((2 << (steps - 1)) + 1, max_steps);
 }
 
 /* object uses deformation motion blur */
@@ -517,37 +533,40 @@ static inline bool object_use_deform_motion(BL::Object &b_parent, BL::Object &b_
   return use_deform_motion;
 }
 
-static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object &b_ob)
+static inline BL::FluidDomainSettings object_fluid_liquid_domain_find(BL::Object &b_ob)
 {
   BL::Object::modifiers_iterator b_mod;
 
   for (b_ob.modifiers.begin(b_mod); b_mod != b_ob.modifiers.end(); ++b_mod) {
-    if (b_mod->is_a(&RNA_SmokeModifier)) {
-      BL::SmokeModifier b_smd(*b_mod);
+    if (b_mod->is_a(&RNA_FluidModifier)) {
+      BL::FluidModifier b_mmd(*b_mod);
 
-      if (b_smd.smoke_type() == BL::SmokeModifier::smoke_type_DOMAIN)
-        return b_smd.domain_settings();
+      if (b_mmd.fluid_type() == BL::FluidModifier::fluid_type_DOMAIN &&
+          b_mmd.domain_settings().domain_type() == BL::FluidDomainSettings::domain_type_LIQUID) {
+        return b_mmd.domain_settings();
+      }
     }
   }
 
-  return BL::SmokeDomainSettings(PointerRNA_NULL);
+  return BL::FluidDomainSettings(PointerRNA_NULL);
 }
 
-static inline BL::DomainFluidSettings object_fluid_domain_find(BL::Object b_ob)
+static inline BL::FluidDomainSettings object_fluid_gas_domain_find(BL::Object &b_ob)
 {
   BL::Object::modifiers_iterator b_mod;
 
   for (b_ob.modifiers.begin(b_mod); b_mod != b_ob.modifiers.end(); ++b_mod) {
-    if (b_mod->is_a(&RNA_FluidSimulationModifier)) {
-      BL::FluidSimulationModifier b_fmd(*b_mod);
-      BL::FluidSettings fss = b_fmd.settings();
+    if (b_mod->is_a(&RNA_FluidModifier)) {
+      BL::FluidModifier b_mmd(*b_mod);
 
-      if (fss.type() == BL::FluidSettings::type_DOMAIN)
-        return (BL::DomainFluidSettings)b_fmd.settings();
+      if (b_mmd.fluid_type() == BL::FluidModifier::fluid_type_DOMAIN &&
+          b_mmd.domain_settings().domain_type() == BL::FluidDomainSettings::domain_type_GAS) {
+        return b_mmd.domain_settings();
+      }
     }
   }
 
-  return BL::DomainFluidSettings(PointerRNA_NULL);
+  return BL::FluidDomainSettings(PointerRNA_NULL);
 }
 
 static inline Mesh::SubdivisionType object_subdivision_type(BL::Object &b_ob,
@@ -576,209 +595,20 @@ static inline Mesh::SubdivisionType object_subdivision_type(BL::Object &b_ob,
   return Mesh::SUBDIVISION_NONE;
 }
 
-/* ID Map
- *
- * Utility class to keep in sync with blender data.
- * Used for objects, meshes, lights and shaders. */
-
-template<typename K, typename T> class id_map {
- public:
-  id_map(vector<T *> *scene_data_)
-  {
-    scene_data = scene_data_;
-  }
-
-  T *find(const BL::ID &id)
-  {
-    return find(id.ptr.id.data);
-  }
-
-  T *find(const K &key)
-  {
-    if (b_map.find(key) != b_map.end()) {
-      T *data = b_map[key];
-      return data;
-    }
-
-    return NULL;
-  }
-
-  void set_recalc(const BL::ID &id)
-  {
-    b_recalc.insert(id.ptr.data);
-  }
-
-  void set_recalc(void *id_ptr)
-  {
-    b_recalc.insert(id_ptr);
-  }
-
-  bool has_recalc()
-  {
-    return !(b_recalc.empty());
-  }
-
-  void pre_sync()
-  {
-    used_set.clear();
-  }
-
-  bool sync(T **r_data, const BL::ID &id)
-  {
-    return sync(r_data, id, id, id.ptr.id.data);
-  }
-
-  bool sync(T **r_data, const BL::ID &id, const BL::ID &parent, const K &key)
-  {
-    T *data = find(key);
-    bool recalc;
-
-    if (!data) {
-      /* add data if it didn't exist yet */
-      data = new T();
-      scene_data->push_back(data);
-      b_map[key] = data;
-      recalc = true;
-    }
-    else {
-      recalc = (b_recalc.find(id.ptr.data) != b_recalc.end());
-      if (parent.ptr.data)
-        recalc = recalc || (b_recalc.find(parent.ptr.data) != b_recalc.end());
-    }
-
-    used(data);
-
-    *r_data = data;
-    return recalc;
-  }
-
-  bool is_used(const K &key)
-  {
-    T *data = find(key);
-    return (data) ? used_set.find(data) != used_set.end() : false;
-  }
-
-  void used(T *data)
-  {
-    /* tag data as still in use */
-    used_set.insert(data);
-  }
-
-  void set_default(T *data)
-  {
-    b_map[NULL] = data;
-  }
-
-  bool post_sync(bool do_delete = true)
-  {
-    /* remove unused data */
-    vector<T *> new_scene_data;
-    typename vector<T *>::iterator it;
-    bool deleted = false;
-
-    for (it = scene_data->begin(); it != scene_data->end(); it++) {
-      T *data = *it;
-
-      if (do_delete && used_set.find(data) == used_set.end()) {
-        delete data;
-        deleted = true;
-      }
-      else
-        new_scene_data.push_back(data);
-    }
-
-    *scene_data = new_scene_data;
-
-    /* update mapping */
-    map<K, T *> new_map;
-    typedef pair<const K, T *> TMapPair;
-    typename map<K, T *>::iterator jt;
-
-    for (jt = b_map.begin(); jt != b_map.end(); jt++) {
-      TMapPair &pair = *jt;
-
-      if (used_set.find(pair.second) != used_set.end())
-        new_map[pair.first] = pair.second;
-    }
-
-    used_set.clear();
-    b_recalc.clear();
-    b_map = new_map;
-
-    return deleted;
-  }
-
-  const map<K, T *> &key_to_scene_data()
-  {
-    return b_map;
-  }
-
- protected:
-  vector<T *> *scene_data;
-  map<K, T *> b_map;
-  set<T *> used_set;
-  set<void *> b_recalc;
-};
-
-/* Object Key */
-
-enum { OBJECT_PERSISTENT_ID_SIZE = 16 };
-
-struct ObjectKey {
-  void *parent;
-  int id[OBJECT_PERSISTENT_ID_SIZE];
-  void *ob;
-
-  ObjectKey(void *parent_, int id_[OBJECT_PERSISTENT_ID_SIZE], void *ob_)
-      : parent(parent_), ob(ob_)
-  {
-    if (id_)
-      memcpy(id, id_, sizeof(id));
-    else
-      memset(id, 0, sizeof(id));
-  }
-
-  bool operator<(const ObjectKey &k) const
-  {
-    if (ob < k.ob) {
-      return true;
-    }
-    else if (ob == k.ob) {
-      if (parent < k.parent)
-        return true;
-      else if (parent == k.parent)
-        return memcmp(id, k.id, sizeof(id)) < 0;
-    }
-
-    return false;
-  }
-};
-
-/* Particle System Key */
-
-struct ParticleSystemKey {
-  void *ob;
-  int id[OBJECT_PERSISTENT_ID_SIZE];
-
-  ParticleSystemKey(void *ob_, int id_[OBJECT_PERSISTENT_ID_SIZE]) : ob(ob_)
-  {
-    if (id_)
-      memcpy(id, id_, sizeof(id));
-    else
-      memset(id, 0, sizeof(id));
-  }
+static inline uint object_ray_visibility(BL::Object &b_ob)
+{
+  PointerRNA cvisibility = RNA_pointer_get(&b_ob.ptr, "cycles_visibility");
+  uint flag = 0;
 
-  bool operator<(const ParticleSystemKey &k) const
-  {
-    /* first id is particle index, we don't compare that */
-    if (ob < k.ob)
-      return true;
-    else if (ob == k.ob)
-      return memcmp(id + 1, k.id + 1, sizeof(int) * (OBJECT_PERSISTENT_ID_SIZE - 1)) < 0;
+  flag |= get_boolean(cvisibility, "camera") ? PATH_RAY_CAMERA : 0;
+  flag |= get_boolean(cvisibility, "diffuse") ? PATH_RAY_DIFFUSE : 0;
+  flag |= get_boolean(cvisibility, "glossy") ? PATH_RAY_GLOSSY : 0;
+  flag |= get_boolean(cvisibility, "transmission") ? PATH_RAY_TRANSMIT : 0;
+  flag |= get_boolean(cvisibility, "shadow") ? PATH_RAY_SHADOW : 0;
+  flag |= get_boolean(cvisibility, "scatter") ? PATH_RAY_VOLUME_SCATTER : 0;
 
-    return false;
-  }
-};
+  return flag;
+}
 
 class EdgeMap {
  public:
diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp
new file mode 100644
index 00000000000..73ef5f94720
--- /dev/null
+++ b/intern/cycles/blender/blender_viewport.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "blender_viewport.h"
+
+#include "blender_util.h"
+
+CCL_NAMESPACE_BEGIN
+
+BlenderViewportParameters::BlenderViewportParameters()
+    : use_scene_world(true),
+      use_scene_lights(true),
+      studiolight_rotate_z(0.0f),
+      studiolight_intensity(1.0f),
+      studiolight_background_alpha(1.0f),
+      studiolight_path(ustring())
+{
+}
+
+BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
+    : BlenderViewportParameters()
+{
+  /* We only copy the parameters if we are in look dev mode. otherwise
+   * defaults are being used. These defaults mimic normal render settings */
+  if (b_v3d && b_v3d.shading().type() == BL::View3DShading::type_RENDERED) {
+    use_scene_world = b_v3d.shading().use_scene_world_render();
+    use_scene_lights = b_v3d.shading().use_scene_lights_render();
+    if (!use_scene_world) {
+      studiolight_rotate_z = b_v3d.shading().studiolight_rotate_z();
+      studiolight_intensity = b_v3d.shading().studiolight_intensity();
+      studiolight_background_alpha = b_v3d.shading().studiolight_background_alpha();
+      studiolight_path = b_v3d.shading().selected_studio_light().path();
+    }
+  }
+}
+
+/* Check if two instances are different. */
+const bool BlenderViewportParameters::modified(const BlenderViewportParameters &other) const
+{
+  return use_scene_world != other.use_scene_world || use_scene_lights != other.use_scene_lights ||
+         studiolight_rotate_z != other.studiolight_rotate_z ||
+         studiolight_intensity != other.studiolight_intensity ||
+         studiolight_background_alpha != other.studiolight_background_alpha ||
+         studiolight_path != other.studiolight_path;
+}
+
+const bool BlenderViewportParameters::custom_viewport_parameters() const
+{
+  return !(use_scene_world && use_scene_lights);
+}
+
+PassType BlenderViewportParameters::get_viewport_display_render_pass(BL::SpaceView3D &b_v3d)
+{
+  PassType display_pass = PASS_NONE;
+  if (b_v3d) {
+    BL::View3DShading b_view3dshading = b_v3d.shading();
+    PointerRNA cshading = RNA_pointer_get(&b_view3dshading.ptr, "cycles");
+    display_pass = (PassType)get_enum(cshading, "render_pass", -1, -1);
+  }
+  return display_pass;
+}
+
+PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes)
+{
+  if (b_v3d) {
+    PassType display_pass = BlenderViewportParameters::get_viewport_display_render_pass(b_v3d);
+
+    passes.clear();
+    Pass::add(display_pass, passes);
+
+    return display_pass;
+  }
+  return PASS_NONE;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h
new file mode 100644
index 00000000000..7c6c9c4d274
--- /dev/null
+++ b/intern/cycles/blender/blender_viewport.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BLENDER_VIEWPORT_H__
+#define __BLENDER_VIEWPORT_H__
+
+#include "MEM_guardedalloc.h"
+#include "RNA_access.h"
+#include "RNA_blender_cpp.h"
+#include "RNA_types.h"
+
+#include "render/film.h"
+#include "util/util_param.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BlenderViewportParameters {
+ private:
+  bool use_scene_world;
+  bool use_scene_lights;
+  float studiolight_rotate_z;
+  float studiolight_intensity;
+  float studiolight_background_alpha;
+  ustring studiolight_path;
+
+  BlenderViewportParameters();
+  BlenderViewportParameters(BL::SpaceView3D &b_v3d);
+
+  const bool modified(const BlenderViewportParameters &other) const;
+  const bool custom_viewport_parameters() const;
+  friend class BlenderSync;
+
+ public:
+  /* Retrieve the render pass that needs to be displayed on the given `SpaceView3D`
+   * When the `b_v3d` parameter is not given `PASS_NONE` will be returned. */
+  static PassType get_viewport_display_render_pass(BL::SpaceView3D &b_v3d);
+};
+
+PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes);
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/blender/blender_volume.cpp b/intern/cycles/blender/blender_volume.cpp
new file mode 100644
index 00000000000..80591e0eec8
--- /dev/null
+++ b/intern/cycles/blender/blender_volume.cpp
@@ -0,0 +1,387 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/colorspace.h"
+#include "render/image.h"
+#include "render/image_vdb.h"
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#ifdef WITH_OPENVDB
+#  include <openvdb/openvdb.h>
+openvdb::GridBase::ConstPtr BKE_volume_grid_openvdb_for_read(const struct Volume *volume,
+                                                             struct VolumeGrid *grid);
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* TODO: verify this is not loading unnecessary attributes. */
+class BlenderSmokeLoader : public ImageLoader {
+ public:
+  BlenderSmokeLoader(BL::Object &b_ob, AttributeStandard attribute)
+      : b_domain(object_fluid_gas_domain_find(b_ob)), attribute(attribute)
+  {
+    BL::Mesh b_mesh(b_ob.data());
+    mesh_texture_space(b_mesh, texspace_loc, texspace_size);
+  }
+
+  bool load_metadata(ImageMetaData &metadata) override
+  {
+    if (!b_domain) {
+      return false;
+    }
+
+    if (attribute == ATTR_STD_VOLUME_DENSITY || attribute == ATTR_STD_VOLUME_FLAME ||
+        attribute == ATTR_STD_VOLUME_HEAT || attribute == ATTR_STD_VOLUME_TEMPERATURE) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT;
+      metadata.channels = 1;
+    }
+    else if (attribute == ATTR_STD_VOLUME_COLOR) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+      metadata.channels = 4;
+    }
+    else if (attribute == ATTR_STD_VOLUME_VELOCITY) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+      metadata.channels = 3;
+    }
+    else {
+      return false;
+    }
+
+    int3 resolution = get_int3(b_domain.domain_resolution());
+    int amplify = (b_domain.use_noise()) ? b_domain.noise_scale() : 1;
+
+    /* Velocity and heat data is always low-resolution. */
+    if (attribute == ATTR_STD_VOLUME_VELOCITY || attribute == ATTR_STD_VOLUME_HEAT) {
+      amplify = 1;
+    }
+
+    metadata.width = resolution.x * amplify;
+    metadata.height = resolution.y * amplify;
+    metadata.depth = resolution.z * amplify;
+
+    /* Create a matrix to transform from object space to mesh texture space.
+     * This does not work with deformations but that can probably only be done
+     * well with a volume grid mapping of coordinates. */
+    metadata.transform_3d = transform_translate(-texspace_loc) * transform_scale(texspace_size);
+    metadata.use_transform_3d = true;
+
+    return true;
+  }
+
+  bool load_pixels(const ImageMetaData &, void *pixels, const size_t, const bool) override
+  {
+    if (!b_domain) {
+      return false;
+    }
+#ifdef WITH_FLUID
+    int3 resolution = get_int3(b_domain.domain_resolution());
+    int length, amplify = (b_domain.use_noise()) ? b_domain.noise_scale() : 1;
+
+    /* Velocity and heat data is always low-resolution. */
+    if (attribute == ATTR_STD_VOLUME_VELOCITY || attribute == ATTR_STD_VOLUME_HEAT) {
+      amplify = 1;
+    }
+
+    const int width = resolution.x * amplify;
+    const int height = resolution.y * amplify;
+    const int depth = resolution.z * amplify;
+    const size_t num_pixels = ((size_t)width) * height * depth;
+
+    float *fpixels = (float *)pixels;
+
+    if (attribute == ATTR_STD_VOLUME_DENSITY) {
+      FluidDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels) {
+        FluidDomainSettings_density_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_FLAME) {
+      /* this is in range 0..1, and interpreted by the OpenGL smoke viewer
+       * as 1500..3000 K with the first part faded to zero density */
+      FluidDomainSettings_flame_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels) {
+        FluidDomainSettings_flame_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_COLOR) {
+      /* the RGB is "premultiplied" by density for better interpolation results */
+      FluidDomainSettings_color_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels * 4) {
+        FluidDomainSettings_color_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_VELOCITY) {
+      FluidDomainSettings_velocity_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels * 3) {
+        FluidDomainSettings_velocity_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_HEAT) {
+      FluidDomainSettings_heat_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels) {
+        FluidDomainSettings_heat_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_TEMPERATURE) {
+      FluidDomainSettings_temperature_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels) {
+        FluidDomainSettings_temperature_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else {
+      fprintf(stderr,
+              "Cycles error: unknown volume attribute %s, skipping\n",
+              Attribute::standard_name(attribute));
+      fpixels[0] = 0.0f;
+      return false;
+    }
+#else
+    (void)pixels;
+#endif
+    fprintf(stderr, "Cycles error: unexpected smoke volume resolution, skipping\n");
+    return false;
+  }
+
+  string name() const override
+  {
+    return Attribute::standard_name(attribute);
+  }
+
+  bool equals(const ImageLoader &other) const override
+  {
+    const BlenderSmokeLoader &other_loader = (const BlenderSmokeLoader &)other;
+    return b_domain == other_loader.b_domain && attribute == other_loader.attribute;
+  }
+
+  BL::FluidDomainSettings b_domain;
+  float3 texspace_loc, texspace_size;
+  AttributeStandard attribute;
+};
+
+static void sync_smoke_volume(Scene *scene, BL::Object &b_ob, Mesh *mesh, float frame)
+{
+  BL::FluidDomainSettings b_domain = object_fluid_gas_domain_find(b_ob);
+  if (!b_domain) {
+    return;
+  }
+
+  AttributeStandard attributes[] = {ATTR_STD_VOLUME_DENSITY,
+                                    ATTR_STD_VOLUME_COLOR,
+                                    ATTR_STD_VOLUME_FLAME,
+                                    ATTR_STD_VOLUME_HEAT,
+                                    ATTR_STD_VOLUME_TEMPERATURE,
+                                    ATTR_STD_VOLUME_VELOCITY,
+                                    ATTR_STD_NONE};
+
+  for (int i = 0; attributes[i] != ATTR_STD_NONE; i++) {
+    AttributeStandard std = attributes[i];
+    if (!mesh->need_attribute(scene, std)) {
+      continue;
+    }
+
+    mesh->volume_clipping = b_domain.clipping();
+
+    Attribute *attr = mesh->attributes.add(std);
+
+    ImageLoader *loader = new BlenderSmokeLoader(b_ob, std);
+    ImageParams params;
+    params.frame = frame;
+
+    attr->data_voxel() = scene->image_manager->add_image(loader, params);
+  }
+}
+
+class BlenderVolumeLoader : public VDBImageLoader {
+ public:
+  BlenderVolumeLoader(BL::BlendData &b_data, BL::Volume &b_volume, const string &grid_name)
+      : VDBImageLoader(grid_name), b_data(b_data), b_volume(b_volume), unload(false)
+  {
+  }
+
+  bool load_metadata(ImageMetaData &metadata) override
+  {
+    b_volume.grids.load(b_data.ptr.data);
+    BL::VolumeGrid b_volume_grid = find_grid();
+
+    if (!b_volume_grid) {
+      return false;
+    }
+
+    unload = !b_volume_grid.is_loaded();
+
+#ifdef WITH_OPENVDB
+    Volume *volume = (Volume *)b_volume.ptr.data;
+    VolumeGrid *volume_grid = (VolumeGrid *)b_volume_grid.ptr.data;
+    grid = BKE_volume_grid_openvdb_for_read(volume, volume_grid);
+#endif
+
+    return VDBImageLoader::load_metadata(metadata);
+  }
+
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t pixel_size,
+                   const bool associate_alpha) override
+  {
+    b_volume.grids.load(b_data.ptr.data);
+    BL::VolumeGrid b_volume_grid = find_grid();
+
+    if (!b_volume_grid) {
+      return false;
+    }
+
+    return VDBImageLoader::load_pixels(metadata, pixels, pixel_size, associate_alpha);
+  }
+
+  bool equals(const ImageLoader &other) const override
+  {
+    /* TODO: detect multiple volume datablocks with the same filepath. */
+    const BlenderVolumeLoader &other_loader = (const BlenderVolumeLoader &)other;
+    return b_volume == other_loader.b_volume && grid_name == other_loader.grid_name;
+  }
+
+  void cleanup() override
+  {
+    VDBImageLoader::cleanup();
+
+    BL::VolumeGrid b_volume_grid = find_grid();
+    if (b_volume_grid && unload) {
+      b_volume_grid.unload();
+    }
+  }
+
+  /* Find grid with matching name. Grid point not stored in the class since
+   * grids may be unloaded before we load the pixels, for example for motion
+   * blur where we move between frames. */
+  BL::VolumeGrid find_grid()
+  {
+#ifdef WITH_OPENVDB
+    BL::Volume::grids_iterator b_grid_iter;
+    for (b_volume.grids.begin(b_grid_iter); b_grid_iter != b_volume.grids.end(); ++b_grid_iter) {
+      if (b_grid_iter->name() == grid_name) {
+        return *b_grid_iter;
+      }
+    }
+#endif
+
+    return BL::VolumeGrid(PointerRNA_NULL);
+  }
+
+  BL::BlendData b_data;
+  BL::Volume b_volume;
+  bool unload;
+};
+
+static void sync_volume_object(BL::BlendData &b_data, BL::Object &b_ob, Scene *scene, Mesh *mesh)
+{
+  BL::Volume b_volume(b_ob.data());
+  b_volume.grids.load(b_data.ptr.data);
+
+  BL::VolumeRender b_render(b_volume.render());
+
+  mesh->volume_clipping = b_render.clipping();
+  mesh->volume_step_size = b_render.step_size();
+  mesh->volume_object_space = (b_render.space() == BL::VolumeRender::space_OBJECT);
+
+  /* Find grid with matching name. */
+  BL::Volume::grids_iterator b_grid_iter;
+  for (b_volume.grids.begin(b_grid_iter); b_grid_iter != b_volume.grids.end(); ++b_grid_iter) {
+    BL::VolumeGrid b_grid = *b_grid_iter;
+    ustring name = ustring(b_grid.name());
+    AttributeStandard std = ATTR_STD_NONE;
+
+    if (name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
+      std = ATTR_STD_VOLUME_DENSITY;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR)) {
+      std = ATTR_STD_VOLUME_COLOR;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME)) {
+      std = ATTR_STD_VOLUME_FLAME;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) {
+      std = ATTR_STD_VOLUME_HEAT;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE)) {
+      std = ATTR_STD_VOLUME_TEMPERATURE;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY)) {
+      std = ATTR_STD_VOLUME_VELOCITY;
+    }
+
+    if ((std != ATTR_STD_NONE && mesh->need_attribute(scene, std)) ||
+        mesh->need_attribute(scene, name)) {
+      Attribute *attr = (std != ATTR_STD_NONE) ?
+                            mesh->attributes.add(std) :
+                            mesh->attributes.add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL);
+
+      ImageLoader *loader = new BlenderVolumeLoader(b_data, b_volume, name.string());
+      ImageParams params;
+      params.frame = b_volume.grids.frame();
+
+      attr->data_voxel() = scene->image_manager->add_image(loader, params);
+    }
+  }
+}
+
+/* If the voxel attributes change, we need to rebuild the bounding mesh. */
+static vector<int> get_voxel_image_slots(Mesh *mesh)
+{
+  vector<int> slots;
+  for (const Attribute &attr : mesh->attributes.attributes) {
+    if (attr.element == ATTR_ELEMENT_VOXEL) {
+      slots.push_back(attr.data_voxel().svm_slot());
+    }
+  }
+
+  return slots;
+}
+
+void BlenderSync::sync_volume(BL::Object &b_ob, Mesh *mesh, const vector<Shader *> &used_shaders)
+{
+  vector<int> old_voxel_slots = get_voxel_image_slots(mesh);
+
+  mesh->clear();
+  mesh->used_shaders = used_shaders;
+
+  if (view_layer.use_volumes) {
+    if (b_ob.type() == BL::Object::type_VOLUME) {
+      /* Volume object. Create only attributes, bounding mesh will then
+       * be automatically generated later. */
+      sync_volume_object(b_data, b_ob, scene, mesh);
+    }
+    else {
+      /* Smoke domain. */
+      sync_smoke_volume(scene, b_ob, mesh, b_scene.frame_current());
+    }
+  }
+
+  /* Tag update. */
+  bool rebuild = (old_voxel_slots != get_voxel_image_slots(mesh));
+  mesh->tag_update(scene, rebuild);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 36bbd937e1a..8b8f3ca7265 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -9,12 +9,11 @@ set(INC_SYS
 set(SRC
   bvh.cpp
   bvh2.cpp
-  bvh4.cpp
-  bvh8.cpp
   bvh_binning.cpp
   bvh_build.cpp
   bvh_embree.cpp
   bvh_node.cpp
+  bvh_optix.cpp
   bvh_sort.cpp
   bvh_split.cpp
   bvh_unaligned.cpp
@@ -23,12 +22,11 @@ set(SRC
 set(SRC_HEADERS
   bvh.h
   bvh2.h
-  bvh4.h
-  bvh8.h
   bvh_binning.h
   bvh_build.h
   bvh_embree.h
   bvh_node.h
+  bvh_optix.h
   bvh_params.h
   bvh_sort.h
   bvh_split.h
@@ -37,9 +35,16 @@ set(SRC_HEADERS
 
 set(LIB
   cycles_render
+  cycles_util
 )
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
+if(WITH_CYCLES_EMBREE)
+  list(APPEND LIB
+    ${EMBREE_LIBRARIES}
+  )
+endif()
+
 cycles_add_library(cycles_bvh "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 53c66777928..e9e67fd1305 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -17,18 +17,15 @@
 
 #include "bvh/bvh.h"
 
+#include "render/hair.h"
 #include "render/mesh.h"
 #include "render/object.h"
 
 #include "bvh/bvh2.h"
-#include "bvh/bvh4.h"
-#include "bvh/bvh8.h"
 #include "bvh/bvh_build.h"
+#include "bvh/bvh_embree.h"
 #include "bvh/bvh_node.h"
-
-#ifdef WITH_EMBREE
-#  include "bvh/bvh_embree.h"
-#endif
+#include "bvh/bvh_optix.h"
 
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
@@ -43,14 +40,12 @@ const char *bvh_layout_name(BVHLayout layout)
   switch (layout) {
     case BVH_LAYOUT_BVH2:
       return "BVH2";
-    case BVH_LAYOUT_BVH4:
-      return "BVH4";
-    case BVH_LAYOUT_BVH8:
-      return "BVH8";
     case BVH_LAYOUT_NONE:
       return "NONE";
     case BVH_LAYOUT_EMBREE:
       return "EMBREE";
+    case BVH_LAYOUT_OPTIX:
+      return "OPTIX";
     case BVH_LAYOUT_ALL:
       return "ALL";
   }
@@ -71,7 +66,11 @@ BVHLayout BVHParams::best_bvh_layout(BVHLayout requested_layout, BVHLayoutMask s
   /* This is a mask of supported BVH layouts which are narrower than the
    * requested one.
    */
-  const BVHLayoutMask allowed_layouts_mask = (supported_layouts & (requested_layout_mask - 1));
+  BVHLayoutMask allowed_layouts_mask = (supported_layouts & (requested_layout_mask - 1));
+  /* If the requested layout is not supported, choose from the supported layouts instead. */
+  if (allowed_layouts_mask == 0) {
+    allowed_layouts_mask = supported_layouts;
+  }
   /* We get widest from allowed ones and convert mask to actual layout. */
   const BVHLayoutMask widest_allowed_layout_mask = __bsr(allowed_layouts_mask);
   return (BVHLayout)(1 << widest_allowed_layout_mask);
@@ -90,23 +89,31 @@ int BVHStackEntry::encodeIdx() const
 
 /* BVH */
 
-BVH::BVH(const BVHParams &params_, const vector<Object *> &objects_)
-    : params(params_), objects(objects_)
+BVH::BVH(const BVHParams &params_,
+         const vector<Geometry *> &geometry_,
+         const vector<Object *> &objects_)
+    : params(params_), geometry(geometry_), objects(objects_)
 {
 }
 
-BVH *BVH::create(const BVHParams &params, const vector<Object *> &objects)
+BVH *BVH::create(const BVHParams &params,
+                 const vector<Geometry *> &geometry,
+                 const vector<Object *> &objects)
 {
   switch (params.bvh_layout) {
     case BVH_LAYOUT_BVH2:
-      return new BVH2(params, objects);
-    case BVH_LAYOUT_BVH4:
-      return new BVH4(params, objects);
-    case BVH_LAYOUT_BVH8:
-      return new BVH8(params, objects);
+      return new BVH2(params, geometry, objects);
     case BVH_LAYOUT_EMBREE:
 #ifdef WITH_EMBREE
-      return new BVHEmbree(params, objects);
+      return new BVHEmbree(params, geometry, objects);
+#else
+      break;
+#endif
+    case BVH_LAYOUT_OPTIX:
+#ifdef WITH_OPTIX
+      return new BVHOptiX(params, geometry, objects);
+#else
+      break;
 #endif
     case BVH_LAYOUT_NONE:
     case BVH_LAYOUT_ALL:
@@ -198,36 +205,34 @@ void BVH::refit_primitives(int start, int end, BoundBox &bbox, uint &visibility)
     }
     else {
       /* Primitives. */
-      const Mesh *mesh = ob->mesh;
-
       if (pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
         /* Curves. */
-        int str_offset = (params.top_level) ? mesh->curve_offset : 0;
-        Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+        const Hair *hair = static_cast<const Hair *>(ob->geometry);
+        int prim_offset = (params.top_level) ? hair->prim_offset : 0;
+        Hair::Curve curve = hair->get_curve(pidx - prim_offset);
         int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
 
-        curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
-
-        visibility |= PATH_RAY_CURVE;
+        curve.bounds_grow(k, &hair->curve_keys[0], &hair->curve_radius[0], bbox);
 
         /* Motion curves. */
-        if (mesh->use_motion_blur) {
-          Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+        if (hair->use_motion_blur) {
+          Attribute *attr = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
           if (attr) {
-            size_t mesh_size = mesh->curve_keys.size();
-            size_t steps = mesh->motion_steps - 1;
+            size_t hair_size = hair->curve_keys.size();
+            size_t steps = hair->motion_steps - 1;
             float3 *key_steps = attr->data_float3();
 
             for (size_t i = 0; i < steps; i++)
-              curve.bounds_grow(k, key_steps + i * mesh_size, &mesh->curve_radius[0], bbox);
+              curve.bounds_grow(k, key_steps + i * hair_size, &hair->curve_radius[0], bbox);
           }
         }
       }
       else {
         /* Triangles. */
-        int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
-        Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+        const Mesh *mesh = static_cast<const Mesh *>(ob->geometry);
+        int prim_offset = (params.top_level) ? mesh->prim_offset : 0;
+        Mesh::Triangle triangle = mesh->get_triangle(pidx - prim_offset);
         const float3 *vpos = &mesh->verts[0];
 
         triangle.bounds_grow(vpos, bbox);
@@ -257,7 +262,7 @@ void BVH::pack_triangle(int idx, float4 tri_verts[3])
 {
   int tob = pack.prim_object[idx];
   assert(tob >= 0 && tob < objects.size());
-  const Mesh *mesh = objects[tob]->mesh;
+  const Mesh *mesh = static_cast<const Mesh *>(objects[tob]->geometry);
 
   int tidx = pack.prim_index[idx];
   Mesh::Triangle t = mesh->get_triangle(tidx);
@@ -305,9 +310,6 @@ void BVH::pack_primitives()
         pack.prim_tri_index[i] = -1;
       }
       pack.prim_visibility[i] = ob->visibility_for_tracing();
-      if (pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
-        pack.prim_visibility[i] |= PATH_RAY_CURVE;
-      }
     }
     else {
       pack.prim_tri_index[i] = -1;
@@ -320,23 +322,14 @@ void BVH::pack_primitives()
 
 void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 {
-  /* The BVH's for instances are built separately, but for traversal all
-   * BVH's are stored in global arrays. This function merges them into the
-   * top level BVH, adjusting indexes and offsets where appropriate.
-   */
-  const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4);
-  const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
-
   /* Adjust primitive index to point to the triangle in the global array, for
-   * meshes with transform applied and already in the top level BVH.
+   * geometry with transform applied and already in the top level BVH.
    */
-  for (size_t i = 0; i < pack.prim_index.size(); i++)
+  for (size_t i = 0; i < pack.prim_index.size(); i++) {
     if (pack.prim_index[i] != -1) {
-      if (pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
-        pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->curve_offset;
-      else
-        pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->tri_offset;
+      pack.prim_index[i] += objects[pack.prim_object[i]]->geometry->prim_offset;
     }
+  }
 
   /* track offsets of instanced BVH data in global array */
   size_t prim_offset = pack.prim_index.size();
@@ -356,26 +349,17 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
   size_t pack_leaf_nodes_offset = leaf_nodes_size;
   size_t object_offset = 0;
 
-  map<Mesh *, int> mesh_map;
+  foreach (Geometry *geom, geometry) {
+    BVH *bvh = geom->bvh;
 
-  foreach (Object *ob, objects) {
-    Mesh *mesh = ob->mesh;
-    BVH *bvh = mesh->bvh;
-
-    if (mesh->need_build_bvh()) {
-      if (mesh_map.find(mesh) == mesh_map.end()) {
-        prim_index_size += bvh->pack.prim_index.size();
-        prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
-        nodes_size += bvh->pack.nodes.size();
-        leaf_nodes_size += bvh->pack.leaf_nodes.size();
-
-        mesh_map[mesh] = 1;
-      }
+    if (geom->need_build_bvh(params.bvh_layout)) {
+      prim_index_size += bvh->pack.prim_index.size();
+      prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
+      nodes_size += bvh->pack.nodes.size();
+      leaf_nodes_size += bvh->pack.leaf_nodes.size();
     }
   }
 
-  mesh_map.clear();
-
   pack.prim_index.resize(prim_index_size);
   pack.prim_type.resize(prim_index_size);
   pack.prim_object.resize(prim_index_size);
@@ -400,34 +384,35 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
   int4 *pack_leaf_nodes = (pack.leaf_nodes.size()) ? &pack.leaf_nodes[0] : NULL;
   float2 *pack_prim_time = (pack.prim_time.size()) ? &pack.prim_time[0] : NULL;
 
+  map<Geometry *, int> geometry_map;
+
   /* merge */
   foreach (Object *ob, objects) {
-    Mesh *mesh = ob->mesh;
+    Geometry *geom = ob->geometry;
 
     /* We assume that if mesh doesn't need own BVH it was already included
      * into a top-level BVH and no packing here is needed.
      */
-    if (!mesh->need_build_bvh()) {
+    if (!geom->need_build_bvh(params.bvh_layout)) {
       pack.object_node[object_offset++] = 0;
       continue;
     }
 
     /* if mesh already added once, don't add it again, but used set
      * node offset for this object */
-    map<Mesh *, int>::iterator it = mesh_map.find(mesh);
+    map<Geometry *, int>::iterator it = geometry_map.find(geom);
 
-    if (mesh_map.find(mesh) != mesh_map.end()) {
+    if (geometry_map.find(geom) != geometry_map.end()) {
       int noffset = it->second;
       pack.object_node[object_offset++] = noffset;
       continue;
     }
 
-    BVH *bvh = mesh->bvh;
+    BVH *bvh = geom->bvh;
 
     int noffset = nodes_offset;
     int noffset_leaf = nodes_leaf_offset;
-    int mesh_tri_offset = mesh->tri_offset;
-    int mesh_curve_offset = mesh->curve_offset;
+    int geom_prim_offset = geom->prim_offset;
 
     /* fill in node indexes for instances */
     if (bvh->pack.root_index == -1)
@@ -435,7 +420,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
     else
       pack.object_node[object_offset++] = noffset;
 
-    mesh_map[mesh] = pack.object_node[object_offset - 1];
+    geometry_map[geom] = pack.object_node[object_offset - 1];
 
     /* merge primitive, object and triangle indexes */
     if (bvh->pack.prim_index.size()) {
@@ -448,11 +433,11 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
       for (size_t i = 0; i < bvh_prim_index_size; i++) {
         if (bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset;
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_prim_index_offset] = -1;
         }
         else {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset;
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_prim_index_offset] = bvh_prim_tri_index[i] +
                                                         pack_prim_tri_verts_offset;
         }
@@ -499,53 +484,21 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
       for (size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
         size_t nsize, nsize_bbox;
         if (bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
-          if (use_obvh) {
-            nsize = BVH_UNALIGNED_ONODE_SIZE;
-            nsize_bbox = BVH_UNALIGNED_ONODE_SIZE - 1;
-          }
-          else {
-            nsize = use_qbvh ? BVH_UNALIGNED_QNODE_SIZE : BVH_UNALIGNED_NODE_SIZE;
-            nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE - 1 : 0;
-          }
+          nsize = BVH_UNALIGNED_NODE_SIZE;
+          nsize_bbox = 0;
         }
         else {
-          if (use_obvh) {
-            nsize = BVH_ONODE_SIZE;
-            nsize_bbox = BVH_ONODE_SIZE - 1;
-          }
-          else {
-            nsize = (use_qbvh) ? BVH_QNODE_SIZE : BVH_NODE_SIZE;
-            nsize_bbox = (use_qbvh) ? BVH_QNODE_SIZE - 1 : 0;
-          }
+          nsize = BVH_NODE_SIZE;
+          nsize_bbox = 0;
         }
 
         memcpy(pack_nodes + pack_nodes_offset, bvh_nodes + i, nsize_bbox * sizeof(int4));
 
         /* Modify offsets into arrays */
         int4 data = bvh_nodes[i + nsize_bbox];
-        int4 data1 = bvh_nodes[i + nsize_bbox - 1];
-        if (use_obvh) {
-          data.z += (data.z < 0) ? -noffset_leaf : noffset;
-          data.w += (data.w < 0) ? -noffset_leaf : noffset;
-          data.x += (data.x < 0) ? -noffset_leaf : noffset;
-          data.y += (data.y < 0) ? -noffset_leaf : noffset;
-          data1.z += (data1.z < 0) ? -noffset_leaf : noffset;
-          data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
-          data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
-          data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
-        }
-        else {
-          data.z += (data.z < 0) ? -noffset_leaf : noffset;
-          data.w += (data.w < 0) ? -noffset_leaf : noffset;
-          if (use_qbvh) {
-            data.x += (data.x < 0) ? -noffset_leaf : noffset;
-            data.y += (data.y < 0) ? -noffset_leaf : noffset;
-          }
-        }
+        data.z += (data.z < 0) ? -noffset_leaf : noffset;
+        data.w += (data.w < 0) ? -noffset_leaf : noffset;
         pack_nodes[pack_nodes_offset + nsize_bbox] = data;
-        if (use_obvh) {
-          pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
-        }
 
         /* Usually this copies nothing, but we better
          * be prepared for possible node size extension.
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index edce3ca6f2a..6639e06b0bc 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -26,11 +26,14 @@
 CCL_NAMESPACE_BEGIN
 
 class Stats;
+class Device;
+class DeviceScene;
 class BVHNode;
 struct BVHStackEntry;
 class BVHParams;
 class BoundBox;
 class LeafNode;
+class Geometry;
 class Object;
 class Progress;
 
@@ -73,7 +76,7 @@ struct PackedBVH {
   }
 };
 
-enum BVH_TYPE { bvh2, bvh4, bvh8 };
+enum BVH_TYPE { bvh2 };
 
 /* BVH */
 
@@ -81,18 +84,27 @@ class BVH {
  public:
   PackedBVH pack;
   BVHParams params;
+  vector<Geometry *> geometry;
   vector<Object *> objects;
 
-  static BVH *create(const BVHParams &params, const vector<Object *> &objects);
+  static BVH *create(const BVHParams &params,
+                     const vector<Geometry *> &geometry,
+                     const vector<Object *> &objects);
   virtual ~BVH()
   {
   }
 
   virtual void build(Progress &progress, Stats *stats = NULL);
+  virtual void copy_to_device(Progress & /*progress*/, DeviceScene * /*dscene*/)
+  {
+  }
+
   void refit(Progress &progress);
 
  protected:
-  BVH(const BVHParams &params, const vector<Object *> &objects);
+  BVH(const BVHParams &params,
+      const vector<Geometry *> &geometry,
+      const vector<Object *> &objects);
 
   /* Refit range of primitives. */
   void refit_primitives(int start, int end, BoundBox &bbox, uint &visibility);
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
index f419d413ef6..c903070429e 100644
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -25,7 +25,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-BVH2::BVH2(const BVHParams &params_, const vector<Object *> &objects_) : BVH(params_, objects_)
+BVH2::BVH2(const BVHParams &params_,
+           const vector<Geometry *> &geometry_,
+           const vector<Object *> &objects_)
+    : BVH(params_, geometry_, objects_)
 {
 }
 
diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h
index c6a4e6fa73a..fa3e45b72d2 100644
--- a/intern/cycles/bvh/bvh2.h
+++ b/intern/cycles/bvh/bvh2.h
@@ -46,7 +46,9 @@ class BVH2 : public BVH {
  protected:
   /* constructor */
   friend class BVH;
-  BVH2(const BVHParams &params, const vector<Object *> &objects);
+  BVH2(const BVHParams &params,
+       const vector<Geometry *> &geometry,
+       const vector<Object *> &objects);
 
   /* Building process. */
   virtual BVHNode *widen_children_nodes(const BVHNode *root) override;
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
deleted file mode 100644
index 850bdf5b8b4..00000000000
--- a/intern/cycles/bvh/bvh4.cpp
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Adapted from code copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bvh/bvh4.h"
-
-#include "render/mesh.h"
-#include "render/object.h"
-
-#include "bvh/bvh_node.h"
-#include "bvh/bvh_unaligned.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Can we avoid this somehow or make more generic?
- *
- * Perhaps we can merge nodes in actual tree and make our
- * life easier all over the place.
- */
-
-BVH4::BVH4(const BVHParams &params_, const vector<Object *> &objects_) : BVH(params_, objects_)
-{
-  params.bvh_layout = BVH_LAYOUT_BVH4;
-}
-
-namespace {
-
-BVHNode *bvh_node_merge_children_recursively(const BVHNode *node)
-{
-  if (node->is_leaf()) {
-    return new LeafNode(*reinterpret_cast<const LeafNode *>(node));
-  }
-  /* Collect nodes of one layer deeper, allowing us to have more childrem in
-   * an inner layer. */
-  assert(node->num_children() <= 2);
-  const BVHNode *children[4];
-  const BVHNode *child0 = node->get_child(0);
-  const BVHNode *child1 = node->get_child(1);
-  int num_children = 0;
-  if (child0->is_leaf()) {
-    children[num_children++] = child0;
-  }
-  else {
-    children[num_children++] = child0->get_child(0);
-    children[num_children++] = child0->get_child(1);
-  }
-  if (child1->is_leaf()) {
-    children[num_children++] = child1;
-  }
-  else {
-    children[num_children++] = child1->get_child(0);
-    children[num_children++] = child1->get_child(1);
-  }
-  /* Merge children in subtrees. */
-  BVHNode *children4[4];
-  for (int i = 0; i < num_children; ++i) {
-    children4[i] = bvh_node_merge_children_recursively(children[i]);
-  }
-  /* Allocate new node. */
-  BVHNode *node4 = new InnerNode(node->bounds, children4, num_children);
-  /* TODO(sergey): Consider doing this from the InnerNode() constructor.
-   * But in order to do this nicely need to think of how to pass all the
-   * parameters there. */
-  if (node->is_unaligned) {
-    node4->is_unaligned = true;
-    node4->aligned_space = new Transform();
-    *node4->aligned_space = *node->aligned_space;
-  }
-  return node4;
-}
-
-}  // namespace
-
-BVHNode *BVH4::widen_children_nodes(const BVHNode *root)
-{
-  if (root == NULL) {
-    return NULL;
-  }
-  if (root->is_leaf()) {
-    return const_cast<BVHNode *>(root);
-  }
-  BVHNode *root4 = bvh_node_merge_children_recursively(root);
-  /* TODO(sergey): Pack children nodes to parents which has less that 4
-   * children. */
-  return root4;
-}
-
-void BVH4::pack_leaf(const BVHStackEntry &e, const LeafNode *leaf)
-{
-  float4 data[BVH_QNODE_LEAF_SIZE];
-  memset(data, 0, sizeof(data));
-  if (leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
-    /* object */
-    data[0].x = __int_as_float(~(leaf->lo));
-    data[0].y = __int_as_float(0);
-  }
-  else {
-    /* triangle */
-    data[0].x = __int_as_float(leaf->lo);
-    data[0].y = __int_as_float(leaf->hi);
-  }
-  data[0].z = __uint_as_float(leaf->visibility);
-  if (leaf->num_triangles() != 0) {
-    data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
-  }
-
-  memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4) * BVH_QNODE_LEAF_SIZE);
-}
-
-void BVH4::pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  bool has_unaligned = false;
-  /* Check whether we have to create unaligned node or all nodes are aligned
-   * and we can cut some corner here.
-   */
-  if (params.use_unaligned_nodes) {
-    for (int i = 0; i < num; i++) {
-      if (en[i].node->is_unaligned) {
-        has_unaligned = true;
-        break;
-      }
-    }
-  }
-  if (has_unaligned) {
-    /* There's no unaligned children, pack into AABB node. */
-    pack_unaligned_inner(e, en, num);
-  }
-  else {
-    /* Create unaligned node with orientation transform for each of the
-     * children.
-     */
-    pack_aligned_inner(e, en, num);
-  }
-}
-
-void BVH4::pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  BoundBox bounds[4];
-  int child[4];
-  for (int i = 0; i < num; ++i) {
-    bounds[i] = en[i].node->bounds;
-    child[i] = en[i].encodeIdx();
-  }
-  pack_aligned_node(
-      e.idx, bounds, child, e.node->visibility, e.node->time_from, e.node->time_to, num);
-}
-
-void BVH4::pack_aligned_node(int idx,
-                             const BoundBox *bounds,
-                             const int *child,
-                             const uint visibility,
-                             const float time_from,
-                             const float time_to,
-                             const int num)
-{
-  float4 data[BVH_QNODE_SIZE];
-  memset(data, 0, sizeof(data));
-
-  data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
-  data[0].y = time_from;
-  data[0].z = time_to;
-
-  for (int i = 0; i < num; i++) {
-    float3 bb_min = bounds[i].min;
-    float3 bb_max = bounds[i].max;
-
-    data[1][i] = bb_min.x;
-    data[2][i] = bb_max.x;
-    data[3][i] = bb_min.y;
-    data[4][i] = bb_max.y;
-    data[5][i] = bb_min.z;
-    data[6][i] = bb_max.z;
-
-    data[7][i] = __int_as_float(child[i]);
-  }
-
-  for (int i = num; i < 4; i++) {
-    /* We store BB which would never be recorded as intersection
-     * so kernel might safely assume there are always 4 child nodes.
-     */
-    data[1][i] = FLT_MAX;
-    data[2][i] = -FLT_MAX;
-
-    data[3][i] = FLT_MAX;
-    data[4][i] = -FLT_MAX;
-
-    data[5][i] = FLT_MAX;
-    data[6][i] = -FLT_MAX;
-
-    data[7][i] = __int_as_float(0);
-  }
-
-  memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_QNODE_SIZE);
-}
-
-void BVH4::pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  Transform aligned_space[4];
-  BoundBox bounds[4];
-  int child[4];
-  for (int i = 0; i < num; ++i) {
-    aligned_space[i] = en[i].node->get_aligned_space();
-    bounds[i] = en[i].node->bounds;
-    child[i] = en[i].encodeIdx();
-  }
-  pack_unaligned_node(e.idx,
-                      aligned_space,
-                      bounds,
-                      child,
-                      e.node->visibility,
-                      e.node->time_from,
-                      e.node->time_to,
-                      num);
-}
-
-void BVH4::pack_unaligned_node(int idx,
-                               const Transform *aligned_space,
-                               const BoundBox *bounds,
-                               const int *child,
-                               const uint visibility,
-                               const float time_from,
-                               const float time_to,
-                               const int num)
-{
-  float4 data[BVH_UNALIGNED_QNODE_SIZE];
-  memset(data, 0, sizeof(data));
-
-  data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
-  data[0].y = time_from;
-  data[0].z = time_to;
-
-  for (int i = 0; i < num; i++) {
-    Transform space = BVHUnaligned::compute_node_transform(bounds[i], aligned_space[i]);
-
-    data[1][i] = space.x.x;
-    data[2][i] = space.x.y;
-    data[3][i] = space.x.z;
-
-    data[4][i] = space.y.x;
-    data[5][i] = space.y.y;
-    data[6][i] = space.y.z;
-
-    data[7][i] = space.z.x;
-    data[8][i] = space.z.y;
-    data[9][i] = space.z.z;
-
-    data[10][i] = space.x.w;
-    data[11][i] = space.y.w;
-    data[12][i] = space.z.w;
-
-    data[13][i] = __int_as_float(child[i]);
-  }
-
-  for (int i = num; i < 4; i++) {
-    /* We store BB which would never be recorded as intersection
-     * so kernel might safely assume there are always 4 child nodes.
-     */
-
-    data[1][i] = NAN;
-    data[2][i] = NAN;
-    data[3][i] = NAN;
-
-    data[4][i] = NAN;
-    data[5][i] = NAN;
-    data[6][i] = NAN;
-
-    data[7][i] = NAN;
-    data[8][i] = NAN;
-    data[9][i] = NAN;
-
-    data[10][i] = NAN;
-    data[11][i] = NAN;
-    data[12][i] = NAN;
-
-    data[13][i] = __int_as_float(0);
-  }
-
-  memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_UNALIGNED_QNODE_SIZE);
-}
-
-/* Quad SIMD Nodes */
-
-void BVH4::pack_nodes(const BVHNode *root)
-{
-  /* Calculate size of the arrays required. */
-  const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
-  const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-  assert(num_leaf_nodes <= num_nodes);
-  const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
-  size_t node_size;
-  if (params.use_unaligned_nodes) {
-    const size_t num_unaligned_nodes = root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
-    node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) +
-                (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE;
-  }
-  else {
-    node_size = num_inner_nodes * BVH_QNODE_SIZE;
-  }
-  /* Resize arrays. */
-  pack.nodes.clear();
-  pack.leaf_nodes.clear();
-  /* For top level BVH, first merge existing BVH's so we know the offsets. */
-  if (params.top_level) {
-    pack_instances(node_size, num_leaf_nodes * BVH_QNODE_LEAF_SIZE);
-  }
-  else {
-    pack.nodes.resize(node_size);
-    pack.leaf_nodes.resize(num_leaf_nodes * BVH_QNODE_LEAF_SIZE);
-  }
-
-  int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
-  vector<BVHStackEntry> stack;
-  stack.reserve(BVHParams::MAX_DEPTH * 2);
-  if (root->is_leaf()) {
-    stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-  }
-  else {
-    stack.push_back(BVHStackEntry(root, nextNodeIdx));
-    nextNodeIdx += root->has_unaligned() ? BVH_UNALIGNED_QNODE_SIZE : BVH_QNODE_SIZE;
-  }
-
-  while (stack.size()) {
-    BVHStackEntry e = stack.back();
-    stack.pop_back();
-
-    if (e.node->is_leaf()) {
-      /* leaf node */
-      const LeafNode *leaf = reinterpret_cast<const LeafNode *>(e.node);
-      pack_leaf(e, leaf);
-    }
-    else {
-      /* Inner node. */
-      /* Collect nodes. */
-      const BVHNode *children[4];
-      const int num_children = e.node->num_children();
-      /* Push entries on the stack. */
-      for (int i = 0; i < num_children; ++i) {
-        int idx;
-        children[i] = e.node->get_child(i);
-        assert(children[i] != NULL);
-        if (children[i]->is_leaf()) {
-          idx = nextLeafNodeIdx++;
-        }
-        else {
-          idx = nextNodeIdx;
-          nextNodeIdx += children[i]->has_unaligned() ? BVH_UNALIGNED_QNODE_SIZE : BVH_QNODE_SIZE;
-        }
-        stack.push_back(BVHStackEntry(children[i], idx));
-      }
-      /* Set node. */
-      pack_inner(e, &stack[stack.size() - num_children], num_children);
-    }
-  }
-
-  assert(node_size == nextNodeIdx);
-  /* Root index to start traversal at, to handle case of single leaf node. */
-  pack.root_index = (root->is_leaf()) ? -1 : 0;
-}
-
-void BVH4::refit_nodes()
-{
-  assert(!params.top_level);
-
-  BoundBox bbox = BoundBox::empty;
-  uint visibility = 0;
-  refit_node(0, (pack.root_index == -1) ? true : false, bbox, visibility);
-}
-
-void BVH4::refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility)
-{
-  if (leaf) {
-    /* Refit leaf node. */
-    int4 *data = &pack.leaf_nodes[idx];
-    int4 c = data[0];
-
-    BVH::refit_primitives(c.x, c.y, bbox, visibility);
-
-    /* TODO(sergey): This is actually a copy of pack_leaf(),
-     * but this chunk of code only knows actual data and has
-     * no idea about BVHNode.
-     *
-     * Would be nice to de-duplicate code, but trying to make
-     * making code more general ends up in much nastier code
-     * in my opinion so far.
-     *
-     * Same applies to the inner nodes case below.
-     */
-    float4 leaf_data[BVH_QNODE_LEAF_SIZE];
-    leaf_data[0].x = __int_as_float(c.x);
-    leaf_data[0].y = __int_as_float(c.y);
-    leaf_data[0].z = __uint_as_float(visibility);
-    leaf_data[0].w = __uint_as_float(c.w);
-    memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4) * BVH_QNODE_LEAF_SIZE);
-  }
-  else {
-    int4 *data = &pack.nodes[idx];
-    bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0;
-    int4 c;
-    if (is_unaligned) {
-      c = data[13];
-    }
-    else {
-      c = data[7];
-    }
-    /* Refit inner node, set bbox from children. */
-    BoundBox child_bbox[4] = {BoundBox::empty, BoundBox::empty, BoundBox::empty, BoundBox::empty};
-    uint child_visibility[4] = {0};
-    int num_nodes = 0;
-
-    for (int i = 0; i < 4; ++i) {
-      if (c[i] != 0) {
-        refit_node((c[i] < 0) ? -c[i] - 1 : c[i], (c[i] < 0), child_bbox[i], child_visibility[i]);
-        ++num_nodes;
-        bbox.grow(child_bbox[i]);
-        visibility |= child_visibility[i];
-      }
-    }
-
-    if (is_unaligned) {
-      Transform aligned_space[4] = {
-          transform_identity(), transform_identity(), transform_identity(), transform_identity()};
-      pack_unaligned_node(
-          idx, aligned_space, child_bbox, &c[0], visibility, 0.0f, 1.0f, num_nodes);
-    }
-    else {
-      pack_aligned_node(idx, child_bbox, &c[0], visibility, 0.0f, 1.0f, num_nodes);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h
deleted file mode 100644
index 38b0961d3df..00000000000
--- a/intern/cycles/bvh/bvh4.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Adapted from code copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __BVH4_H__
-#define __BVH4_H__
-
-#include "bvh/bvh.h"
-#include "bvh/bvh_params.h"
-
-#include "util/util_types.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class BVHNode;
-struct BVHStackEntry;
-class BVHParams;
-class BoundBox;
-class LeafNode;
-class Object;
-class Progress;
-
-#define BVH_QNODE_SIZE 8
-#define BVH_QNODE_LEAF_SIZE 1
-#define BVH_UNALIGNED_QNODE_SIZE 14
-
-/* BVH4
- *
- * Quad BVH, with each node having four children, to use with SIMD instructions.
- */
-class BVH4 : public BVH {
- protected:
-  /* constructor */
-  friend class BVH;
-  BVH4(const BVHParams &params, const vector<Object *> &objects);
-
-  /* Building process. */
-  virtual BVHNode *widen_children_nodes(const BVHNode *root) override;
-
-  /* pack */
-  void pack_nodes(const BVHNode *root) override;
-
-  void pack_leaf(const BVHStackEntry &e, const LeafNode *leaf);
-  void pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-
-  void pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-  void pack_aligned_node(int idx,
-                         const BoundBox *bounds,
-                         const int *child,
-                         const uint visibility,
-                         const float time_from,
-                         const float time_to,
-                         const int num);
-
-  void pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-  void pack_unaligned_node(int idx,
-                           const Transform *aligned_space,
-                           const BoundBox *bounds,
-                           const int *child,
-                           const uint visibility,
-                           const float time_from,
-                           const float time_to,
-                           const int num);
-
-  /* refit */
-  void refit_nodes() override;
-  void refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility);
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __BVH4_H__ */
diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp
deleted file mode 100644
index e812d806b94..00000000000
--- a/intern/cycles/bvh/bvh8.cpp
+++ /dev/null
@@ -1,539 +0,0 @@
-/*
-* Original code Copyright 2017, Intel Corporation
-* Modifications Copyright 2018, Blender Foundation.
-*
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions are met:
-*
-* * Redistributions of source code must retain the above copyright notice,
-* this list of conditions and the following disclaimer.
-* * Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in the
-* documentation and/or other materials provided with the distribution.
-* * Neither the name of Intel Corporation nor the names of its contributors
-* may be used to endorse or promote products derived from this software
-* without specific prior written permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#include "bvh/bvh8.h"
-
-#include "render/mesh.h"
-#include "render/object.h"
-
-#include "bvh/bvh_node.h"
-#include "bvh/bvh_unaligned.h"
-
-CCL_NAMESPACE_BEGIN
-
-BVH8::BVH8(const BVHParams &params_, const vector<Object *> &objects_) : BVH(params_, objects_)
-{
-}
-
-namespace {
-
-BVHNode *bvh_node_merge_children_recursively(const BVHNode *node)
-{
-  if (node->is_leaf()) {
-    return new LeafNode(*reinterpret_cast<const LeafNode *>(node));
-  }
-  /* Collect nodes of two layer deeper, allowing us to have more childrem in
-   * an inner layer. */
-  assert(node->num_children() <= 2);
-  const BVHNode *children[8];
-  const BVHNode *child0 = node->get_child(0);
-  const BVHNode *child1 = node->get_child(1);
-  int num_children = 0;
-  if (child0->is_leaf()) {
-    children[num_children++] = child0;
-  }
-  else {
-    const BVHNode *child00 = child0->get_child(0), *child01 = child0->get_child(1);
-    if (child00->is_leaf()) {
-      children[num_children++] = child00;
-    }
-    else {
-      children[num_children++] = child00->get_child(0);
-      children[num_children++] = child00->get_child(1);
-    }
-    if (child01->is_leaf()) {
-      children[num_children++] = child01;
-    }
-    else {
-      children[num_children++] = child01->get_child(0);
-      children[num_children++] = child01->get_child(1);
-    }
-  }
-  if (child1->is_leaf()) {
-    children[num_children++] = child1;
-  }
-  else {
-    const BVHNode *child10 = child1->get_child(0), *child11 = child1->get_child(1);
-    if (child10->is_leaf()) {
-      children[num_children++] = child10;
-    }
-    else {
-      children[num_children++] = child10->get_child(0);
-      children[num_children++] = child10->get_child(1);
-    }
-    if (child11->is_leaf()) {
-      children[num_children++] = child11;
-    }
-    else {
-      children[num_children++] = child11->get_child(0);
-      children[num_children++] = child11->get_child(1);
-    }
-  }
-  /* Merge children in subtrees. */
-  BVHNode *children4[8];
-  for (int i = 0; i < num_children; ++i) {
-    children4[i] = bvh_node_merge_children_recursively(children[i]);
-  }
-  /* Allocate new node. */
-  BVHNode *node8 = new InnerNode(node->bounds, children4, num_children);
-  /* TODO(sergey): Consider doing this from the InnerNode() constructor.
-   * But in order to do this nicely need to think of how to pass all the
-   * parameters there. */
-  if (node->is_unaligned) {
-    node8->is_unaligned = true;
-    node8->aligned_space = new Transform();
-    *node8->aligned_space = *node->aligned_space;
-  }
-  return node8;
-}
-
-}  // namespace
-
-BVHNode *BVH8::widen_children_nodes(const BVHNode *root)
-{
-  if (root == NULL) {
-    return NULL;
-  }
-  if (root->is_leaf()) {
-    return const_cast<BVHNode *>(root);
-  }
-  BVHNode *root8 = bvh_node_merge_children_recursively(root);
-  /* TODO(sergey): Pack children nodes to parents which has less that 4
-   * children. */
-  return root8;
-}
-
-void BVH8::pack_leaf(const BVHStackEntry &e, const LeafNode *leaf)
-{
-  float4 data[BVH_ONODE_LEAF_SIZE];
-  memset(data, 0, sizeof(data));
-  if (leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
-    /* object */
-    data[0].x = __int_as_float(~(leaf->lo));
-    data[0].y = __int_as_float(0);
-  }
-  else {
-    /* triangle */
-    data[0].x = __int_as_float(leaf->lo);
-    data[0].y = __int_as_float(leaf->hi);
-  }
-  data[0].z = __uint_as_float(leaf->visibility);
-  if (leaf->num_triangles() != 0) {
-    data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
-  }
-
-  memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4) * BVH_ONODE_LEAF_SIZE);
-}
-
-void BVH8::pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  bool has_unaligned = false;
-  /* Check whether we have to create unaligned node or all nodes are aligned
-   * and we can cut some corner here.
-   */
-  if (params.use_unaligned_nodes) {
-    for (int i = 0; i < num; i++) {
-      if (en[i].node->is_unaligned) {
-        has_unaligned = true;
-        break;
-      }
-    }
-  }
-  if (has_unaligned) {
-    /* There's no unaligned children, pack into AABB node. */
-    pack_unaligned_inner(e, en, num);
-  }
-  else {
-    /* Create unaligned node with orientation transform for each of the
-     * children.
-     */
-    pack_aligned_inner(e, en, num);
-  }
-}
-
-void BVH8::pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  BoundBox bounds[8];
-  int child[8];
-  for (int i = 0; i < num; ++i) {
-    bounds[i] = en[i].node->bounds;
-    child[i] = en[i].encodeIdx();
-  }
-  pack_aligned_node(
-      e.idx, bounds, child, e.node->visibility, e.node->time_from, e.node->time_to, num);
-}
-
-void BVH8::pack_aligned_node(int idx,
-                             const BoundBox *bounds,
-                             const int *child,
-                             const uint visibility,
-                             const float time_from,
-                             const float time_to,
-                             const int num)
-{
-  float8 data[8];
-  memset(data, 0, sizeof(data));
-
-  data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
-  data[0].b = time_from;
-  data[0].c = time_to;
-
-  for (int i = 0; i < num; i++) {
-    float3 bb_min = bounds[i].min;
-    float3 bb_max = bounds[i].max;
-
-    data[1][i] = bb_min.x;
-    data[2][i] = bb_max.x;
-    data[3][i] = bb_min.y;
-    data[4][i] = bb_max.y;
-    data[5][i] = bb_min.z;
-    data[6][i] = bb_max.z;
-
-    data[7][i] = __int_as_float(child[i]);
-  }
-
-  for (int i = num; i < 8; i++) {
-    /* We store BB which would never be recorded as intersection
-     * so kernel might safely assume there are always 4 child nodes.
-     */
-    data[1][i] = FLT_MAX;
-    data[2][i] = -FLT_MAX;
-
-    data[3][i] = FLT_MAX;
-    data[4][i] = -FLT_MAX;
-
-    data[5][i] = FLT_MAX;
-    data[6][i] = -FLT_MAX;
-
-    data[7][i] = __int_as_float(0);
-  }
-
-  memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_ONODE_SIZE);
-}
-
-void BVH8::pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num)
-{
-  Transform aligned_space[8];
-  BoundBox bounds[8];
-  int child[8];
-  for (int i = 0; i < num; ++i) {
-    aligned_space[i] = en[i].node->get_aligned_space();
-    bounds[i] = en[i].node->bounds;
-    child[i] = en[i].encodeIdx();
-  }
-  pack_unaligned_node(e.idx,
-                      aligned_space,
-                      bounds,
-                      child,
-                      e.node->visibility,
-                      e.node->time_from,
-                      e.node->time_to,
-                      num);
-}
-
-void BVH8::pack_unaligned_node(int idx,
-                               const Transform *aligned_space,
-                               const BoundBox *bounds,
-                               const int *child,
-                               const uint visibility,
-                               const float time_from,
-                               const float time_to,
-                               const int num)
-{
-  float8 data[BVH_UNALIGNED_ONODE_SIZE];
-  memset(data, 0, sizeof(data));
-
-  data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
-  data[0].b = time_from;
-  data[0].c = time_to;
-
-  for (int i = 0; i < num; i++) {
-    Transform space = BVHUnaligned::compute_node_transform(bounds[i], aligned_space[i]);
-
-    data[1][i] = space.x.x;
-    data[2][i] = space.x.y;
-    data[3][i] = space.x.z;
-
-    data[4][i] = space.y.x;
-    data[5][i] = space.y.y;
-    data[6][i] = space.y.z;
-
-    data[7][i] = space.z.x;
-    data[8][i] = space.z.y;
-    data[9][i] = space.z.z;
-
-    data[10][i] = space.x.w;
-    data[11][i] = space.y.w;
-    data[12][i] = space.z.w;
-
-    data[13][i] = __int_as_float(child[i]);
-  }
-
-  for (int i = num; i < 8; i++) {
-    /* We store BB which would never be recorded as intersection
-     * so kernel might safely assume there are always 4 child nodes.
-     */
-
-    data[1][i] = NAN;
-    data[2][i] = NAN;
-    data[3][i] = NAN;
-
-    data[4][i] = NAN;
-    data[5][i] = NAN;
-    data[6][i] = NAN;
-
-    data[7][i] = NAN;
-    data[8][i] = NAN;
-    data[9][i] = NAN;
-
-    data[10][i] = NAN;
-    data[11][i] = NAN;
-    data[12][i] = NAN;
-
-    data[13][i] = __int_as_float(0);
-  }
-
-  memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_UNALIGNED_ONODE_SIZE);
-}
-
-/* Quad SIMD Nodes */
-
-void BVH8::pack_nodes(const BVHNode *root)
-{
-  /* Calculate size of the arrays required. */
-  const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
-  const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
-  assert(num_leaf_nodes <= num_nodes);
-  const size_t num_inner_nodes = num_nodes - num_leaf_nodes;
-  size_t node_size;
-  if (params.use_unaligned_nodes) {
-    const size_t num_unaligned_nodes = root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT);
-    node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) +
-                (num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE;
-  }
-  else {
-    node_size = num_inner_nodes * BVH_ONODE_SIZE;
-  }
-  /* Resize arrays. */
-  pack.nodes.clear();
-  pack.leaf_nodes.clear();
-  /* For top level BVH, first merge existing BVH's so we know the offsets. */
-  if (params.top_level) {
-    pack_instances(node_size, num_leaf_nodes * BVH_ONODE_LEAF_SIZE);
-  }
-  else {
-    pack.nodes.resize(node_size);
-    pack.leaf_nodes.resize(num_leaf_nodes * BVH_ONODE_LEAF_SIZE);
-  }
-
-  int nextNodeIdx = 0, nextLeafNodeIdx = 0;
-
-  vector<BVHStackEntry> stack;
-  stack.reserve(BVHParams::MAX_DEPTH * 2);
-  if (root->is_leaf()) {
-    stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++));
-  }
-  else {
-    stack.push_back(BVHStackEntry(root, nextNodeIdx));
-    nextNodeIdx += root->has_unaligned() ? BVH_UNALIGNED_ONODE_SIZE : BVH_ONODE_SIZE;
-  }
-
-  while (stack.size()) {
-    BVHStackEntry e = stack.back();
-    stack.pop_back();
-
-    if (e.node->is_leaf()) {
-      /* leaf node */
-      const LeafNode *leaf = reinterpret_cast<const LeafNode *>(e.node);
-      pack_leaf(e, leaf);
-    }
-    else {
-      /* Inner node. */
-      /* Collect nodes. */
-      const BVHNode *children[8];
-      int num_children = e.node->num_children();
-      /* Push entries on the stack. */
-      for (int i = 0; i < num_children; ++i) {
-        int idx;
-        children[i] = e.node->get_child(i);
-        if (children[i]->is_leaf()) {
-          idx = nextLeafNodeIdx++;
-        }
-        else {
-          idx = nextNodeIdx;
-          nextNodeIdx += children[i]->has_unaligned() ? BVH_UNALIGNED_ONODE_SIZE : BVH_ONODE_SIZE;
-        }
-        stack.push_back(BVHStackEntry(children[i], idx));
-      }
-      /* Set node. */
-      pack_inner(e, &stack[stack.size() - num_children], num_children);
-    }
-  }
-
-  assert(node_size == nextNodeIdx);
-  /* Root index to start traversal at, to handle case of single leaf node. */
-  pack.root_index = (root->is_leaf()) ? -1 : 0;
-}
-
-void BVH8::refit_nodes()
-{
-  assert(!params.top_level);
-
-  BoundBox bbox = BoundBox::empty;
-  uint visibility = 0;
-  refit_node(0, (pack.root_index == -1) ? true : false, bbox, visibility);
-}
-
-void BVH8::refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility)
-{
-  if (leaf) {
-    int4 *data = &pack.leaf_nodes[idx];
-    int4 c = data[0];
-    /* Refit leaf node. */
-    for (int prim = c.x; prim < c.y; prim++) {
-      int pidx = pack.prim_index[prim];
-      int tob = pack.prim_object[prim];
-      Object *ob = objects[tob];
-
-      if (pidx == -1) {
-        /* Object instance. */
-        bbox.grow(ob->bounds);
-      }
-      else {
-        /* Primitives. */
-        const Mesh *mesh = ob->mesh;
-
-        if (pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
-          /* Curves. */
-          int str_offset = (params.top_level) ? mesh->curve_offset : 0;
-          Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
-          int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
-
-          curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
-
-          visibility |= PATH_RAY_CURVE;
-
-          /* Motion curves. */
-          if (mesh->use_motion_blur) {
-            Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-            if (attr) {
-              size_t mesh_size = mesh->curve_keys.size();
-              size_t steps = mesh->motion_steps - 1;
-              float3 *key_steps = attr->data_float3();
-
-              for (size_t i = 0; i < steps; i++) {
-                curve.bounds_grow(k, key_steps + i * mesh_size, &mesh->curve_radius[0], bbox);
-              }
-            }
-          }
-        }
-        else {
-          /* Triangles. */
-          int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
-          Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
-          const float3 *vpos = &mesh->verts[0];
-
-          triangle.bounds_grow(vpos, bbox);
-
-          /* Motion triangles. */
-          if (mesh->use_motion_blur) {
-            Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-            if (attr) {
-              size_t mesh_size = mesh->verts.size();
-              size_t steps = mesh->motion_steps - 1;
-              float3 *vert_steps = attr->data_float3();
-
-              for (size_t i = 0; i < steps; i++) {
-                triangle.bounds_grow(vert_steps + i * mesh_size, bbox);
-              }
-            }
-          }
-        }
-      }
-
-      visibility |= ob->visibility;
-    }
-
-    float4 leaf_data[BVH_ONODE_LEAF_SIZE];
-    leaf_data[0].x = __int_as_float(c.x);
-    leaf_data[0].y = __int_as_float(c.y);
-    leaf_data[0].z = __uint_as_float(visibility);
-    leaf_data[0].w = __uint_as_float(c.w);
-    memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4) * BVH_ONODE_LEAF_SIZE);
-  }
-  else {
-    float8 *data = (float8 *)&pack.nodes[idx];
-    bool is_unaligned = (__float_as_uint(data[0].a) & PATH_RAY_NODE_UNALIGNED) != 0;
-    /* Refit inner node, set bbox from children. */
-    BoundBox child_bbox[8] = {BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty,
-                              BoundBox::empty};
-    int child[8];
-    uint child_visibility[8] = {0};
-    int num_nodes = 0;
-
-    for (int i = 0; i < 8; ++i) {
-      child[i] = __float_as_int(data[(is_unaligned) ? 13 : 7][i]);
-
-      if (child[i] != 0) {
-        refit_node((child[i] < 0) ? -child[i] - 1 : child[i],
-                   (child[i] < 0),
-                   child_bbox[i],
-                   child_visibility[i]);
-        ++num_nodes;
-        bbox.grow(child_bbox[i]);
-        visibility |= child_visibility[i];
-      }
-    }
-
-    if (is_unaligned) {
-      Transform aligned_space[8] = {transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity(),
-                                    transform_identity()};
-      pack_unaligned_node(
-          idx, aligned_space, child_bbox, child, visibility, 0.0f, 1.0f, num_nodes);
-    }
-    else {
-      pack_aligned_node(idx, child_bbox, child, visibility, 0.0f, 1.0f, num_nodes);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h
deleted file mode 100644
index fc07eadcada..00000000000
--- a/intern/cycles/bvh/bvh8.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-* Original code Copyright 2017, Intel Corporation
-* Modifications Copyright 2018, Blender Foundation.
-*
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions are met:
-*
-* * Redistributions of source code must retain the above copyright notice,
-* this list of conditions and the following disclaimer.
-* * Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in the
-* documentation and/or other materials provided with the distribution.
-* * Neither the name of Intel Corporation nor the names of its contributors
-* may be used to endorse or promote products derived from this software
-* without specific prior written permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef __BVH8_H__
-#define __BVH8_H__
-
-#include "bvh/bvh.h"
-#include "bvh/bvh_params.h"
-
-#include "util/util_types.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class BVHNode;
-struct BVHStackEntry;
-class BVHParams;
-class BoundBox;
-class LeafNode;
-class Object;
-class Progress;
-
-#define BVH_ONODE_SIZE 16
-#define BVH_ONODE_LEAF_SIZE 1
-#define BVH_UNALIGNED_ONODE_SIZE 28
-
-/* BVH8
-*
-* Octo BVH, with each node having eight children, to use with SIMD instructions.
-*/
-class BVH8 : public BVH {
- protected:
-  /* constructor */
-  friend class BVH;
-  BVH8(const BVHParams &params, const vector<Object *> &objects);
-
-  /* Building process. */
-  virtual BVHNode *widen_children_nodes(const BVHNode *root) override;
-
-  /* pack */
-  void pack_nodes(const BVHNode *root) override;
-
-  void pack_leaf(const BVHStackEntry &e, const LeafNode *leaf);
-  void pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-
-  void pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-  void pack_aligned_node(int idx,
-                         const BoundBox *bounds,
-                         const int *child,
-                         const uint visibility,
-                         const float time_from,
-                         const float time_to,
-                         const int num);
-
-  void pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num);
-  void pack_unaligned_node(int idx,
-                           const Transform *aligned_space,
-                           const BoundBox *bounds,
-                           const int *child,
-                           const uint visibility,
-                           const float time_from,
-                           const float time_to,
-                           const int num);
-
-  /* refit */
-  void refit_nodes() override;
-  void refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility);
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __BVH8_H__ */
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 1d9b006e8cb..86ab7b00815 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -22,64 +22,23 @@
 #include "bvh/bvh_params.h"
 #include "bvh_split.h"
 
+#include "render/curves.h"
+#include "render/hair.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
-#include "render/curves.h"
 
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_progress.h"
-#include "util/util_stack_allocator.h"
+#include "util/util_queue.h"
 #include "util/util_simd.h"
+#include "util/util_stack_allocator.h"
 #include "util/util_time.h"
-#include "util/util_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* BVH Build Task */
-
-class BVHBuildTask : public Task {
- public:
-  BVHBuildTask(
-      BVHBuild *build, InnerNode *node, int child, const BVHObjectBinning &range, int level)
-      : range_(range)
-  {
-    run = function_bind(&BVHBuild::thread_build_node, build, node, child, &range_, level);
-  }
-
- private:
-  BVHObjectBinning range_;
-};
-
-class BVHSpatialSplitBuildTask : public Task {
- public:
-  BVHSpatialSplitBuildTask(BVHBuild *build,
-                           InnerNode *node,
-                           int child,
-                           const BVHRange &range,
-                           const vector<BVHReference> &references,
-                           int level)
-      : range_(range),
-        references_(references.begin() + range.start(), references.begin() + range.end())
-  {
-    range_.set_start(0);
-    run = function_bind(&BVHBuild::thread_build_spatial_split_node,
-                        build,
-                        node,
-                        child,
-                        &range_,
-                        &references_,
-                        level,
-                        _1);
-  }
-
- private:
-  BVHRange range_;
-  vector<BVHReference> references_;
-};
-
 /* Constructor / Destructor */
 
 BVHBuild::BVHBuild(const vector<Object *> &objects_,
@@ -194,23 +153,30 @@ void BVHBuild::add_reference_triangles(BoundBox &root, BoundBox &center, Mesh *m
   }
 }
 
-void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh, int i)
+void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair, int i)
 {
   const Attribute *curve_attr_mP = NULL;
-  if (mesh->has_motion_blur()) {
-    curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  if (hair->has_motion_blur()) {
+    curve_attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
   }
-  const size_t num_curves = mesh->num_curves();
+
+  const PrimitiveType primitive_type =
+      (curve_attr_mP != NULL) ?
+          ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON :
+                                                 PRIMITIVE_MOTION_CURVE_THICK) :
+          ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON : PRIMITIVE_CURVE_THICK);
+
+  const size_t num_curves = hair->num_curves();
   for (uint j = 0; j < num_curves; j++) {
-    const Mesh::Curve curve = mesh->get_curve(j);
-    const float *curve_radius = &mesh->curve_radius[0];
+    const Hair::Curve curve = hair->get_curve(j);
+    const float *curve_radius = &hair->curve_radius[0];
     for (int k = 0; k < curve.num_keys - 1; k++) {
       if (curve_attr_mP == NULL) {
         /* Really simple logic for static hair. */
         BoundBox bounds = BoundBox::empty;
-        curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
+        curve.bounds_grow(k, &hair->curve_keys[0], curve_radius, bounds);
         if (bounds.valid()) {
-          int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_CURVE, k);
+          int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
           references.push_back(BVHReference(bounds, j, i, packed_type));
           root.grow(bounds);
           center.grow(bounds.center2());
@@ -223,15 +189,15 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh
          */
         /* TODO(sergey): Support motion steps for spatially split BVH. */
         BoundBox bounds = BoundBox::empty;
-        curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
-        const size_t num_keys = mesh->curve_keys.size();
-        const size_t num_steps = mesh->motion_steps;
+        curve.bounds_grow(k, &hair->curve_keys[0], curve_radius, bounds);
+        const size_t num_keys = hair->curve_keys.size();
+        const size_t num_steps = hair->motion_steps;
         const float3 *key_steps = curve_attr_mP->data_float3();
         for (size_t step = 0; step < num_steps - 1; step++) {
           curve.bounds_grow(k, key_steps + step * num_keys, curve_radius, bounds);
         }
         if (bounds.valid()) {
-          int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
+          int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
           references.push_back(BVHReference(bounds, j, i, packed_type));
           root.grow(bounds);
           center.grow(bounds.center2());
@@ -244,10 +210,10 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh
          */
         const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
         const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
-        const size_t num_steps = mesh->motion_steps;
-        const float3 *curve_keys = &mesh->curve_keys[0];
+        const size_t num_steps = hair->motion_steps;
+        const float3 *curve_keys = &hair->curve_keys[0];
         const float3 *key_steps = curve_attr_mP->data_float3();
-        const size_t num_keys = mesh->curve_keys.size();
+        const size_t num_keys = hair->curve_keys.size();
         /* Calculate bounding box of the previous time step.
          * Will be reused later to avoid duplicated work on
          * calculating BVH time step boundbox.
@@ -287,7 +253,7 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh
           bounds.grow(curr_bounds);
           if (bounds.valid()) {
             const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
-            int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
+            int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
             references.push_back(BVHReference(bounds, j, i, packed_type, prev_time, curr_time));
             root.grow(bounds);
             center.grow(bounds.center2());
@@ -302,13 +268,15 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh
   }
 }
 
-void BVHBuild::add_reference_mesh(BoundBox &root, BoundBox &center, Mesh *mesh, int i)
+void BVHBuild::add_reference_geometry(BoundBox &root, BoundBox &center, Geometry *geom, int i)
 {
-  if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
     add_reference_triangles(root, center, mesh, i);
   }
-  if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-    add_reference_curves(root, center, mesh, i);
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    add_reference_curves(root, center, hair, i);
   }
 }
 
@@ -319,16 +287,30 @@ void BVHBuild::add_reference_object(BoundBox &root, BoundBox &center, Object *ob
   center.grow(ob->bounds.center2());
 }
 
-static size_t count_curve_segments(Mesh *mesh)
+static size_t count_curve_segments(Hair *hair)
 {
-  size_t num = 0, num_curves = mesh->num_curves();
+  size_t num = 0, num_curves = hair->num_curves();
 
   for (size_t i = 0; i < num_curves; i++)
-    num += mesh->get_curve(i).num_keys - 1;
+    num += hair->get_curve(i).num_keys - 1;
 
   return num;
 }
 
+static size_t count_primitives(Geometry *geom)
+{
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    return mesh->num_triangles();
+  }
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    return count_curve_segments(hair);
+  }
+
+  return 0;
+}
+
 void BVHBuild::add_references(BVHRange &root)
 {
   /* reserve space for references */
@@ -339,24 +321,14 @@ void BVHBuild::add_references(BVHRange &root)
       if (!ob->is_traceable()) {
         continue;
       }
-      if (!ob->mesh->is_instanced()) {
-        if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
-          num_alloc_references += ob->mesh->num_triangles();
-        }
-        if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-          num_alloc_references += count_curve_segments(ob->mesh);
-        }
+      if (!ob->geometry->is_instanced()) {
+        num_alloc_references += count_primitives(ob->geometry);
       }
       else
         num_alloc_references++;
     }
     else {
-      if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
-        num_alloc_references += ob->mesh->num_triangles();
-      }
-      if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-        num_alloc_references += count_curve_segments(ob->mesh);
-      }
+      num_alloc_references += count_primitives(ob->geometry);
     }
   }
 
@@ -372,13 +344,13 @@ void BVHBuild::add_references(BVHRange &root)
         ++i;
         continue;
       }
-      if (!ob->mesh->is_instanced())
-        add_reference_mesh(bounds, center, ob->mesh, i);
+      if (!ob->geometry->is_instanced())
+        add_reference_geometry(bounds, center, ob->geometry, i);
       else
         add_reference_object(bounds, center, ob, i);
     }
     else
-      add_reference_mesh(bounds, center, ob->mesh, i);
+      add_reference_geometry(bounds, center, ob->geometry, i);
 
     i++;
 
@@ -416,22 +388,6 @@ BVHNode *BVHBuild::run()
   }
 
   spatial_min_overlap = root.bounds().safe_area() * params.spatial_split_alpha;
-  if (params.use_spatial_split) {
-    /* NOTE: The API here tries to be as much ready for multi-threaded build
-     * as possible, but at the same time it tries not to introduce any
-     * changes in behavior for until all refactoring needed for threading is
-     * finished.
-     *
-     * So we currently allocate single storage for now, which is only used by
-     * the only thread working on the spatial BVH build.
-     */
-    spatial_storage.resize(TaskScheduler::num_threads() + 1);
-    size_t num_bins = max(root.size(), (int)BVHParams::NUM_SPATIAL_BINS) - 1;
-    foreach (BVHSpatialStorage &storage, spatial_storage) {
-      storage.right_bounds.clear();
-    }
-    spatial_storage[0].right_bounds.resize(num_bins);
-  }
   spatial_free_index = 0;
 
   need_prim_time = params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0;
@@ -458,7 +414,8 @@ BVHNode *BVHBuild::run()
 
   if (params.use_spatial_split) {
     /* Perform multithreaded spatial split build. */
-    rootnode = build_node(root, &references, 0, 0);
+    BVHSpatialStorage *local_storage = &spatial_storage.local();
+    rootnode = build_node(root, references, 0, local_storage);
     task_pool.wait_work();
   }
   else {
@@ -468,6 +425,9 @@ BVHNode *BVHBuild::run()
     task_pool.wait_work();
   }
 
+  /* clean up temporary memory usage by threads */
+  spatial_storage.clear();
+
   /* delete if we canceled */
   if (rootnode) {
     if (progress.get_cancel()) {
@@ -522,41 +482,46 @@ void BVHBuild::progress_update()
   progress_start_time = time_dt();
 }
 
-void BVHBuild::thread_build_node(InnerNode *inner, int child, BVHObjectBinning *range, int level)
+void BVHBuild::thread_build_node(InnerNode *inner,
+                                 int child,
+                                 const BVHObjectBinning &range,
+                                 int level)
 {
   if (progress.get_cancel())
     return;
 
   /* build nodes */
-  BVHNode *node = build_node(*range, level);
+  BVHNode *node = build_node(range, level);
 
   /* set child in inner node */
   inner->children[child] = node;
 
   /* update progress */
-  if (range->size() < THREAD_TASK_SIZE) {
+  if (range.size() < THREAD_TASK_SIZE) {
     /*rotate(node, INT_MAX, 5);*/
 
     thread_scoped_lock lock(build_mutex);
 
-    progress_count += range->size();
+    progress_count += range.size();
     progress_update();
   }
 }
 
 void BVHBuild::thread_build_spatial_split_node(InnerNode *inner,
                                                int child,
-                                               BVHRange *range,
-                                               vector<BVHReference> *references,
-                                               int level,
-                                               int thread_id)
+                                               const BVHRange &range,
+                                               vector<BVHReference> &references,
+                                               int level)
 {
   if (progress.get_cancel()) {
     return;
   }
 
+  /* Get per-thread memory for spatial split. */
+  BVHSpatialStorage *local_storage = &spatial_storage.local();
+
   /* build nodes */
-  BVHNode *node = build_node(*range, references, level, thread_id);
+  BVHNode *node = build_node(range, references, level, local_storage);
 
   /* set child in inner node */
   inner->children[child] = node;
@@ -579,14 +544,22 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange &range,
   for (int i = 0; i < size; i++) {
     const BVHReference &ref = references[range.start() + i];
 
-    if (ref.prim_type() & PRIMITIVE_CURVE)
-      num_curves++;
-    if (ref.prim_type() & PRIMITIVE_MOTION_CURVE)
-      num_motion_curves++;
-    else if (ref.prim_type() & PRIMITIVE_TRIANGLE)
-      num_triangles++;
-    else if (ref.prim_type() & PRIMITIVE_MOTION_TRIANGLE)
-      num_motion_triangles++;
+    if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
+      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
+        num_motion_curves++;
+      }
+      else {
+        num_curves++;
+      }
+    }
+    else if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
+      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
+        num_motion_triangles++;
+      }
+      else {
+        num_triangles++;
+      }
+    }
   }
 
   return (num_triangles <= params.max_triangle_leaf_size) &&
@@ -668,8 +641,8 @@ BVHNode *BVHBuild::build_node(const BVHObjectBinning &range, int level)
     /* Threaded build */
     inner = new InnerNode(bounds);
 
-    task_pool.push(new BVHBuildTask(this, inner, 0, left, level + 1), true);
-    task_pool.push(new BVHBuildTask(this, inner, 1, right, level + 1), true);
+    task_pool.push([=] { thread_build_node(inner, 0, left, level + 1); });
+    task_pool.push([=] { thread_build_node(inner, 1, right, level + 1); });
   }
 
   if (do_unalinged_split) {
@@ -681,9 +654,9 @@ BVHNode *BVHBuild::build_node(const BVHObjectBinning &range, int level)
 
 /* multithreaded spatial split builder */
 BVHNode *BVHBuild::build_node(const BVHRange &range,
-                              vector<BVHReference> *references,
+                              vector<BVHReference> &references,
                               int level,
-                              int thread_id)
+                              BVHSpatialStorage *storage)
 {
   /* Update progress.
    *
@@ -700,18 +673,17 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
   if (!(range.size() > 0 && params.top_level && level == 0)) {
     if (params.small_enough_for_leaf(range.size(), level)) {
       progress_count += range.size();
-      return create_leaf_node(range, *references);
+      return create_leaf_node(range, references);
     }
   }
 
   /* Perform splitting test. */
-  BVHSpatialStorage *storage = &spatial_storage[thread_id];
   BVHMixedSplit split(this, storage, range, references, level);
 
   if (!(range.size() > 0 && params.top_level && level == 0)) {
     if (split.no_split) {
       progress_count += range.size();
-      return create_leaf_node(range, *references);
+      return create_leaf_node(range, references);
     }
   }
   float leafSAH = params.sah_primitive_cost * split.leafSAH;
@@ -724,7 +696,7 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
   Transform aligned_space;
   bool do_unalinged_split = false;
   if (params.use_unaligned_nodes && splitSAH > params.unaligned_split_threshold * leafSAH) {
-    aligned_space = unaligned_heuristic.compute_aligned_space(range, &references->at(0));
+    aligned_space = unaligned_heuristic.compute_aligned_space(range, &references.at(0));
     unaligned_split = BVHMixedSplit(
         this, storage, range, references, level, &unaligned_heuristic, &aligned_space);
     /* unalignedLeafSAH = params.sah_primitive_cost * split.leafSAH; */
@@ -750,8 +722,7 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
 
   BoundBox bounds;
   if (do_unalinged_split) {
-    bounds = unaligned_heuristic.compute_aligned_boundbox(
-        range, &references->at(0), aligned_space);
+    bounds = unaligned_heuristic.compute_aligned_boundbox(range, &references.at(0), aligned_space);
   }
   else {
     bounds = range.bounds();
@@ -763,24 +734,35 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
     /* Local build. */
 
     /* Build left node. */
-    vector<BVHReference> copy(references->begin() + right.start(),
-                              references->begin() + right.end());
+    vector<BVHReference> right_references(references.begin() + right.start(),
+                                          references.begin() + right.end());
     right.set_start(0);
 
-    BVHNode *leftnode = build_node(left, references, level + 1, thread_id);
+    BVHNode *leftnode = build_node(left, references, level + 1, storage);
 
     /* Build right node. */
-    BVHNode *rightnode = build_node(right, &copy, level + 1, thread_id);
+    BVHNode *rightnode = build_node(right, right_references, level + 1, storage);
 
     inner = new InnerNode(bounds, leftnode, rightnode);
   }
   else {
     /* Threaded build. */
     inner = new InnerNode(bounds);
-    task_pool.push(new BVHSpatialSplitBuildTask(this, inner, 0, left, *references, level + 1),
-                   true);
-    task_pool.push(new BVHSpatialSplitBuildTask(this, inner, 1, right, *references, level + 1),
-                   true);
+
+    vector<BVHReference> left_references(references.begin() + left.start(),
+                                         references.begin() + left.end());
+    vector<BVHReference> right_references(references.begin() + right.start(),
+                                          references.begin() + right.end());
+    right.set_start(0);
+
+    /* Create tasks for left and right nodes, using copy for most arguments and
+     * move for reference to avoid memory copies. */
+    task_pool.push([=, refs = std::move(left_references)]() mutable {
+      thread_build_spatial_split_node(inner, 0, left, refs, level + 1);
+    });
+    task_pool.push([=, refs = std::move(right_references)]() mutable {
+      thread_build_spatial_split_node(inner, 1, right, refs, level + 1);
+    });
   }
 
   if (do_unalinged_split) {
@@ -878,9 +860,6 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
 
       bounds[type_index].grow(ref.bounds());
       visibility[type_index] |= objects[ref.prim_object()]->visibility_for_tracing();
-      if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
-        visibility[type_index] |= PATH_RAY_CURVE;
-      }
       ++num_new_prims;
     }
     else {
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 9685e26cfac..c35af083fbd 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -35,6 +35,8 @@ class BVHNode;
 class BVHSpatialSplitBuildTask;
 class BVHParams;
 class InnerNode;
+class Geometry;
+class Hair;
 class Mesh;
 class Object;
 class Progress;
@@ -65,16 +67,16 @@ class BVHBuild {
 
   /* Adding references. */
   void add_reference_triangles(BoundBox &root, BoundBox &center, Mesh *mesh, int i);
-  void add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh, int i);
-  void add_reference_mesh(BoundBox &root, BoundBox &center, Mesh *mesh, int i);
+  void add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair, int i);
+  void add_reference_geometry(BoundBox &root, BoundBox &center, Geometry *geom, int i);
   void add_reference_object(BoundBox &root, BoundBox &center, Object *ob, int i);
   void add_references(BVHRange &root);
 
   /* Building. */
   BVHNode *build_node(const BVHRange &range,
-                      vector<BVHReference> *references,
+                      vector<BVHReference> &references,
                       int level,
-                      int thread_id);
+                      BVHSpatialStorage *storage);
   BVHNode *build_node(const BVHObjectBinning &range, int level);
   BVHNode *create_leaf_node(const BVHRange &range, const vector<BVHReference> &references);
   BVHNode *create_object_leaf_nodes(const BVHReference *ref, int start, int num);
@@ -84,13 +86,12 @@ class BVHBuild {
 
   /* Threads. */
   enum { THREAD_TASK_SIZE = 4096 };
-  void thread_build_node(InnerNode *node, int child, BVHObjectBinning *range, int level);
+  void thread_build_node(InnerNode *node, int child, const BVHObjectBinning &range, int level);
   void thread_build_spatial_split_node(InnerNode *node,
                                        int child,
-                                       BVHRange *range,
-                                       vector<BVHReference> *references,
-                                       int level,
-                                       int thread_id);
+                                       const BVHRange &range,
+                                       vector<BVHReference> &references,
+                                       int level);
   thread_mutex build_mutex;
 
   /* Progress. */
@@ -125,7 +126,7 @@ class BVHBuild {
 
   /* Spatial splitting. */
   float spatial_min_overlap;
-  vector<BVHSpatialStorage> spatial_storage;
+  enumerable_thread_specific<BVHSpatialStorage> spatial_storage;
   size_t spatial_free_index;
   thread_spin_lock spatial_spin_lock;
 
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
index 5ef9622aba2..17e1f86a589 100644
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -14,47 +14,52 @@
  * limitations under the License.
  */
 
-/* This class implemens a ray accelerator for Cycles using Intel's Embree library.
+/* This class implements a ray accelerator for Cycles using Intel's Embree library.
  * It supports triangles, curves, object and deformation blur and instancing.
- * Not supported are thick line segments, those have no native equivalent in Embree.
- * They could be implemented using Embree's thick curves, at the expense of wasted memory.
- * User defined intersections for Embree could also be an option, but since Embree only uses aligned BVHs
- * for user geometry, this would come with reduced performance and/or higher memory usage.
  *
- * Since Embree allows object to be either curves or triangles but not both, Cycles object IDs are maapped
- * to Embree IDs by multiplying by two and adding one for curves.
+ * Since Embree allows object to be either curves or triangles but not both, Cycles object IDs are
+ * mapped to Embree IDs by multiplying by two and adding one for curves.
  *
- * This implementation shares RTCDevices between Cycles instances. Eventually each instance should get
- * a separate RTCDevice to correctly keep track of memory usage.
+ * This implementation shares RTCDevices between Cycles instances. Eventually each instance should
+ * get a separate RTCDevice to correctly keep track of memory usage.
  *
- * Vertex and index buffers are duplicated between Cycles device arrays and Embree. These could be merged,
- * which would requrie changes to intersection refinement, shader setup, mesh light sampling and a few
- * other places in Cycles where direct access to vertex data is required.
+ * Vertex and index buffers are duplicated between Cycles device arrays and Embree. These could be
+ * merged, which would require changes to intersection refinement, shader setup, mesh light
+ * sampling and a few other places in Cycles where direct access to vertex data is required.
  */
 
 #ifdef WITH_EMBREE
 
+#  include <embree3/rtcore_geometry.h>
 #  include <pmmintrin.h>
 #  include <xmmintrin.h>
-#  include <embree3/rtcore_geometry.h>
 
 #  include "bvh/bvh_embree.h"
 
-/* Kernel includes are necessary so that the filter function for Embree can access the packed BVH. */
+/* Kernel includes are necessary so that the filter function for Embree can access the packed BVH.
+ */
 #  include "kernel/bvh/bvh_embree.h"
 #  include "kernel/kernel_compat_cpu.h"
-#  include "kernel/split/kernel_split_data_types.h"
 #  include "kernel/kernel_globals.h"
 #  include "kernel/kernel_random.h"
+#  include "kernel/split/kernel_split_data_types.h"
 
+#  include "render/hair.h"
 #  include "render/mesh.h"
 #  include "render/object.h"
+
 #  include "util/util_foreach.h"
 #  include "util/util_logging.h"
 #  include "util/util_progress.h"
+#  include "util/util_stats.h"
 
 CCL_NAMESPACE_BEGIN
 
+static_assert(Object::MAX_MOTION_STEPS <= RTC_MAX_TIME_STEP_COUNT,
+              "Object and Embree max motion steps inconsistent");
+static_assert(Object::MAX_MOTION_STEPS == Geometry::MAX_MOTION_STEPS,
+              "Object and Geometry max motion steps inconsistent");
+
 #  define IS_HAIR(x) (x & 1)
 
 /* This gets called by Embree at every valid ray/object intersection.
@@ -62,30 +67,9 @@ CCL_NAMESPACE_BEGIN
  * as well as filtering for volume objects happen here.
  * Cycles' own BVH does that directly inside the traversal calls.
  */
-static void rtc_filter_func(const RTCFilterFunctionNArguments *args)
-{
-  /* Current implementation in Cycles assumes only single-ray intersection queries. */
-  assert(args->N == 1);
-
-  const RTCRay *ray = (RTCRay *)args->ray;
-  const RTCHit *hit = (RTCHit *)args->hit;
-  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
-  KernelGlobals *kg = ctx->kg;
-
-  /* Check if there is backfacing hair to ignore. */
-  if (IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) &&
-      !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING) &&
-      !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) {
-    if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
-            make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
-      *args->valid = 0;
-      return;
-    }
-  }
-}
-
 static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
 {
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
   assert(args->N == 1);
 
   const RTCRay *ray = (RTCRay *)args->ray;
@@ -93,17 +77,6 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
   CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
   KernelGlobals *kg = ctx->kg;
 
-  /* For all ray types: Check if there is backfacing hair to ignore */
-  if (IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) &&
-      !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING) &&
-      !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) {
-    if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
-            make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
-      *args->valid = 0;
-      return;
-    }
-  }
-
   switch (ctx->type) {
     case CCLIntersectContext::RAY_SHADOW_ALL: {
       /* Append the intersection to the end of the array. */
@@ -144,52 +117,68 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
       }
       break;
     }
+    case CCLIntersectContext::RAY_LOCAL:
     case CCLIntersectContext::RAY_SSS: {
+      /* Check if it's hitting the correct object. */
+      Intersection current_isect;
+      if (ctx->type == CCLIntersectContext::RAY_SSS) {
+        kernel_embree_convert_sss_hit(kg, ray, hit, &current_isect, ctx->local_object_id);
+      }
+      else {
+        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+        if (ctx->local_object_id != current_isect.object) {
+          /* This tells Embree to continue tracing. */
+          *args->valid = 0;
+        }
+      }
+
       /* No intersection information requested, just return a hit. */
       if (ctx->max_hits == 0) {
         break;
       }
 
       /* Ignore curves. */
-      if (hit->geomID & 1) {
+      if (IS_HAIR(hit->geomID)) {
         /* This tells Embree to continue tracing. */
         *args->valid = 0;
         break;
       }
 
       /* See triangle_intersect_subsurface() for the native equivalent. */
-      for (int i = min(ctx->max_hits, ctx->ss_isect->num_hits) - 1; i >= 0; --i) {
-        if (ctx->ss_isect->hits[i].t == ray->tfar) {
+      for (int i = min(ctx->max_hits, ctx->local_isect->num_hits) - 1; i >= 0; --i) {
+        if (ctx->local_isect->hits[i].t == ray->tfar) {
           /* This tells Embree to continue tracing. */
           *args->valid = 0;
           break;
         }
       }
 
-      ++ctx->ss_isect->num_hits;
-      int hit_idx;
+      int hit_idx = 0;
 
-      if (ctx->ss_isect->num_hits <= ctx->max_hits) {
-        hit_idx = ctx->ss_isect->num_hits - 1;
-      }
-      else {
-        /* reservoir sampling: if we are at the maximum number of
-         * hits, randomly replace element or skip it */
-        hit_idx = lcg_step_uint(ctx->lcg_state) % ctx->ss_isect->num_hits;
+      if (ctx->lcg_state) {
 
-        if (hit_idx >= ctx->max_hits) {
-          /* This tells Embree to continue tracing. */
-          *args->valid = 0;
-          break;
+        ++ctx->local_isect->num_hits;
+        if (ctx->local_isect->num_hits <= ctx->max_hits) {
+          hit_idx = ctx->local_isect->num_hits - 1;
+        }
+        else {
+          /* reservoir sampling: if we are at the maximum number of
+           * hits, randomly replace element or skip it */
+          hit_idx = lcg_step_uint(ctx->lcg_state) % ctx->local_isect->num_hits;
+
+          if (hit_idx >= ctx->max_hits) {
+            /* This tells Embree to continue tracing. */
+            *args->valid = 0;
+            break;
+          }
         }
       }
+      else {
+        ctx->local_isect->num_hits = 1;
+      }
       /* record intersection */
-      kernel_embree_convert_local_hit(
-          kg, ray, hit, &ctx->ss_isect->hits[hit_idx], ctx->sss_object_id);
-      ctx->ss_isect->Ng[hit_idx].x = hit->Ng_x;
-      ctx->ss_isect->Ng[hit_idx].y = hit->Ng_y;
-      ctx->ss_isect->Ng[hit_idx].z = hit->Ng_z;
-      ctx->ss_isect->Ng[hit_idx] = normalize(ctx->ss_isect->Ng[hit_idx]);
+      ctx->local_isect->hits[hit_idx] = current_isect;
+      ctx->local_isect->Ng[hit_idx] = normalize(make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z));
       /* This tells Embree to continue tracing .*/
       *args->valid = 0;
       break;
@@ -230,6 +219,34 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
   }
 }
 
+static void rtc_filter_func_thick_curve(const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore backfacing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+}
+
+static void rtc_filter_occluded_func_thick_curve(const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore backfacing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+
+  rtc_filter_occluded_func(args);
+}
+
 static size_t unaccounted_mem = 0;
 
 static bool rtc_memory_monitor_func(void *userPtr, const ssize_t bytes, const bool)
@@ -283,16 +300,30 @@ RTCDevice BVHEmbree::rtc_shared_device = NULL;
 int BVHEmbree::rtc_shared_users = 0;
 thread_mutex BVHEmbree::rtc_shared_mutex;
 
-BVHEmbree::BVHEmbree(const BVHParams &params_, const vector<Object *> &objects_)
-    : BVH(params_, objects_),
+static size_t count_primitives(Geometry *geom)
+{
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    return mesh->num_triangles();
+  }
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    return hair->num_segments();
+  }
+
+  return 0;
+}
+
+BVHEmbree::BVHEmbree(const BVHParams &params_,
+                     const vector<Geometry *> &geometry_,
+                     const vector<Object *> &objects_)
+    : BVH(params_, geometry_, objects_),
       scene(NULL),
       mem_used(0),
       top_level(NULL),
       stats(NULL),
       curve_subdivisions(params.curve_subdivisions),
       build_quality(RTC_BUILD_QUALITY_REFIT),
-      use_curves(params_.curve_flags & CURVE_KN_INTERPOLATE),
-      use_ribbons(params.curve_flags & CURVE_KN_RIBBONS),
       dynamic_scene(true)
 {
   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
@@ -305,7 +336,7 @@ BVHEmbree::BVHEmbree(const BVHParams &params_, const vector<Object *> &objects_)
     if (ret != 1) {
       assert(0);
       VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED flag."
-                 "Ray visiblity will not work.";
+                 "Ray visibility will not work.";
     }
     ret = rtcGetDeviceProperty(rtc_shared_device, RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED);
     if (ret != 1) {
@@ -416,29 +447,15 @@ void BVHEmbree::build(Progress &progress, Stats *stats_)
       if (!ob->is_traceable()) {
         continue;
       }
-      if (!ob->mesh->is_instanced()) {
-        if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
-          prim_count += ob->mesh->num_triangles();
-        }
-        if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-          for (size_t j = 0; j < ob->mesh->num_curves(); ++j) {
-            prim_count += ob->mesh->get_curve(j).num_segments();
-          }
-        }
+      if (!ob->geometry->is_instanced()) {
+        prim_count += count_primitives(ob->geometry);
       }
       else {
         ++prim_count;
       }
     }
     else {
-      if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && ob->mesh->num_triangles() > 0) {
-        prim_count += ob->mesh->num_triangles();
-      }
-      if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-        for (size_t j = 0; j < ob->mesh->num_curves(); ++j) {
-          prim_count += ob->mesh->get_curve(j).num_segments();
-        }
-      }
+      prim_count += count_primitives(ob->geometry);
     }
   }
 
@@ -457,7 +474,7 @@ void BVHEmbree::build(Progress &progress, Stats *stats_)
         ++i;
         continue;
       }
-      if (!ob->mesh->is_instanced()) {
+      if (!ob->geometry->is_instanced()) {
         add_object(ob, i);
       }
       else {
@@ -495,6 +512,11 @@ void BVHEmbree::build(Progress &progress, Stats *stats_)
   stats = NULL;
 }
 
+void BVHEmbree::copy_to_device(Progress & /*progress*/, DeviceScene *dscene)
+{
+  dscene->data.bvh.scene = scene;
+}
+
 BVHNode *BVHEmbree::widen_children_nodes(const BVHNode * /*root*/)
 {
   assert(!"Must not be called.");
@@ -503,36 +525,57 @@ BVHNode *BVHEmbree::widen_children_nodes(const BVHNode * /*root*/)
 
 void BVHEmbree::add_object(Object *ob, int i)
 {
-  Mesh *mesh = ob->mesh;
-  if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
-    add_triangles(ob, i);
+  Geometry *geom = ob->geometry;
+
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    if (mesh->num_triangles() > 0) {
+      add_triangles(ob, mesh, i);
+    }
   }
-  if (params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
-    add_curves(ob, i);
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    if (hair->num_curves() > 0) {
+      add_curves(ob, hair, i);
+    }
   }
 }
 
 void BVHEmbree::add_instance(Object *ob, int i)
 {
-  if (!ob || !ob->mesh) {
+  if (!ob || !ob->geometry) {
     assert(0);
     return;
   }
-  BVHEmbree *instance_bvh = (BVHEmbree *)(ob->mesh->bvh);
+  BVHEmbree *instance_bvh = (BVHEmbree *)(ob->geometry->bvh);
 
   if (instance_bvh->top_level != this) {
     instance_bvh->top_level = this;
   }
 
-  const size_t num_motion_steps = ob->use_motion() ? ob->motion.size() : 1;
+  const size_t num_object_motion_steps = ob->use_motion() ? ob->motion.size() : 1;
+  const size_t num_motion_steps = min(num_object_motion_steps, RTC_MAX_TIME_STEP_COUNT);
+  assert(num_object_motion_steps <= RTC_MAX_TIME_STEP_COUNT);
+
   RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, RTC_GEOMETRY_TYPE_INSTANCE);
   rtcSetGeometryInstancedScene(geom_id, instance_bvh->scene);
   rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);
 
   if (ob->use_motion()) {
+    array<DecomposedTransform> decomp(ob->motion.size());
+    transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
     for (size_t step = 0; step < num_motion_steps; ++step) {
-      rtcSetGeometryTransform(
-          geom_id, step, RTC_FORMAT_FLOAT3X4_ROW_MAJOR, (const float *)&ob->motion[step]);
+      RTCQuaternionDecomposition rtc_decomp;
+      rtcInitQuaternionDecomposition(&rtc_decomp);
+      rtcQuaternionDecompositionSetQuaternion(
+          &rtc_decomp, decomp[step].x.w, decomp[step].x.x, decomp[step].x.y, decomp[step].x.z);
+      rtcQuaternionDecompositionSetScale(
+          &rtc_decomp, decomp[step].y.w, decomp[step].z.w, decomp[step].w.w);
+      rtcQuaternionDecompositionSetTranslation(
+          &rtc_decomp, decomp[step].y.x, decomp[step].y.y, decomp[step].y.z);
+      rtcQuaternionDecompositionSetSkew(
+          &rtc_decomp, decomp[step].z.x, decomp[step].z.y, decomp[step].w.x);
+      rtcSetGeometryTransformQuaternion(geom_id, step, &rtc_decomp);
     }
   }
   else {
@@ -545,30 +588,28 @@ void BVHEmbree::add_instance(Object *ob, int i)
   pack.prim_tri_index.push_back_slow(-1);
 
   rtcSetGeometryUserData(geom_id, (void *)instance_bvh->scene);
-  rtcSetGeometryMask(geom_id, ob->visibility);
+  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
   rtcCommitGeometry(geom_id);
   rtcAttachGeometryByID(scene, geom_id, i * 2);
   rtcReleaseGeometry(geom_id);
 }
 
-void BVHEmbree::add_triangles(Object *ob, int i)
+void BVHEmbree::add_triangles(const Object *ob, const Mesh *mesh, int i)
 {
   size_t prim_offset = pack.prim_index.size();
-  Mesh *mesh = ob->mesh;
   const Attribute *attr_mP = NULL;
-  size_t num_motion_steps = 1;
+  size_t num_geometry_motion_steps = 1;
   if (mesh->has_motion_blur()) {
     attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     if (attr_mP) {
-      num_motion_steps = mesh->motion_steps;
-      if (num_motion_steps > RTC_MAX_TIME_STEP_COUNT) {
-        assert(0);
-        num_motion_steps = RTC_MAX_TIME_STEP_COUNT;
-      }
+      num_geometry_motion_steps = mesh->motion_steps;
     }
   }
 
+  const size_t num_motion_steps = min(num_geometry_motion_steps, RTC_MAX_TIME_STEP_COUNT);
+  assert(num_geometry_motion_steps <= RTC_MAX_TIME_STEP_COUNT);
+
   const size_t num_triangles = mesh->num_triangles();
   RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, RTC_GEOMETRY_TYPE_TRIANGLE);
   rtcSetGeometryBuildQuality(geom_id, build_quality);
@@ -608,9 +649,8 @@ void BVHEmbree::add_triangles(Object *ob, int i)
   }
 
   rtcSetGeometryUserData(geom_id, (void *)prim_offset);
-  rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func);
   rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
-  rtcSetGeometryMask(geom_id, ob->visibility);
+  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
   rtcCommitGeometry(geom_id);
   rtcAttachGeometryByID(scene, geom_id, i * 2);
@@ -659,31 +699,35 @@ void BVHEmbree::update_tri_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh)
   }
 }
 
-void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh)
+void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair)
 {
   const Attribute *attr_mP = NULL;
   size_t num_motion_steps = 1;
-  if (mesh->has_motion_blur()) {
-    attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  if (hair->has_motion_blur()) {
+    attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     if (attr_mP) {
-      num_motion_steps = mesh->motion_steps;
+      num_motion_steps = hair->motion_steps;
     }
   }
 
-  const size_t num_curves = mesh->num_curves();
+  const size_t num_curves = hair->num_curves();
   size_t num_keys = 0;
   for (size_t j = 0; j < num_curves; ++j) {
-    const Mesh::Curve c = mesh->get_curve(j);
+    const Hair::Curve c = hair->get_curve(j);
     num_keys += c.num_keys;
   }
 
+  /* Catmull-Rom splines need extra CVs at the beginning and end of each curve. */
+  size_t num_keys_embree = num_keys;
+  num_keys_embree += num_curves * 2;
+
   /* Copy the CV data to Embree */
   const int t_mid = (num_motion_steps - 1) / 2;
-  const float *curve_radius = &mesh->curve_radius[0];
+  const float *curve_radius = &hair->curve_radius[0];
   for (int t = 0; t < num_motion_steps; ++t) {
     const float3 *verts;
     if (t == t_mid || attr_mP == NULL) {
-      verts = &mesh->curve_keys[0];
+      verts = &hair->curve_keys[0];
     }
     else {
       int t_ = (t > t_mid) ? (t - 1) : t;
@@ -691,67 +735,53 @@ void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh
     }
 
     float4 *rtc_verts = (float4 *)rtcSetNewGeometryBuffer(
-        geom_id, RTC_BUFFER_TYPE_VERTEX, t, RTC_FORMAT_FLOAT4, sizeof(float) * 4, num_keys);
-    float4 *rtc_tangents = NULL;
-    if (use_curves) {
-      rtc_tangents = (float4 *)rtcSetNewGeometryBuffer(
-          geom_id, RTC_BUFFER_TYPE_TANGENT, t, RTC_FORMAT_FLOAT4, sizeof(float) * 4, num_keys);
-      assert(rtc_tangents);
-    }
+        geom_id, RTC_BUFFER_TYPE_VERTEX, t, RTC_FORMAT_FLOAT4, sizeof(float) * 4, num_keys_embree);
+
     assert(rtc_verts);
     if (rtc_verts) {
-      if (use_curves && rtc_tangents) {
-        const size_t num_curves = mesh->num_curves();
-        for (size_t j = 0; j < num_curves; ++j) {
-          Mesh::Curve c = mesh->get_curve(j);
-          int fk = c.first_key;
-          rtc_verts[0] = float3_to_float4(verts[fk]);
-          rtc_verts[0].w = curve_radius[fk];
-          rtc_tangents[0] = float3_to_float4(verts[fk + 1] - verts[fk]);
-          rtc_tangents[0].w = curve_radius[fk + 1] - curve_radius[fk];
-          ++fk;
-          int k = 1;
-          for (; k < c.num_segments(); ++k, ++fk) {
-            rtc_verts[k] = float3_to_float4(verts[fk]);
-            rtc_verts[k].w = curve_radius[fk];
-            rtc_tangents[k] = float3_to_float4((verts[fk + 1] - verts[fk - 1]) * 0.5f);
-            rtc_tangents[k].w = (curve_radius[fk + 1] - curve_radius[fk - 1]) * 0.5f;
-          }
+      const size_t num_curves = hair->num_curves();
+      for (size_t j = 0; j < num_curves; ++j) {
+        Hair::Curve c = hair->get_curve(j);
+        int fk = c.first_key;
+        int k = 1;
+        for (; k < c.num_keys + 1; ++k, ++fk) {
           rtc_verts[k] = float3_to_float4(verts[fk]);
           rtc_verts[k].w = curve_radius[fk];
-          rtc_tangents[k] = float3_to_float4(verts[fk] - verts[fk - 1]);
-          rtc_tangents[k].w = curve_radius[fk] - curve_radius[fk - 1];
-          rtc_verts += c.num_keys;
-          rtc_tangents += c.num_keys;
-        }
-      }
-      else {
-        for (size_t j = 0; j < num_keys; ++j) {
-          rtc_verts[j] = float3_to_float4(verts[j]);
-          rtc_verts[j].w = curve_radius[j];
         }
+        /* Duplicate Embree's Catmull-Rom spline CVs at the start and end of each curve. */
+        rtc_verts[0] = rtc_verts[1];
+        rtc_verts[k] = rtc_verts[k - 1];
+        rtc_verts += c.num_keys + 2;
       }
     }
   }
 }
 
-void BVHEmbree::add_curves(Object *ob, int i)
+void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
 {
   size_t prim_offset = pack.prim_index.size();
-  const Mesh *mesh = ob->mesh;
   const Attribute *attr_mP = NULL;
-  size_t num_motion_steps = 1;
-  if (mesh->has_motion_blur()) {
-    attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  size_t num_geometry_motion_steps = 1;
+  if (hair->has_motion_blur()) {
+    attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     if (attr_mP) {
-      num_motion_steps = mesh->motion_steps;
+      num_geometry_motion_steps = hair->motion_steps;
     }
   }
 
-  const size_t num_curves = mesh->num_curves();
+  const size_t num_motion_steps = min(num_geometry_motion_steps, RTC_MAX_TIME_STEP_COUNT);
+  const PrimitiveType primitive_type =
+      (num_motion_steps > 1) ?
+          ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON :
+                                                 PRIMITIVE_MOTION_CURVE_THICK) :
+          ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON : PRIMITIVE_CURVE_THICK);
+
+  assert(num_geometry_motion_steps <= RTC_MAX_TIME_STEP_COUNT);
+
+  const size_t num_curves = hair->num_curves();
   size_t num_segments = 0;
   for (size_t j = 0; j < num_curves; ++j) {
-    Mesh::Curve c = mesh->get_curve(j);
+    Hair::Curve c = hair->get_curve(j);
     assert(c.num_segments() > 0);
     num_segments += c.num_segments();
   }
@@ -766,24 +796,24 @@ void BVHEmbree::add_curves(Object *ob, int i)
   size_t prim_tri_index_size = pack.prim_index.size();
   pack.prim_tri_index.resize(prim_tri_index_size + num_segments);
 
-  enum RTCGeometryType type = (!use_curves) ?
-                                  RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE :
-                                  (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE :
-                                                 RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE);
+  enum RTCGeometryType type = (hair->curve_shape == CURVE_RIBBON ?
+                                   RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE :
+                                   RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE);
 
   RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, type);
-  rtcSetGeometryTessellationRate(geom_id, curve_subdivisions);
+  rtcSetGeometryTessellationRate(geom_id, curve_subdivisions + 1);
   unsigned *rtc_indices = (unsigned *)rtcSetNewGeometryBuffer(
       geom_id, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT, sizeof(int), num_segments);
   size_t rtc_index = 0;
   for (size_t j = 0; j < num_curves; ++j) {
-    Mesh::Curve c = mesh->get_curve(j);
+    Hair::Curve c = hair->get_curve(j);
     for (size_t k = 0; k < c.num_segments(); ++k) {
       rtc_indices[rtc_index] = c.first_key + k;
+      /* Room for extra CVs at Catmull-Rom splines. */
+      rtc_indices[rtc_index] += j * 2;
       /* Cycles specific data. */
       pack.prim_object[prim_object_size + rtc_index] = i;
-      pack.prim_type[prim_type_size + rtc_index] = (PRIMITIVE_PACK_SEGMENT(
-          num_motion_steps > 1 ? PRIMITIVE_MOTION_CURVE : PRIMITIVE_CURVE, k));
+      pack.prim_type[prim_type_size + rtc_index] = (PRIMITIVE_PACK_SEGMENT(primitive_type, k));
       pack.prim_index[prim_index_size + rtc_index] = j;
       pack.prim_tri_index[prim_tri_index_size + rtc_index] = rtc_index;
 
@@ -794,12 +824,17 @@ void BVHEmbree::add_curves(Object *ob, int i)
   rtcSetGeometryBuildQuality(geom_id, build_quality);
   rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);
 
-  update_curve_vertex_buffer(geom_id, mesh);
+  update_curve_vertex_buffer(geom_id, hair);
 
   rtcSetGeometryUserData(geom_id, (void *)prim_offset);
-  rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func);
-  rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
-  rtcSetGeometryMask(geom_id, ob->visibility);
+  if (hair->curve_shape == CURVE_RIBBON) {
+    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
+  }
+  else {
+    rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func_thick_curve);
+    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func_thick_curve);
+  }
+  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
   rtcCommitGeometry(geom_id);
   rtcAttachGeometryByID(scene, geom_id, i * 2 + 1);
@@ -815,10 +850,7 @@ void BVHEmbree::pack_nodes(const BVHNode *)
 
   for (size_t i = 0; i < pack.prim_index.size(); ++i) {
     if (pack.prim_index[i] != -1) {
-      if (pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
-        pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->curve_offset;
-      else
-        pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->tri_offset;
+      pack.prim_index[i] += objects[pack.prim_object[i]]->geometry->prim_offset;
     }
   }
 
@@ -832,22 +864,22 @@ void BVHEmbree::pack_nodes(const BVHNode *)
   size_t pack_prim_tri_verts_offset = prim_tri_verts_size;
   size_t object_offset = 0;
 
-  map<Mesh *, int> mesh_map;
+  map<Geometry *, int> geometry_map;
 
   foreach (Object *ob, objects) {
-    Mesh *mesh = ob->mesh;
-    BVH *bvh = mesh->bvh;
+    Geometry *geom = ob->geometry;
+    BVH *bvh = geom->bvh;
 
-    if (mesh->need_build_bvh()) {
-      if (mesh_map.find(mesh) == mesh_map.end()) {
+    if (geom->need_build_bvh(BVH_LAYOUT_EMBREE)) {
+      if (geometry_map.find(geom) == geometry_map.end()) {
         prim_index_size += bvh->pack.prim_index.size();
         prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
-        mesh_map[mesh] = 1;
+        geometry_map[geom] = 1;
       }
     }
   }
 
-  mesh_map.clear();
+  geometry_map.clear();
 
   pack.prim_index.resize(prim_index_size);
   pack.prim_type.resize(prim_index_size);
@@ -865,38 +897,37 @@ void BVHEmbree::pack_nodes(const BVHNode *)
 
   /* merge */
   foreach (Object *ob, objects) {
-    Mesh *mesh = ob->mesh;
+    Geometry *geom = ob->geometry;
 
     /* We assume that if mesh doesn't need own BVH it was already included
      * into a top-level BVH and no packing here is needed.
      */
-    if (!mesh->need_build_bvh()) {
+    if (!geom->need_build_bvh(BVH_LAYOUT_EMBREE)) {
       pack.object_node[object_offset++] = prim_offset;
       continue;
     }
 
-    /* if mesh already added once, don't add it again, but used set
+    /* if geom already added once, don't add it again, but used set
      * node offset for this object */
-    map<Mesh *, int>::iterator it = mesh_map.find(mesh);
+    map<Geometry *, int>::iterator it = geometry_map.find(geom);
 
-    if (mesh_map.find(mesh) != mesh_map.end()) {
+    if (geometry_map.find(geom) != geometry_map.end()) {
       int noffset = it->second;
       pack.object_node[object_offset++] = noffset;
       continue;
     }
 
-    BVHEmbree *bvh = (BVHEmbree *)mesh->bvh;
+    BVHEmbree *bvh = (BVHEmbree *)geom->bvh;
 
     rtc_memory_monitor_func(stats, unaccounted_mem, true);
     unaccounted_mem = 0;
 
-    int mesh_tri_offset = mesh->tri_offset;
-    int mesh_curve_offset = mesh->curve_offset;
+    int geom_prim_offset = geom->prim_offset;
 
     /* fill in node indexes for instances */
     pack.object_node[object_offset++] = prim_offset;
 
-    mesh_map[mesh] = pack.object_node[object_offset - 1];
+    geometry_map[geom] = pack.object_node[object_offset - 1];
 
     /* merge primitive, object and triangle indexes */
     if (bvh->pack.prim_index.size()) {
@@ -907,11 +938,11 @@ void BVHEmbree::pack_nodes(const BVHNode *)
 
       for (size_t i = 0; i < bvh_prim_index_size; ++i) {
         if (bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset;
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_prim_index_offset] = -1;
         }
         else {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset;
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_prim_index_offset] = bvh_prim_tri_index[i] +
                                                         pack_prim_tri_verts_offset;
         }
@@ -941,15 +972,22 @@ void BVHEmbree::refit_nodes()
   /* Update all vertex buffers, then tell Embree to rebuild/-fit the BVHs. */
   unsigned geom_id = 0;
   foreach (Object *ob, objects) {
-    if (!params.top_level || (ob->is_traceable() && !ob->mesh->is_instanced())) {
-      if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && ob->mesh->num_triangles() > 0) {
-        update_tri_vertex_buffer(rtcGetGeometry(scene, geom_id), ob->mesh);
-        rtcCommitGeometry(rtcGetGeometry(scene, geom_id));
+    if (!params.top_level || (ob->is_traceable() && !ob->geometry->is_instanced())) {
+      Geometry *geom = ob->geometry;
+
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (mesh->num_triangles() > 0) {
+          update_tri_vertex_buffer(rtcGetGeometry(scene, geom_id), mesh);
+          rtcCommitGeometry(rtcGetGeometry(scene, geom_id));
+        }
       }
-
-      if (params.primitive_mask & PRIMITIVE_ALL_CURVE && ob->mesh->num_curves() > 0) {
-        update_curve_vertex_buffer(rtcGetGeometry(scene, geom_id + 1), ob->mesh);
-        rtcCommitGeometry(rtcGetGeometry(scene, geom_id + 1));
+      else if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        if (hair->num_curves() > 0) {
+          update_curve_vertex_buffer(rtcGetGeometry(scene, geom_id + 1), hair);
+          rtcCommitGeometry(rtcGetGeometry(scene, geom_id + 1));
+        }
       }
     }
     geom_id += 2;
diff --git a/intern/cycles/bvh/bvh_embree.h b/intern/cycles/bvh/bvh_embree.h
index 60702713583..f60a1ca0102 100644
--- a/intern/cycles/bvh/bvh_embree.h
+++ b/intern/cycles/bvh/bvh_embree.h
@@ -31,11 +31,14 @@
 
 CCL_NAMESPACE_BEGIN
 
+class Geometry;
+class Hair;
 class Mesh;
 
 class BVHEmbree : public BVH {
  public:
   virtual void build(Progress &progress, Stats *stats) override;
+  virtual void copy_to_device(Progress &progress, DeviceScene *dscene) override;
   virtual ~BVHEmbree();
   RTCScene scene;
   static void destroy(RTCScene);
@@ -45,15 +48,17 @@ class BVHEmbree : public BVH {
 
  protected:
   friend class BVH;
-  BVHEmbree(const BVHParams &params, const vector<Object *> &objects);
+  BVHEmbree(const BVHParams &params,
+            const vector<Geometry *> &geometry,
+            const vector<Object *> &objects);
 
   virtual void pack_nodes(const BVHNode *) override;
   virtual void refit_nodes() override;
 
   void add_object(Object *ob, int i);
   void add_instance(Object *ob, int i);
-  void add_curves(Object *ob, int i);
-  void add_triangles(Object *ob, int i);
+  void add_curves(const Object *ob, const Hair *hair, int i);
+  void add_triangles(const Object *ob, const Mesh *mesh, int i);
 
   ssize_t mem_used;
 
@@ -66,7 +71,7 @@ class BVHEmbree : public BVH {
  private:
   void delete_rtcScene();
   void update_tri_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh);
-  void update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh);
+  void update_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair);
 
   static RTCDevice rtc_shared_device;
   static int rtc_shared_users;
@@ -76,7 +81,7 @@ class BVHEmbree : public BVH {
   vector<RTCScene> delayed_delete_scenes;
   int curve_subdivisions;
   enum RTCBuildQuality build_quality;
-  bool use_curves, use_ribbons, dynamic_scene;
+  bool dynamic_scene;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp
new file mode 100644
index 00000000000..ccb7ae08625
--- /dev/null
+++ b/intern/cycles/bvh/bvh_optix.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "bvh/bvh_optix.h"
+
+#  include "device/device.h"
+
+#  include "render/geometry.h"
+#  include "render/hair.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVHOptiX::BVHOptiX(const BVHParams &params_,
+                   const vector<Geometry *> &geometry_,
+                   const vector<Object *> &objects_)
+    : BVH(params_, geometry_, objects_)
+{
+}
+
+BVHOptiX::~BVHOptiX()
+{
+}
+
+void BVHOptiX::build(Progress &, Stats *)
+{
+  if (params.top_level)
+    pack_tlas();
+  else
+    pack_blas();
+}
+
+void BVHOptiX::copy_to_device(Progress &progress, DeviceScene *dscene)
+{
+  progress.set_status("Updating Scene BVH", "Building OptiX acceleration structure");
+
+  Device *const device = dscene->bvh_nodes.device;
+  if (!device->build_optix_bvh(this))
+    progress.set_error("Failed to build OptiX acceleration structure");
+}
+
+void BVHOptiX::pack_blas()
+{
+  // Bottom-level BVH can contain multiple primitive types, so merge them:
+  assert(geometry.size() == 1 && objects.size() == 1);  // These are built per-mesh
+  Geometry *const geom = geometry[0];
+
+  if (geom->type == Geometry::HAIR) {
+    Hair *const hair = static_cast<Hair *const>(geom);
+    if (hair->num_curves() > 0) {
+      const size_t num_curves = hair->num_curves();
+      const size_t num_segments = hair->num_segments();
+      pack.prim_type.reserve(pack.prim_type.size() + num_segments);
+      pack.prim_index.reserve(pack.prim_index.size() + num_segments);
+      pack.prim_object.reserve(pack.prim_object.size() + num_segments);
+      // 'pack.prim_time' is only used in geom_curve_intersect.h
+      // It is not needed because of OPTIX_MOTION_FLAG_[START|END]_VANISH
+
+      uint type = (hair->use_motion_blur &&
+                   hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) ?
+                      ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON :
+                                                             PRIMITIVE_MOTION_CURVE_THICK) :
+                      ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON :
+                                                             PRIMITIVE_CURVE_THICK);
+
+      for (size_t j = 0; j < num_curves; ++j) {
+        const Hair::Curve curve = hair->get_curve(j);
+        for (size_t k = 0; k < curve.num_segments(); ++k) {
+          pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(type, k));
+          // Each curve segment points back to its curve index
+          pack.prim_index.push_back_reserved(j);
+          pack.prim_object.push_back_reserved(0);
+        }
+      }
+    }
+  }
+  else if (geom->type == Geometry::MESH) {
+    Mesh *const mesh = static_cast<Mesh *const>(geom);
+    if (mesh->num_triangles() > 0) {
+      const size_t num_triangles = mesh->num_triangles();
+      pack.prim_type.reserve(pack.prim_type.size() + num_triangles);
+      pack.prim_index.reserve(pack.prim_index.size() + num_triangles);
+      pack.prim_object.reserve(pack.prim_object.size() + num_triangles);
+
+      uint type = PRIMITIVE_TRIANGLE;
+      if (mesh->use_motion_blur && mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
+        type = PRIMITIVE_MOTION_TRIANGLE;
+
+      for (size_t k = 0; k < num_triangles; ++k) {
+        pack.prim_type.push_back_reserved(type);
+        pack.prim_index.push_back_reserved(k);
+        pack.prim_object.push_back_reserved(0);
+      }
+    }
+  }
+
+  // Initialize visibility to zero and later update it during top-level build
+  uint prev_visibility = objects[0]->visibility;
+  objects[0]->visibility = 0;
+
+  // Update 'pack.prim_tri_index', 'pack.prim_tri_verts' and 'pack.prim_visibility'
+  pack_primitives();
+
+  // Reset visibility after packing
+  objects[0]->visibility = prev_visibility;
+}
+
+void BVHOptiX::pack_tlas()
+{
+  // Calculate total packed size
+  size_t prim_index_size = 0;
+  size_t prim_tri_verts_size = 0;
+  foreach (Geometry *geom, geometry) {
+    BVH *const bvh = geom->bvh;
+    prim_index_size += bvh->pack.prim_index.size();
+    prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
+  }
+
+  if (prim_index_size == 0)
+    return;  // Abort right away if this is an empty BVH
+
+  size_t pack_offset = 0;
+  size_t pack_verts_offset = 0;
+
+  pack.prim_type.resize(prim_index_size);
+  int *pack_prim_type = pack.prim_type.data();
+  pack.prim_index.resize(prim_index_size);
+  int *pack_prim_index = pack.prim_index.data();
+  pack.prim_object.resize(prim_index_size);
+  int *pack_prim_object = pack.prim_object.data();
+  pack.prim_visibility.resize(prim_index_size);
+  uint *pack_prim_visibility = pack.prim_visibility.data();
+  pack.prim_tri_index.resize(prim_index_size);
+  uint *pack_prim_tri_index = pack.prim_tri_index.data();
+  pack.prim_tri_verts.resize(prim_tri_verts_size);
+  float4 *pack_prim_tri_verts = pack.prim_tri_verts.data();
+
+  // Top-level BVH should only contain instances, see 'Geometry::need_build_bvh'
+  // Iterate over scene mesh list instead of objects, since the 'prim_offset' is calculated based
+  // on that list, which may be ordered differently from the object list.
+  foreach (Geometry *geom, geometry) {
+    PackedBVH &bvh_pack = geom->bvh->pack;
+    int geom_prim_offset = geom->prim_offset;
+
+    // Merge visibility flags of all objects and fix object indices for non-instanced geometry
+    int object_index = 0;  // Unused for instanced geometry
+    int object_visibility = 0;
+    foreach (Object *ob, objects) {
+      if (ob->geometry == geom) {
+        object_visibility |= ob->visibility_for_tracing();
+        if (!geom->is_instanced()) {
+          object_index = ob->get_device_index();
+          break;
+        }
+      }
+    }
+
+    // Merge primitive, object and triangle indexes
+    if (!bvh_pack.prim_index.empty()) {
+      int *bvh_prim_type = &bvh_pack.prim_type[0];
+      int *bvh_prim_index = &bvh_pack.prim_index[0];
+      uint *bvh_prim_tri_index = &bvh_pack.prim_tri_index[0];
+      uint *bvh_prim_visibility = &bvh_pack.prim_visibility[0];
+
+      for (size_t i = 0; i < bvh_pack.prim_index.size(); i++, pack_offset++) {
+        if (bvh_pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
+          pack_prim_index[pack_offset] = bvh_prim_index[i] + geom_prim_offset;
+          pack_prim_tri_index[pack_offset] = -1;
+        }
+        else {
+          pack_prim_index[pack_offset] = bvh_prim_index[i] + geom_prim_offset;
+          pack_prim_tri_index[pack_offset] = bvh_prim_tri_index[i] + pack_verts_offset;
+        }
+
+        pack_prim_type[pack_offset] = bvh_prim_type[i];
+        pack_prim_object[pack_offset] = object_index;
+        pack_prim_visibility[pack_offset] = bvh_prim_visibility[i] | object_visibility;
+      }
+    }
+
+    // Merge triangle vertex data
+    if (!bvh_pack.prim_tri_verts.empty()) {
+      const size_t prim_tri_size = bvh_pack.prim_tri_verts.size();
+      memcpy(pack_prim_tri_verts + pack_verts_offset,
+             bvh_pack.prim_tri_verts.data(),
+             prim_tri_size * sizeof(float4));
+      pack_verts_offset += prim_tri_size;
+    }
+  }
+}
+
+void BVHOptiX::pack_nodes(const BVHNode *)
+{
+}
+
+void BVHOptiX::refit_nodes()
+{
+  // TODO(pmours): Implement?
+  VLOG(1) << "Refit is not yet implemented for OptiX BVH.";
+}
+
+BVHNode *BVHOptiX::widen_children_nodes(const BVHNode *)
+{
+  return NULL;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/bvh/bvh_optix.h b/intern/cycles/bvh/bvh_optix.h
new file mode 100644
index 00000000000..e4745b093b5
--- /dev/null
+++ b/intern/cycles/bvh/bvh_optix.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_OPTIX_H__
+#define __BVH_OPTIX_H__
+
+#ifdef WITH_OPTIX
+
+#  include "bvh/bvh.h"
+#  include "bvh/bvh_params.h"
+#  include "device/device_memory.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Geometry;
+class Optix;
+
+class BVHOptiX : public BVH {
+  friend class BVH;
+
+ public:
+  BVHOptiX(const BVHParams &params,
+           const vector<Geometry *> &geometry,
+           const vector<Object *> &objects);
+  virtual ~BVHOptiX();
+
+  virtual void build(Progress &progress, Stats *) override;
+  virtual void copy_to_device(Progress &progress, DeviceScene *dscene) override;
+
+ private:
+  void pack_blas();
+  void pack_tlas();
+
+  virtual void pack_nodes(const BVHNode *) override;
+  virtual void refit_nodes() override;
+
+  virtual BVHNode *widen_children_nodes(const BVHNode *) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
+
+#endif /* __BVH_OPTIX_H__ */
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 2731662a39d..1a50742dc33 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -69,9 +69,6 @@ class BVHParams {
   /* BVH layout to be built. */
   BVHLayout bvh_layout;
 
-  /* Mask of primitives to be included into the BVH. */
-  int primitive_mask;
-
   /* Use unaligned bounding boxes.
    * Only used for curves BVH.
    */
@@ -92,7 +89,6 @@ class BVHParams {
   int bvh_type;
 
   /* These are needed for Embree. */
-  int curve_flags;
   int curve_subdivisions;
 
   /* fixed parameters */
@@ -120,14 +116,11 @@ class BVHParams {
     bvh_layout = BVH_LAYOUT_BVH2;
     use_unaligned_nodes = false;
 
-    primitive_mask = PRIMITIVE_ALL;
-
     num_motion_curve_steps = 0;
     num_motion_triangle_steps = 0;
 
     bvh_type = 0;
 
-    curve_flags = 0;
     curve_subdivisions = 4;
   }
 
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index 4498a759c08..b01785b547a 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -88,18 +88,6 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool,
                                         const int job_end,
                                         const BVHReferenceCompare &compare);
 
-class BVHSortTask : public Task {
- public:
-  BVHSortTask(TaskPool *task_pool,
-              BVHReference *data,
-              const int job_start,
-              const int job_end,
-              const BVHReferenceCompare &compare)
-  {
-    run = function_bind(bvh_reference_sort_threaded, task_pool, data, job_start, job_end, compare);
-  }
-};
-
 /* Multi-threaded reference sort. */
 static void bvh_reference_sort_threaded(TaskPool *task_pool,
                                         BVHReference *data,
@@ -158,7 +146,8 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool,
     have_work = false;
     if (left < end) {
       if (start < right) {
-        task_pool->push(new BVHSortTask(task_pool, data, left, end, compare), true);
+        task_pool->push(
+            function_bind(bvh_reference_sort_threaded, task_pool, data, left, end, compare));
       }
       else {
         start = left;
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index bd261c10d55..4b21f852d7a 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -20,6 +20,7 @@
 #include "bvh/bvh_build.h"
 #include "bvh/bvh_sort.h"
 
+#include "render/hair.h"
 #include "render/mesh.h"
 #include "render/object.h"
 
@@ -32,7 +33,7 @@ CCL_NAMESPACE_BEGIN
 BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
                                BVHSpatialStorage *storage,
                                const BVHRange &range,
-                               vector<BVHReference> *references,
+                               vector<BVHReference> &references,
                                float nodeSAH,
                                const BVHUnaligned *unaligned_heuristic,
                                const Transform *aligned_space)
@@ -42,7 +43,7 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
       left_bounds(BoundBox::empty),
       right_bounds(BoundBox::empty),
       storage_(storage),
-      references_(references),
+      references_(&references),
       unaligned_heuristic_(unaligned_heuristic),
       aligned_space_(aligned_space)
 {
@@ -132,7 +133,7 @@ void BVHObjectSplit::split(BVHRange &left, BVHRange &right, const BVHRange &rang
 BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder,
                                  BVHSpatialStorage *storage,
                                  const BVHRange &range,
-                                 vector<BVHReference> *references,
+                                 vector<BVHReference> &references,
                                  float nodeSAH,
                                  const BVHUnaligned *unaligned_heuristic,
                                  const Transform *aligned_space)
@@ -140,7 +141,7 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder,
       dim(0),
       pos(0.0f),
       storage_(storage),
-      references_(references),
+      references_(&references),
       unaligned_heuristic_(unaligned_heuristic),
       aligned_space_(aligned_space)
 {
@@ -151,7 +152,7 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder,
   }
   else {
     range_bounds = unaligned_heuristic->compute_aligned_boundbox(
-        range, &references->at(0), *aligned_space);
+        range, &references_->at(0), *aligned_space);
   }
 
   float3 origin = range_bounds.min;
@@ -378,7 +379,7 @@ void BVHSpatialSplit::split_triangle_primitive(const Mesh *mesh,
   }
 }
 
-void BVHSpatialSplit::split_curve_primitive(const Mesh *mesh,
+void BVHSpatialSplit::split_curve_primitive(const Hair *hair,
                                             const Transform *tfm,
                                             int prim_index,
                                             int segment_index,
@@ -388,11 +389,11 @@ void BVHSpatialSplit::split_curve_primitive(const Mesh *mesh,
                                             BoundBox &right_bounds)
 {
   /* curve split: NOTE - Currently ignores curve width and needs to be fixed.*/
-  Mesh::Curve curve = mesh->get_curve(prim_index);
+  Hair::Curve curve = hair->get_curve(prim_index);
   const int k0 = curve.first_key + segment_index;
   const int k1 = k0 + 1;
-  float3 v0 = mesh->curve_keys[k0];
-  float3 v1 = mesh->curve_keys[k1];
+  float3 v0 = hair->curve_keys[k0];
+  float3 v1 = hair->curve_keys[k1];
 
   if (tfm != NULL) {
     v0 = transform_point(tfm, v0);
@@ -436,13 +437,13 @@ void BVHSpatialSplit::split_triangle_reference(const BVHReference &ref,
 }
 
 void BVHSpatialSplit::split_curve_reference(const BVHReference &ref,
-                                            const Mesh *mesh,
+                                            const Hair *hair,
                                             int dim,
                                             float pos,
                                             BoundBox &left_bounds,
                                             BoundBox &right_bounds)
 {
-  split_curve_primitive(mesh,
+  split_curve_primitive(hair,
                         NULL,
                         ref.prim_index(),
                         PRIMITIVE_UNPACK_SEGMENT(ref.prim_type()),
@@ -455,15 +456,22 @@ void BVHSpatialSplit::split_curve_reference(const BVHReference &ref,
 void BVHSpatialSplit::split_object_reference(
     const Object *object, int dim, float pos, BoundBox &left_bounds, BoundBox &right_bounds)
 {
-  Mesh *mesh = object->mesh;
-  for (int tri_idx = 0; tri_idx < mesh->num_triangles(); ++tri_idx) {
-    split_triangle_primitive(mesh, &object->tfm, tri_idx, dim, pos, left_bounds, right_bounds);
+  Geometry *geom = object->geometry;
+
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    for (int tri_idx = 0; tri_idx < mesh->num_triangles(); ++tri_idx) {
+      split_triangle_primitive(mesh, &object->tfm, tri_idx, dim, pos, left_bounds, right_bounds);
+    }
   }
-  for (int curve_idx = 0; curve_idx < mesh->num_curves(); ++curve_idx) {
-    Mesh::Curve curve = mesh->get_curve(curve_idx);
-    for (int segment_idx = 0; segment_idx < curve.num_keys - 1; ++segment_idx) {
-      split_curve_primitive(
-          mesh, &object->tfm, curve_idx, segment_idx, dim, pos, left_bounds, right_bounds);
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    for (int curve_idx = 0; curve_idx < hair->num_curves(); ++curve_idx) {
+      Hair::Curve curve = hair->get_curve(curve_idx);
+      for (int segment_idx = 0; segment_idx < curve.num_keys - 1; ++segment_idx) {
+        split_curve_primitive(
+            hair, &object->tfm, curve_idx, segment_idx, dim, pos, left_bounds, right_bounds);
+      }
     }
   }
 }
@@ -481,13 +489,14 @@ void BVHSpatialSplit::split_reference(const BVHBuild &builder,
 
   /* loop over vertices/edges. */
   const Object *ob = builder.objects[ref.prim_object()];
-  const Mesh *mesh = ob->mesh;
 
   if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
+    Mesh *mesh = static_cast<Mesh *>(ob->geometry);
     split_triangle_reference(ref, mesh, dim, pos, left_bounds, right_bounds);
   }
   else if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
-    split_curve_reference(ref, mesh, dim, pos, left_bounds, right_bounds);
+    Hair *hair = static_cast<Hair *>(ob->geometry);
+    split_curve_reference(ref, hair, dim, pos, left_bounds, right_bounds);
   }
   else {
     split_object_reference(ob, dim, pos, left_bounds, right_bounds);
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index eddd1c27f49..28ff0e05fc3 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -24,6 +24,8 @@
 CCL_NAMESPACE_BEGIN
 
 class BVHBuild;
+class Hair;
+class Mesh;
 struct Transform;
 
 /* Object Split */
@@ -42,7 +44,7 @@ class BVHObjectSplit {
   BVHObjectSplit(BVHBuild *builder,
                  BVHSpatialStorage *storage,
                  const BVHRange &range,
-                 vector<BVHReference> *references,
+                 vector<BVHReference> &references,
                  float nodeSAH,
                  const BVHUnaligned *unaligned_heuristic = NULL,
                  const Transform *aligned_space = NULL);
@@ -80,7 +82,7 @@ class BVHSpatialSplit {
   BVHSpatialSplit(const BVHBuild &builder,
                   BVHSpatialStorage *storage,
                   const BVHRange &range,
-                  vector<BVHReference> *references,
+                  vector<BVHReference> &references,
                   float nodeSAH,
                   const BVHUnaligned *unaligned_heuristic = NULL,
                   const Transform *aligned_space = NULL);
@@ -113,7 +115,7 @@ class BVHSpatialSplit {
                                 float pos,
                                 BoundBox &left_bounds,
                                 BoundBox &right_bounds);
-  void split_curve_primitive(const Mesh *mesh,
+  void split_curve_primitive(const Hair *hair,
                              const Transform *tfm,
                              int prim_index,
                              int segment_index,
@@ -134,7 +136,7 @@ class BVHSpatialSplit {
                                 BoundBox &left_bounds,
                                 BoundBox &right_bounds);
   void split_curve_reference(const BVHReference &ref,
-                             const Mesh *mesh,
+                             const Hair *hair,
                              int dim,
                              float pos,
                              BoundBox &left_bounds,
@@ -185,7 +187,7 @@ class BVHMixedSplit {
   __forceinline BVHMixedSplit(BVHBuild *builder,
                               BVHSpatialStorage *storage,
                               const BVHRange &range,
-                              vector<BVHReference> *references,
+                              vector<BVHReference> &references,
                               int level,
                               const BVHUnaligned *unaligned_heuristic = NULL,
                               const Transform *aligned_space = NULL)
@@ -195,7 +197,7 @@ class BVHMixedSplit {
     }
     else {
       bounds = unaligned_heuristic->compute_aligned_boundbox(
-          range, &references->at(0), *aligned_space);
+          range, &references.at(0), *aligned_space);
     }
     /* find split candidates. */
     float area = bounds.safe_area();
@@ -218,7 +220,7 @@ class BVHMixedSplit {
 
     /* leaf SAH is the lowest => create leaf. */
     minSAH = min(min(leafSAH, object.sah), spatial.sah);
-    no_split = (minSAH == leafSAH && builder->range_within_max_leaf_size(range, *references));
+    no_split = (minSAH == leafSAH && builder->range_within_max_leaf_size(range, references));
   }
 
   __forceinline void split(BVHBuild *builder,
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
index 1843ca403a5..c969b361643 100644
--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -16,7 +16,7 @@
 
 #include "bvh/bvh_unaligned.h"
 
-#include "render/mesh.h"
+#include "render/hair.h"
 #include "render/object.h"
 
 #include "bvh/bvh_binning.h"
@@ -68,13 +68,14 @@ bool BVHUnaligned::compute_aligned_space(const BVHReference &ref, Transform *ali
   const Object *object = objects_[ref.prim_object()];
   const int packed_type = ref.prim_type();
   const int type = (packed_type & PRIMITIVE_ALL);
-  if (type & PRIMITIVE_CURVE) {
+  /* No motion blur curves here, we can't fit them to aligned boxes well. */
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) {
     const int curve_index = ref.prim_index();
     const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
-    const Mesh *mesh = object->mesh;
-    const Mesh::Curve &curve = mesh->get_curve(curve_index);
+    const Hair *hair = static_cast<const Hair *>(object->geometry);
+    const Hair::Curve &curve = hair->get_curve(curve_index);
     const int key = curve.first_key + segment;
-    const float3 v1 = mesh->curve_keys[key], v2 = mesh->curve_keys[key + 1];
+    const float3 v1 = hair->curve_keys[key], v2 = hair->curve_keys[key + 1];
     float length;
     const float3 axis = normalize_len(v2 - v1, &length);
     if (length > 1e-6f) {
@@ -93,13 +94,14 @@ BoundBox BVHUnaligned::compute_aligned_prim_boundbox(const BVHReference &prim,
   const Object *object = objects_[prim.prim_object()];
   const int packed_type = prim.prim_type();
   const int type = (packed_type & PRIMITIVE_ALL);
-  if (type & PRIMITIVE_CURVE) {
+  /* No motion blur curves here, we can't fit them to aligned boxes well. */
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) {
     const int curve_index = prim.prim_index();
     const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
-    const Mesh *mesh = object->mesh;
-    const Mesh::Curve &curve = mesh->get_curve(curve_index);
+    const Hair *hair = static_cast<const Hair *>(object->geometry);
+    const Hair::Curve &curve = hair->get_curve(curve_index);
     curve.bounds_grow(
-        segment, &mesh->curve_keys[0], &mesh->curve_radius[0], aligned_space, bounds);
+        segment, &hair->curve_keys[0], &hair->curve_radius[0], aligned_space, bounds);
   }
   else {
     bounds = prim.bounds().transformed(&aligned_space);
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 5bf681792ca..b09f442bd16 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -133,9 +133,9 @@ if(CYCLES_STANDALONE_REPOSITORY)
   set(BOOST_DEFINITIONS "-DBOOST_ALL_NO_LIB")
 
   ####
-  # embree
+  # Embree
   if(WITH_CYCLES_EMBREE)
-    find_package(embree 3.2.4 REQUIRED)
+    find_package(Embree 3.8.0 REQUIRED)
   endif()
 
   ####
diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake
index 0efd8bb7ea8..13328a8b6bf 100644
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -8,8 +8,64 @@ endfunction()
 
 macro(cycles_add_library target library_deps)
   add_library(${target} ${ARGN})
-  if(NOT ("${library_deps}" STREQUAL ""))
-    target_link_libraries(${target} "${library_deps}")
+
+  # On Windows certain libraries have two sets of binaries: one for debug builds and one for
+  # release builds. The root of this requirement goes into ABI, I believe, but that's outside
+  # of a scope of this comment.
+  #
+  # CMake have a native way of dealing with this, which is specifying what build type the
+  # libraries are provided for:
+  #
+  #   target_link_libraries(tagret optimized|debug|general <libraries>)
+  #
+  # The build type is to be provided as a separate argument to the function.
+  #
+  # CMake's variables for libraries will contain build type in such cases. For example:
+  #
+  #   set(FOO_LIBRARIES optimized libfoo.lib debug libfoo_d.lib)
+  #
+  # Complications starts with a single argument for library_deps: all the elements are being
+  # put to a list: "${FOO_LIBRARIES}" will become "optimized;libfoo.lib;debug;libfoo_d.lib".
+  # This makes it impossible to pass it as-is to target_link_libraries sine it will treat
+  # this argument as a list of libraries to be linked against, causing missing libraries
+  # for optimized.lib.
+  #
+  # What this code does it traverses library_deps and extracts information about whether
+  # library is to provided as general, debug or optimized. This is a little state machine which
+  # keeps track of whiuch build type library is to provided for:
+  #
+  # - If "debug" or "optimized" word is found, the next element in the list is expected to be
+  #   a library which will be passed to target_link_libraries() under corresponding build type.
+  #
+  # - If there is no "debug" or "optimized" used library is specified for all build types.
+  #
+  # NOTE: If separated libraries for debug and release ar eneeded every library is the list are
+  # to be prefixed explicitly.
+  #
+  #  Use: "optimized libfoo optimized libbar debug libfoo_d debug libbar_d"
+  #  NOT: "optimized libfoo libbar debug libfoo_d libbar_d"
+  #
+  # TODO(sergey): This is the same as Blender's side CMake. Find a way to avoid duplication
+  # somehow in a way which allows to have Cycles standalone.
+  if(NOT "${library_deps}" STREQUAL "")
+    set(next_library_mode "")
+    foreach(library ${library_deps})
+      string(TOLOWER "${library}" library_lower)
+      if(("${library_lower}" STREQUAL "optimized") OR
+         ("${library_lower}" STREQUAL "debug"))
+        set(next_library_mode "${library_lower}")
+      else()
+        if("${next_library_mode}" STREQUAL "optimized")
+          target_link_libraries(${target} optimized ${library})
+        elseif("${next_library_mode}" STREQUAL "debug")
+          target_link_libraries(${target} debug ${library})
+        else()
+          target_link_libraries(${target} ${library})
+        endif()
+        set(next_library_mode "")
+      endif()
+    endforeach()
   endif()
+
   cycles_set_solution_folder(${target})
 endmacro()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 75f4a72bee3..ca366722eb7 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -29,17 +29,22 @@ set(SRC
   device_memory.cpp
   device_multi.cpp
   device_opencl.cpp
+  device_optix.cpp
   device_split_kernel.cpp
   device_task.cpp
 )
 
+set(SRC_CUDA
+  cuda/device_cuda.h
+  cuda/device_cuda_impl.cpp
+)
+
 set(SRC_OPENCL
-  opencl/opencl.h
+  opencl/device_opencl.h
+  opencl/device_opencl_impl.cpp
   opencl/memory_manager.h
-
-  opencl/opencl_split.cpp
-  opencl/opencl_util.cpp
   opencl/memory_manager.cpp
+  opencl/opencl_util.cpp
 )
 
 if(WITH_CYCLES_NETWORK)
@@ -59,7 +64,9 @@ set(SRC_HEADERS
 )
 
 set(LIB
-
+  cycles_render
+  cycles_kernel
+  cycles_util
 )
 
 if(WITH_CUDA_DYNLOAD)
@@ -77,16 +84,34 @@ if(WITH_CYCLES_NETWORK)
   add_definitions(-DWITH_NETWORK)
 endif()
 if(WITH_CYCLES_DEVICE_OPENCL)
+  list(APPEND LIB
+    extern_clew
+  )
   add_definitions(-DWITH_OPENCL)
 endif()
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
+if(WITH_CYCLES_DEVICE_OPTIX)
+  add_definitions(-DWITH_OPTIX)
+endif()
 if(WITH_CYCLES_DEVICE_MULTI)
   add_definitions(-DWITH_MULTI)
 endif()
 
+if(WITH_OPENIMAGEDENOISE)
+  add_definitions(-DWITH_OPENIMAGEDENOISE)
+  add_definitions(-DOIDN_STATIC_LIB)
+  list(APPEND INC_SYS
+    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${OPENIMAGEDENOISE_LIBRARIES}
+    ${TBB_LIBRARIES}
+  )
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
new file mode 100644
index 00000000000..e5e3e24165d
--- /dev/null
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/device.h"
+#  include "device/device_denoising.h"
+#  include "device/device_split_kernel.h"
+
+#  include "util/util_map.h"
+#  include "util/util_task.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include "util/util_opengl.h"
+#    include <cuda.h>
+#    include <cudaGL.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDASplitKernel;
+
+class CUDADevice : public Device {
+
+  friend class CUDASplitKernelFunction;
+  friend class CUDASplitKernel;
+  friend class CUDAContextScope;
+
+ public:
+  DedicatedTaskPool task_pool;
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule, cuFilterModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+  CUDASplitKernel *split_kernel;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+
+  struct PixelMem {
+    GLuint cuPBO;
+    CUgraphicsResource cuPBOresource;
+    GLuint cuTexId;
+    int w, h;
+  };
+  map<device_ptr, PixelMem> pixel_mem_map;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  /* Kernels */
+  struct {
+    bool loaded;
+
+    CUfunction adaptive_stopping;
+    CUfunction adaptive_filter_x;
+    CUfunction adaptive_filter_y;
+    CUfunction adaptive_scale_samples;
+    int adaptive_num_threads_per_block;
+  } functions;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
+
+  virtual ~CUDADevice();
+
+  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  bool use_split_kernel();
+
+  virtual string compile_kernel_get_common_cflags(
+      const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
+
+  string compile_kernel(const DeviceRequestedFeatures &requested_features,
+                        const char *name,
+                        const char *base = "cuda",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
+
+  void load_functions();
+
+  void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  bool denoising_non_local_means(device_ptr image_ptr,
+                                 device_ptr guide_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr out_ptr,
+                                 DenoisingTask *task);
+
+  bool denoising_construct_transform(DenoisingTask *task);
+
+  bool denoising_accumulate(device_ptr color_ptr,
+                            device_ptr color_variance_ptr,
+                            device_ptr scale_ptr,
+                            int frame,
+                            DenoisingTask *task);
+
+  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
+
+  bool denoising_combine_halves(device_ptr a_ptr,
+                                device_ptr b_ptr,
+                                device_ptr mean_ptr,
+                                device_ptr variance_ptr,
+                                int r,
+                                int4 rect,
+                                DenoisingTask *task);
+
+  bool denoising_divide_shadow(device_ptr a_ptr,
+                               device_ptr b_ptr,
+                               device_ptr sample_variance_ptr,
+                               device_ptr sv_variance_ptr,
+                               device_ptr buffer_variance_ptr,
+                               DenoisingTask *task);
+
+  bool denoising_get_feature(int mean_offset,
+                             int variance_offset,
+                             device_ptr mean_ptr,
+                             device_ptr variance_ptr,
+                             float scale,
+                             DenoisingTask *task);
+
+  bool denoising_write_feature(int out_offset,
+                               device_ptr from_ptr,
+                               device_ptr buffer_ptr,
+                               DenoisingTask *task);
+
+  bool denoising_detect_outliers(device_ptr image_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr depth_ptr,
+                                 device_ptr output_ptr,
+                                 DenoisingTask *task);
+
+  void denoise(RenderTile &rtile, DenoisingTask &denoising);
+
+  void adaptive_sampling_filter(uint filter_sample,
+                                WorkTile *wtile,
+                                CUdeviceptr d_wtile,
+                                CUstream stream = 0);
+  void adaptive_sampling_post(RenderTile &rtile,
+                              WorkTile *wtile,
+                              CUdeviceptr d_wtile,
+                              CUstream stream = 0);
+
+  void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
+
+  void film_convert(DeviceTask &task,
+                    device_ptr buffer,
+                    device_ptr rgba_byte,
+                    device_ptr rgba_half);
+
+  void shader(DeviceTask &task);
+
+  CUdeviceptr map_pixels(device_ptr mem);
+
+  void unmap_pixels(device_ptr mem);
+
+  void pixels_alloc(device_memory &mem);
+
+  void pixels_copy_from(device_memory &mem, int y, int w, int h);
+
+  void pixels_free(device_memory &mem);
+
+  void draw_pixels(device_memory &mem,
+                   int y,
+                   int w,
+                   int h,
+                   int width,
+                   int height,
+                   int dx,
+                   int dy,
+                   int dw,
+                   int dh,
+                   bool transparent,
+                   const DeviceDrawParams &draw_params) override;
+
+  void thread_run(DeviceTask &task);
+
+  virtual void task_add(DeviceTask &task) override;
+
+  virtual void task_wait() override;
+
+  virtual void task_cancel() override;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
new file mode 100644
index 00000000000..3a2eb8df95b
--- /dev/null
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -0,0 +1,2683 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/cuda/device_cuda.h"
+#  include "device/device_intern.h"
+#  include "device/device_split_kernel.h"
+
+#  include "render/buffers.h"
+
+#  include "kernel/filter/filter_defines.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+#  include "kernel/split/kernel_split_data_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+#  ifndef WITH_CUDA_DYNLOAD
+
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all.
+ */
+
+namespace {
+
+const char *cuewErrorString(CUresult result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+
+} /* namespace */
+#  endif /* WITH_CUDA_DYNLOAD */
+
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+  CUDADevice *device;
+
+ public:
+  explicit CUDASplitKernel(CUDADevice *device);
+
+  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
+
+  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                              RenderTile &rtile,
+                                              int num_global_elements,
+                                              device_memory &kernel_globals,
+                                              device_memory &kernel_data_,
+                                              device_memory &split_data,
+                                              device_memory &ray_state,
+                                              device_memory &queue_index,
+                                              device_memory &use_queues_flag,
+                                              device_memory &work_pool_wgs);
+
+  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+                                                         const DeviceRequestedFeatures &);
+  virtual int2 split_kernel_local_size();
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
+};
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
+
+ private:
+  CUDADevice *device;
+};
+
+bool CUDADevice::have_precompiled_kernels()
+{
+  string cubins_path = path_get("lib");
+  return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::set_error(const string &error)
+{
+  Device::set_error(error);
+
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+#  define cuda_assert(stmt) \
+    { \
+      CUresult result = stmt; \
+      if (result != CUDA_SUCCESS) { \
+        const char *name = cuewErrorString(result); \
+        set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
+    : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+  background = background_;
+
+  cuDevId = info.num;
+  cuDevice = 0;
+  cuContext = 0;
+
+  cuModule = 0;
+  cuFilterModule = 0;
+
+  split_kernel = NULL;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+  pitch_alignment = 0;
+
+  functions.loaded = false;
+
+  /* Intialize CUDA. */
+  CUresult result = cuInit(0);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  /* Setup device and context. */
+  result = cuDeviceGet(&cuDevice, cuDevId);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+                            cuewErrorString(result)));
+    return;
+  }
+
+  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
+  cuda_assert(
+      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
+  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+  if (can_map_host) {
+    ctx_flags |= CU_CTX_MAP_HOST;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  if (background) {
+    result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+  }
+  else {
+    result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+    if (result != CUDA_SUCCESS) {
+      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+      background = true;
+    }
+  }
+
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+  cuDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by cuCtxCreate. */
+  cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+  task_pool.cancel();
+
+  delete split_kernel;
+
+  texture_info.free();
+
+  cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
+{
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* We only support sm_30 and above */
+  if (major < 3) {
+    set_error(string_printf(
+        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+  return DebugFlags().cuda.adaptive_compile;
+}
+
+bool CUDADevice::use_split_kernel()
+{
+  return DebugFlags().cuda.split_kernel;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(
+    const DeviceRequestedFeatures &requested_features, bool filter, bool split)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DNVCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (!filter && use_adaptive_compilation()) {
+    cflags += " " + requested_features.get_build_options();
+  }
+  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+  if (extra_cflags) {
+    cflags += string(" ") + string(extra_cflags);
+  }
+#  ifdef WITH_CYCLES_DEBUG
+  cflags += " -D__KERNEL_DEBUG__";
+#  endif
+
+  if (split) {
+    cflags += " -D__SPLIT__";
+  }
+
+  return cflags;
+}
+
+string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
+                                  const char *name,
+                                  const char *base,
+                                  bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing cuda toolkit or changing other
+   * compiler command line arguments makes sure cubin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(
+      requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string cubin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+  const string cubin = path_cache_get(path_join("kernels", cubin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+  if (path_exists(cubin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return cubin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      set_error(
+          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      set_error(
+          string_printf("CUDA binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const nvcc = cuewCompilerPath();
+  if (nvcc == NULL) {
+    set_error(
+        "CUDA nvcc compiler not found. "
+        "Install CUDA toolkit in default location.");
+    return string();
+  }
+
+  const int nvcc_cuda_version = cuewCompilerVersion();
+  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+  if (nvcc_cuda_version < 80) {
+    printf(
+        "Unsupported CUDA version %d.%d detected, "
+        "you need CUDA 8.0 or newer.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+    return string();
+  }
+  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102)) {
+    printf(
+        "CUDA version %d.%d detected, build may succeed but only "
+        "CUDA 10.1 and 10.2 are officially supported.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(cubin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("kernels", path_join(base, string_printf("%s.cu", name))));
+
+  string command = string_printf(
+      "\"%s\" "
+      "-arch=%s_%d%d "
+      "--%s \"%s\" "
+      "-o \"%s\" "
+      "%s",
+      nvcc,
+      kernel_arch,
+      major,
+      minor,
+      kernel_ext,
+      source_path.c_str(),
+      cubin.c_str(),
+      common_cflags.c_str());
+
+  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    set_error(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(cubin)) {
+    set_error(
+        "CUDA kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return cubin;
+}
+
+bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
+{
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in cuCtxSynchronize.
+   */
+  if (cuFilterModule && cuModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if cuda init succeeded */
+  if (cuContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(requested_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
+  string cubin = compile_kernel(requested_features, kernel_name);
+  if (cubin.empty())
+    return false;
+
+  const char *filter_name = "filter";
+  string filter_cubin = compile_kernel(requested_features, filter_name);
+  if (filter_cubin.empty())
+    return false;
+
+  /* open module */
+  CUDAContextScope scope(this);
+
+  string cubin_data;
+  CUresult result;
+
+  if (path_read_text(cubin, cubin_data))
+    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf(
+        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
+
+  if (path_read_text(filter_cubin, cubin_data))
+    result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
+                            filter_cubin.c_str(),
+                            cuewErrorString(result)));
+
+  if (result == CUDA_SUCCESS) {
+    reserve_local_memory(requested_features);
+  }
+
+  load_functions();
+
+  return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::load_functions()
+{
+  /* TODO: load all functions here. */
+  if (functions.loaded) {
+    return;
+  }
+  functions.loaded = true;
+
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
+
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
+
+  int unused_min_blocks;
+  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
+                                               &functions.adaptive_num_threads_per_block,
+                                               functions.adaptive_scale_samples,
+                                               NULL,
+                                               0,
+                                               0));
+}
+
+void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
+{
+  if (use_split_kernel()) {
+    /* Split kernel mostly uses global memory and adaptive compilation,
+     * difficult to predict how much is needed currently. */
+    return;
+  }
+
+  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  CUDAContextScope scope(this);
+
+  size_t total = 0, free_before = 0, free_after = 0;
+  cuMemGetInfo(&free_before, &total);
+
+  /* Get kernel function. */
+  CUfunction cuRender;
+
+  if (requested_features.use_baking) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else if (requested_features.use_integrator_branched) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+  int min_blocks, num_threads_per_block;
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+
+  /* Launch kernel, using just 1 block appears sufficient to reserve
+   * memory for all multiprocessors. It would be good to do this in
+   * parallel for the multi GPU case still to make it faster. */
+  CUdeviceptr d_work_tiles = 0;
+  uint total_work_size = 0;
+
+  void *args[] = {&d_work_tiles, &total_work_size};
+
+  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+  cuda_assert(cuCtxSynchronize());
+
+  cuMemGetInfo(&free_after, &total);
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    CUdeviceptr tmp;
+    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+    cuMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void CUDADevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+  if (need_texture_info) {
+    texture_info.copy_to_device();
+    need_texture_info = false;
+  }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+      device_memory &mem = *pair.first;
+      CUDAMem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple CUDA devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  CUDAContextScope scope(this);
+
+  CUdeviceptr device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  cuMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    cuMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = cuMemAlloc(&device_pointer, size);
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = CUDA_SUCCESS;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = cuMemHostAlloc(
+          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (mem_alloc_result != CUDA_SUCCESS) {
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  CUDAMem *cmem = &cuda_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * CUDA memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(
+        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          cuMemFreeHost(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      cuda_assert(cuMemFree(mem.device_pointer));
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+  }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS) {
+    assert(!"mem_copy_to not supported for pixels.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+
+    generic_copy_to(mem);
+  }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_copy_from(mem, y, w, h);
+  }
+  else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const CUDAContextScope scope(this);
+      cuda_assert(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_free(mem);
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  CUDAContextScope scope(this);
+  CUdeviceptr mem;
+  size_t bytes;
+
+  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+  // assert(bytes == size);
+  cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+  if (mem.is_resident(this) && mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+  CUDAContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = CU_TR_ADDRESS_MODE_WRAP;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+      break;
+    case EXTENSION_CLIP:
+      address_mode = CU_TR_ADDRESS_MODE_BORDER;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  CUfilter_mode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = CU_TR_FILTER_MODE_POINT;
+  }
+  else {
+    filter_mode = CU_TR_FILTER_MODE_LINEAR;
+  }
+
+  /* Image Texture Storage */
+  CUarray_format_enum format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = CU_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = CU_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = CU_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = CU_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = CU_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = CU_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  CUDAMem *cmem = NULL;
+  CUarray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (!mem.is_resident(this)) {
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (CUarray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    CUDA_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    param.dstArray = array_3d;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    cuda_assert(cuMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    dst_pitch = align_up(src_pitch, pitch_alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    CUDA_MEMCPY2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    cuda_assert(cuMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Kepler+, bindless textures. */
+  CUDA_RESOURCE_DESC resDesc;
+  memset(&resDesc, 0, sizeof(resDesc));
+
+  if (array_3d) {
+    resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+    resDesc.res.array.hArray = array_3d;
+    resDesc.flags = 0;
+  }
+  else if (mem.data_height > 0) {
+    resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+    resDesc.res.pitch2D.devPtr = mem.device_pointer;
+    resDesc.res.pitch2D.format = format;
+    resDesc.res.pitch2D.numChannels = mem.data_elements;
+    resDesc.res.pitch2D.height = mem.data_height;
+    resDesc.res.pitch2D.width = mem.data_width;
+    resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+  }
+  else {
+    resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+    resDesc.res.linear.devPtr = mem.device_pointer;
+    resDesc.res.linear.format = format;
+    resDesc.res.linear.numChannels = mem.data_elements;
+    resDesc.res.linear.sizeInBytes = mem.device_size;
+  }
+
+  CUDA_TEXTURE_DESC texDesc;
+  memset(&texDesc, 0, sizeof(texDesc));
+  texDesc.addressMode[0] = address_mode;
+  texDesc.addressMode[1] = address_mode;
+  texDesc.addressMode[2] = address_mode;
+  texDesc.filterMode = filter_mode;
+  texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+  cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  texture_info[slot].data = (uint64_t)cmem->texobject;
+  need_texture_info = true;
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      cuTexObjectDestroy(cmem.texobject);
+    }
+
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
+      /* Free array. */
+      cuArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else {
+      generic_free(mem);
+    }
+  }
+}
+
+#  define CUDA_GET_BLOCKSIZE(func, w, h) \
+    int threads_per_block; \
+    cuda_assert( \
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+    int threads = (int)sqrt((float)threads_per_block); \
+    int xblocks = ((w) + threads - 1) / threads; \
+    int yblocks = ((h) + threads - 1) / threads;
+
+#  define CUDA_LAUNCH_KERNEL(func, args) \
+    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
+
+/* Similar as above, but for 1-dimensional blocks. */
+#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+    int threads_per_block; \
+    cuda_assert( \
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+    int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
+    int yblocks = h;
+
+#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
+    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
+
+bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
+                                           device_ptr guide_ptr,
+                                           device_ptr variance_ptr,
+                                           device_ptr out_ptr,
+                                           DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  int stride = task->buffer.stride;
+  int w = task->buffer.width;
+  int h = task->buffer.h;
+  int r = task->nlm_state.r;
+  int f = task->nlm_state.f;
+  float a = task->nlm_state.a;
+  float k_2 = task->nlm_state.k_2;
+
+  int pass_stride = task->buffer.pass_stride;
+  int num_shifts = (2 * r + 1) * (2 * r + 1);
+  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+  int frame_offset = 0;
+
+  if (have_error())
+    return false;
+
+  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
+  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
+  CUdeviceptr scale_ptr = 0;
+
+  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
+  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
+
+  {
+    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+
+    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
+
+    void *calc_difference_args[] = {&guide_ptr,
+                                    &variance_ptr,
+                                    &scale_ptr,
+                                    &difference,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &channel_offset,
+                                    &frame_offset,
+                                    &a,
+                                    &k_2};
+    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *calc_weight_args[] = {
+        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *update_output_args[] = {&blurDifference,
+                                  &image_ptr,
+                                  &out_ptr,
+                                  &weightAccum,
+                                  &w,
+                                  &h,
+                                  &stride,
+                                  &pass_stride,
+                                  &channel_offset,
+                                  &r,
+                                  &f};
+
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+  }
+
+  {
+    CUfunction cuNLMNormalize;
+    cuda_assert(
+        cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+    void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+    CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+    CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+    cuda_assert(cuCtxSynchronize());
+  }
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterConstructTransform;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
+
+  void *args[] = {&task->buffer.mem.device_pointer,
+                  &task->tile_info_mem.device_pointer,
+                  &task->storage.transform.device_pointer,
+                  &task->storage.rank.device_pointer,
+                  &task->filter_area,
+                  &task->rect,
+                  &task->radius,
+                  &task->pca_threshold,
+                  &task->buffer.pass_stride,
+                  &task->buffer.frame_stride,
+                  &task->buffer.use_time};
+  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
+                                      device_ptr color_variance_ptr,
+                                      device_ptr scale_ptr,
+                                      int frame,
+                                      DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  int r = task->radius;
+  int f = 4;
+  float a = 1.0f;
+  float k_2 = task->nlm_k_2;
+
+  int w = task->reconstruction_state.source_w;
+  int h = task->reconstruction_state.source_h;
+  int stride = task->buffer.stride;
+  int frame_offset = frame * task->buffer.frame_stride;
+  int t = task->tile_info->frames[frame];
+
+  int pass_stride = task->buffer.pass_stride;
+  int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+  if (have_error())
+    return false;
+
+  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
+  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+
+  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+  cuda_assert(cuModuleGetFunction(
+      &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+  cuda_assert(
+      cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+  cuda_assert(cuModuleGetFunction(
+      &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                        task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+                        num_shifts);
+
+  void *calc_difference_args[] = {&color_ptr,
+                                  &color_variance_ptr,
+                                  &scale_ptr,
+                                  &difference,
+                                  &w,
+                                  &h,
+                                  &stride,
+                                  &pass_stride,
+                                  &r,
+                                  &pass_stride,
+                                  &frame_offset,
+                                  &a,
+                                  &k_2};
+  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+  void *construct_gramian_args[] = {&t,
+                                    &blurDifference,
+                                    &task->buffer.mem.device_pointer,
+                                    &task->storage.transform.device_pointer,
+                                    &task->storage.rank.device_pointer,
+                                    &task->storage.XtWX.device_pointer,
+                                    &task->storage.XtWY.device_pointer,
+                                    &task->reconstruction_state.filter_window,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &f,
+                                    &frame_offset,
+                                    &task->buffer.use_time};
+
+  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
+{
+  CUfunction cuFinalize;
+  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+  void *finalize_args[] = {&output_ptr,
+                           &task->storage.rank.device_pointer,
+                           &task->storage.XtWX.device_pointer,
+                           &task->storage.XtWY.device_pointer,
+                           &task->filter_area,
+                           &task->reconstruction_state.buffer_params.x,
+                           &task->render_buffer.samples};
+  CUDA_GET_BLOCKSIZE(
+      cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
+  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
+                                          device_ptr b_ptr,
+                                          device_ptr mean_ptr,
+                                          device_ptr variance_ptr,
+                                          int r,
+                                          int4 rect,
+                                          DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterCombineHalves;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
+  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
+                                         device_ptr b_ptr,
+                                         device_ptr sample_variance_ptr,
+                                         device_ptr sv_variance_ptr,
+                                         device_ptr buffer_variance_ptr,
+                                         DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterDivideShadow;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->tile_info_mem.device_pointer,
+                  &a_ptr,
+                  &b_ptr,
+                  &sample_variance_ptr,
+                  &sv_variance_ptr,
+                  &buffer_variance_ptr,
+                  &task->rect,
+                  &task->render_buffer.pass_stride,
+                  &task->render_buffer.offset};
+  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_get_feature(int mean_offset,
+                                       int variance_offset,
+                                       device_ptr mean_ptr,
+                                       device_ptr variance_ptr,
+                                       float scale,
+                                       DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterGetFeature;
+  cuda_assert(
+      cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->tile_info_mem.device_pointer,
+                  &mean_offset,
+                  &variance_offset,
+                  &mean_ptr,
+                  &variance_ptr,
+                  &scale,
+                  &task->rect,
+                  &task->render_buffer.pass_stride,
+                  &task->render_buffer.offset};
+  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_write_feature(int out_offset,
+                                         device_ptr from_ptr,
+                                         device_ptr buffer_ptr,
+                                         DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterWriteFeature;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->reconstruction_state.buffer_params,
+                  &task->filter_area,
+                  &from_ptr,
+                  &buffer_ptr,
+                  &out_offset,
+                  &task->rect};
+  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
+                                           device_ptr variance_ptr,
+                                           device_ptr depth_ptr,
+                                           device_ptr output_ptr,
+                                           DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterDetectOutliers;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {
+      &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
+
+  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
+{
+  denoising.functions.construct_transform = function_bind(
+      &CUDADevice::denoising_construct_transform, this, &denoising);
+  denoising.functions.accumulate = function_bind(
+      &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
+  denoising.functions.divide_shadow = function_bind(
+      &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+  denoising.functions.non_local_means = function_bind(
+      &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+  denoising.functions.combine_halves = function_bind(
+      &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+  denoising.functions.get_feature = function_bind(
+      &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+  denoising.functions.write_feature = function_bind(
+      &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+  denoising.functions.detect_outliers = function_bind(
+      &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+  denoising.render_buffer.samples = rtile.sample;
+  denoising.buffer.gpu_temporary_mem = true;
+
+  denoising.run_denoising(rtile);
+}
+
+void CUDADevice::adaptive_sampling_filter(uint filter_sample,
+                                          WorkTile *wtile,
+                                          CUdeviceptr d_wtile,
+                                          CUstream stream)
+{
+  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+
+  /* These are a series of tiny kernels because there is no grid synchronization
+   * from within a kernel, so multiple kernel launches it is. */
+  uint total_work_size = wtile->h * wtile->w;
+  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
+  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+  total_work_size = wtile->h;
+  num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+  total_work_size = wtile->w;
+  num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+}
+
+void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
+                                        WorkTile *wtile,
+                                        CUdeviceptr d_wtile,
+                                        CUstream stream)
+{
+  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+  uint total_work_size = wtile->h * wtile->w;
+
+  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
+  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args,
+                             0));
+}
+
+void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+  CUfunction cuRender;
+
+  /* Get kernel function. */
+  if (rtile.task == RenderTile::BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else if (task.integrator_branched) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  WorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+  if (task.adaptive_sampling.use) {
+    step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+  }
+
+  /* Render all samples. */
+  int start_sample = rtile.start_sample;
+  int end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample; sample += step_samples) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = min(step_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(
+        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    cuda_assert(cuCtxSynchronize());
+
+    /* Update progress. */
+    rtile.sample = sample + wtile->num_samples;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void CUDADevice::film_convert(DeviceTask &task,
+                              device_ptr buffer,
+                              device_ptr rgba_byte,
+                              device_ptr rgba_half)
+{
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilmConvert;
+  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
+  CUdeviceptr d_buffer = (CUdeviceptr)buffer;
+
+  /* get kernel function */
+  if (rgba_half) {
+    cuda_assert(
+        cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
+  }
+
+  float sample_scale = 1.0f / (task.sample + 1);
+
+  /* pass in parameters */
+  void *args[] = {&d_rgba,
+                  &d_buffer,
+                  &sample_scale,
+                  &task.x,
+                  &task.y,
+                  &task.w,
+                  &task.h,
+                  &task.offset,
+                  &task.stride};
+
+  /* launch kernel */
+  int threads_per_block;
+  cuda_assert(cuFuncGetAttribute(
+      &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
+
+  int xthreads = (int)sqrt(threads_per_block);
+  int ythreads = (int)sqrt(threads_per_block);
+  int xblocks = (task.w + xthreads - 1) / xthreads;
+  int yblocks = (task.h + ythreads - 1) / ythreads;
+
+  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
+
+  cuda_assert(cuLaunchKernel(cuFilmConvert,
+                             xblocks,
+                             yblocks,
+                             1, /* blocks */
+                             xthreads,
+                             ythreads,
+                             1, /* threads */
+                             0,
+                             0,
+                             args,
+                             0));
+
+  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
+
+  cuda_assert(cuCtxSynchronize());
+}
+
+void CUDADevice::shader(DeviceTask &task)
+{
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuShader;
+  CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
+  CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
+
+  /* get kernel function */
+  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
+  }
+
+  /* do tasks in smaller chunks, so we can cancel it */
+  const int shader_chunk_size = 65536;
+  const int start = task.shader_x;
+  const int end = task.shader_x + task.shader_w;
+  int offset = task.offset;
+
+  bool canceled = false;
+  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
+    for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+      int shader_w = min(shader_chunk_size, end - shader_x);
+
+      /* pass in parameters */
+      void *args[8];
+      int arg = 0;
+      args[arg++] = &d_input;
+      args[arg++] = &d_output;
+      args[arg++] = &task.shader_eval_type;
+      if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+        args[arg++] = &task.shader_filter;
+      }
+      args[arg++] = &shader_x;
+      args[arg++] = &shader_w;
+      args[arg++] = &offset;
+      args[arg++] = &sample;
+
+      /* launch kernel */
+      int threads_per_block;
+      cuda_assert(cuFuncGetAttribute(
+          &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+
+      int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
+
+      cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuLaunchKernel(cuShader,
+                                 xblocks,
+                                 1,
+                                 1, /* blocks */
+                                 threads_per_block,
+                                 1,
+                                 1, /* threads */
+                                 0,
+                                 0,
+                                 args,
+                                 0));
+
+      cuda_assert(cuCtxSynchronize());
+
+      if (task.get_cancel()) {
+        canceled = true;
+        break;
+      }
+    }
+
+    task.update_progress(NULL);
+  }
+}
+
+CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
+{
+  if (!background) {
+    PixelMem pmem = pixel_mem_map[mem];
+    CUdeviceptr buffer;
+
+    size_t bytes;
+    cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
+    cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
+
+    return buffer;
+  }
+
+  return (CUdeviceptr)mem;
+}
+
+void CUDADevice::unmap_pixels(device_ptr mem)
+{
+  if (!background) {
+    PixelMem pmem = pixel_mem_map[mem];
+
+    cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
+  }
+}
+
+void CUDADevice::pixels_alloc(device_memory &mem)
+{
+  PixelMem pmem;
+
+  pmem.w = mem.data_width;
+  pmem.h = mem.data_height;
+
+  CUDAContextScope scope(this);
+
+  glGenBuffers(1, &pmem.cuPBO);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+  if (mem.data_type == TYPE_HALF)
+    glBufferData(
+        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
+  else
+    glBufferData(
+        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+  glActiveTexture(GL_TEXTURE0);
+  glGenTextures(1, &pmem.cuTexId);
+  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+  if (mem.data_type == TYPE_HALF)
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+  else
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  CUresult result = cuGraphicsGLRegisterBuffer(
+      &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+
+  if (result == CUDA_SUCCESS) {
+    mem.device_pointer = pmem.cuTexId;
+    pixel_mem_map[mem.device_pointer] = pmem;
+
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+
+    return;
+  }
+  else {
+    /* failed to register buffer, fallback to no interop */
+    glDeleteBuffers(1, &pmem.cuPBO);
+    glDeleteTextures(1, &pmem.cuTexId);
+
+    background = true;
+  }
+}
+
+void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
+{
+  PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+  CUDAContextScope scope(this);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+  size_t offset = sizeof(uchar) * 4 * y * w;
+  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
+  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+}
+
+void CUDADevice::pixels_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+    CUDAContextScope scope(this);
+
+    cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
+    glDeleteBuffers(1, &pmem.cuPBO);
+    glDeleteTextures(1, &pmem.cuTexId);
+
+    pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+    mem.device_pointer = 0;
+
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+void CUDADevice::draw_pixels(device_memory &mem,
+                             int y,
+                             int w,
+                             int h,
+                             int width,
+                             int height,
+                             int dx,
+                             int dy,
+                             int dw,
+                             int dh,
+                             bool transparent,
+                             const DeviceDrawParams &draw_params)
+{
+  assert(mem.type == MEM_PIXELS);
+
+  if (!background) {
+    const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
+    PixelMem pmem = pixel_mem_map[mem.device_pointer];
+    float *vpointer;
+
+    CUDAContextScope scope(this);
+
+    /* for multi devices, this assumes the inefficient method that we allocate
+     * all pixels on the device even though we only render to a subset */
+    size_t offset = 4 * y * w;
+
+    if (mem.data_type == TYPE_HALF)
+      offset *= sizeof(GLhalf);
+    else
+      offset *= sizeof(uint8_t);
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+    if (mem.data_type == TYPE_HALF) {
+      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
+    }
+    else {
+      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
+    }
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    if (transparent) {
+      glEnable(GL_BLEND);
+      glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+    }
+
+    GLint shader_program;
+    if (use_fallback_shader) {
+      if (!bind_fallback_display_space_shader(dw, dh)) {
+        return;
+      }
+      shader_program = fallback_shader_program;
+    }
+    else {
+      draw_params.bind_display_space_shader_cb();
+      glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
+    }
+
+    if (!vertex_buffer) {
+      glGenBuffers(1, &vertex_buffer);
+    }
+
+    glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+    /* invalidate old contents -
+     * avoids stalling if buffer is still waiting in queue to be rendered */
+    glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+    vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+    if (vpointer) {
+      /* texture coordinate - vertex pair */
+      vpointer[0] = 0.0f;
+      vpointer[1] = 0.0f;
+      vpointer[2] = dx;
+      vpointer[3] = dy;
+
+      vpointer[4] = (float)w / (float)pmem.w;
+      vpointer[5] = 0.0f;
+      vpointer[6] = (float)width + dx;
+      vpointer[7] = dy;
+
+      vpointer[8] = (float)w / (float)pmem.w;
+      vpointer[9] = (float)h / (float)pmem.h;
+      vpointer[10] = (float)width + dx;
+      vpointer[11] = (float)height + dy;
+
+      vpointer[12] = 0.0f;
+      vpointer[13] = (float)h / (float)pmem.h;
+      vpointer[14] = dx;
+      vpointer[15] = (float)height + dy;
+
+      glUnmapBuffer(GL_ARRAY_BUFFER);
+    }
+
+    GLuint vertex_array_object;
+    GLuint position_attribute, texcoord_attribute;
+
+    glGenVertexArrays(1, &vertex_array_object);
+    glBindVertexArray(vertex_array_object);
+
+    texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
+    position_attribute = glGetAttribLocation(shader_program, "pos");
+
+    glEnableVertexAttribArray(texcoord_attribute);
+    glEnableVertexAttribArray(position_attribute);
+
+    glVertexAttribPointer(
+        texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+    glVertexAttribPointer(position_attribute,
+                          2,
+                          GL_FLOAT,
+                          GL_FALSE,
+                          4 * sizeof(float),
+                          (const GLvoid *)(sizeof(float) * 2));
+
+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+    if (use_fallback_shader) {
+      glUseProgram(0);
+    }
+    else {
+      draw_params.unbind_display_space_shader_cb();
+    }
+
+    if (transparent) {
+      glDisable(GL_BLEND);
+    }
+
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    return;
+  }
+
+  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
+}
+
+void CUDADevice::thread_run(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  if (task.type == DeviceTask::RENDER) {
+    DeviceRequestedFeatures requested_features;
+    if (use_split_kernel()) {
+      if (split_kernel == NULL) {
+        split_kernel = new CUDASplitKernel(this);
+        split_kernel->load_kernels(requested_features);
+      }
+    }
+
+    device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+
+    while (task.acquire_tile(this, tile, task.tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        if (use_split_kernel()) {
+          device_only_memory<uchar> void_buffer(this, "void_buffer");
+          split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+        }
+        else {
+          render(task, tile, work_tiles);
+        }
+      }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, work_tiles);
+      }
+      else if (tile.task == RenderTile::DENOISE) {
+        tile.sample = tile.start_sample + tile.num_samples;
+
+        denoise(tile, denoising);
+
+        task.update_progress(&tile, tile.w * tile.h);
+      }
+
+      task.release_tile(tile);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+  else if (task.type == DeviceTask::SHADER) {
+    shader(task);
+
+    cuda_assert(cuCtxSynchronize());
+  }
+  else if (task.type == DeviceTask::DENOISE_BUFFER) {
+    RenderTile tile;
+    tile.x = task.x;
+    tile.y = task.y;
+    tile.w = task.w;
+    tile.h = task.h;
+    tile.buffer = task.buffer;
+    tile.sample = task.sample + task.num_samples;
+    tile.num_samples = task.num_samples;
+    tile.start_sample = task.sample;
+    tile.offset = task.offset;
+    tile.stride = task.stride;
+    tile.buffers = task.buffers;
+
+    DenoisingTask denoising(this, task);
+    denoise(tile, denoising);
+    task.update_progress(&tile, tile.w * tile.h);
+  }
+}
+
+void CUDADevice::task_add(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  /* Load texture info. */
+  load_texture_info();
+
+  /* Synchronize all memory copies before executing task. */
+  cuda_assert(cuCtxSynchronize());
+
+  if (task.type == DeviceTask::FILM_CONVERT) {
+    /* must be done in main thread due to opengl access */
+    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+  }
+  else {
+    task_pool.push([=] {
+      DeviceTask task_copy = task;
+      thread_run(task_copy);
+    });
+  }
+}
+
+void CUDADevice::task_wait()
+{
+  task_pool.wait();
+}
+
+void CUDADevice::task_cancel()
+{
+  task_pool.cancel();
+}
+
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#  undef cuda_assert
+#  define cuda_assert(stmt) \
+    { \
+      CUresult result = stmt; \
+      if (result != CUDA_SUCCESS) { \
+        const char *name = cuewErrorString(result); \
+        device->set_error( \
+            string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+/* CUDA context scope. */
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+  cuda_assert(cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+  cuda_assert(cuCtxPopCurrent(NULL));
+}
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction {
+  CUDADevice *device;
+  CUfunction func;
+
+ public:
+  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
+  {
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
+  {
+    return enqueue(dim, NULL);
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, void *args[])
+  {
+    if (device->have_error())
+      return false;
+
+    CUDAContextScope scope(device);
+
+    /* we ignore dim.local_size for now, as this is faster */
+    int threads_per_block;
+    cuda_assert(
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
+                  threads_per_block;
+
+    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+    cuda_assert(cuLaunchKernel(func,
+                               xblocks,
+                               1,
+                               1, /* blocks */
+                               threads_per_block,
+                               1,
+                               1, /* threads */
+                               0,
+                               0,
+                               args,
+                               0));
+
+    return !device->have_error();
+  }
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
+                                            device_memory & /*data*/,
+                                            size_t num_threads)
+{
+  CUDAContextScope scope(device);
+
+  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+  size_buffer.alloc(1);
+  size_buffer.zero_to_device();
+
+  uint threads = num_threads;
+  CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
+
+  struct args_t {
+    uint *num_threads;
+    CUdeviceptr *size;
+  };
+
+  args_t args = {&threads, &d_size};
+
+  CUfunction state_buffer_size;
+  cuda_assert(
+      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
+
+  size_buffer.copy_from_device(0, 1, 1);
+  size_t size = size_buffer[0];
+  size_buffer.free();
+
+  return size;
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                                     RenderTile &rtile,
+                                                     int num_global_elements,
+                                                     device_memory & /*kernel_globals*/,
+                                                     device_memory & /*kernel_data*/,
+                                                     device_memory &split_data,
+                                                     device_memory &ray_state,
+                                                     device_memory &queue_index,
+                                                     device_memory &use_queues_flag,
+                                                     device_memory &work_pool_wgs)
+{
+  CUDAContextScope scope(device);
+
+  CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
+  CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
+  CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
+  CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
+  CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
+
+  CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
+
+  int end_sample = rtile.start_sample + rtile.num_samples;
+  int queue_size = dim.global_size[0] * dim.global_size[1];
+
+  struct args_t {
+    CUdeviceptr *split_data_buffer;
+    int *num_elements;
+    CUdeviceptr *ray_state;
+    int *start_sample;
+    int *end_sample;
+    int *sx;
+    int *sy;
+    int *sw;
+    int *sh;
+    int *offset;
+    int *stride;
+    CUdeviceptr *queue_index;
+    int *queuesize;
+    CUdeviceptr *use_queues_flag;
+    CUdeviceptr *work_pool_wgs;
+    int *num_samples;
+    CUdeviceptr *buffer;
+  };
+
+  args_t args = {&d_split_data,
+                 &num_global_elements,
+                 &d_ray_state,
+                 &rtile.start_sample,
+                 &end_sample,
+                 &rtile.x,
+                 &rtile.y,
+                 &rtile.w,
+                 &rtile.h,
+                 &rtile.offset,
+                 &rtile.stride,
+                 &d_queue_index,
+                 &queue_size,
+                 &d_use_queues_flag,
+                 &d_work_pool_wgs,
+                 &rtile.num_samples,
+                 &d_buffer};
+
+  CUfunction data_init;
+  cuda_assert(
+      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+  if (device->have_error()) {
+    return false;
+  }
+
+  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
+
+  return !device->have_error();
+}
+
+SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
+                                                                const DeviceRequestedFeatures &)
+{
+  const CUDAContextScope scope(device);
+
+  CUfunction func;
+  const CUresult result = cuModuleGetFunction(
+      &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
+  if (result != CUDA_SUCCESS) {
+    device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
+                                    kernel_name.data(),
+                                    cuewErrorString(result)));
+    return NULL;
+  }
+
+  return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+  return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
+                                               device_memory &data,
+                                               DeviceTask & /*task*/)
+{
+  CUDAContextScope scope(device);
+  size_t free;
+  size_t total;
+
+  cuda_assert(cuMemGetInfo(&free, &total));
+
+  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
+          << " bytes. (" << string_human_readable_size(free) << ").";
+
+  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+  size_t side = round_down((int)sqrt(num_elements), 32);
+  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
+  VLOG(1) << "Global size: " << global_size << ".";
+  return global_size;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 16a68e8b855..407f73e8451 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -25,11 +25,11 @@
 #include "util/util_logging.h"
 #include "util/util_math.h"
 #include "util/util_opengl.h"
-#include "util/util_time.h"
+#include "util/util_string.h"
 #include "util/util_system.h"
+#include "util/util_time.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
-#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,6 +38,7 @@ bool Device::need_devices_update = true;
 thread_mutex Device::device_mutex;
 vector<DeviceInfo> Device::opencl_devices;
 vector<DeviceInfo> Device::cuda_devices;
+vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
 vector<DeviceInfo> Device::network_devices;
 uint Device::devices_initialized_mask = 0;
@@ -76,7 +77,7 @@ std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &reques
 
 /* Device */
 
-Device::~Device()
+Device::~Device() noexcept(false)
 {
   if (!background) {
     if (vertex_buffer != 0) {
@@ -208,13 +209,13 @@ bool Device::bind_fallback_display_space_shader(const float width, const float h
     glUseProgram(fallback_shader_program);
     image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
     if (image_texture_location < 0) {
-      LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform.";
+      LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
       return false;
     }
 
     fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
     if (fullscreen_location < 0) {
-      LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform.";
+      LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
       return false;
     }
 
@@ -287,7 +288,8 @@ void Device::draw_pixels(device_memory &rgba,
   }
 
   glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-  /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+  /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered
+   */
   glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
 
   float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
@@ -364,6 +366,15 @@ void Device::draw_pixels(device_memory &rgba,
 
 Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
+#ifdef WITH_MULTI
+  if (!info.multi_devices.empty()) {
+    /* Always create a multi device when info contains multiple devices.
+     * This is done so that the type can still be e.g. DEVICE_CPU to indicate
+     * that it is a homogeneous collection of devices, which simplifies checks. */
+    return device_multi_create(info, stats, profiler, background);
+  }
+#endif
+
   Device *device;
 
   switch (info.type) {
@@ -378,9 +389,12 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
         device = NULL;
       break;
 #endif
-#ifdef WITH_MULTI
-    case DEVICE_MULTI:
-      device = device_multi_create(info, stats, profiler, background);
+#ifdef WITH_OPTIX
+    case DEVICE_OPTIX:
+      if (device_optix_init())
+        device = device_optix_create(info, stats, profiler, background);
+      else
+        device = NULL;
       break;
 #endif
 #ifdef WITH_NETWORK
@@ -409,6 +423,8 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_CPU;
   else if (strcmp(name, "CUDA") == 0)
     return DEVICE_CUDA;
+  else if (strcmp(name, "OPTIX") == 0)
+    return DEVICE_OPTIX;
   else if (strcmp(name, "OPENCL") == 0)
     return DEVICE_OPENCL;
   else if (strcmp(name, "NETWORK") == 0)
@@ -425,6 +441,8 @@ string Device::string_from_type(DeviceType type)
     return "CPU";
   else if (type == DEVICE_CUDA)
     return "CUDA";
+  else if (type == DEVICE_OPTIX)
+    return "OPTIX";
   else if (type == DEVICE_OPENCL)
     return "OPENCL";
   else if (type == DEVICE_NETWORK)
@@ -442,6 +460,9 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_CUDA
   types.push_back(DEVICE_CUDA);
 #endif
+#ifdef WITH_OPTIX
+  types.push_back(DEVICE_OPTIX);
+#endif
 #ifdef WITH_OPENCL
   types.push_back(DEVICE_OPENCL);
 #endif
@@ -473,15 +494,31 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   }
 #endif
 
-#ifdef WITH_CUDA
-  if (mask & DEVICE_MASK_CUDA) {
+#if defined(WITH_CUDA) || defined(WITH_OPTIX)
+  if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) {
     if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
       if (device_cuda_init()) {
         device_cuda_info(cuda_devices);
       }
       devices_initialized_mask |= DEVICE_MASK_CUDA;
     }
-    foreach (DeviceInfo &info, cuda_devices) {
+    if (mask & DEVICE_MASK_CUDA) {
+      foreach (DeviceInfo &info, cuda_devices) {
+        devices.push_back(info);
+      }
+    }
+  }
+#endif
+
+#ifdef WITH_OPTIX
+  if (mask & DEVICE_MASK_OPTIX) {
+    if (!(devices_initialized_mask & DEVICE_MASK_OPTIX)) {
+      if (device_optix_init()) {
+        device_optix_info(cuda_devices, optix_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_OPTIX;
+    }
+    foreach (DeviceInfo &info, optix_devices) {
       devices.push_back(info);
     }
   }
@@ -555,15 +592,18 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   }
 
   DeviceInfo info;
-  info.type = DEVICE_MULTI;
+  info.type = subdevices.front().type;
   info.id = "MULTI";
   info.description = "Multi Device";
   info.num = 0;
 
   info.has_half_images = true;
   info.has_volume_decoupled = true;
+  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_profiling = true;
+  info.has_peer_memory = false;
+  info.denoisers = DENOISER_ALL;
 
   foreach (const DeviceInfo &device, subdevices) {
     /* Ensure CPU device does not slow down GPU. */
@@ -593,11 +633,22 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
       info.multi_devices.push_back(device);
     }
 
+    /* Create unique ID for this combination of devices. */
+    info.id += device.id;
+
+    /* Set device type to MULTI if subdevices are not of a common type. */
+    if (device.type != info.type) {
+      info.type = DEVICE_MULTI;
+    }
+
     /* Accumulate device info. */
     info.has_half_images &= device.has_half_images;
     info.has_volume_decoupled &= device.has_volume_decoupled;
+    info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
+    info.has_peer_memory |= device.has_peer_memory;
+    info.denoisers &= device.denoisers;
   }
 
   return info;
@@ -612,9 +663,61 @@ void Device::free_memory()
 {
   devices_initialized_mask = 0;
   cuda_devices.free_memory();
+  optix_devices.free_memory();
   opencl_devices.free_memory();
   cpu_devices.free_memory();
   network_devices.free_memory();
 }
 
+/* DeviceInfo */
+
+void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+{
+  assert(denoising_devices.empty());
+
+  if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
+    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
+    if (!optix_devices.empty()) {
+      /* Convert to a special multi device with separate denoising devices. */
+      if (multi_devices.empty()) {
+        multi_devices.push_back(*this);
+      }
+
+      /* Try to use the same physical devices for denoising. */
+      for (const DeviceInfo &cuda_device : multi_devices) {
+        if (cuda_device.type == DEVICE_CUDA) {
+          for (const DeviceInfo &optix_device : optix_devices) {
+            if (cuda_device.num == optix_device.num) {
+              id += optix_device.id;
+              denoising_devices.push_back(optix_device);
+              break;
+            }
+          }
+        }
+      }
+
+      if (denoising_devices.empty()) {
+        /* Simply use the first available OptiX device. */
+        const DeviceInfo optix_device = optix_devices.front();
+        id += optix_device.id; /* Uniquely identify this special multi device. */
+        denoising_devices.push_back(optix_device);
+      }
+
+      denoisers = denoiser_type;
+    }
+  }
+  else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
+    /* Convert to a special multi device with separate denoising devices. */
+    if (multi_devices.empty()) {
+      multi_devices.push_back(*this);
+    }
+
+    /* Add CPU denoising devices. */
+    DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
+    denoising_devices.push_back(cpu_device);
+
+    denoisers = denoiser_type;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 15a0ceb4a19..115b05e3911 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -27,13 +27,14 @@
 #include "util/util_list.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
-#include "util/util_thread.h"
 #include "util/util_texture.h"
+#include "util/util_thread.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class BVH;
 class Progress;
 class RenderTile;
 
@@ -45,13 +46,15 @@ enum DeviceType {
   DEVICE_OPENCL,
   DEVICE_CUDA,
   DEVICE_NETWORK,
-  DEVICE_MULTI
+  DEVICE_MULTI,
+  DEVICE_OPTIX,
 };
 
 enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
   DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
+  DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
   DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
   DEVICE_MASK_ALL = ~0
 };
@@ -72,14 +75,18 @@ class DeviceInfo {
   string description;
   string id; /* used for user preferences, should stay fixed with changing hardware config */
   int num;
-  bool display_device;       /* GPU is used as a display device. */
-  bool has_half_images;      /* Support half-float textures. */
-  bool has_volume_decoupled; /* Decoupled volume shading. */
-  bool has_osl;              /* Support Open Shading Language. */
-  bool use_split_kernel;     /* Use split or mega kernel. */
-  bool has_profiling;        /* Supports runtime collection of profiling info. */
+  bool display_device;               /* GPU is used as a display device. */
+  bool has_half_images;              /* Support half-float textures. */
+  bool has_volume_decoupled;         /* Decoupled volume shading. */
+  bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
+  bool has_osl;                      /* Support Open Shading Language. */
+  bool use_split_kernel;             /* Use split or mega kernel. */
+  bool has_profiling;                /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
+  DenoiserTypeMask denoisers;        /* Supported denoiser types. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
+  vector<DeviceInfo> denoising_devices;
 
   DeviceInfo()
   {
@@ -90,9 +97,12 @@ class DeviceInfo {
     display_device = false;
     has_half_images = false;
     has_volume_decoupled = false;
+    has_adaptive_stop_per_sample = false;
     has_osl = false;
     use_split_kernel = false;
     has_profiling = false;
+    has_peer_memory = false;
+    denoisers = DENOISER_NONE;
   }
 
   bool operator==(const DeviceInfo &info)
@@ -102,6 +112,9 @@ class DeviceInfo {
            (type == info.type && num == info.num && description == info.description));
     return id == info.id;
   }
+
+  /* Add additional devices needed for the specified denoiser. */
+  void add_denoising_devices(DenoiserType denoiser_type);
 };
 
 class DeviceRequestedFeatures {
@@ -124,6 +137,7 @@ class DeviceRequestedFeatures {
 
   /* BVH/sampling kernel features. */
   bool use_hair;
+  bool use_hair_thick;
   bool use_object_motion;
   bool use_camera_motion;
 
@@ -170,6 +184,7 @@ class DeviceRequestedFeatures {
     max_nodes_group = 0;
     nodes_features = 0;
     use_hair = false;
+    use_hair_thick = false;
     use_object_motion = false;
     use_camera_motion = false;
     use_baking = false;
@@ -192,6 +207,7 @@ class DeviceRequestedFeatures {
              max_nodes_group == requested_features.max_nodes_group &&
              nodes_features == requested_features.nodes_features &&
              use_hair == requested_features.use_hair &&
+             use_hair_thick == requested_features.use_hair_thick &&
              use_object_motion == requested_features.use_object_motion &&
              use_camera_motion == requested_features.use_camera_motion &&
              use_baking == requested_features.use_baking &&
@@ -311,7 +327,8 @@ class Device {
   virtual void mem_free_sub_ptr(device_ptr /*ptr*/){};
 
  public:
-  virtual ~Device();
+  /* noexcept needed to silence TBB warning. */
+  virtual ~Device() noexcept(false);
 
   /* info */
   DeviceInfo info;
@@ -380,7 +397,11 @@ class Device {
   }
 
   /* tasks */
-  virtual int get_split_task_count(DeviceTask &task) = 0;
+  virtual int get_split_task_count(DeviceTask &)
+  {
+    return 1;
+  }
+
   virtual void task_add(DeviceTask &task) = 0;
   virtual void task_wait() = 0;
   virtual void task_cancel() = 0;
@@ -399,6 +420,12 @@ class Device {
                            bool transparent,
                            const DeviceDrawParams &draw_params);
 
+  /* acceleration structure building */
+  virtual bool build_optix_bvh(BVH *)
+  {
+    return false;
+  }
+
 #ifdef WITH_NETWORK
   /* networking */
   void server_run();
@@ -412,11 +439,22 @@ class Device {
   {
     return 0;
   }
-  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
+  {
+  }
+  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
   {
   }
-  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+
+  virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
+  {
+    /* Memory is always resident if this is not a multi device, regardless of whether the pointer
+     * is valid or not (since it may not have been allocated yet). */
+    return sub_device == this;
+  }
+  virtual bool check_peer_access(Device * /*peer_device*/)
   {
+    return false;
   }
 
   /* static */
@@ -456,6 +494,7 @@ class Device {
   static bool need_types_update, need_devices_update;
   static thread_mutex device_mutex;
   static vector<DeviceInfo> cuda_devices;
+  static vector<DeviceInfo> optix_devices;
   static vector<DeviceInfo> opencl_devices;
   static vector<DeviceInfo> cpu_devices;
   static vector<DeviceInfo> network_devices;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 837a8186064..ee3a3ddea64 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -29,16 +29,19 @@
 #include "device/device_intern.h"
 #include "device/device_split_kernel.h"
 
+// clang-format off
 #include "kernel/kernel.h"
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 #include "kernel/filter/filter.h"
 
 #include "kernel/osl/osl_shader.h"
 #include "kernel/osl/osl_globals.h"
+// clang-format on
 
 #include "render/buffers.h"
 #include "render/coverage.h"
@@ -48,10 +51,12 @@
 #include "util/util_function.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
+#include "util/util_openimagedenoise.h"
 #include "util/util_opengl.h"
 #include "util/util_optimization.h"
 #include "util/util_progress.h"
 #include "util/util_system.h"
+#include "util/util_task.h"
 #include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
@@ -114,6 +119,12 @@ template<typename F> class KernelFunctions {
       architecture_name = "SSE2";
       kernel = kernel_sse2;
     }
+#else
+    {
+      /* Dummy to prevent the architecture if below become
+       * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+       * is not defined. */
+    }
 #endif
 
     if (strcmp(architecture_name, logged_architecture) != 0) {
@@ -152,7 +163,7 @@ class CPUSplitKernel : public DeviceSplitKernel {
   virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
                                                          const DeviceRequestedFeatures &);
   virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
   virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
 };
 
@@ -167,6 +178,11 @@ class CPUDevice : public Device {
 #ifdef WITH_OSL
   OSLGlobals osl_globals;
 #endif
+#ifdef WITH_OPENIMAGEDENOISE
+  oidn::DeviceRef oidn_device;
+  oidn::FilterRef oidn_filter;
+#endif
+  thread_spin_lock oidn_task_lock;
 
   bool use_split_kernel;
 
@@ -179,6 +195,7 @@ class CPUDevice : public Device {
       convert_to_byte_kernel;
   KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
       shader_kernel;
+  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
 
   KernelFunctions<void (*)(
       int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
@@ -255,12 +272,13 @@ class CPUDevice : public Device {
 
   CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
       : Device(info_, stats_, profiler_, background_),
-        texture_info(this, "__texture_info", MEM_TEXTURE),
+        texture_info(this, "__texture_info", MEM_GLOBAL),
 #define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
         REGISTER_KERNEL(path_trace),
         REGISTER_KERNEL(convert_to_half_float),
         REGISTER_KERNEL(convert_to_byte),
         REGISTER_KERNEL(shader),
+        REGISTER_KERNEL(bake),
         REGISTER_KERNEL(filter_divide_shadow),
         REGISTER_KERNEL(filter_get_feature),
         REGISTER_KERNEL(filter_write_feature),
@@ -311,13 +329,17 @@ class CPUDevice : public Device {
     REGISTER_SPLIT_KERNEL(next_iteration_setup);
     REGISTER_SPLIT_KERNEL(indirect_subsurface);
     REGISTER_SPLIT_KERNEL(buffer_update);
+    REGISTER_SPLIT_KERNEL(adaptive_stopping);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
+    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
 #undef REGISTER_SPLIT_KERNEL
 #undef KERNEL_FUNCTIONS
   }
 
   ~CPUDevice()
   {
-    task_pool.stop();
+    task_pool.cancel();
     texture_info.free();
   }
 
@@ -329,12 +351,6 @@ class CPUDevice : public Device {
   virtual BVHLayoutMask get_bvh_layout_mask() const
   {
     BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-      bvh_layout_mask |= BVH_LAYOUT_BVH4;
-    }
-    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-      bvh_layout_mask |= BVH_LAYOUT_BVH8;
-    }
 #ifdef WITH_EMBREE
     bvh_layout_mask |= BVH_LAYOUT_EMBREE;
 #endif /* WITH_EMBREE */
@@ -354,6 +370,9 @@ class CPUDevice : public Device {
     if (mem.type == MEM_TEXTURE) {
       assert(!"mem_alloc not supported for textures.");
     }
+    else if (mem.type == MEM_GLOBAL) {
+      assert(!"mem_alloc not supported for global memory.");
+    }
     else {
       if (mem.name) {
         VLOG(1) << "Buffer allocate: " << mem.name << ", "
@@ -378,9 +397,13 @@ class CPUDevice : public Device {
 
   void mem_copy_to(device_memory &mem)
   {
-    if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-      tex_alloc(mem);
+    if (mem.type == MEM_GLOBAL) {
+      global_free(mem);
+      global_alloc(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free((device_texture &)mem);
+      tex_alloc((device_texture &)mem);
     }
     else if (mem.type == MEM_PIXELS) {
       assert(!"mem_copy_to not supported for pixels.");
@@ -412,8 +435,11 @@ class CPUDevice : public Device {
 
   void mem_free(device_memory &mem)
   {
-    if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
+    if (mem.type == MEM_GLOBAL) {
+      global_free(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free((device_texture &)mem);
     }
     else if (mem.device_pointer) {
       if (mem.type == MEM_DEVICE_ONLY) {
@@ -435,51 +461,50 @@ class CPUDevice : public Device {
     kernel_const_copy(&kernel_globals, name, host, size);
   }
 
-  void tex_alloc(device_memory &mem)
+  void global_alloc(device_memory &mem)
   {
-    VLOG(1) << "Texture allocate: " << mem.name << ", "
+    VLOG(1) << "Global memory allocate: " << mem.name << ", "
             << string_human_readable_number(mem.memory_size()) << " bytes. ("
             << string_human_readable_size(mem.memory_size()) << ")";
 
-    if (mem.interpolation == INTERPOLATION_NONE) {
-      /* Data texture. */
-      kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-    }
-    else {
-      /* Image Texture. */
-      int flat_slot = 0;
-      if (string_startswith(mem.name, "__tex_image")) {
-        int pos = string(mem.name).rfind("_");
-        flat_slot = atoi(mem.name + pos + 1);
-      }
-      else {
-        assert(0);
-      }
+    kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
 
-      if (flat_slot >= texture_info.size()) {
-        /* Allocate some slots in advance, to reduce amount
-         * of re-allocations. */
-        texture_info.resize(flat_slot + 128);
-      }
-
-      TextureInfo &info = texture_info[flat_slot];
-      info.data = (uint64_t)mem.host_pointer;
-      info.cl_buffer = 0;
-      info.interpolation = mem.interpolation;
-      info.extension = mem.extension;
-      info.width = mem.data_width;
-      info.height = mem.data_height;
-      info.depth = mem.data_depth;
+    mem.device_pointer = (device_ptr)mem.host_pointer;
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+  }
 
-      need_texture_info = true;
+  void global_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      mem.device_pointer = 0;
+      stats.mem_free(mem.device_size);
+      mem.device_size = 0;
     }
+  }
+
+  void tex_alloc(device_texture &mem)
+  {
+    VLOG(1) << "Texture allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
 
     mem.device_pointer = (device_ptr)mem.host_pointer;
     mem.device_size = mem.memory_size();
     stats.mem_alloc(mem.device_size);
+
+    const uint slot = mem.slot;
+    if (slot >= texture_info.size()) {
+      /* Allocate some slots in advance, to reduce amount of re-allocations. */
+      texture_info.resize(slot + 128);
+    }
+
+    texture_info[slot] = mem.info;
+    texture_info[slot].data = (uint64_t)mem.host_pointer;
+    need_texture_info = true;
   }
 
-  void tex_free(device_memory &mem)
+  void tex_free(device_texture &mem)
   {
     if (mem.device_pointer) {
       mem.device_pointer = 0;
@@ -498,25 +523,18 @@ class CPUDevice : public Device {
 #endif
   }
 
-  void thread_run(DeviceTask *task)
+  void thread_run(DeviceTask &task)
   {
-    if (task->type == DeviceTask::RENDER) {
-      thread_render(*task);
-    }
-    else if (task->type == DeviceTask::FILM_CONVERT)
-      thread_film_convert(*task);
-    else if (task->type == DeviceTask::SHADER)
-      thread_shader(*task);
+    if (task.type == DeviceTask::RENDER)
+      thread_render(task);
+    else if (task.type == DeviceTask::SHADER)
+      thread_shader(task);
+    else if (task.type == DeviceTask::FILM_CONVERT)
+      thread_film_convert(task);
+    else if (task.type == DeviceTask::DENOISE_BUFFER)
+      thread_denoise(task);
   }
 
-  class CPUDeviceTask : public DeviceTask {
-   public:
-    CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task)
-    {
-      run = function_bind(&CPUDevice::thread_run, device, this);
-    }
-  };
-
   bool denoising_non_local_means(device_ptr image_ptr,
                                  device_ptr guide_ptr,
                                  device_ptr variance_ptr,
@@ -811,7 +829,63 @@ class CPUDevice : public Device {
     return true;
   }
 
-  void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
+  {
+    WorkTile wtile;
+    wtile.x = tile.x;
+    wtile.y = tile.y;
+    wtile.w = tile.w;
+    wtile.h = tile.h;
+    wtile.offset = tile.offset;
+    wtile.stride = tile.stride;
+    wtile.buffer = (float *)tile.buffer;
+
+    /* For CPU we do adaptive stopping per sample so we can stop earlier, but
+     * for combined CPU + GPU rendering we match the GPU and do it per tile
+     * after a given number of sample steps. */
+    if (!kernel_data.integrator.adaptive_stop_per_sample) {
+      for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
+        for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
+          const int index = wtile.offset + x + y * wtile.stride;
+          float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
+          kernel_do_adaptive_stopping(kg, buffer, sample);
+        }
+      }
+    }
+
+    bool any = false;
+    for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
+      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
+    }
+    for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
+      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
+    }
+    return (!any);
+  }
+
+  void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
+  {
+    float *render_buffer = (float *)tile.buffer;
+    for (int y = tile.y; y < tile.y + tile.h; y++) {
+      for (int x = tile.x; x < tile.x + tile.w; x++) {
+        int index = tile.offset + x + y * tile.stride;
+        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+          float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f,
+                                                      buffer[kernel_data.film.pass_sample_count]);
+          if (sample_multiplier != 1.0f) {
+            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+          }
+        }
+        else {
+          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
+        }
+      }
+    }
+  }
+
+  void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
   {
     const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
 
@@ -835,25 +909,262 @@ class CPUDevice : public Device {
           break;
       }
 
-      for (int y = tile.y; y < tile.y + tile.h; y++) {
-        for (int x = tile.x; x < tile.x + tile.w; x++) {
-          if (use_coverage) {
-            coverage.init_pixel(x, y);
+      if (tile.task == RenderTile::PATH_TRACE) {
+        for (int y = tile.y; y < tile.y + tile.h; y++) {
+          for (int x = tile.x; x < tile.x + tile.w; x++) {
+            if (use_coverage) {
+              coverage.init_pixel(x, y);
+            }
+            path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+          }
+        }
+      }
+      else {
+        for (int y = tile.y; y < tile.y + tile.h; y++) {
+          for (int x = tile.x; x < tile.x + tile.w; x++) {
+            bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
           }
-          path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
         }
       }
-
       tile.sample = sample + 1;
 
+      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+        const bool stop = adaptive_sampling_filter(kg, tile, sample);
+        if (stop) {
+          const int num_progress_samples = end_sample - sample;
+          tile.sample = end_sample;
+          task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+          break;
+        }
+      }
+
       task.update_progress(&tile, tile.w * tile.h);
     }
     if (use_coverage) {
       coverage.finalize();
     }
+
+    if (task.adaptive_sampling.use) {
+      adaptive_sampling_post(tile, kg);
+    }
+  }
+
+  void denoise_openimagedenoise_buffer(DeviceTask &task,
+                                       float *buffer,
+                                       const size_t offset,
+                                       const size_t stride,
+                                       const size_t x,
+                                       const size_t y,
+                                       const size_t w,
+                                       const size_t h,
+                                       const float scale)
+  {
+#ifdef WITH_OPENIMAGEDENOISE
+    assert(openimagedenoise_supported());
+
+    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
+     * buffers, and for tiled rendering because creating multiple devices and filters
+     * is slow and memory hungry as well.
+     *
+     * TODO: optimize tiled rendering case, by batching together denoising of many
+     * tiles somehow? */
+    static thread_mutex mutex;
+    thread_scoped_lock lock(mutex);
+
+    /* Create device and filter, cached for reuse. */
+    if (!oidn_device) {
+      oidn_device = oidn::newDevice();
+      oidn_device.commit();
+    }
+    if (!oidn_filter) {
+      oidn_filter = oidn_device.newFilter("RT");
+      oidn_filter.set("hdr", true);
+      oidn_filter.set("srgb", false);
+    }
+
+    /* Set images with appropriate stride for our interleaved pass storage. */
+    struct {
+      const char *name;
+      const int offset;
+      const bool scale;
+      const bool use;
+      array<float> scaled_buffer;
+    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
+                  {"albedo",
+                   task.pass_denoising_data + DENOISING_PASS_ALBEDO,
+                   true,
+                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
+                  {"normal",
+                   task.pass_denoising_data + DENOISING_PASS_NORMAL,
+                   true,
+                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
+                  {"output", 0, false, true},
+                  { NULL,
+                    0 }};
+
+    for (int i = 0; passes[i].name; i++) {
+      if (!passes[i].use) {
+        continue;
+      }
+
+      const int64_t pixel_offset = offset + x + y * stride;
+      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
+      const int64_t pixel_stride = task.pass_stride;
+      const int64_t row_stride = stride * pixel_stride;
+
+      if (passes[i].scale && scale != 1.0f) {
+        /* Normalize albedo and normal passes as they are scaled by the number of samples.
+         * For the color passes OIDN will perform auto-exposure making it unnecessary. */
+        array<float> &scaled_buffer = passes[i].scaled_buffer;
+        scaled_buffer.resize(w * h * 3);
+
+        for (int y = 0; y < h; y++) {
+          const float *pass_row = buffer + buffer_offset + y * row_stride;
+          float *scaled_row = scaled_buffer.data() + y * w * 3;
+
+          for (int x = 0; x < w; x++) {
+            scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
+            scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
+            scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
+          }
+        }
+
+        oidn_filter.setImage(
+            passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
+      }
+      else {
+        oidn_filter.setImage(passes[i].name,
+                             buffer + buffer_offset,
+                             oidn::Format::Float3,
+                             w,
+                             h,
+                             0,
+                             pixel_stride * sizeof(float),
+                             row_stride * sizeof(float));
+      }
+    }
+
+    /* Execute filter. */
+    oidn_filter.commit();
+    oidn_filter.execute();
+#else
+    (void)task;
+    (void)buffer;
+    (void)offset;
+    (void)stride;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)scale;
+#endif
   }
 
-  void denoise(DenoisingTask &denoising, RenderTile &tile)
+  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  {
+    if (task.type == DeviceTask::DENOISE_BUFFER) {
+      /* Copy pixels from compute device to CPU (no-op for CPU device). */
+      rtile.buffers->buffer.copy_from_device();
+
+      denoise_openimagedenoise_buffer(task,
+                                      (float *)rtile.buffer,
+                                      rtile.offset,
+                                      rtile.stride,
+                                      rtile.x,
+                                      rtile.y,
+                                      rtile.w,
+                                      rtile.h,
+                                      1.0f / rtile.sample);
+
+      /* todo: it may be possible to avoid this copy, but we have to ensure that
+       * when other code copies data from the device it doesn't overwrite the
+       * denoiser buffers. */
+      rtile.buffers->buffer.copy_to_device();
+    }
+    else {
+      /* Per-tile denoising. */
+      rtile.sample = rtile.start_sample + rtile.num_samples;
+      const float scale = 1.0f / rtile.sample;
+      const float invscale = rtile.sample;
+      const size_t pass_stride = task.pass_stride;
+
+      /* Map neighboring tiles into one buffer for denoising. */
+      RenderTileNeighbors neighbors(rtile);
+      task.map_neighbor_tiles(neighbors, this);
+      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+      rtile = center_tile;
+
+      /* Calculate size of the tile to denoise (including overlap). The overlap
+       * size was chosen empirically. OpenImageDenoise specifies an overlap size
+       * of 128 but this is significantly bigger than typical tile size. */
+      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
+      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
+
+      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
+      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
+
+      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+        RenderTile &ntile = neighbors.tiles[i];
+        if (!ntile.buffer) {
+          continue;
+        }
+
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        float *merged_buffer = merged.data() + merged_offset * pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
+            merged_buffer[x] = tile_buffer[x] * scale;
+          }
+          tile_buffer += ntile.stride * pass_stride;
+          merged_buffer += merged_stride * pass_stride;
+        }
+      }
+
+      /* Denoise */
+      denoise_openimagedenoise_buffer(
+          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
+
+      /* Copy back result from merged buffer. */
+      RenderTile &ntile = neighbors.target;
+      if (ntile.buffer) {
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        const float *merged_buffer = merged.data() + merged_offset * pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
+            tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
+            tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
+            tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
+          }
+          tile_buffer += ntile.stride * pass_stride;
+          merged_buffer += merged_stride * pass_stride;
+        }
+      }
+
+      task.unmap_neighbor_tiles(neighbors, this);
+    }
+  }
+
+  void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
   {
     ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
 
@@ -881,7 +1192,7 @@ class CPUDevice : public Device {
     denoising.render_buffer.samples = tile.sample;
     denoising.buffer.gpu_temporary_mem = false;
 
-    denoising.run_denoising(&tile);
+    denoising.run_denoising(tile);
   }
 
   void thread_render(DeviceTask &task)
@@ -911,22 +1222,46 @@ class CPUDevice : public Device {
       }
     }
 
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-    denoising.profiler = &kg->profiler;
+    /* NLM denoiser. */
+    DenoisingTask *denoising = NULL;
+
+    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+     * avoid waiting with mutex locks in the denoiser, we let only a single
+     * thread acquire denoising tiles. */
+    uint tile_types = task.tile_types;
+    bool hold_denoise_lock = false;
+    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      if (!oidn_task_lock.try_lock()) {
+        tile_types &= ~RenderTile::DENOISE;
+        hold_denoise_lock = true;
+      }
+    }
 
-    while (task.acquire_tile(this, tile)) {
+    RenderTile tile;
+    while (task.acquire_tile(this, tile, tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         if (use_split_kernel) {
           device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
+          split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
         }
         else {
-          path_trace(task, tile, kg);
+          render(task, tile, kg);
         }
       }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, kg);
+      }
       else if (tile.task == RenderTile::DENOISE) {
-        denoise(denoising, tile);
+        if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+          denoise_openimagedenoise(task, tile);
+        }
+        else if (task.denoising.type == DENOISER_NLM) {
+          if (denoising == NULL) {
+            denoising = new DenoisingTask(this, task);
+            denoising->profiler = &kg->profiler;
+          }
+          denoise_nlm(*denoising, tile);
+        }
         task.update_progress(&tile, tile.w * tile.h);
       }
 
@@ -938,12 +1273,50 @@ class CPUDevice : public Device {
       }
     }
 
+    if (hold_denoise_lock) {
+      oidn_task_lock.unlock();
+    }
+
     profiler.remove_state(&kg->profiler);
 
     thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
     kg->~KernelGlobals();
     kgbuffer.free();
     delete split_kernel;
+    delete denoising;
+  }
+
+  void thread_denoise(DeviceTask &task)
+  {
+    RenderTile tile;
+    tile.x = task.x;
+    tile.y = task.y;
+    tile.w = task.w;
+    tile.h = task.h;
+    tile.buffer = task.buffer;
+    tile.sample = task.sample + task.num_samples;
+    tile.num_samples = task.num_samples;
+    tile.start_sample = task.sample;
+    tile.offset = task.offset;
+    tile.stride = task.stride;
+    tile.buffers = task.buffers;
+
+    if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      denoise_openimagedenoise(task, tile);
+    }
+    else {
+      DenoisingTask denoising(this, task);
+
+      ProfilingState denoising_profiler_state;
+      profiler.add_state(&denoising_profiler_state);
+      denoising.profiler = &denoising_profiler_state;
+
+      denoise_nlm(denoising, tile);
+
+      profiler.remove_state(&denoising_profiler_state);
+    }
+
+    task.update_progress(&tile, tile.w * tile.h);
   }
 
   void thread_film_convert(DeviceTask &task)
@@ -978,14 +1351,11 @@ class CPUDevice : public Device {
 
   void thread_shader(DeviceTask &task)
   {
-    KernelGlobals kg = kernel_globals;
+    KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
 
-#ifdef WITH_OSL
-    OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
     for (int sample = 0; sample < task.num_samples; sample++) {
       for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-        shader_kernel()(&kg,
+        shader_kernel()(kg,
                         (uint4 *)task.shader_input,
                         (float4 *)task.shader_output,
                         task.shader_eval_type,
@@ -1000,9 +1370,8 @@ class CPUDevice : public Device {
       task.update_progress(NULL);
     }
 
-#ifdef WITH_OSL
-    OSLShader::thread_free(&kg);
-#endif
+    thread_kernel_globals_free(kg);
+    delete kg;
   }
 
   int get_split_task_count(DeviceTask &task)
@@ -1021,13 +1390,24 @@ class CPUDevice : public Device {
     /* split task into smaller ones */
     list<DeviceTask> tasks;
 
-    if (task.type == DeviceTask::SHADER)
+    if (task.type == DeviceTask::DENOISE_BUFFER &&
+        task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      /* Denoise entire buffer at once with OIDN, it has own threading. */
+      tasks.push_back(task);
+    }
+    else if (task.type == DeviceTask::SHADER) {
       task.split(tasks, info.cpu_threads, 256);
-    else
+    }
+    else {
       task.split(tasks, info.cpu_threads);
+    }
 
-    foreach (DeviceTask &task, tasks)
-      task_pool.push(new CPUDeviceTask(this, task));
+    foreach (DeviceTask &task, tasks) {
+      task_pool.push([=] {
+        DeviceTask task_copy = task;
+        thread_run(task_copy);
+      });
+    }
   }
 
   void task_wait()
@@ -1192,7 +1572,7 @@ int2 CPUSplitKernel::split_kernel_local_size()
 
 int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
                                               device_memory & /*data*/,
-                                              DeviceTask * /*task*/)
+                                              DeviceTask & /*task*/)
 {
   return make_int2(1, 1);
 }
@@ -1220,9 +1600,14 @@ void device_cpu_info(vector<DeviceInfo> &devices)
   info.id = "CPU";
   info.num = 0;
   info.has_volume_decoupled = true;
+  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_half_images = true;
   info.has_profiling = true;
+  info.denoisers = DENOISER_NLM;
+  if (openimagedenoise_supported()) {
+    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+  }
 
   devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 68bc3bd4045..d9ffcceb06e 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -14,2530 +14,21 @@
  * limitations under the License.
  */
 
-#include <climits>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#ifdef WITH_CUDA
 
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
+#  include "device/cuda/device_cuda.h"
+#  include "device/device.h"
+#  include "device/device_intern.h"
 
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#ifdef WITH_CUDA_DYNLOAD
-#  include "cuew.h"
-#else
-#  include "util/util_opengl.h"
-#  include <cuda.h>
-#  include <cudaGL.h>
-#endif
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_md5.h"
-#include "util/util_opengl.h"
-#include "util/util_path.h"
-#include "util/util_string.h"
-#include "util/util_system.h"
-#include "util/util_types.h"
-#include "util/util_time.h"
-
-#include "kernel/split/kernel_split_data_types.h"
+#  include "util/util_logging.h"
+#  include "util/util_string.h"
+#  include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
-  /* We can only give error code here without major code duplication, that
-   * should be enough since dynamic loading is only being disabled by folks
-   * who knows what they're doing anyway.
-   *
-   * NOTE: Avoid call from several threads.
-   */
-  static string error;
-  error = string_printf("%d", result);
-  return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
-  return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
-  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-#endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-  CUDADevice *device;
-
- public:
-  explicit CUDASplitKernel(CUDADevice *device);
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
-  CUDAContextScope(CUDADevice *device);
-  ~CUDAContextScope();
-
- private:
-  CUDADevice *device;
-};
-
-class CUDADevice : public Device {
- public:
-  DedicatedTaskPool task_pool;
-  CUdevice cuDevice;
-  CUcontext cuContext;
-  CUmodule cuModule, cuFilterModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
-  int cuDevId;
-  int cuDevArchitecture;
-  bool first_error;
-  CUDASplitKernel *split_kernel;
-
-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), map_host_pointer(0), free_map_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-    void *map_host_pointer;
-    bool free_map_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-
-  struct PixelMem {
-    GLuint cuPBO;
-    CUgraphicsResource cuPBOresource;
-    GLuint cuTexId;
-    int w, h;
-  };
-  map<device_ptr, PixelMem> pixel_mem_map;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-  CUdeviceptr cuda_device_ptr(device_ptr mem)
-  {
-    return (CUdeviceptr)mem;
-  }
-
-  static bool have_precompiled_kernels()
-  {
-    string cubins_path = path_get("lib");
-    return path_exists(cubins_path);
-  }
-
-  virtual bool show_samples() const
-  {
-    /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-    return true;
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  /*#ifdef NDEBUG
-#define cuda_abort()
-#else
-#define cuda_abort() abort()
-#endif*/
-  void cuda_error_documentation()
-  {
-    if (first_error) {
-      fprintf(stderr,
-              "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-      fprintf(stderr,
-              "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
-      first_error = false;
-    }
-  }
-
-#define cuda_assert(stmt) \
-  { \
-    CUresult result = stmt; \
-\
-    if (result != CUDA_SUCCESS) { \
-      string message = string_printf( \
-          "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
-      if (error_msg == "") \
-        error_msg = message; \
-      fprintf(stderr, "%s\n", message.c_str()); \
-      /*cuda_abort();*/ \
-      cuda_error_documentation(); \
-    } \
-  } \
-  (void)0
-
-  bool cuda_error_(CUresult result, const string &stmt)
-  {
-    if (result == CUDA_SUCCESS)
-      return false;
-
-    string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    cuda_error_documentation();
-    return true;
-  }
-
-#define cuda_error(stmt) cuda_error_(stmt, #stmt)
-
-  void cuda_error_message(const string &message)
-  {
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    cuda_error_documentation();
-  }
-
-  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-      : Device(info, stats, profiler, background_),
-        texture_info(this, "__texture_info", MEM_TEXTURE)
-  {
-    first_error = true;
-    background = background_;
-
-    cuDevId = info.num;
-    cuDevice = 0;
-    cuContext = 0;
-
-    cuModule = 0;
-    cuFilterModule = 0;
-
-    split_kernel = NULL;
-
-    need_texture_info = false;
-
-    device_texture_headroom = 0;
-    device_working_headroom = 0;
-    move_texture_to_host = false;
-    map_host_limit = 0;
-    map_host_used = 0;
-    can_map_host = 0;
-
-    /* Intialize CUDA. */
-    if (cuda_error(cuInit(0)))
-      return;
-
-    /* Setup device and context. */
-    if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
-      return;
-
-    /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-     * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-     * so we can predict which memory to map to host. */
-    cuda_assert(
-        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-    unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-    if (can_map_host) {
-      ctx_flags |= CU_CTX_MAP_HOST;
-      init_host_memory();
-    }
-
-    /* Create context. */
-    CUresult result;
-
-    if (background) {
-      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-    }
-    else {
-      result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-      if (result != CUDA_SUCCESS) {
-        result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-        background = true;
-      }
-    }
-
-    if (cuda_error_(result, "cuCtxCreate"))
-      return;
-
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-    cuDevArchitecture = major * 100 + minor * 10;
-
-    /* Pop context set by cuCtxCreate. */
-    cuCtxPopCurrent(NULL);
-  }
-
-  ~CUDADevice()
-  {
-    task_pool.stop();
-
-    delete split_kernel;
-
-    texture_info.free();
-
-    cuda_assert(cuCtxDestroy(cuContext));
-  }
-
-  bool support_device(const DeviceRequestedFeatures & /*requested_features*/)
-  {
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-    /* We only support sm_30 and above */
-    if (major < 3) {
-      cuda_error_message(string_printf(
-          "CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
-          major,
-          minor));
-      return false;
-    }
-
-    return true;
-  }
-
-  bool use_adaptive_compilation()
-  {
-    return DebugFlags().cuda.adaptive_compile;
-  }
-
-  bool use_split_kernel()
-  {
-    return DebugFlags().cuda.split_kernel;
-  }
-
-  /* Common NVCC flags which stays the same regardless of shading model,
-   * kernel sources md5 and only depends on compiler or compilation settings.
-   */
-  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
-                                          bool filter = false,
-                                          bool split = false)
-  {
-    const int machine = system_cpu_bits();
-    const string source_path = path_get("source");
-    const string include_path = source_path;
-    string cflags = string_printf(
-        "-m%d "
-        "--ptxas-options=\"-v\" "
-        "--use_fast_math "
-        "-DNVCC "
-        "-I\"%s\"",
-        machine,
-        include_path.c_str());
-    if (!filter && use_adaptive_compilation()) {
-      cflags += " " + requested_features.get_build_options();
-    }
-    const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-    if (extra_cflags) {
-      cflags += string(" ") + string(extra_cflags);
-    }
-#ifdef WITH_CYCLES_DEBUG
-    cflags += " -D__KERNEL_DEBUG__";
-#endif
-
-    if (split) {
-      cflags += " -D__SPLIT__";
-    }
-
-    return cflags;
-  }
-
-  bool compile_check_compiler()
-  {
-    const char *nvcc = cuewCompilerPath();
-    if (nvcc == NULL) {
-      cuda_error_message(
-          "CUDA nvcc compiler not found. "
-          "Install CUDA toolkit in default location.");
-      return false;
-    }
-    const int cuda_version = cuewCompilerVersion();
-    VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << ".";
-    const int major = cuda_version / 10, minor = cuda_version % 10;
-    if (cuda_version == 0) {
-      cuda_error_message("CUDA nvcc compiler version could not be parsed.");
-      return false;
-    }
-    if (cuda_version < 80) {
-      printf(
-          "Unsupported CUDA version %d.%d detected, "
-          "you need CUDA 8.0 or newer.\n",
-          major,
-          minor);
-      return false;
-    }
-    else if (cuda_version != 101) {
-      printf(
-          "CUDA version %d.%d detected, build may succeed but only "
-          "CUDA 10.1 is officially supported.\n",
-          major,
-          minor);
-    }
-    return true;
-  }
-
-  string compile_kernel(const DeviceRequestedFeatures &requested_features,
-                        bool filter = false,
-                        bool split = false)
-  {
-    const char *name, *source;
-    if (filter) {
-      name = "filter";
-      source = "filter.cu";
-    }
-    else if (split) {
-      name = "kernel_split";
-      source = "kernel_split.cu";
-    }
-    else {
-      name = "kernel";
-      source = "kernel.cu";
-    }
-    /* Compute cubin name. */
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-    /* Attempt to use kernel provided with Blender. */
-    if (!use_adaptive_compilation()) {
-      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-      if (path_exists(cubin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return cubin;
-      }
-    }
-
-    const string common_cflags = compile_kernel_get_common_cflags(
-        requested_features, filter, split);
-
-    /* Try to use locally compiled kernel. */
-    const string source_path = path_get("source");
-    const string kernel_md5 = path_files_md5_hash(source_path);
-
-    /* We include cflags into md5 so changing cuda toolkit or changing other
-     * compiler command line arguments makes sure cubin gets re-built.
-     */
-    const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
-
-    const string cubin_file = string_printf(
-        "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str());
-    const string cubin = path_cache_get(path_join("kernels", cubin_file));
-    VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-    if (path_exists(cubin)) {
-      VLOG(1) << "Using locally compiled kernel.";
-      return cubin;
-    }
-
-#ifdef _WIN32
-    if (have_precompiled_kernels()) {
-      if (major < 3) {
-        cuda_error_message(
-            string_printf("CUDA device requires compute capability 3.0 or up, "
-                          "found %d.%d. Your GPU is not supported.",
-                          major,
-                          minor));
-      }
-      else {
-        cuda_error_message(
-            string_printf("CUDA binary kernel for this graphics card compute "
-                          "capability (%d.%d) not found.",
-                          major,
-                          minor));
-      }
-      return "";
-    }
-#endif
-
-    /* Compile. */
-    if (!compile_check_compiler()) {
-      return "";
-    }
-    const char *nvcc = cuewCompilerPath();
-    const string kernel = path_join(path_join(source_path, "kernel"),
-                                    path_join("kernels", path_join("cuda", source)));
-    double starttime = time_dt();
-    printf("Compiling CUDA kernel ...\n");
-
-    path_create_directories(cubin);
-
-    string command = string_printf(
-        "\"%s\" "
-        "-arch=sm_%d%d "
-        "--cubin \"%s\" "
-        "-o \"%s\" "
-        "%s ",
-        nvcc,
-        major,
-        minor,
-        kernel.c_str(),
-        cubin.c_str(),
-        common_cflags.c_str());
-
-    printf("%s\n", command.c_str());
-
-    if (system(command.c_str()) == -1) {
-      cuda_error_message(
-          "Failed to execute compilation command, "
-          "see console for details.");
-      return "";
-    }
-
-    /* Verify if compilation succeeded */
-    if (!path_exists(cubin)) {
-      cuda_error_message(
-          "CUDA kernel compilation failed, "
-          "see console for details.");
-      return "";
-    }
-
-    printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-    return cubin;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features)
-  {
-    /* TODO(sergey): Support kernels re-load for CUDA devices.
-     *
-     * Currently re-loading kernel will invalidate memory pointers,
-     * causing problems in cuCtxSynchronize.
-     */
-    if (cuFilterModule && cuModule) {
-      VLOG(1) << "Skipping kernel reload, not currently supported.";
-      return true;
-    }
-
-    /* check if cuda init succeeded */
-    if (cuContext == 0)
-      return false;
-
-    /* check if GPU is supported */
-    if (!support_device(requested_features))
-      return false;
-
-    /* get kernel */
-    string cubin = compile_kernel(requested_features, false, use_split_kernel());
-    if (cubin == "")
-      return false;
-
-    string filter_cubin = compile_kernel(requested_features, true, false);
-    if (filter_cubin == "")
-      return false;
-
-    /* open module */
-    CUDAContextScope scope(this);
-
-    string cubin_data;
-    CUresult result;
-
-    if (path_read_text(cubin, cubin_data))
-      result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-    else
-      result = CUDA_ERROR_FILE_NOT_FOUND;
-
-    if (cuda_error_(result, "cuModuleLoad"))
-      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
-
-    if (path_read_text(filter_cubin, cubin_data))
-      result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-    else
-      result = CUDA_ERROR_FILE_NOT_FOUND;
-
-    if (cuda_error_(result, "cuModuleLoad"))
-      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
-
-    if (result == CUDA_SUCCESS) {
-      reserve_local_memory(requested_features);
-    }
-
-    return (result == CUDA_SUCCESS);
-  }
-
-  void reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-  {
-    if (use_split_kernel()) {
-      /* Split kernel mostly uses global memory and adaptive compilation,
-       * difficult to predict how much is needed currently. */
-      return;
-    }
-
-    /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-     * needed for kernel launches, so that we can reliably figure out when
-     * to allocate scene data in mapped host memory. */
-    CUDAContextScope scope(this);
-
-    size_t total = 0, free_before = 0, free_after = 0;
-    cuMemGetInfo(&free_before, &total);
-
-    /* Get kernel function. */
-    CUfunction cuPathTrace;
-
-    if (requested_features.use_integrator_branched) {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-    }
-
-    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-    int min_blocks, num_threads_per_block;
-    cuda_assert(cuOccupancyMaxPotentialBlockSize(
-        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-
-    /* Launch kernel, using just 1 block appears sufficient to reserve
-     * memory for all multiprocessors. It would be good to do this in
-     * parallel for the multi GPU case still to make it faster. */
-    CUdeviceptr d_work_tiles = 0;
-    uint total_work_size = 0;
-
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    cuda_assert(cuCtxSynchronize());
-
-    cuMemGetInfo(&free_after, &total);
-    VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
-            << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-#if 0
-    /* For testing mapped host memory, fill up device memory. */
-    const size_t keep_mb = 1024;
-
-    while(free_after > keep_mb * 1024 * 1024LL) {
-      CUdeviceptr tmp;
-      cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-      cuMemGetInfo(&free_after, &total);
-    }
-#endif
-  }
-
-  void init_host_memory()
-  {
-    /* Limit amount of host mapped memory, because allocating too much can
-     * cause system instability. Leave at least half or 4 GB of system
-     * memory free, whichever is smaller. */
-    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-    size_t system_ram = system_physical_ram();
-
-    if (system_ram > 0) {
-      if (system_ram / 2 > default_limit) {
-        map_host_limit = system_ram - default_limit;
-      }
-      else {
-        map_host_limit = system_ram / 2;
-      }
-    }
-    else {
-      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-      map_host_limit = 0;
-    }
-
-    /* Amount of device memory to keep is free after texture memory
-     * and working memory allocations respectively. We set the working
-     * memory limit headroom lower so that some space is left after all
-     * texture memory allocations. */
-    device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-    device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-    VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-  }
-
-  void load_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  void move_textures_to_host(size_t size, bool for_texture)
-  {
-    /* Signal to reallocate textures in host memory only. */
-    move_texture_to_host = true;
-
-    while (size > 0) {
-      /* Find suitable memory allocation to move. */
-      device_memory *max_mem = NULL;
-      size_t max_size = 0;
-      bool max_is_image = false;
-
-      foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-        device_memory &mem = *pair.first;
-        CUDAMem *cmem = &pair.second;
-
-        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-        bool is_image = is_texture && (mem.data_height > 1);
-
-        /* Can't move this type of memory. */
-        if (!is_texture || cmem->array) {
-          continue;
-        }
-
-        /* Already in host memory. */
-        if (cmem->map_host_pointer) {
-          continue;
-        }
-
-        /* For other textures, only move image textures. */
-        if (for_texture && !is_image) {
-          continue;
-        }
-
-        /* Try to move largest allocation, prefer moving images. */
-        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-          max_is_image = is_image;
-          max_size = mem.device_size;
-          max_mem = &mem;
-        }
-      }
-
-      /* Move to host memory. This part is mutex protected since
-       * multiple CUDA devices could be moving the memory. The
-       * first one will do it, and the rest will adopt the pointer. */
-      if (max_mem) {
-        VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-        static thread_mutex move_mutex;
-        thread_scoped_lock lock(move_mutex);
-
-        /* Preserve the original device pointer, in case of multi device
-         * we can't change it because the pointer mapping would break. */
-        device_ptr prev_pointer = max_mem->device_pointer;
-        size_t prev_size = max_mem->device_size;
-
-        tex_free(*max_mem);
-        tex_alloc(*max_mem);
-        size = (max_size >= size) ? 0 : size - max_size;
-
-        max_mem->device_pointer = prev_pointer;
-        max_mem->device_size = prev_size;
-      }
-      else {
-        break;
-      }
-    }
-
-    /* Update texture info array with new pointers. */
-    load_texture_info();
-
-    move_texture_to_host = false;
-  }
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
-  {
-    CUDAContextScope scope(this);
-
-    CUdeviceptr device_pointer = 0;
-    size_t size = mem.memory_size() + pitch_padding;
-
-    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-    const char *status = "";
-
-    /* First try allocating in device memory, respecting headroom. We make
-     * an exception for texture info. It is small and frequently accessed,
-     * so treat it as working memory.
-     *
-     * If there is not enough room for working memory, we will try to move
-     * textures to host memory, assuming the performance impact would have
-     * been worse for working memory. */
-    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-    bool is_image = is_texture && (mem.data_height > 1);
-
-    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-    size_t total = 0, free = 0;
-    cuMemGetInfo(&free, &total);
-
-    /* Move textures to host memory if needed. */
-    if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
-      move_textures_to_host(size + headroom - free, is_texture);
-      cuMemGetInfo(&free, &total);
-    }
-
-    /* Allocate in device memory. */
-    if (!move_texture_to_host && (size + headroom) < free) {
-      mem_alloc_result = cuMemAlloc(&device_pointer, size);
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        status = " in device memory";
-      }
-    }
-
-    /* Fall back to mapped host memory if needed and possible. */
-    void *map_host_pointer = 0;
-    bool free_map_host = false;
-
-    if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
-        map_host_used + size < map_host_limit) {
-      if (mem.shared_pointer) {
-        /* Another device already allocated host memory. */
-        mem_alloc_result = CUDA_SUCCESS;
-        map_host_pointer = mem.shared_pointer;
-      }
-      else {
-        /* Allocate host memory ourselves. */
-        mem_alloc_result = cuMemHostAlloc(
-            &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-        mem.shared_pointer = map_host_pointer;
-        free_map_host = true;
-      }
-
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
-        map_host_used += size;
-        status = " in host memory";
-
-        /* Replace host pointer with our host allocation. Only works if
-         * CUDA memory layout is the same and has no pitch padding. Also
-         * does not work if we move textures to host during a render,
-         * since other devices might be using the memory. */
-        if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-            mem.host_pointer != mem.shared_pointer) {
-          memcpy(mem.shared_pointer, mem.host_pointer, size);
-          mem.host_free();
-          mem.host_pointer = mem.shared_pointer;
-        }
-      }
-      else {
-        status = " failed, out of host memory";
-      }
-    }
-    else if (mem_alloc_result != CUDA_SUCCESS) {
-      status = " failed, out of device and host memory";
-    }
-
-    if (mem_alloc_result != CUDA_SUCCESS) {
-      cuda_assert(mem_alloc_result);
-    }
-
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-    }
-
-    mem.device_pointer = (device_ptr)device_pointer;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    if (!mem.device_pointer) {
-      return NULL;
-    }
-
-    /* Insert into map of allocations. */
-    CUDAMem *cmem = &cuda_mem_map[&mem];
-    cmem->map_host_pointer = map_host_pointer;
-    cmem->free_map_host = free_map_host;
-    return cmem;
-  }
-
-  void generic_copy_to(device_memory &mem)
-  {
-    if (mem.host_pointer && mem.device_pointer) {
-      CUDAContextScope scope(this);
-
-      if (mem.host_pointer != mem.shared_pointer) {
-        cuda_assert(cuMemcpyHtoD(
-            cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
-      }
-    }
-  }
-
-  void generic_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(this);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      if (cmem.map_host_pointer) {
-        /* Free host memory. */
-        if (cmem.free_map_host) {
-          cuMemFreeHost(cmem.map_host_pointer);
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          mem.shared_pointer = 0;
-        }
-
-        map_host_used -= mem.device_size;
-      }
-      else {
-        /* Free device memory. */
-        cuMemFree(mem.device_pointer);
-      }
-
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-  }
-
-  void mem_alloc(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_alloc(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-    }
-    else {
-      generic_alloc(mem);
-    }
-  }
-
-  void mem_copy_to(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-      tex_alloc(mem);
-    }
-    else {
-      if (!mem.device_pointer) {
-        generic_alloc(mem);
-      }
-
-      generic_copy_to(mem);
-    }
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_copy_from(mem, y, w, h);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_copy_from not supported for textures.");
-    }
-    else {
-      CUDAContextScope scope(this);
-      size_t offset = elem * y * w;
-      size_t size = elem * w * h;
-
-      if (mem.host_pointer && mem.device_pointer) {
-        cuda_assert(cuMemcpyDtoH(
-            (uchar *)mem.host_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size));
-      }
-      else if (mem.host_pointer) {
-        memset((char *)mem.host_pointer + offset, 0, size);
-      }
-    }
-  }
-
-  void mem_zero(device_memory &mem)
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-
-    if (mem.device_pointer && (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
-      CUDAContextScope scope(this);
-      cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
-    }
-  }
-
-  void mem_free(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_free(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-    }
-    else {
-      generic_free(mem);
-    }
-  }
-
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size)
-  {
-    CUDAContextScope scope(this);
-    CUdeviceptr mem;
-    size_t bytes;
-
-    cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-    //assert(bytes == size);
-    cuda_assert(cuMemcpyHtoD(mem, host, size));
-  }
-
-  void tex_alloc(device_memory &mem)
-  {
-    CUDAContextScope scope(this);
-
-    /* General variables for both architectures */
-    string bind_name = mem.name;
-    size_t dsize = datatype_size(mem.data_type);
-    size_t size = mem.memory_size();
-
-    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-    switch (mem.extension) {
-      case EXTENSION_REPEAT:
-        address_mode = CU_TR_ADDRESS_MODE_WRAP;
-        break;
-      case EXTENSION_EXTEND:
-        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-        break;
-      case EXTENSION_CLIP:
-        address_mode = CU_TR_ADDRESS_MODE_BORDER;
-        break;
-      default:
-        assert(0);
-        break;
-    }
-
-    CUfilter_mode filter_mode;
-    if (mem.interpolation == INTERPOLATION_CLOSEST) {
-      filter_mode = CU_TR_FILTER_MODE_POINT;
-    }
-    else {
-      filter_mode = CU_TR_FILTER_MODE_LINEAR;
-    }
-
-    /* Data Storage */
-    if (mem.interpolation == INTERPOLATION_NONE) {
-      generic_alloc(mem);
-      generic_copy_to(mem);
-
-      CUdeviceptr cumem;
-      size_t cubytes;
-
-      cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
-
-      if (cubytes == 8) {
-        /* 64 bit device pointer */
-        uint64_t ptr = mem.device_pointer;
-        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
-      }
-      else {
-        /* 32 bit device pointer */
-        uint32_t ptr = (uint32_t)mem.device_pointer;
-        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
-      }
-      return;
-    }
-
-    /* Image Texture Storage */
-    CUarray_format_enum format;
-    switch (mem.data_type) {
-      case TYPE_UCHAR:
-        format = CU_AD_FORMAT_UNSIGNED_INT8;
-        break;
-      case TYPE_UINT16:
-        format = CU_AD_FORMAT_UNSIGNED_INT16;
-        break;
-      case TYPE_UINT:
-        format = CU_AD_FORMAT_UNSIGNED_INT32;
-        break;
-      case TYPE_INT:
-        format = CU_AD_FORMAT_SIGNED_INT32;
-        break;
-      case TYPE_FLOAT:
-        format = CU_AD_FORMAT_FLOAT;
-        break;
-      case TYPE_HALF:
-        format = CU_AD_FORMAT_HALF;
-        break;
-      default:
-        assert(0);
-        return;
-    }
-
-    CUDAMem *cmem = NULL;
-    CUarray array_3d = NULL;
-    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-    size_t dst_pitch = src_pitch;
-
-    if (mem.data_depth > 1) {
-      /* 3D texture using array, there is no API for linear memory. */
-      CUDA_ARRAY3D_DESCRIPTOR desc;
-
-      desc.Width = mem.data_width;
-      desc.Height = mem.data_height;
-      desc.Depth = mem.data_depth;
-      desc.Format = format;
-      desc.NumChannels = mem.data_elements;
-      desc.Flags = 0;
-
-      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-
-      cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-      if (!array_3d) {
-        return;
-      }
-
-      CUDA_MEMCPY3D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-      param.dstArray = array_3d;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-      param.Depth = mem.data_depth;
-
-      cuda_assert(cuMemcpy3D(&param));
-
-      mem.device_pointer = (device_ptr)array_3d;
-      mem.device_size = size;
-      stats.mem_alloc(size);
-
-      cmem = &cuda_mem_map[&mem];
-      cmem->texobject = 0;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      /* 2D texture, using pitch aligned linear memory. */
-      int alignment = 0;
-      cuda_assert(
-          cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-      dst_pitch = align_up(src_pitch, alignment);
-      size_t dst_size = dst_pitch * mem.data_height;
-
-      cmem = generic_alloc(mem, dst_size - mem.memory_size());
-      if (!cmem) {
-        return;
-      }
-
-      CUDA_MEMCPY2D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-      param.dstDevice = mem.device_pointer;
-      param.dstPitch = dst_pitch;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-
-      cuda_assert(cuMemcpy2DUnaligned(&param));
-    }
-    else {
-      /* 1D texture, using linear memory. */
-      cmem = generic_alloc(mem);
-      if (!cmem) {
-        return;
-      }
-
-      cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-    }
-
-    /* Kepler+, bindless textures. */
-    int flat_slot = 0;
-    if (string_startswith(mem.name, "__tex_image")) {
-      int pos = string(mem.name).rfind("_");
-      flat_slot = atoi(mem.name + pos + 1);
-    }
-    else {
-      assert(0);
-    }
-
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    /* Resize once */
-    if (flat_slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount
-       * of re-allocations. */
-      texture_info.resize(flat_slot + 128);
-    }
-
-    /* Set Mapping and tag that we need to (re-)upload to device */
-    TextureInfo &info = texture_info[flat_slot];
-    info.data = (uint64_t)cmem->texobject;
-    info.cl_buffer = 0;
-    info.interpolation = mem.interpolation;
-    info.extension = mem.extension;
-    info.width = mem.data_width;
-    info.height = mem.data_height;
-    info.depth = mem.data_depth;
-    need_texture_info = true;
-  }
-
-  void tex_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(this);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      if (cmem.texobject) {
-        /* Free bindless texture. */
-        cuTexObjectDestroy(cmem.texobject);
-      }
-
-      if (cmem.array) {
-        /* Free array. */
-        cuArrayDestroy(cmem.array);
-        stats.mem_free(mem.device_size);
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        cuda_mem_map.erase(cuda_mem_map.find(&mem));
-      }
-      else {
-        generic_free(mem);
-      }
-    }
-  }
-
-#define CUDA_GET_BLOCKSIZE(func, w, h) \
-  int threads_per_block; \
-  cuda_assert( \
-      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-  int threads = (int)sqrt((float)threads_per_block); \
-  int xblocks = ((w) + threads - 1) / threads; \
-  int yblocks = ((h) + threads - 1) / threads;
-
-#define CUDA_LAUNCH_KERNEL(func, args) \
-  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-  int threads_per_block; \
-  cuda_assert( \
-      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-  int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
-  int yblocks = h;
-
-#define CUDA_LAUNCH_KERNEL_1D(func, args) \
-  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    int stride = task->buffer.stride;
-    int w = task->buffer.width;
-    int h = task->buffer.h;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-    int frame_offset = 0;
-
-    if (have_error())
-      return false;
-
-    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr scale_ptr = 0;
-
-    cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
-    cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
-    {
-      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-      cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-      void *calc_difference_args[] = {&guide_ptr,
-                                      &variance_ptr,
-                                      &scale_ptr,
-                                      &difference,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &channel_offset,
-                                      &frame_offset,
-                                      &a,
-                                      &k_2};
-      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *calc_weight_args[] = {
-          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *update_output_args[] = {&blurDifference,
-                                    &image_ptr,
-                                    &out_ptr,
-                                    &weightAccum,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &channel_offset,
-                                    &r,
-                                    &f};
-
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-    }
-
-    {
-      CUfunction cuNLMNormalize;
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-      cuda_assert(cuCtxSynchronize());
-    }
-
-    return !have_error();
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterConstructTransform;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-    void *args[] = {&task->buffer.mem.device_pointer,
-                    &task->tile_info_mem.device_pointer,
-                    &task->storage.transform.device_pointer,
-                    &task->storage.rank.device_pointer,
-                    &task->filter_area,
-                    &task->rect,
-                    &task->radius,
-                    &task->pca_threshold,
-                    &task->buffer.pass_stride,
-                    &task->buffer.frame_stride,
-                    &task->buffer.use_time};
-    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    int r = task->radius;
-    int f = 4;
-    float a = 1.0f;
-    float k_2 = task->nlm_k_2;
-
-    int w = task->reconstruction_state.source_w;
-    int h = task->reconstruction_state.source_h;
-    int stride = task->buffer.stride;
-    int frame_offset = frame * task->buffer.frame_stride;
-    int t = task->tile_info->frames[frame];
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-    if (have_error())
-      return false;
-
-    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                          task->reconstruction_state.source_w *
-                              task->reconstruction_state.source_h,
-                          num_shifts);
-
-    void *calc_difference_args[] = {&color_ptr,
-                                    &color_variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &pass_stride,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *construct_gramian_args[] = {&t,
-                                      &blurDifference,
-                                      &task->buffer.mem.device_pointer,
-                                      &task->storage.transform.device_pointer,
-                                      &task->storage.rank.device_pointer,
-                                      &task->storage.XtWX.device_pointer,
-                                      &task->storage.XtWY.device_pointer,
-                                      &task->reconstruction_state.filter_window,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &f,
-                                      &frame_offset,
-                                      &task->buffer.use_time};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-  {
-    CUfunction cuFinalize;
-    cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-    cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-    void *finalize_args[] = {&output_ptr,
-                             &task->storage.rank.device_pointer,
-                             &task->storage.XtWX.device_pointer,
-                             &task->storage.XtWY.device_pointer,
-                             &task->filter_area,
-                             &task->reconstruction_state.buffer_params.x,
-                             &task->render_buffer.samples};
-    CUDA_GET_BLOCKSIZE(
-        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterCombineHalves;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterDivideShadow;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &a_ptr,
-                    &b_ptr,
-                    &sample_variance_ptr,
-                    &sv_variance_ptr,
-                    &buffer_variance_ptr,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterGetFeature;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &mean_offset,
-                    &variance_offset,
-                    &mean_ptr,
-                    &variance_ptr,
-                    &scale,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterWriteFeature;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->reconstruction_state.buffer_params,
-                    &task->filter_area,
-                    &from_ptr,
-                    &buffer_ptr,
-                    &out_offset,
-                    &task->rect};
-    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterDetectOutliers;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&image_ptr,
-                    &variance_ptr,
-                    &depth_ptr,
-                    &output_ptr,
-                    &task->rect,
-                    &task->buffer.pass_stride};
-
-    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  void denoise(RenderTile &rtile, DenoisingTask &denoising)
-  {
-    denoising.functions.construct_transform = function_bind(
-        &CUDADevice::denoising_construct_transform, this, &denoising);
-    denoising.functions.accumulate = function_bind(
-        &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-    denoising.functions.divide_shadow = function_bind(
-        &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.non_local_means = function_bind(
-        &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.combine_halves = function_bind(
-        &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-    denoising.functions.get_feature = function_bind(
-        &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.write_feature = function_bind(
-        &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-    denoising.functions.detect_outliers = function_bind(
-        &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-    denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-    denoising.render_buffer.samples = rtile.sample;
-    denoising.buffer.gpu_temporary_mem = true;
-
-    denoising.run_denoising(&rtile);
-  }
-
-  void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-  {
-    scoped_timer timer(&rtile.buffers->render_time);
-
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-    CUfunction cuPathTrace;
-
-    /* Get kernel function. */
-    if (task.integrator_branched) {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-    }
-
-    if (have_error()) {
-      return;
-    }
-
-    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-    /* Allocate work tile. */
-    work_tiles.alloc(1);
-
-    WorkTile *wtile = work_tiles.data();
-    wtile->x = rtile.x;
-    wtile->y = rtile.y;
-    wtile->w = rtile.w;
-    wtile->h = rtile.h;
-    wtile->offset = rtile.offset;
-    wtile->stride = rtile.stride;
-    wtile->buffer = (float *)cuda_device_ptr(rtile.buffer);
-
-    /* Prepare work size. More step samples render faster, but for now we
-     * remain conservative for GPUs connected to a display to avoid driver
-     * timeouts and display freezing. */
-    int min_blocks, num_threads_per_block;
-    cuda_assert(cuOccupancyMaxPotentialBlockSize(
-        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-    if (!info.display_device) {
-      min_blocks *= 8;
-    }
-
-    uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-    /* Render all samples. */
-    int start_sample = rtile.start_sample;
-    int end_sample = rtile.start_sample + rtile.num_samples;
-
-    for (int sample = start_sample; sample < end_sample; sample += step_samples) {
-      /* Setup and copy work tile to device. */
-      wtile->start_sample = sample;
-      wtile->num_samples = min(step_samples, end_sample - sample);
-      work_tiles.copy_to_device();
-
-      CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
-      uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-      uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-      /* Launch kernel. */
-      void *args[] = {&d_work_tiles, &total_work_size};
-
-      cuda_assert(cuLaunchKernel(
-          cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-      cuda_assert(cuCtxSynchronize());
-
-      /* Update progress. */
-      rtile.sample = sample + wtile->num_samples;
-      task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-  }
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half)
-  {
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilmConvert;
-    CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
-    CUdeviceptr d_buffer = cuda_device_ptr(buffer);
-
-    /* get kernel function */
-    if (rgba_half) {
-      cuda_assert(
-          cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-    }
-
-    float sample_scale = 1.0f / (task.sample + 1);
-
-    /* pass in parameters */
-    void *args[] = {&d_rgba,
-                    &d_buffer,
-                    &sample_scale,
-                    &task.x,
-                    &task.y,
-                    &task.w,
-                    &task.h,
-                    &task.offset,
-                    &task.stride};
-
-    /* launch kernel */
-    int threads_per_block;
-    cuda_assert(cuFuncGetAttribute(
-        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-    int xthreads = (int)sqrt(threads_per_block);
-    int ythreads = (int)sqrt(threads_per_block);
-    int xblocks = (task.w + xthreads - 1) / xthreads;
-    int yblocks = (task.h + ythreads - 1) / ythreads;
-
-    cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(cuFilmConvert,
-                               xblocks,
-                               yblocks,
-                               1, /* blocks */
-                               xthreads,
-                               ythreads,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
-    cuda_assert(cuCtxSynchronize());
-  }
-
-  void shader(DeviceTask &task)
-  {
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuShader;
-    CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
-    CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
-
-    /* get kernel function */
-    if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
-    }
-    else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-    }
-
-    /* do tasks in smaller chunks, so we can cancel it */
-    const int shader_chunk_size = 65536;
-    const int start = task.shader_x;
-    const int end = task.shader_x + task.shader_w;
-    int offset = task.offset;
-
-    bool canceled = false;
-    for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
-      for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-        int shader_w = min(shader_chunk_size, end - shader_x);
-
-        /* pass in parameters */
-        void *args[8];
-        int arg = 0;
-        args[arg++] = &d_input;
-        args[arg++] = &d_output;
-        args[arg++] = &task.shader_eval_type;
-        if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-          args[arg++] = &task.shader_filter;
-        }
-        args[arg++] = &shader_x;
-        args[arg++] = &shader_w;
-        args[arg++] = &offset;
-        args[arg++] = &sample;
-
-        /* launch kernel */
-        int threads_per_block;
-        cuda_assert(cuFuncGetAttribute(
-            &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-        int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
-        cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-        cuda_assert(cuLaunchKernel(cuShader,
-                                   xblocks,
-                                   1,
-                                   1, /* blocks */
-                                   threads_per_block,
-                                   1,
-                                   1, /* threads */
-                                   0,
-                                   0,
-                                   args,
-                                   0));
-
-        cuda_assert(cuCtxSynchronize());
-
-        if (task.get_cancel()) {
-          canceled = true;
-          break;
-        }
-      }
-
-      task.update_progress(NULL);
-    }
-  }
-
-  CUdeviceptr map_pixels(device_ptr mem)
-  {
-    if (!background) {
-      PixelMem pmem = pixel_mem_map[mem];
-      CUdeviceptr buffer;
-
-      size_t bytes;
-      cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-      cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-      return buffer;
-    }
-
-    return cuda_device_ptr(mem);
-  }
-
-  void unmap_pixels(device_ptr mem)
-  {
-    if (!background) {
-      PixelMem pmem = pixel_mem_map[mem];
-
-      cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-    }
-  }
-
-  void pixels_alloc(device_memory &mem)
-  {
-    PixelMem pmem;
-
-    pmem.w = mem.data_width;
-    pmem.h = mem.data_height;
-
-    CUDAContextScope scope(this);
-
-    glGenBuffers(1, &pmem.cuPBO);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    if (mem.data_type == TYPE_HALF)
-      glBufferData(
-          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
-    else
-      glBufferData(
-          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-    glActiveTexture(GL_TEXTURE0);
-    glGenTextures(1, &pmem.cuTexId);
-    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-    if (mem.data_type == TYPE_HALF)
-      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-    else
-      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    CUresult result = cuGraphicsGLRegisterBuffer(
-        &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-    if (result == CUDA_SUCCESS) {
-      mem.device_pointer = pmem.cuTexId;
-      pixel_mem_map[mem.device_pointer] = pmem;
-
-      mem.device_size = mem.memory_size();
-      stats.mem_alloc(mem.device_size);
-
-      return;
-    }
-    else {
-      /* failed to register buffer, fallback to no interop */
-      glDeleteBuffers(1, &pmem.cuPBO);
-      glDeleteTextures(1, &pmem.cuTexId);
-
-      background = true;
-    }
-  }
-
-  void pixels_copy_from(device_memory &mem, int y, int w, int h)
-  {
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-    CUDAContextScope scope(this);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-    size_t offset = sizeof(uchar) * 4 * y * w;
-    memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
-    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-  }
-
-  void pixels_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-      CUDAContextScope scope(this);
-
-      cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-      glDeleteBuffers(1, &pmem.cuPBO);
-      glDeleteTextures(1, &pmem.cuTexId);
-
-      pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-      mem.device_pointer = 0;
-
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  void draw_pixels(device_memory &mem,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params)
-  {
-    assert(mem.type == MEM_PIXELS);
-
-    if (!background) {
-      const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-      PixelMem pmem = pixel_mem_map[mem.device_pointer];
-      float *vpointer;
-
-      CUDAContextScope scope(this);
-
-      /* for multi devices, this assumes the inefficient method that we allocate
-       * all pixels on the device even though we only render to a subset */
-      size_t offset = 4 * y * w;
-
-      if (mem.data_type == TYPE_HALF)
-        offset *= sizeof(GLhalf);
-      else
-        offset *= sizeof(uint8_t);
-
-      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-      glActiveTexture(GL_TEXTURE0);
-      glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-      if (mem.data_type == TYPE_HALF) {
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
-      }
-      else {
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
-      }
-      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-      if (transparent) {
-        glEnable(GL_BLEND);
-        glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-      }
-
-      GLint shader_program;
-      if (use_fallback_shader) {
-        if (!bind_fallback_display_space_shader(dw, dh)) {
-          return;
-        }
-        shader_program = fallback_shader_program;
-      }
-      else {
-        draw_params.bind_display_space_shader_cb();
-        glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-      }
-
-      if (!vertex_buffer) {
-        glGenBuffers(1, &vertex_buffer);
-      }
-
-      glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-      /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
-      glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-      vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-      if (vpointer) {
-        /* texture coordinate - vertex pair */
-        vpointer[0] = 0.0f;
-        vpointer[1] = 0.0f;
-        vpointer[2] = dx;
-        vpointer[3] = dy;
-
-        vpointer[4] = (float)w / (float)pmem.w;
-        vpointer[5] = 0.0f;
-        vpointer[6] = (float)width + dx;
-        vpointer[7] = dy;
-
-        vpointer[8] = (float)w / (float)pmem.w;
-        vpointer[9] = (float)h / (float)pmem.h;
-        vpointer[10] = (float)width + dx;
-        vpointer[11] = (float)height + dy;
-
-        vpointer[12] = 0.0f;
-        vpointer[13] = (float)h / (float)pmem.h;
-        vpointer[14] = dx;
-        vpointer[15] = (float)height + dy;
-
-        glUnmapBuffer(GL_ARRAY_BUFFER);
-      }
-
-      GLuint vertex_array_object;
-      GLuint position_attribute, texcoord_attribute;
-
-      glGenVertexArrays(1, &vertex_array_object);
-      glBindVertexArray(vertex_array_object);
-
-      texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-      position_attribute = glGetAttribLocation(shader_program, "pos");
-
-      glEnableVertexAttribArray(texcoord_attribute);
-      glEnableVertexAttribArray(position_attribute);
-
-      glVertexAttribPointer(
-          texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-      glVertexAttribPointer(position_attribute,
-                            2,
-                            GL_FLOAT,
-                            GL_FALSE,
-                            4 * sizeof(float),
-                            (const GLvoid *)(sizeof(float) * 2));
-
-      glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-      if (use_fallback_shader) {
-        glUseProgram(0);
-      }
-      else {
-        draw_params.unbind_display_space_shader_cb();
-      }
-
-      if (transparent) {
-        glDisable(GL_BLEND);
-      }
-
-      glBindTexture(GL_TEXTURE_2D, 0);
-
-      return;
-    }
-
-    Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-  }
-
-  void thread_run(DeviceTask *task)
-  {
-    CUDAContextScope scope(this);
-
-    if (task->type == DeviceTask::RENDER) {
-      DeviceRequestedFeatures requested_features;
-      if (use_split_kernel()) {
-        if (split_kernel == NULL) {
-          split_kernel = new CUDASplitKernel(this);
-          split_kernel->load_kernels(requested_features);
-        }
-      }
-
-      device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-      /* keep rendering tiles until done */
-      RenderTile tile;
-      DenoisingTask denoising(this, *task);
-
-      while (task->acquire_tile(this, tile)) {
-        if (tile.task == RenderTile::PATH_TRACE) {
-          if (use_split_kernel()) {
-            device_only_memory<uchar> void_buffer(this, "void_buffer");
-            split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-          }
-          else {
-            path_trace(*task, tile, work_tiles);
-          }
-        }
-        else if (tile.task == RenderTile::DENOISE) {
-          tile.sample = tile.start_sample + tile.num_samples;
-
-          denoise(tile, denoising);
-
-          task->update_progress(&tile, tile.w * tile.h);
-        }
-
-        task->release_tile(tile);
-
-        if (task->get_cancel()) {
-          if (task->need_finish_queue == false)
-            break;
-        }
-      }
-
-      work_tiles.free();
-    }
-    else if (task->type == DeviceTask::SHADER) {
-      shader(*task);
-
-      cuda_assert(cuCtxSynchronize());
-    }
-  }
-
-  class CUDADeviceTask : public DeviceTask {
-   public:
-    CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
-    {
-      run = function_bind(&CUDADevice::thread_run, device, this);
-    }
-  };
-
-  int get_split_task_count(DeviceTask & /*task*/)
-  {
-    return 1;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    CUDAContextScope scope(this);
-
-    /* Load texture info. */
-    load_texture_info();
-
-    /* Synchronize all memory copies before executing task. */
-    cuda_assert(cuCtxSynchronize());
-
-    if (task.type == DeviceTask::FILM_CONVERT) {
-      /* must be done in main thread due to opengl access */
-      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-    }
-    else {
-      task_pool.push(new CUDADeviceTask(this, task));
-    }
-  }
-
-  void task_wait()
-  {
-    task_pool.wait();
-  }
-
-  void task_cancel()
-  {
-    task_pool.cancel();
-  }
-
-  friend class CUDASplitKernelFunction;
-  friend class CUDASplitKernel;
-  friend class CUDAContextScope;
-};
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#undef cuda_assert
-#define cuda_assert(stmt) \
-  { \
-    CUresult result = stmt; \
-\
-    if (result != CUDA_SUCCESS) { \
-      string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
-      if (device->error_msg == "") \
-        device->error_msg = message; \
-      fprintf(stderr, "%s\n", message.c_str()); \
-      /*cuda_abort();*/ \
-      device->cuda_error_documentation(); \
-    } \
-  } \
-  (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
-  cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
-  cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
-  CUDADevice *device;
-  CUfunction func;
-
- public:
-  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
-  {
-    return enqueue(dim, NULL);
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, void *args[])
-  {
-    if (device->have_error())
-      return false;
-
-    CUDAContextScope scope(device);
-
-    /* we ignore dim.local_size for now, as this is faster */
-    int threads_per_block;
-    cuda_assert(
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
-                  threads_per_block;
-
-    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(func,
-                               xblocks,
-                               1,
-                               1, /* blocks */
-                               threads_per_block,
-                               1,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    return !device->have_error();
-  }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
-                                            device_memory & /*data*/,
-                                            size_t num_threads)
-{
-  CUDAContextScope scope(device);
-
-  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-  size_buffer.alloc(1);
-  size_buffer.zero_to_device();
-
-  uint threads = num_threads;
-  CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
-
-  struct args_t {
-    uint *num_threads;
-    CUdeviceptr *size;
-  };
-
-  args_t args = {&threads, &d_size};
-
-  CUfunction state_buffer_size;
-  cuda_assert(
-      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
-  size_buffer.copy_from_device(0, 1, 1);
-  size_t size = size_buffer[0];
-  size_buffer.free();
-
-  return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                     RenderTile &rtile,
-                                                     int num_global_elements,
-                                                     device_memory & /*kernel_globals*/,
-                                                     device_memory & /*kernel_data*/,
-                                                     device_memory &split_data,
-                                                     device_memory &ray_state,
-                                                     device_memory &queue_index,
-                                                     device_memory &use_queues_flag,
-                                                     device_memory &work_pool_wgs)
-{
-  CUDAContextScope scope(device);
-
-  CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
-  CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
-  CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
-  CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
-  CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
-
-  CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
-
-  int end_sample = rtile.start_sample + rtile.num_samples;
-  int queue_size = dim.global_size[0] * dim.global_size[1];
-
-  struct args_t {
-    CUdeviceptr *split_data_buffer;
-    int *num_elements;
-    CUdeviceptr *ray_state;
-    int *start_sample;
-    int *end_sample;
-    int *sx;
-    int *sy;
-    int *sw;
-    int *sh;
-    int *offset;
-    int *stride;
-    CUdeviceptr *queue_index;
-    int *queuesize;
-    CUdeviceptr *use_queues_flag;
-    CUdeviceptr *work_pool_wgs;
-    int *num_samples;
-    CUdeviceptr *buffer;
-  };
-
-  args_t args = {&d_split_data,
-                 &num_global_elements,
-                 &d_ray_state,
-                 &rtile.start_sample,
-                 &end_sample,
-                 &rtile.x,
-                 &rtile.y,
-                 &rtile.w,
-                 &rtile.h,
-                 &rtile.offset,
-                 &rtile.stride,
-                 &d_queue_index,
-                 &queue_size,
-                 &d_use_queues_flag,
-                 &d_work_pool_wgs,
-                 &rtile.num_samples,
-                 &d_buffer};
-
-  CUfunction data_init;
-  cuda_assert(
-      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-  if (device->have_error()) {
-    return false;
-  }
-
-  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
-  return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                                const DeviceRequestedFeatures &)
-{
-  CUDAContextScope scope(device);
-  CUfunction func;
-
-  cuda_assert(
-      cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
-  if (device->have_error()) {
-    device->cuda_error_message(
-        string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
-    return NULL;
-  }
-
-  return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-  return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
-                                               device_memory &data,
-                                               DeviceTask * /*task*/)
-{
-  CUDAContextScope scope(device);
-  size_t free;
-  size_t total;
-
-  cuda_assert(cuMemGetInfo(&free, &total));
-
-  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
-          << " bytes. (" << string_human_readable_size(free) << ").";
-
-  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-  size_t side = round_down((int)sqrt(num_elements), 32);
-  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-  VLOG(1) << "Global size: " << global_size << ".";
-  return global_size;
-}
-
 bool device_cuda_init()
 {
-#ifdef WITH_CUDA_DYNLOAD
+#  ifdef WITH_CUDA_DYNLOAD
   static bool initialized = false;
   static bool result = false;
 
@@ -2552,7 +43,6 @@ bool device_cuda_init()
       VLOG(1) << "Found precompiled kernels";
       result = true;
     }
-#  ifndef _WIN32
     else if (cuewCompilerPath() != NULL) {
       VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
       result = true;
@@ -2561,7 +51,6 @@ bool device_cuda_init()
       VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
               << " unable to use CUDA";
     }
-#  endif
   }
   else {
     VLOG(1) << "CUEW initialization failed: "
@@ -2570,9 +59,9 @@ bool device_cuda_init()
   }
 
   return result;
-#else  /* WITH_CUDA_DYNLOAD */
+#  else  /* WITH_CUDA_DYNLOAD */
   return true;
-#endif /* WITH_CUDA_DYNLOAD */
+#  endif /* WITH_CUDA_DYNLOAD */
 }
 
 Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
@@ -2582,7 +71,7 @@ Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, b
 
 static CUresult device_cuda_safe_init()
 {
-#ifdef _WIN32
+#  ifdef _WIN32
   __try {
     return cuInit(0);
   }
@@ -2593,9 +82,9 @@ static CUresult device_cuda_safe_init()
   }
 
   return CUDA_ERROR_NO_DEVICE;
-#else
+#  else
   return cuInit(0);
-#endif
+#  endif
 }
 
 void device_cuda_info(vector<DeviceInfo> &devices)
@@ -2640,6 +129,17 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
     info.has_half_images = (major >= 3);
     info.has_volume_decoupled = false;
+    info.has_adaptive_stop_per_sample = false;
+    info.denoisers = DENOISER_NLM;
+
+    /* Check if the device has P2P access to any other device in the system. */
+    for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
+      if (num != peer_num) {
+        int can_access = 0;
+        cuDeviceCanAccessPeer(&can_access, num, peer_num);
+        info.has_peer_memory = (can_access != 0);
+      }
+    }
 
     int pci_location[3] = {0, 0, 0};
     cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
@@ -2658,6 +158,14 @@ void device_cuda_info(vector<DeviceInfo> &devices)
     cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
     cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
 
+    /* The CUDA driver reports compute preemption as not being available on
+     * Windows 10 even when it is, due to an issue in application profiles.
+     * Detect case where we expect it to be available and override. */
+    if (preempt_attr == 0 && (major >= 6) && system_windows_version_at_least(10, 17134)) {
+      VLOG(1) << "Assuming device has compute preemption on Windows 10.";
+      preempt_attr = 1;
+    }
+
     if (timeout_attr && !preempt_attr) {
       VLOG(1) << "Device is recognized as display.";
       info.description += " (Display)";
@@ -2665,6 +173,7 @@ void device_cuda_info(vector<DeviceInfo> &devices)
       display_devices.push_back(info);
     }
     else {
+      VLOG(1) << "Device has compute preemption or is not used for display.";
       devices.push_back(info);
     }
     VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
@@ -2698,13 +207,13 @@ string device_cuda_capabilities()
     }
     capabilities += string("\t") + name + "\n";
     int value;
-#define GET_ATTR(attr) \
-  { \
-    if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \
-      capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \
+#  define GET_ATTR(attr) \
+    { \
+      if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \
+        capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \
+      } \
     } \
-  } \
-  (void)0
+    (void)0
     /* TODO(sergey): Strip all attributes which are not useful for us
      * or does not depend on the driver.
      */
@@ -2795,7 +304,7 @@ string device_cuda_capabilities()
     GET_ATTR(MANAGED_MEMORY);
     GET_ATTR(MULTI_GPU_BOARD);
     GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
-#undef GET_ATTR
+#  undef GET_ATTR
     capabilities += "\n";
   }
 
@@ -2803,3 +312,5 @@ string device_cuda_capabilities()
 }
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 05a7fb8ae4d..38c42d15cab 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -56,8 +56,8 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
     tile_info->frames[i] = task.denoising_frames[i - 1];
   }
 
-  write_passes = task.denoising_write_passes;
-  do_filter = task.denoising_do_filter;
+  do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
+  do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
 }
 
 DenoisingTask::~DenoisingTask()
@@ -71,29 +71,30 @@ DenoisingTask::~DenoisingTask()
   tile_info_mem.free();
 }
 
-void DenoisingTask::set_render_buffer(RenderTile *rtiles)
+void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
 {
-  for (int i = 0; i < 9; i++) {
-    tile_info->offsets[i] = rtiles[i].offset;
-    tile_info->strides[i] = rtiles[i].stride;
-    tile_info->buffers[i] = rtiles[i].buffer;
+  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+    RenderTile &rtile = neighbors.tiles[i];
+    tile_info->offsets[i] = rtile.offset;
+    tile_info->strides[i] = rtile.stride;
+    tile_info->buffers[i] = rtile.buffer;
   }
-  tile_info->x[0] = rtiles[3].x;
-  tile_info->x[1] = rtiles[4].x;
-  tile_info->x[2] = rtiles[5].x;
-  tile_info->x[3] = rtiles[5].x + rtiles[5].w;
-  tile_info->y[0] = rtiles[1].y;
-  tile_info->y[1] = rtiles[4].y;
-  tile_info->y[2] = rtiles[7].y;
-  tile_info->y[3] = rtiles[7].y + rtiles[7].h;
-
-  target_buffer.offset = rtiles[9].offset;
-  target_buffer.stride = rtiles[9].stride;
-  target_buffer.ptr = rtiles[9].buffer;
-
-  if (write_passes && rtiles[9].buffers) {
+  tile_info->x[0] = neighbors.tiles[3].x;
+  tile_info->x[1] = neighbors.tiles[4].x;
+  tile_info->x[2] = neighbors.tiles[5].x;
+  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
+  tile_info->y[0] = neighbors.tiles[1].y;
+  tile_info->y[1] = neighbors.tiles[4].y;
+  tile_info->y[2] = neighbors.tiles[7].y;
+  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
+
+  target_buffer.offset = neighbors.target.offset;
+  target_buffer.stride = neighbors.target.stride;
+  target_buffer.ptr = neighbors.target.buffer;
+
+  if (do_prefilter && neighbors.target.buffers) {
     target_buffer.denoising_output_offset =
-        rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+        neighbors.target.buffers->params.get_denoising_prefiltered_offset();
   }
   else {
     target_buffer.denoising_output_offset = 0;
@@ -104,13 +105,14 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
 
 void DenoisingTask::setup_denoising_buffer()
 {
-  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
+   * tiles */
   rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
   rect = rect_expand(rect, radius);
   rect = rect_clip(rect,
                    make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
 
-  buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
+  buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
   buffer.passes = buffer.use_intensity ? 15 : 14;
   buffer.width = rect.z - rect.x;
   buffer.stride = align_up(buffer.width, 4);
@@ -149,16 +151,19 @@ void DenoisingTask::prefilter_shadowing()
   device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
   device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
 
-  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
+   * sample variance and the buffer variance. */
   functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
 
-  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
+   * sample variance. */
   nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
   functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
 
   /* Reuse memory, the previous data isn't needed anymore. */
   device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
-  /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+  /* Use the smoothed variance to filter the two shadow half images using each other for weight
+   * calculation. */
   nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
   functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
   functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
@@ -210,12 +215,12 @@ void DenoisingTask::prefilter_color()
   int num_color_passes = 3;
 
   device_only_memory<float> temporary_color(device, "denoising temporary color");
-  temporary_color.alloc_to_device(3 * buffer.pass_stride, false);
+  temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
 
   for (int pass = 0; pass < num_color_passes; pass++) {
     device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
     device_sub_ptr color_var_pass(
-        buffer.mem, variance_to[pass] * buffer.pass_stride, buffer.pass_stride);
+        temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
     functions.get_feature(mean_from[pass],
                           variance_from[pass],
                           *color_pass,
@@ -316,12 +321,11 @@ void DenoisingTask::reconstruct()
   functions.solve(target_buffer.ptr);
 }
 
-void DenoisingTask::run_denoising(RenderTile *tile)
+void DenoisingTask::run_denoising(RenderTile &tile)
 {
-  RenderTile rtiles[10];
-  rtiles[4] = *tile;
-  functions.map_neighbor_tiles(rtiles);
-  set_render_buffer(rtiles);
+  RenderTileNeighbors neighbors(tile);
+  functions.map_neighbor_tiles(neighbors);
+  set_render_buffer(neighbors);
 
   setup_denoising_buffer();
 
@@ -339,11 +343,11 @@ void DenoisingTask::run_denoising(RenderTile *tile)
     reconstruct();
   }
 
-  if (write_passes) {
+  if (do_prefilter) {
     write_buffer();
   }
 
-  functions.unmap_neighbor_tiles(rtiles);
+  functions.unmap_neighbor_tiles(neighbors);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index bd1d0193dbd..2c0dc23b44a 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -60,7 +60,7 @@ class DenoisingTask {
   int4 rect;
   int4 filter_area;
 
-  bool write_passes;
+  bool do_prefilter;
   bool do_filter;
 
   struct DeviceFunctions {
@@ -102,8 +102,8 @@ class DenoisingTask {
                   device_ptr output_ptr)>
         detect_outliers;
     function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
-    function<void(RenderTile *rtiles)> map_neighbor_tiles;
-    function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
+    function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
+    function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
   } functions;
 
   /* Stores state of the current Reconstruction operation,
@@ -154,7 +154,7 @@ class DenoisingTask {
   DenoisingTask(Device *device, const DeviceTask &task);
   ~DenoisingTask();
 
-  void run_denoising(RenderTile *tile);
+  void run_denoising(RenderTile &tile);
 
   struct DenoiseBuffers {
     int pass_stride;
@@ -179,7 +179,7 @@ class DenoisingTask {
  protected:
   Device *device;
 
-  void set_render_buffer(RenderTile *rtiles);
+  void set_render_buffer(RenderTileNeighbors &neighbors);
   void setup_denoising_buffer();
   void prefilter_shadowing();
   void prefilter_features();
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index c393a3f9cda..94d63e8f333 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -17,9 +17,15 @@
 #ifndef __DEVICE_INTERN_H__
 #define __DEVICE_INTERN_H__
 
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
 
 Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
 bool device_opencl_init();
@@ -27,6 +33,9 @@ Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler,
 bool device_opencl_compile_kernel(const vector<string> &parameters);
 bool device_cuda_init();
 Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+bool device_optix_init();
+Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+
 Device *device_network_create(DeviceInfo &info,
                               Stats &stats,
                               Profiler &profiler,
@@ -36,6 +45,7 @@ Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler,
 void device_cpu_info(vector<DeviceInfo> &devices);
 void device_opencl_info(vector<DeviceInfo> &devices);
 void device_cuda_info(vector<DeviceInfo> &devices);
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
 void device_network_info(vector<DeviceInfo> &devices);
 
 string device_cpu_capabilities();
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 859535307f4..8064d50d31f 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "device/device.h"
 #include "device/device_memory.h"
+#include "device/device.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -31,17 +31,18 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
       data_depth(0),
       type(type),
       name(name),
-      interpolation(INTERPOLATION_NONE),
-      extension(EXTENSION_REPEAT),
       device(device),
       device_pointer(0),
       host_pointer(0),
-      shared_pointer(0)
+      shared_pointer(0),
+      shared_counter(0)
 {
 }
 
 device_memory::~device_memory()
 {
+  assert(shared_pointer == 0);
+  assert(shared_counter == 0);
 }
 
 void *device_memory::host_alloc(size_t size)
@@ -73,7 +74,7 @@ void device_memory::host_free()
 
 void device_memory::device_alloc()
 {
-  assert(!device_pointer && type != MEM_TEXTURE);
+  assert(!device_pointer && type != MEM_TEXTURE && type != MEM_GLOBAL);
   device->mem_alloc(*this);
 }
 
@@ -93,7 +94,7 @@ void device_memory::device_copy_to()
 
 void device_memory::device_copy_from(int y, int w, int h, int elem)
 {
-  assert(type != MEM_TEXTURE && type != MEM_READ_ONLY);
+  assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
   device->mem_copy_from(*this, y, w, h, elem);
 }
 
@@ -124,6 +125,11 @@ void device_memory::restore_device()
   device_pointer = original_device_ptr;
 }
 
+bool device_memory::is_resident(Device *sub_device) const
+{
+  return device->is_resident(device_pointer, sub_device);
+}
+
 /* Device Sub Ptr */
 
 device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
@@ -136,4 +142,93 @@ device_sub_ptr::~device_sub_ptr()
   device->mem_free_sub_ptr(ptr);
 }
 
+/* Device Texture */
+
+device_texture::device_texture(Device *device,
+                               const char *name,
+                               const uint slot,
+                               ImageDataType image_data_type,
+                               InterpolationType interpolation,
+                               ExtensionType extension)
+    : device_memory(device, name, MEM_TEXTURE), slot(slot)
+{
+  switch (image_data_type) {
+    case IMAGE_DATA_TYPE_FLOAT4:
+      data_type = TYPE_FLOAT;
+      data_elements = 4;
+      break;
+    case IMAGE_DATA_TYPE_FLOAT:
+      data_type = TYPE_FLOAT;
+      data_elements = 1;
+      break;
+    case IMAGE_DATA_TYPE_BYTE4:
+      data_type = TYPE_UCHAR;
+      data_elements = 4;
+      break;
+    case IMAGE_DATA_TYPE_BYTE:
+      data_type = TYPE_UCHAR;
+      data_elements = 1;
+      break;
+    case IMAGE_DATA_TYPE_HALF4:
+      data_type = TYPE_HALF;
+      data_elements = 4;
+      break;
+    case IMAGE_DATA_TYPE_HALF:
+      data_type = TYPE_HALF;
+      data_elements = 1;
+      break;
+    case IMAGE_DATA_TYPE_USHORT4:
+      data_type = TYPE_UINT16;
+      data_elements = 4;
+      break;
+    case IMAGE_DATA_TYPE_USHORT:
+      data_type = TYPE_UINT16;
+      data_elements = 1;
+      break;
+    case IMAGE_DATA_NUM_TYPES:
+      assert(0);
+      return;
+  }
+
+  memset(&info, 0, sizeof(info));
+  info.data_type = image_data_type;
+  info.interpolation = interpolation;
+  info.extension = extension;
+}
+
+device_texture::~device_texture()
+{
+  device_free();
+  host_free();
+}
+
+/* Host memory allocation. */
+void *device_texture::alloc(const size_t width, const size_t height, const size_t depth)
+{
+  const size_t new_size = size(width, height, depth);
+
+  if (new_size != data_size) {
+    device_free();
+    host_free();
+    host_pointer = host_alloc(data_elements * datatype_size(data_type) * new_size);
+    assert(device_pointer == 0);
+  }
+
+  data_size = new_size;
+  data_width = width;
+  data_height = height;
+  data_depth = depth;
+
+  info.width = width;
+  info.height = height;
+  info.depth = depth;
+
+  return host_pointer;
+}
+
+void device_texture::copy_to_device()
+{
+  device_copy_to();
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index f50184efba7..32654e62a6f 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -23,6 +23,7 @@
 
 #include "util/util_array.h"
 #include "util/util_half.h"
+#include "util/util_string.h"
 #include "util/util_texture.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
@@ -31,7 +32,14 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 
-enum MemoryType { MEM_READ_ONLY, MEM_READ_WRITE, MEM_DEVICE_ONLY, MEM_TEXTURE, MEM_PIXELS };
+enum MemoryType {
+  MEM_READ_ONLY,
+  MEM_READ_WRITE,
+  MEM_DEVICE_ONLY,
+  MEM_GLOBAL,
+  MEM_TEXTURE,
+  MEM_PIXELS
+};
 
 /* Supported Data Types */
 
@@ -208,29 +216,32 @@ class device_memory {
   size_t data_depth;
   MemoryType type;
   const char *name;
-  InterpolationType interpolation;
-  ExtensionType extension;
 
   /* Pointers. */
   Device *device;
   device_ptr device_pointer;
   void *host_pointer;
   void *shared_pointer;
+  /* reference counter for shared_pointer */
+  int shared_counter;
 
   virtual ~device_memory();
 
   void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
   void restore_device();
 
+  bool is_resident(Device *sub_device) const;
+
  protected:
   friend class CUDADevice;
+  friend class OptiXDevice;
 
   /* Only create through subclasses. */
   device_memory(Device *device, const char *name, MemoryType type);
 
   /* No copying allowed. */
-  device_memory(const device_memory &);
-  device_memory &operator=(const device_memory &);
+  device_memory(const device_memory &) = delete;
+  device_memory &operator=(const device_memory &) = delete;
 
   /* Host allocation on the device. All host_pointer memory should be
    * allocated with these functions, for devices that support using
@@ -307,7 +318,7 @@ template<typename T> class device_only_memory : public device_memory {
  * in and copied to the device with copy_to_device(). Or alternatively
  * allocated and set to zero on the device with zero_to_device().
  *
- * When using memory type MEM_TEXTURE, a pointer to this memory will be
+ * When using memory type MEM_GLOBAL, a pointer to this memory will be
  * automatically attached to kernel globals, using the provided name
  * matching an entry in kernel_textures.h. */
 
@@ -424,6 +435,11 @@ template<typename T> class device_vector : public device_memory {
     device_copy_to();
   }
 
+  void copy_from_device()
+  {
+    device_copy_from(0, data_width, data_height, sizeof(T));
+  }
+
   void copy_from_device(int y, int w, int h)
   {
     device_copy_from(y, w, h, sizeof(T));
@@ -495,6 +511,33 @@ class device_sub_ptr {
   device_ptr ptr;
 };
 
+/* Device Texture
+ *
+ * 2D or 3D image texture memory. */
+
+class device_texture : public device_memory {
+ public:
+  device_texture(Device *device,
+                 const char *name,
+                 const uint slot,
+                 ImageDataType image_data_type,
+                 InterpolationType interpolation,
+                 ExtensionType extension);
+  ~device_texture();
+
+  void *alloc(const size_t width, const size_t height, const size_t depth = 0);
+  void copy_to_device();
+
+  uint slot;
+  TextureInfo info;
+
+ protected:
+  size_t size(const size_t width, const size_t height, const size_t depth)
+  {
+    return width * ((height == 0) ? 1 : height) * ((depth == 0) ? 1 : depth);
+  }
+};
+
 CCL_NAMESPACE_END
 
 #endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 4a40e106115..9ea8782d0f0 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <stdlib.h>
 #include <sstream>
+#include <stdlib.h>
 
 #include "device/device.h"
 #include "device/device_intern.h"
@@ -34,30 +34,87 @@ CCL_NAMESPACE_BEGIN
 class MultiDevice : public Device {
  public:
   struct SubDevice {
-    explicit SubDevice(Device *device_) : device(device_)
-    {
-    }
-
+    Stats stats;
     Device *device;
     map<device_ptr, device_ptr> ptr_map;
+    int peer_island_index = -1;
   };
 
-  list<SubDevice> devices;
+  list<SubDevice> devices, denoising_devices;
   device_ptr unique_key;
+  vector<vector<SubDevice *>> peer_islands;
+  bool matching_rendering_and_denoising_devices;
 
   MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
       : Device(info, stats, profiler, background_), unique_key(1)
   {
     foreach (DeviceInfo &subinfo, info.multi_devices) {
-      Device *device = Device::create(subinfo, sub_stats_, profiler, background);
-
       /* Always add CPU devices at the back since GPU devices can change
        * host memory pointers, which CPU uses as device pointer. */
+      SubDevice *sub;
       if (subinfo.type == DEVICE_CPU) {
-        devices.push_back(SubDevice(device));
+        devices.emplace_back();
+        sub = &devices.back();
       }
       else {
-        devices.push_front(SubDevice(device));
+        devices.emplace_front();
+        sub = &devices.front();
+      }
+
+      /* The pointer to 'sub->stats' will stay valid even after new devices
+       * are added, since 'devices' is a linked list. */
+      sub->device = Device::create(subinfo, sub->stats, profiler, background);
+    }
+
+    foreach (DeviceInfo &subinfo, info.denoising_devices) {
+      denoising_devices.emplace_front();
+      SubDevice *sub = &denoising_devices.front();
+
+      sub->device = Device::create(subinfo, sub->stats, profiler, background);
+    }
+
+    /* Build a list of peer islands for the available render devices */
+    foreach (SubDevice &sub, devices) {
+      /* First ensure that every device is in at least once peer island */
+      if (sub.peer_island_index < 0) {
+        peer_islands.emplace_back();
+        sub.peer_island_index = (int)peer_islands.size() - 1;
+        peer_islands[sub.peer_island_index].push_back(&sub);
+      }
+
+      if (!info.has_peer_memory) {
+        continue;
+      }
+
+      /* Second check peer access between devices and fill up the islands accordingly */
+      foreach (SubDevice &peer_sub, devices) {
+        if (peer_sub.peer_island_index < 0 &&
+            peer_sub.device->info.type == sub.device->info.type &&
+            peer_sub.device->check_peer_access(sub.device)) {
+          peer_sub.peer_island_index = sub.peer_island_index;
+          peer_islands[sub.peer_island_index].push_back(&peer_sub);
+        }
+      }
+    }
+
+    /* Try to re-use memory when denoising and render devices use the same physical devices
+     * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
+     * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
+    matching_rendering_and_denoising_devices = denoising_devices.empty() ||
+                                               (devices.size() == denoising_devices.size());
+    if (matching_rendering_and_denoising_devices) {
+      for (list<SubDevice>::iterator device_it = devices.begin(),
+                                     denoising_device_it = denoising_devices.begin();
+           device_it != devices.end() && denoising_device_it != denoising_devices.end();
+           ++device_it, ++denoising_device_it) {
+        const DeviceInfo &info = device_it->device->info;
+        const DeviceInfo &denoising_info = denoising_device_it->device->info;
+        if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
+            (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
+            info.num != denoising_info.num) {
+          matching_rendering_and_denoising_devices = false;
+          break;
+        }
       }
     }
 
@@ -80,17 +137,18 @@ class MultiDevice : public Device {
   {
     foreach (SubDevice &sub, devices)
       delete sub.device;
+    foreach (SubDevice &sub, denoising_devices)
+      delete sub.device;
   }
 
   const string &error_message()
   {
-    foreach (SubDevice &sub, devices) {
-      if (sub.device->error_message() != "") {
-        if (error_msg == "")
-          error_msg = sub.device->error_message();
-        break;
-      }
-    }
+    error_msg.clear();
+
+    foreach (SubDevice &sub, devices)
+      error_msg += sub.device->error_message();
+    foreach (SubDevice &sub, denoising_devices)
+      error_msg += sub.device->error_message();
 
     return error_msg;
   }
@@ -118,6 +176,15 @@ class MultiDevice : public Device {
       if (!sub.device->load_kernels(requested_features))
         return false;
 
+    if (requested_features.use_denoising) {
+      /* Only need denoising feature, everything else is unused. */
+      DeviceRequestedFeatures denoising_features;
+      denoising_features.use_denoising = true;
+      foreach (SubDevice &sub, denoising_devices)
+        if (!sub.device->load_kernels(denoising_features))
+          return false;
+    }
+
     return true;
   }
 
@@ -127,6 +194,12 @@ class MultiDevice : public Device {
       if (!sub.device->wait_for_availability(requested_features))
         return false;
 
+    if (requested_features.use_denoising) {
+      foreach (SubDevice &sub, denoising_devices)
+        if (!sub.device->wait_for_availability(requested_features))
+          return false;
+    }
+
     return true;
   }
 
@@ -150,20 +223,104 @@ class MultiDevice : public Device {
           break;
       }
     }
+
     return result;
   }
 
+  bool build_optix_bvh(BVH *bvh)
+  {
+    /* Broadcast acceleration structure build to all render devices */
+    foreach (SubDevice &sub, devices) {
+      if (!sub.device->build_optix_bvh(bvh))
+        return false;
+    }
+    return true;
+  }
+
+  virtual void *osl_memory()
+  {
+    if (devices.size() > 1) {
+      return NULL;
+    }
+    return devices.front().device->osl_memory();
+  }
+
+  bool is_resident(device_ptr key, Device *sub_device)
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        return find_matching_mem_device(key, sub)->device == sub_device;
+      }
+    }
+    return false;
+  }
+
+  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+  {
+    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
+
+    /* Get the memory owner of this key (first try current device, then peer devices) */
+    SubDevice *owner_sub = &sub;
+    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+        if (island_sub != owner_sub &&
+            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+          owner_sub = island_sub;
+        }
+      }
+    }
+    return owner_sub;
+  }
+
+  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+  {
+    assert(!island.empty());
+
+    /* Get the memory owner of this key or the device with the lowest memory usage when new */
+    SubDevice *owner_sub = island.front();
+    foreach (SubDevice *island_sub, island) {
+      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+        owner_sub = island_sub;
+      }
+    }
+    return owner_sub;
+  }
+
+  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+  {
+    return find_matching_mem_device(key, sub)->ptr_map[key];
+  }
+
   void mem_alloc(device_memory &mem)
   {
     device_ptr key = unique_key++;
 
-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = 0;
-      mem.device_size = 0;
+    if (mem.type == MEM_PIXELS) {
+      /* Always allocate pixels memory on all devices
+       * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = 0;
+        mem.device_size = 0;
 
-      sub.device->mem_alloc(mem);
-      sub.ptr_map[key] = mem.device_pointer;
+        sub.device->mem_alloc(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
+    }
+    else {
+      assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
+             mem.type == MEM_DEVICE_ONLY);
+      /* The remaining memory types can be distributed across devices */
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = 0;
+        mem.device_size = 0;
+
+        owner_sub->device->mem_alloc(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
+      }
     }
 
     mem.device = this;
@@ -177,13 +334,36 @@ class MultiDevice : public Device {
     device_ptr key = (existing_key) ? existing_key : unique_key++;
     size_t existing_size = mem.device_size;
 
-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-      mem.device_size = existing_size;
+    /* The tile buffers are allocated on each device (see below), so copy to all of them */
+    if (strcmp(mem.name, "RenderBuffers") == 0) {
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
 
-      sub.device->mem_copy_to(mem);
-      sub.ptr_map[key] = mem.device_pointer;
+        sub.device->mem_copy_to(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
+    }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_copy_to(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
+
+        if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+          /* Need to create texture objects and update pointer in kernel globals on all devices */
+          foreach (SubDevice *island_sub, island) {
+            if (island_sub != owner_sub) {
+              island_sub->device->mem_copy_to(mem);
+            }
+          }
+        }
+      }
     }
 
     mem.device = this;
@@ -200,10 +380,11 @@ class MultiDevice : public Device {
       int sy = y + i * sub_h;
       int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
 
-      mem.device = sub.device;
-      mem.device_pointer = sub.ptr_map[key];
+      SubDevice *owner_sub = find_matching_mem_device(key, sub);
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
 
-      sub.device->mem_copy_from(mem, sy, w, sh, elem);
+      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
       i++;
     }
 
@@ -217,13 +398,48 @@ class MultiDevice : public Device {
     device_ptr key = (existing_key) ? existing_key : unique_key++;
     size_t existing_size = mem.device_size;
 
-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-      mem.device_size = existing_size;
+    /* This is a hack to only allocate the tile buffers on denoising devices
+     * Similarly the tile buffers also need to be allocated separately on all devices so any
+     * overlap rendered for denoising does not interfere with each other */
+    if (strcmp(mem.name, "RenderBuffers") == 0) {
+      vector<device_ptr> device_pointers;
+      device_pointers.reserve(devices.size());
+
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        sub.device->mem_zero(mem);
+        sub.ptr_map[key] = mem.device_pointer;
 
-      sub.device->mem_zero(mem);
-      sub.ptr_map[key] = mem.device_pointer;
+        device_pointers.push_back(mem.device_pointer);
+      }
+      foreach (SubDevice &sub, denoising_devices) {
+        if (matching_rendering_and_denoising_devices) {
+          sub.ptr_map[key] = device_pointers.front();
+          device_pointers.erase(device_pointers.begin());
+        }
+        else {
+          mem.device = sub.device;
+          mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+          mem.device_size = existing_size;
+
+          sub.device->mem_zero(mem);
+          sub.ptr_map[key] = mem.device_pointer;
+        }
+      }
+    }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_zero(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
+      }
     }
 
     mem.device = this;
@@ -236,13 +452,49 @@ class MultiDevice : public Device {
     device_ptr key = mem.device_pointer;
     size_t existing_size = mem.device_size;
 
-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = sub.ptr_map[key];
-      mem.device_size = existing_size;
+    /* Free memory that was allocated for all devices (see above) on each device */
+    if (strcmp(mem.name, "RenderBuffers") == 0 || mem.type == MEM_PIXELS) {
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = sub.ptr_map[key];
+        mem.device_size = existing_size;
 
-      sub.device->mem_free(mem);
-      sub.ptr_map.erase(sub.ptr_map.find(key));
+        sub.device->mem_free(mem);
+        sub.ptr_map.erase(sub.ptr_map.find(key));
+      }
+      foreach (SubDevice &sub, denoising_devices) {
+        if (matching_rendering_and_denoising_devices) {
+          sub.ptr_map.erase(key);
+        }
+        else {
+          mem.device = sub.device;
+          mem.device_pointer = sub.ptr_map[key];
+          mem.device_size = existing_size;
+
+          sub.device->mem_free(mem);
+          sub.ptr_map.erase(sub.ptr_map.find(key));
+        }
+      }
+    }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+        mem.device = owner_sub->device;
+        mem.device_pointer = owner_sub->ptr_map[key];
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_free(mem);
+        owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+        if (mem.type == MEM_TEXTURE) {
+          /* Free texture objects on all devices */
+          foreach (SubDevice *island_sub, island) {
+            if (island_sub != owner_sub) {
+              island_sub->device->mem_free(mem);
+            }
+          }
+        }
+      }
     }
 
     mem.device = this;
@@ -270,6 +522,8 @@ class MultiDevice : public Device {
                    bool transparent,
                    const DeviceDrawParams &draw_params)
   {
+    assert(rgba.type == MEM_PIXELS);
+
     device_ptr key = rgba.device_pointer;
     int i = 0, sub_h = h / devices.size();
     int sub_height = height / devices.size();
@@ -292,10 +546,21 @@ class MultiDevice : public Device {
 
   void map_tile(Device *sub_device, RenderTile &tile)
   {
+    if (!tile.buffer) {
+      return;
+    }
+
     foreach (SubDevice &sub, devices) {
       if (sub.device == sub_device) {
-        if (tile.buffer)
-          tile.buffer = sub.ptr_map[tile.buffer];
+        tile.buffer = find_matching_mem(tile.buffer, sub);
+        return;
+      }
+    }
+
+    foreach (SubDevice &sub, denoising_devices) {
+      if (sub.device == sub_device) {
+        tile.buffer = sub.ptr_map[tile.buffer];
+        return;
       }
     }
   }
@@ -310,13 +575,31 @@ class MultiDevice : public Device {
       i++;
     }
 
+    foreach (SubDevice &sub, denoising_devices) {
+      if (sub.device == sub_device)
+        return i;
+      i++;
+    }
+
     return -1;
   }
 
-  void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+  void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors)
   {
-    for (int i = 0; i < 9; i++) {
-      if (!tiles[i].buffers) {
+    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+      RenderTile &tile = neighbors.tiles[i];
+
+      if (!tile.buffers) {
+        continue;
+      }
+
+      device_vector<float> &mem = tile.buffers->buffer;
+      tile.buffer = mem.device_pointer;
+
+      if (mem.device == this && matching_rendering_and_denoising_devices) {
+        /* Skip unnecessary copies in viewport mode (buffer covers the
+         * whole image), but still need to fix up the tile device pointer. */
+        map_tile(sub_device, tile);
         continue;
       }
 
@@ -324,46 +607,63 @@ class MultiDevice : public Device {
        * to the current device now, for the duration of the denoising task.
        * Note that this temporarily modifies the RenderBuffers and calls
        * the device, so this function is not thread safe. */
-      device_vector<float> &mem = tiles[i].buffers->buffer;
       if (mem.device != sub_device) {
         /* Only copy from device to host once. This is faster, but
          * also required for the case where a CPU thread is denoising
          * a tile rendered on the GPU. In that case we have to avoid
-         * overwriting the buffer being denoised by the CPU thread. */
-        if (!tiles[i].buffers->map_neighbor_copied) {
-          tiles[i].buffers->map_neighbor_copied = true;
-          mem.copy_from_device(0, mem.data_size, 1);
+         * overwriting the buffer being de-noised by the CPU thread. */
+        if (!tile.buffers->map_neighbor_copied) {
+          tile.buffers->map_neighbor_copied = true;
+          mem.copy_from_device();
         }
 
-        mem.swap_device(sub_device, 0, 0);
+        if (mem.device == this) {
+          /* Can re-use memory if tile is already allocated on the sub device. */
+          map_tile(sub_device, tile);
+          mem.swap_device(sub_device, mem.device_size, tile.buffer);
+        }
+        else {
+          mem.swap_device(sub_device, 0, 0);
+        }
 
         mem.copy_to_device();
-        tiles[i].buffer = mem.device_pointer;
-        tiles[i].device_size = mem.device_size;
+
+        tile.buffer = mem.device_pointer;
+        tile.device_size = mem.device_size;
 
         mem.restore_device();
       }
     }
   }
 
-  void unmap_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+  void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors)
   {
+    RenderTile &target_tile = neighbors.target;
+    device_vector<float> &mem = target_tile.buffers->buffer;
+
+    if (mem.device == this && matching_rendering_and_denoising_devices) {
+      return;
+    }
+
     /* Copy denoised result back to the host. */
-    device_vector<float> &mem = tiles[9].buffers->buffer;
-    mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer);
-    mem.copy_from_device(0, mem.data_size, 1);
+    mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
+    mem.copy_from_device();
     mem.restore_device();
+
     /* Copy denoised result to the original device. */
     mem.copy_to_device();
 
-    for (int i = 0; i < 9; i++) {
-      if (!tiles[i].buffers) {
+    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+      RenderTile &tile = neighbors.tiles[i];
+      if (!tile.buffers) {
         continue;
       }
 
-      device_vector<float> &mem = tiles[i].buffers->buffer;
-      if (mem.device != sub_device) {
-        mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer);
+      device_vector<float> &mem = tile.buffers->buffer;
+
+      if (mem.device != sub_device && mem.device != this) {
+        /* Free up memory again if it was allocated for the copy above. */
+        mem.swap_device(sub_device, tile.device_size, tile.buffer);
         sub_device->mem_free(mem);
         mem.restore_device();
       }
@@ -388,26 +688,50 @@ class MultiDevice : public Device {
 
   void task_add(DeviceTask &task)
   {
+    list<SubDevice> task_devices = devices;
+    if (!denoising_devices.empty()) {
+      if (task.type == DeviceTask::DENOISE_BUFFER) {
+        /* Denoising tasks should be redirected to the denoising devices entirely. */
+        task_devices = denoising_devices;
+      }
+      else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
+        const uint tile_types = task.tile_types;
+        /* For normal rendering tasks only redirect the denoising part to the denoising devices.
+         * Do not need to split the task here, since they all run through 'acquire_tile'. */
+        task.tile_types = RenderTile::DENOISE;
+        foreach (SubDevice &sub, denoising_devices) {
+          sub.device->task_add(task);
+        }
+        /* Rendering itself should still be executed on the rendering devices. */
+        task.tile_types = tile_types ^ RenderTile::DENOISE;
+      }
+    }
+
     list<DeviceTask> tasks;
-    task.split(tasks, devices.size());
+    task.split(tasks, task_devices.size());
 
-    foreach (SubDevice &sub, devices) {
+    foreach (SubDevice &sub, task_devices) {
       if (!tasks.empty()) {
         DeviceTask subtask = tasks.front();
         tasks.pop_front();
 
         if (task.buffer)
-          subtask.buffer = sub.ptr_map[task.buffer];
+          subtask.buffer = find_matching_mem(task.buffer, sub);
         if (task.rgba_byte)
           subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
         if (task.rgba_half)
           subtask.rgba_half = sub.ptr_map[task.rgba_half];
         if (task.shader_input)
-          subtask.shader_input = sub.ptr_map[task.shader_input];
+          subtask.shader_input = find_matching_mem(task.shader_input, sub);
         if (task.shader_output)
-          subtask.shader_output = sub.ptr_map[task.shader_output];
+          subtask.shader_output = find_matching_mem(task.shader_output, sub);
 
         sub.device->task_add(subtask);
+
+        if (task.buffers && task.buffers->buffer.device == this) {
+          /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
+          sub.device->task_wait();
+        }
       }
     }
   }
@@ -416,16 +740,17 @@ class MultiDevice : public Device {
   {
     foreach (SubDevice &sub, devices)
       sub.device->task_wait();
+    foreach (SubDevice &sub, denoising_devices)
+      sub.device->task_wait();
   }
 
   void task_cancel()
   {
     foreach (SubDevice &sub, devices)
       sub.device->task_cancel();
+    foreach (SubDevice &sub, denoising_devices)
+      sub.device->task_cancel();
   }
-
- protected:
-  Stats sub_stats_;
 };
 
 Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 80334ad8f22..8904b517e92 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include "device/device_network.h"
 #include "device/device.h"
 #include "device/device_intern.h"
-#include "device/device_network.h"
 
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
@@ -311,7 +311,9 @@ void device_network_info(vector<DeviceInfo> &devices)
 
   /* todo: get this info from device */
   info.has_volume_decoupled = false;
+  info.has_adaptive_stop_per_sample = false;
   info.has_osl = false;
+  info.denoisers = DENOISER_NONE;
 
   devices.push_back(info);
 }
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 5b69b815cc6..e74c4508ab6 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -19,19 +19,19 @@
 
 #ifdef WITH_NETWORK
 
-#  include <boost/archive/text_iarchive.hpp>
-#  include <boost/archive/text_oarchive.hpp>
 #  include <boost/archive/binary_iarchive.hpp>
 #  include <boost/archive/binary_oarchive.hpp>
+#  include <boost/archive/text_iarchive.hpp>
+#  include <boost/archive/text_oarchive.hpp>
 #  include <boost/array.hpp>
 #  include <boost/asio.hpp>
 #  include <boost/bind.hpp>
 #  include <boost/serialization/vector.hpp>
 #  include <boost/thread.hpp>
 
+#  include <deque>
 #  include <iostream>
 #  include <sstream>
-#  include <deque>
 
 #  include "render/buffers.h"
 
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 99a8d2438d6..39b9ef70192 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,8 +16,8 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
-
+#  include "device/opencl/device_opencl.h"
+#  include "device/device.h"
 #  include "device/device_intern.h"
 
 #  include "util/util_foreach.h"
@@ -119,6 +119,8 @@ void device_opencl_info(vector<DeviceInfo> &devices)
     info.display_device = true;
     info.use_split_kernel = true;
     info.has_volume_decoupled = false;
+    info.has_adaptive_stop_per_sample = false;
+    info.denoisers = DENOISER_NLM;
     info.id = id;
 
     /* Check OpenCL extensions */
@@ -136,8 +138,8 @@ string device_opencl_capabilities()
   }
   string result = "";
   string error_msg = ""; /* Only used by opencl_assert(), but in the future
-                           * it could also be nicely reported to the console.
-                           */
+                          * it could also be nicely reported to the console.
+                          */
   cl_uint num_platforms = 0;
   opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
   if (num_platforms == 0) {
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
new file mode 100644
index 00000000000..1cc45983565
--- /dev/null
+++ b/intern/cycles/device/device_optix.cpp
@@ -0,0 +1,1770 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "bvh/bvh.h"
+#  include "device/cuda/device_cuda.h"
+#  include "device/device_denoising.h"
+#  include "device/device_intern.h"
+#  include "render/buffers.h"
+#  include "render/hair.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+#  include "render/scene.h"
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_time.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+#    define OPTIX_DONT_INCLUDE_CUDA
+#  endif
+#  include <optix_function_table_definition.h>
+#  include <optix_stubs.h>
+
+// TODO(pmours): Disable this once drivers have native support
+#  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
+
+CCL_NAMESPACE_BEGIN
+
+/* Make sure this stays in sync with kernel_globals.h */
+struct ShaderParams {
+  uint4 *input;
+  float4 *output;
+  int type;
+  int filter;
+  int sx;
+  int offset;
+  int sample;
+};
+struct KernelParams {
+  WorkTile tile;
+  KernelData data;
+  ShaderParams shader;
+#  define KERNEL_TEX(type, name) const type *name;
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+};
+
+#  define check_result_cuda(stmt) \
+    { \
+      CUresult res = stmt; \
+      if (res != CUDA_SUCCESS) { \
+        const char *name; \
+        cuGetErrorName(res, &name); \
+        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
+        return; \
+      } \
+    } \
+    (void)0
+#  define check_result_cuda_ret(stmt) \
+    { \
+      CUresult res = stmt; \
+      if (res != CUDA_SUCCESS) { \
+        const char *name; \
+        cuGetErrorName(res, &name); \
+        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
+        return false; \
+      } \
+    } \
+    (void)0
+
+#  define check_result_optix(stmt) \
+    { \
+      enum OptixResult res = stmt; \
+      if (res != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(res); \
+        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
+        return; \
+      } \
+    } \
+    (void)0
+#  define check_result_optix_ret(stmt) \
+    { \
+      enum OptixResult res = stmt; \
+      if (res != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(res); \
+        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
+        return false; \
+      } \
+    } \
+    (void)0
+
+#  define launch_filter_kernel(func_name, w, h, args) \
+    { \
+      CUfunction func; \
+      check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
+      check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
+      int threads; \
+      check_result_cuda_ret( \
+          cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+      threads = (int)sqrt((float)threads); \
+      int xblocks = ((w) + threads - 1) / threads; \
+      int yblocks = ((h) + threads - 1) / threads; \
+      check_result_cuda_ret( \
+          cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
+    } \
+    (void)0
+
+class OptiXDevice : public CUDADevice {
+
+  // List of OptiX program groups
+  enum {
+    PG_RGEN,
+    PG_MISS,
+    PG_HITD,  // Default hit group
+    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
+    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
+#  if OPTIX_ABI_VERSION >= 36
+    PG_HITD_MOTION,
+    PG_HITS_MOTION,
+#  endif
+#  ifdef WITH_CYCLES_DEBUG
+    PG_EXCP,
+#  endif
+    PG_BAKE,  // kernel_bake_evaluate
+    PG_DISP,  // kernel_displace_evaluate
+    PG_BACK,  // kernel_background_evaluate
+    NUM_PROGRAM_GROUPS
+  };
+
+  // List of OptiX pipelines
+  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
+
+  // A single shader binding table entry
+  struct SbtRecord {
+    char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+  };
+
+  // Information stored about CUDA memory allocations
+  struct CUDAMem {
+    bool free_map_host = false;
+    CUarray array = NULL;
+    CUtexObject texobject = 0;
+    bool use_mapped_host = false;
+  };
+
+  // Helper class to manage current CUDA context
+  struct CUDAContextScope {
+    CUDAContextScope(CUcontext ctx)
+    {
+      cuCtxPushCurrent(ctx);
+    }
+    ~CUDAContextScope()
+    {
+      cuCtxPopCurrent(NULL);
+    }
+  };
+
+  // Use a pool with multiple threads to support launches with multiple CUDA streams
+  TaskPool task_pool;
+
+  vector<CUstream> cuda_stream;
+  OptixDeviceContext context = NULL;
+
+  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
+  OptixModule builtin_modules[2] = {};
+  OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+  bool motion_blur = false;
+  device_vector<SbtRecord> sbt_data;
+  device_only_memory<KernelParams> launch_params;
+  vector<CUdeviceptr> as_mem;
+  OptixTraversableHandle tlas_handle = 0;
+
+  OptixDenoiser denoiser = NULL;
+  device_only_memory<unsigned char> denoiser_state;
+  int denoiser_input_passes = 0;
+
+ public:
+  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
+      : CUDADevice(info_, stats_, profiler_, background_),
+        sbt_data(this, "__sbt", MEM_READ_ONLY),
+        launch_params(this, "__params"),
+        denoiser_state(this, "__denoiser_state")
+  {
+    // Store number of CUDA streams in device info
+    info.cpu_threads = DebugFlags().optix.cuda_streams;
+
+    // Make the CUDA context current
+    if (!cuContext) {
+      return;  // Do not initialize if CUDA context creation failed already
+    }
+    const CUDAContextScope scope(cuContext);
+
+    // Create OptiX context for this device
+    OptixDeviceContextOptions options = {};
+#  ifdef WITH_CYCLES_LOGGING
+    options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
+    options.logCallbackFunction =
+        [](unsigned int level, const char *, const char *message, void *) {
+          switch (level) {
+            case 1:
+              LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+              break;
+            case 2:
+              LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+              break;
+            case 3:
+              LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+              break;
+            case 4:
+              LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+              break;
+          }
+        };
+#  endif
+    check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
+#  ifdef WITH_CYCLES_LOGGING
+    check_result_optix(optixDeviceContextSetLogCallback(
+        context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+#  endif
+
+    // Create launch streams
+    cuda_stream.resize(info.cpu_threads);
+    for (int i = 0; i < info.cpu_threads; ++i)
+      check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
+
+    // Fix weird compiler bug that assigns wrong size
+    launch_params.data_elements = sizeof(KernelParams);
+    // Allocate launch parameter buffer memory on device
+    launch_params.alloc_to_device(info.cpu_threads);
+  }
+  ~OptiXDevice()
+  {
+    // Stop processing any more tasks
+    task_pool.cancel();
+
+    // Make CUDA context current
+    const CUDAContextScope scope(cuContext);
+
+    // Free all acceleration structures
+    for (CUdeviceptr mem : as_mem) {
+      cuMemFree(mem);
+    }
+
+    sbt_data.free();
+    texture_info.free();
+    launch_params.free();
+    denoiser_state.free();
+
+    // Unload modules
+    if (optix_module != NULL)
+      optixModuleDestroy(optix_module);
+    for (unsigned int i = 0; i < 2; ++i)
+      if (builtin_modules[i] != NULL)
+        optixModuleDestroy(builtin_modules[i]);
+    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
+      if (pipelines[i] != NULL)
+        optixPipelineDestroy(pipelines[i]);
+
+    // Destroy launch streams
+    for (CUstream stream : cuda_stream)
+      cuStreamDestroy(stream);
+
+    if (denoiser != NULL)
+      optixDenoiserDestroy(denoiser);
+
+    optixDeviceContextDestroy(context);
+  }
+
+ private:
+  bool show_samples() const override
+  {
+    // Only show samples if not rendering multiple tiles in parallel
+    return info.cpu_threads == 1;
+  }
+
+  BVHLayoutMask get_bvh_layout_mask() const override
+  {
+    // OptiX has its own internal acceleration structure format
+    return BVH_LAYOUT_OPTIX;
+  }
+
+  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
+                                          bool filter,
+                                          bool /*split*/) override
+  {
+    // Split kernel is not supported in OptiX
+    string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
+        requested_features, filter, false);
+
+    // Add OptiX SDK include directory to include paths
+    const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+    if (optix_sdk_path) {
+      common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+    }
+
+    return common_cflags;
+  }
+
+  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
+  {
+    if (have_error()) {
+      // Abort early if context creation failed already
+      return false;
+    }
+
+    // Load CUDA modules because we need some of the utility kernels
+    if (!CUDADevice::load_kernels(requested_features)) {
+      return false;
+    }
+
+    // Disable baking for now, since its kernel is not well-suited for inlining and is very slow
+    if (requested_features.use_baking) {
+      set_error("OptiX backend does not support baking yet");
+      return false;
+    }
+    // Disable shader raytracing support for now, since continuation callables are slow
+    if (requested_features.use_shader_raytrace) {
+      set_error("OptiX backend does not support 'Ambient Occlusion' and 'Bevel' shader nodes yet");
+      return false;
+    }
+
+    const CUDAContextScope scope(cuContext);
+
+    // Unload existing OptiX module and pipelines first
+    if (optix_module != NULL) {
+      optixModuleDestroy(optix_module);
+      optix_module = NULL;
+    }
+    for (unsigned int i = 0; i < 2; ++i) {
+      if (builtin_modules[i] != NULL) {
+        optixModuleDestroy(builtin_modules[i]);
+        builtin_modules[i] = NULL;
+      }
+    }
+    for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+      if (pipelines[i] != NULL) {
+        optixPipelineDestroy(pipelines[i]);
+        pipelines[i] = NULL;
+      }
+    }
+
+    OptixModuleCompileOptions module_options;
+    module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
+#  ifdef WITH_CYCLES_DEBUG
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+#  else
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+#  endif
+    OptixPipelineCompileOptions pipeline_options;
+    // Default to no motion blur and two-level graph, since it is the fastest option
+    pipeline_options.usesMotionBlur = false;
+    pipeline_options.traversableGraphFlags =
+        OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+    pipeline_options.numPayloadValues = 6;
+    pipeline_options.numAttributeValues = 2;  // u, v
+#  ifdef WITH_CYCLES_DEBUG
+    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW |
+                                      OPTIX_EXCEPTION_FLAG_TRACE_DEPTH;
+#  else
+    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+#  endif
+    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
+
+#  if OPTIX_ABI_VERSION >= 36
+    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+    if (requested_features.use_hair) {
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+      }
+      else {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+      }
+    }
+#  endif
+
+    // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+    // This is necessary since objects may be reported to have motion if the Vector pass is
+    // active, but may still need to be rendered without motion blur if that isn't active as well
+    motion_blur = requested_features.use_object_motion;
+
+    if (motion_blur) {
+      pipeline_options.usesMotionBlur = true;
+      // Motion blur can insert motion transforms into the traversal graph
+      // It is no longer a two-level graph then, so need to set flags to allow any configuration
+      pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+    }
+
+    {  // Load and compile PTX module with OptiX kernels
+      string ptx_data, ptx_filename = path_get("lib/kernel_optix.ptx");
+      if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+        if (!getenv("OPTIX_ROOT_DIR")) {
+          set_error(
+              "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+              "the Optix SDK to be able to compile Optix kernels on demand).");
+          return false;
+        }
+        ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
+      }
+      if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+        set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
+        return false;
+      }
+
+      check_result_optix_ret(optixModuleCreateFromPTX(context,
+                                                      &module_options,
+                                                      &pipeline_options,
+                                                      ptx_data.data(),
+                                                      ptx_data.size(),
+                                                      nullptr,
+                                                      0,
+                                                      &optix_module));
+    }
+
+    // Create program groups
+    OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+    OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+    OptixProgramGroupOptions group_options = {};  // There are no options currently
+    group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN].raygen.module = optix_module;
+    // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
+    group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
+    group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+    group_descs[PG_MISS].miss.module = optix_module;
+    group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+    group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+    group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+    group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+    group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+    if (requested_features.use_hair) {
+      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+
+      // Add curve intersection programs
+      if (requested_features.use_hair_thick) {
+        // Slower programs for thick hair since that also slows down ribbons.
+        // Ideally this should not be needed.
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
+      }
+      else {
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+      }
+
+#  if OPTIX_ABI_VERSION >= 36
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        OptixBuiltinISOptions builtin_options;
+        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        builtin_options.usesMotionBlur = false;
+
+        check_result_optix_ret(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+        if (motion_blur) {
+          builtin_options.usesMotionBlur = true;
+
+          check_result_optix_ret(optixBuiltinISModuleGet(
+              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        }
+      }
+#  endif
+    }
+
+    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
+      // Add hit group for local intersections
+      group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+      group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+      group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+    }
+
+#  ifdef WITH_CYCLES_DEBUG
+    group_descs[PG_EXCP].kind = OPTIX_PROGRAM_GROUP_KIND_EXCEPTION;
+    group_descs[PG_EXCP].exception.module = optix_module;
+    group_descs[PG_EXCP].exception.entryFunctionName = "__exception__kernel_optix_exception";
+#  endif
+
+    if (requested_features.use_baking) {
+      group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+      group_descs[PG_BAKE].raygen.module = optix_module;
+      group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
+    }
+
+    if (requested_features.use_true_displacement) {
+      group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+      group_descs[PG_DISP].raygen.module = optix_module;
+      group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
+    }
+
+    if (requested_features.use_background_light) {
+      group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+      group_descs[PG_BACK].raygen.module = optix_module;
+      group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
+    }
+
+    check_result_optix_ret(optixProgramGroupCreate(
+        context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+    // Get program stack sizes
+    OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+    // Set up SBT, which in this case is used only to select between different programs
+    sbt_data.alloc(NUM_PROGRAM_GROUPS);
+    memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+      check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+      check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+    }
+    sbt_data.copy_to_device();  // Upload SBT to device
+
+    // Calculate maximum trace continuation stack size
+    unsigned int trace_css = stack_size[PG_HITD].cssCH;
+    // This is based on the maximum of closest-hit and any-hit/intersection programs
+    trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+#  if OPTIX_ABI_VERSION >= 36
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+#  endif
+
+    OptixPipelineLinkOptions link_options;
+    link_options.maxTraceDepth = 1;
+#  ifdef WITH_CYCLES_DEBUG
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+#  else
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+#  endif
+#  if OPTIX_ABI_VERSION < 24
+    link_options.overrideUsesMotionBlur = motion_blur;
+#  endif
+
+    {  // Create path tracing pipeline
+      OptixProgramGroup pipeline_groups[] = {
+        groups[PG_RGEN],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
+#  ifdef WITH_CYCLES_DEBUG
+        groups[PG_EXCP],
+#  endif
+      };
+      check_result_optix_ret(
+          optixPipelineCreate(context,
+                              &pipeline_options,
+                              &link_options,
+                              pipeline_groups,
+                              (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
+                              nullptr,
+                              0,
+                              &pipelines[PIP_PATH_TRACE]));
+
+      // Combine ray generation and trace continuation stack size
+      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
+
+      // Set stack size depending on pipeline options
+      check_result_optix_ret(
+          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2)));
+    }
+
+    // Only need to create shader evaluation pipeline if one of these features is used:
+    const bool use_shader_eval_pipeline = requested_features.use_baking ||
+                                          requested_features.use_background_light ||
+                                          requested_features.use_true_displacement;
+
+    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
+      OptixProgramGroup pipeline_groups[] = {
+        groups[PG_BAKE],
+        groups[PG_DISP],
+        groups[PG_BACK],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
+#  ifdef WITH_CYCLES_DEBUG
+        groups[PG_EXCP],
+#  endif
+      };
+      check_result_optix_ret(
+          optixPipelineCreate(context,
+                              &pipeline_options,
+                              &link_options,
+                              pipeline_groups,
+                              (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
+                              nullptr,
+                              0,
+                              &pipelines[PIP_SHADER_EVAL]));
+
+      // Calculate continuation stack size based on the maximum of all ray generation stack sizes
+      const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
+                                        std::max(stack_size[PG_DISP].cssRG,
+                                                 stack_size[PG_BACK].cssRG)) +
+                               link_options.maxTraceDepth * trace_css;
+
+      check_result_optix_ret(optixPipelineSetStackSize(
+          pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+    }
+
+    // Clean up program group objects
+    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+      optixProgramGroupDestroy(groups[i]);
+    }
+
+    return true;
+  }
+
+  void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
+  {
+    if (have_error())
+      return;  // Abort early if there was an error previously
+
+    if (task.type == DeviceTask::RENDER) {
+      if (thread_index != 0) {
+        // Only execute denoising in a single thread (see also 'task_add')
+        task.tile_types &= ~RenderTile::DENOISE;
+      }
+
+      RenderTile tile;
+      while (task.acquire_tile(this, tile, task.tile_types)) {
+        if (tile.task == RenderTile::PATH_TRACE)
+          launch_render(task, tile, thread_index);
+        else if (tile.task == RenderTile::DENOISE)
+          launch_denoise(task, tile);
+        task.release_tile(tile);
+        if (task.get_cancel() && !task.need_finish_queue)
+          break;  // User requested cancellation
+        else if (have_error())
+          break;  // Abort rendering when encountering an error
+      }
+    }
+    else if (task.type == DeviceTask::SHADER) {
+      launch_shader_eval(task, thread_index);
+    }
+    else if (task.type == DeviceTask::DENOISE_BUFFER) {
+      // Set up a single tile that covers the whole task and denoise it
+      RenderTile tile;
+      tile.x = task.x;
+      tile.y = task.y;
+      tile.w = task.w;
+      tile.h = task.h;
+      tile.buffer = task.buffer;
+      tile.num_samples = task.num_samples;
+      tile.start_sample = task.sample;
+      tile.offset = task.offset;
+      tile.stride = task.stride;
+      tile.buffers = task.buffers;
+
+      launch_denoise(task, tile);
+    }
+  }
+
+  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
+  {
+    assert(thread_index < launch_params.data_size);
+
+    // Keep track of total render time of this tile
+    const scoped_timer timer(&rtile.buffers->render_time);
+
+    WorkTile wtile;
+    wtile.x = rtile.x;
+    wtile.y = rtile.y;
+    wtile.w = rtile.w;
+    wtile.h = rtile.h;
+    wtile.offset = rtile.offset;
+    wtile.stride = rtile.stride;
+    wtile.buffer = (float *)rtile.buffer;
+
+    const int end_sample = rtile.start_sample + rtile.num_samples;
+    // Keep this number reasonable to avoid running into TDRs
+    int step_samples = (info.display_device ? 8 : 32);
+    if (task.adaptive_sampling.use) {
+      step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+    }
+
+    // Offset into launch params buffer so that streams use separate data
+    device_ptr launch_params_ptr = launch_params.device_pointer +
+                                   thread_index * launch_params.data_elements;
+
+    const CUDAContextScope scope(cuContext);
+
+    for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
+      // Copy work tile information to device
+      wtile.num_samples = min(step_samples, end_sample - sample);
+      wtile.start_sample = sample;
+      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+      check_result_cuda(
+          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
+
+      OptixShaderBindingTable sbt_params = {};
+      sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
+#  ifdef WITH_CYCLES_DEBUG
+      sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
+#  endif
+      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
+      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+      sbt_params.missRecordCount = 1;
+      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
+      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
+
+      // Launch the ray generation program
+      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
+                                     cuda_stream[thread_index],
+                                     launch_params_ptr,
+                                     launch_params.data_elements,
+                                     &sbt_params,
+                                     // Launch with samples close to each other for better locality
+                                     wtile.w * wtile.num_samples,
+                                     wtile.h,
+                                     1));
+
+      // Run the adaptive sampling kernels at selected samples aligned to step samples.
+      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
+      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+      }
+
+      // Wait for launch to finish
+      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+
+      // Update current sample, so it is displayed correctly
+      rtile.sample = wtile.start_sample + wtile.num_samples;
+      // Update task progress after the kernel completed rendering
+      task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
+
+      if (task.get_cancel() && !task.need_finish_queue)
+        return;  // Cancel rendering
+    }
+
+    // Finalize adaptive sampling
+    if (task.adaptive_sampling.use) {
+      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
+    }
+  }
+
+  bool launch_denoise(DeviceTask &task, RenderTile &rtile)
+  {
+    // Update current sample (for display and NLM denoising task)
+    rtile.sample = rtile.start_sample + rtile.num_samples;
+
+    // Make CUDA context current now, since it is used for both denoising tasks
+    const CUDAContextScope scope(cuContext);
+
+    // Choose between OptiX and NLM denoising
+    if (task.denoising.type == DENOISER_OPTIX) {
+      // Map neighboring tiles onto this device, indices are as following:
+      // Where index 4 is the center tile and index 9 is the target for the result.
+      //   0 1 2
+      //   3 4 5
+      //   6 7 8  9
+      RenderTileNeighbors neighbors(rtile);
+      task.map_neighbor_tiles(neighbors, this);
+      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+      RenderTile &target_tile = neighbors.target;
+      rtile = center_tile;  // Tile may have been modified by mapping code
+
+      // Calculate size of the tile to denoise (including overlap)
+      int4 rect = center_tile.bounds();
+      // Overlap between tiles has to be at least 64 pixels
+      // TODO(pmours): Query this value from OptiX
+      rect = rect_expand(rect, 64);
+      int4 clip_rect = neighbors.bounds();
+      rect = rect_clip(rect, clip_rect);
+      int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
+      int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
+
+      // Calculate byte offsets and strides
+      int pixel_stride = task.pass_stride * (int)sizeof(float);
+      int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
+      const int pass_offset[3] = {
+          (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
+          (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
+          (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
+
+      // Start with the current tile pointer offset
+      int input_stride = pixel_stride;
+      device_ptr input_ptr = rtile.buffer + pixel_offset;
+
+      // Copy tile data into a common buffer if necessary
+      device_only_memory<float> input(this, "denoiser input");
+      device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE);
+
+      bool contiguous_memory = true;
+      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+        if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
+          contiguous_memory = false;
+        }
+      }
+
+      if (contiguous_memory) {
+        // Tiles are in continous memory, so can just subtract overlap offset
+        input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
+        // Stride covers the whole width of the image and not just a single tile
+        input_stride *= rtile.stride;
+      }
+      else {
+        // Adjacent tiles are in separate memory regions, so need to copy them into a single one
+        input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
+        // Start with the new input buffer
+        input_ptr = input.device_pointer;
+        // Stride covers the width of the new input buffer, which includes tile width and overlap
+        input_stride *= rect_size.x;
+
+        TileInfo *tile_info = tile_info_mem.alloc(1);
+        for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+          tile_info->offsets[i] = neighbors.tiles[i].offset;
+          tile_info->strides[i] = neighbors.tiles[i].stride;
+          tile_info->buffers[i] = neighbors.tiles[i].buffer;
+        }
+        tile_info->x[0] = neighbors.tiles[3].x;
+        tile_info->x[1] = neighbors.tiles[4].x;
+        tile_info->x[2] = neighbors.tiles[5].x;
+        tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
+        tile_info->y[0] = neighbors.tiles[1].y;
+        tile_info->y[1] = neighbors.tiles[4].y;
+        tile_info->y[2] = neighbors.tiles[7].y;
+        tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
+        tile_info_mem.copy_to_device();
+
+        void *args[] = {
+            &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
+        launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
+      }
+
+#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
+      device_only_memory<float> input_rgb(this, "denoiser input rgb");
+      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
+
+      void *input_args[] = {&input_rgb.device_pointer,
+                            &input_ptr,
+                            &rect_size.x,
+                            &rect_size.y,
+                            &input_stride,
+                            &task.pass_stride,
+                            const_cast<int *>(pass_offset),
+                            &task.denoising.input_passes,
+                            &rtile.sample};
+      launch_filter_kernel(
+          "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
+
+      input_ptr = input_rgb.device_pointer;
+      pixel_stride = 3 * sizeof(float);
+      input_stride = rect_size.x * pixel_stride;
+#  endif
+
+      const bool recreate_denoiser = (denoiser == NULL) ||
+                                     (task.denoising.input_passes != denoiser_input_passes);
+      if (recreate_denoiser) {
+        // Destroy existing handle before creating new one
+        if (denoiser != NULL) {
+          optixDenoiserDestroy(denoiser);
+        }
+
+        // Create OptiX denoiser handle on demand when it is first used
+        OptixDenoiserOptions denoiser_options;
+        assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
+        denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
+            OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
+#  if OPTIX_ABI_VERSION < 28
+        denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
+#  endif
+        check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
+        check_result_optix_ret(
+            optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
+
+        // OptiX denoiser handle was created with the requested number of input passes
+        denoiser_input_passes = task.denoising.input_passes;
+      }
+
+      OptixDenoiserSizes sizes = {};
+      check_result_optix_ret(
+          optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
+
+#  if OPTIX_ABI_VERSION < 28
+      const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
+#  else
+      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
+#  endif
+      const size_t scratch_offset = sizes.stateSizeInBytes;
+
+      // Allocate denoiser state if tile size has changed since last setup
+      if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
+                                denoiser_state.data_height != rect_size.y)) {
+        denoiser_state.alloc_to_device(scratch_offset + scratch_size);
+
+        // Initialize denoiser state for the current tile size
+        check_result_optix_ret(optixDenoiserSetup(denoiser,
+                                                  0,
+                                                  rect_size.x,
+                                                  rect_size.y,
+                                                  denoiser_state.device_pointer,
+                                                  scratch_offset,
+                                                  denoiser_state.device_pointer + scratch_offset,
+                                                  scratch_size));
+
+        denoiser_state.data_width = rect_size.x;
+        denoiser_state.data_height = rect_size.y;
+      }
+
+      // Set up input and output layer information
+      OptixImage2D input_layers[3] = {};
+      OptixImage2D output_layers[1] = {};
+
+      for (int i = 0; i < 3; ++i) {
+#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
+        input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
+#  else
+        input_layers[i].data = input_ptr + pass_offset[i];
+#  endif
+        input_layers[i].width = rect_size.x;
+        input_layers[i].height = rect_size.y;
+        input_layers[i].rowStrideInBytes = input_stride;
+        input_layers[i].pixelStrideInBytes = pixel_stride;
+        input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
+      }
+
+#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
+      output_layers[0].data = input_ptr;
+      output_layers[0].width = rect_size.x;
+      output_layers[0].height = rect_size.y;
+      output_layers[0].rowStrideInBytes = input_stride;
+      output_layers[0].pixelStrideInBytes = pixel_stride;
+      int2 output_offset = overlap_offset;
+      overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
+#  else
+      output_layers[0].data = target_tile.buffer + pixel_offset;
+      output_layers[0].width = target_tile.w;
+      output_layers[0].height = target_tile.h;
+      output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
+      output_layers[0].pixelStrideInBytes = pixel_stride;
+#  endif
+      output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
+
+      // Finally run denonising
+      OptixDenoiserParams params = {};  // All parameters are disabled/zero
+      check_result_optix_ret(optixDenoiserInvoke(denoiser,
+                                                 0,
+                                                 &params,
+                                                 denoiser_state.device_pointer,
+                                                 scratch_offset,
+                                                 input_layers,
+                                                 task.denoising.input_passes,
+                                                 overlap_offset.x,
+                                                 overlap_offset.y,
+                                                 output_layers,
+                                                 denoiser_state.device_pointer + scratch_offset,
+                                                 scratch_size));
+
+#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
+      void *output_args[] = {&input_ptr,
+                             &target_tile.buffer,
+                             &output_offset.x,
+                             &output_offset.y,
+                             &rect_size.x,
+                             &rect_size.y,
+                             &target_tile.x,
+                             &target_tile.y,
+                             &target_tile.w,
+                             &target_tile.h,
+                             &target_tile.offset,
+                             &target_tile.stride,
+                             &task.pass_stride,
+                             &rtile.sample};
+      launch_filter_kernel(
+          "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
+#  endif
+
+      check_result_cuda_ret(cuStreamSynchronize(0));
+
+      task.unmap_neighbor_tiles(neighbors, this);
+    }
+    else {
+      // Run CUDA denoising kernels
+      DenoisingTask denoising(this, task);
+      CUDADevice::denoise(rtile, denoising);
+    }
+
+    // Update task progress after the denoiser completed processing
+    task.update_progress(&rtile, rtile.w * rtile.h);
+
+    return true;
+  }
+
+  void launch_shader_eval(DeviceTask &task, int thread_index)
+  {
+    unsigned int rgen_index = PG_BACK;
+    if (task.shader_eval_type >= SHADER_EVAL_BAKE)
+      rgen_index = PG_BAKE;
+    if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
+      rgen_index = PG_DISP;
+
+    const CUDAContextScope scope(cuContext);
+
+    device_ptr launch_params_ptr = launch_params.device_pointer +
+                                   thread_index * launch_params.data_elements;
+
+    for (int sample = 0; sample < task.num_samples; ++sample) {
+      ShaderParams params;
+      params.input = (uint4 *)task.shader_input;
+      params.output = (float4 *)task.shader_output;
+      params.type = task.shader_eval_type;
+      params.filter = task.shader_filter;
+      params.sx = task.shader_x;
+      params.offset = task.offset;
+      params.sample = sample;
+
+      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
+                                          &params,
+                                          sizeof(params),
+                                          cuda_stream[thread_index]));
+
+      OptixShaderBindingTable sbt_params = {};
+      sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
+#  ifdef WITH_CYCLES_DEBUG
+      sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
+#  endif
+      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
+      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+      sbt_params.missRecordCount = 1;
+      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
+      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
+
+      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
+                                     cuda_stream[thread_index],
+                                     launch_params_ptr,
+                                     launch_params.data_elements,
+                                     &sbt_params,
+                                     task.shader_w,
+                                     1,
+                                     1));
+
+      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+
+      task.update_progress(NULL);
+    }
+  }
+
+  bool build_optix_bvh(const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps,
+                       OptixTraversableHandle &out_handle)
+  {
+    out_handle = 0;
+
+    const CUDAContextScope scope(cuContext);
+
+    // Compute memory usage
+    OptixAccelBufferSizes sizes = {};
+    OptixAccelBuildOptions options;
+    options.operation = OPTIX_BUILD_OPERATION_BUILD;
+    if (background) {
+      // Prefer best performance and lowest memory consumption in background
+      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+    }
+    else {
+      // Prefer fast updates in viewport
+      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD;
+    }
+
+    options.motionOptions.numKeys = num_motion_steps;
+    options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+    options.motionOptions.timeBegin = 0.0f;
+    options.motionOptions.timeEnd = 1.0f;
+
+    check_result_optix_ret(
+        optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+    // Allocate required output buffers
+    device_only_memory<char> temp_mem(this, "temp_build_mem");
+    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+    if (!temp_mem.device_pointer)
+      return false;  // Make sure temporary memory allocation succeeded
+
+    // Move textures to host memory if there is not enough room
+    size_t size = 0, free = 0;
+    cuMemGetInfo(&free, &size);
+    size = sizes.outputSizeInBytes + device_working_headroom;
+    if (size >= free && can_map_host) {
+      move_textures_to_host(size - free, false);
+    }
+
+    CUdeviceptr out_data = 0;
+    check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
+    as_mem.push_back(out_data);
+
+    // Finally build the acceleration structure
+    OptixAccelEmitDesc compacted_size_prop;
+    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+    // A tiny space was allocated for this property at the end of the temporary buffer above
+    // Make sure this pointer is 8-byte aligned
+    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+    check_result_optix_ret(optixAccelBuild(context,
+                                           NULL,
+                                           &options,
+                                           &build_input,
+                                           1,
+                                           temp_mem.device_pointer,
+                                           sizes.tempSizeInBytes,
+                                           out_data,
+                                           sizes.outputSizeInBytes,
+                                           &out_handle,
+                                           background ? &compacted_size_prop : NULL,
+                                           background ? 1 : 0));
+
+    // Wait for all operations to finish
+    check_result_cuda_ret(cuStreamSynchronize(NULL));
+
+    // Compact acceleration structure to save memory (do not do this in viewport for faster builds)
+    if (background) {
+      uint64_t compacted_size = sizes.outputSizeInBytes;
+      check_result_cuda_ret(
+          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+      // Temporary memory is no longer needed, so free it now to make space
+      temp_mem.free();
+
+      // There is no point compacting if the size does not change
+      if (compacted_size < sizes.outputSizeInBytes) {
+        CUdeviceptr compacted_data = 0;
+        if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
+          // Do not compact if memory allocation for compacted acceleration structure fails
+          // Can just use the uncompacted one then, so succeed here regardless
+          return true;
+        as_mem.push_back(compacted_data);
+
+        check_result_optix_ret(optixAccelCompact(
+            context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
+
+        // Wait for compaction to finish
+        check_result_cuda_ret(cuStreamSynchronize(NULL));
+
+        // Free uncompacted acceleration structure
+        cuMemFree(out_data);
+        as_mem.erase(as_mem.end() - 2);  // Remove 'out_data' from 'as_mem' array
+      }
+    }
+
+    return true;
+  }
+
+  bool build_optix_bvh(BVH *bvh) override
+  {
+    assert(bvh->params.top_level);
+
+    unsigned int num_instances = 0;
+    unordered_map<Geometry *, OptixTraversableHandle> geometry;
+    geometry.reserve(bvh->geometry.size());
+
+    // Free all previous acceleration structures
+    for (CUdeviceptr mem : as_mem) {
+      cuMemFree(mem);
+    }
+    as_mem.clear();
+
+    // Build bottom level acceleration structures (BLAS)
+    // Note: Always keep this logic in sync with bvh_optix.cpp!
+    for (Object *ob : bvh->objects) {
+      // Skip geometry for which acceleration structure already exists
+      Geometry *geom = ob->geometry;
+      if (geometry.find(geom) != geometry.end())
+        continue;
+
+      if (geom->type == Geometry::HAIR) {
+        // Build BLAS for curve primitives
+        Hair *const hair = static_cast<Hair *const>(ob->geometry);
+        if (hair->num_curves() == 0) {
+          continue;
+        }
+
+        const size_t num_segments = hair->num_segments();
+
+        size_t num_motion_steps = 1;
+        Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+        if (motion_blur && hair->use_motion_blur && motion_keys) {
+          num_motion_steps = hair->motion_steps;
+        }
+
+        device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
+#  if OPTIX_ABI_VERSION >= 36
+        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        // Four control points for each curve segment
+        const size_t num_vertices = num_segments * 4;
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          index_data.alloc(num_segments);
+          vertex_data.alloc(num_vertices * num_motion_steps);
+        }
+        else
+#  endif
+          aabb_data.alloc(num_segments * num_motion_steps);
+
+        // Get AABBs for each motion step
+        for (size_t step = 0; step < num_motion_steps; ++step) {
+          // The center step for motion vertices is not stored in the attribute
+          const float3 *keys = hair->curve_keys.data();
+          size_t center_step = (num_motion_steps - 1) / 2;
+          if (step != center_step) {
+            size_t attr_offset = (step > center_step) ? step - 1 : step;
+            // Technically this is a float4 array, but sizeof(float3) is the same as sizeof(float4)
+            keys = motion_keys->data_float3() + attr_offset * hair->curve_keys.size();
+          }
+
+          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+            const Hair::Curve curve = hair->get_curve(j);
+
+            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+#  if OPTIX_ABI_VERSION >= 36
+              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+                int k0 = curve.first_key + segment;
+                int k1 = k0 + 1;
+                int ka = max(k0 - 1, curve.first_key);
+                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+                const float4 pw = make_float4(hair->curve_radius[ka],
+                                              hair->curve_radius[k0],
+                                              hair->curve_radius[k1],
+                                              hair->curve_radius[kb]);
+
+                // Convert Catmull-Rom data to Bezier spline
+                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+                index_data[i] = i * 4;
+                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+                v[0] = make_float4(
+                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+                v[1] = make_float4(
+                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+                v[2] = make_float4(
+                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+                v[3] = make_float4(
+                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+              }
+              else
+#  endif
+              {
+                BoundBox bounds = BoundBox::empty;
+                curve.bounds_grow(segment, keys, hair->curve_radius.data(), bounds);
+
+                const size_t index = step * num_segments + i;
+                aabb_data[index].minX = bounds.min.x;
+                aabb_data[index].minY = bounds.min.y;
+                aabb_data[index].minZ = bounds.min.z;
+                aabb_data[index].maxX = bounds.max.x;
+                aabb_data[index].maxY = bounds.max.y;
+                aabb_data[index].maxZ = bounds.max.z;
+              }
+            }
+          }
+        }
+
+        // Upload AABB data to GPU
+        aabb_data.copy_to_device();
+#  if OPTIX_ABI_VERSION >= 36
+        index_data.copy_to_device();
+        vertex_data.copy_to_device();
+#  endif
+
+        vector<device_ptr> aabb_ptrs;
+        aabb_ptrs.reserve(num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        vector<device_ptr> width_ptrs;
+        vector<device_ptr> vertex_ptrs;
+        width_ptrs.reserve(num_motion_steps);
+        vertex_ptrs.reserve(num_motion_steps);
+#  endif
+        for (size_t step = 0; step < num_motion_steps; ++step) {
+          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+#  if OPTIX_ABI_VERSION >= 36
+          const device_ptr base_ptr = vertex_data.device_pointer +
+                                      step * num_vertices * sizeof(float4);
+          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
+          vertex_ptrs.push_back(base_ptr);
+#  endif
+        }
+
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+        OptixBuildInput build_input = {};
+#  if OPTIX_ABI_VERSION >= 36
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+          build_input.curveArray.numPrimitives = num_segments;
+          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+          build_input.curveArray.numVertices = num_vertices;
+          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+          build_input.curveArray.widthStrideInBytes = sizeof(float4);
+          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+          build_input.curveArray.indexStrideInBytes = sizeof(int);
+          build_input.curveArray.flag = build_flags;
+          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+        }
+        else
+#  endif
+        {
+          // Disable visibility test any-hit program, since it is already checked during
+          // intersection. Those trace calls that require anyhit can force it with a ray flag.
+          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+#  if OPTIX_ABI_VERSION < 23
+          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.aabbArray.numPrimitives = num_segments;
+          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+          build_input.aabbArray.flags = &build_flags;
+          build_input.aabbArray.numSbtRecords = 1;
+          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  else
+          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.customPrimitiveArray.numPrimitives = num_segments;
+          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+          build_input.customPrimitiveArray.flags = &build_flags;
+          build_input.customPrimitiveArray.numSbtRecords = 1;
+          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  endif
+        }
+
+        // Allocate memory for new BLAS and build it
+        OptixTraversableHandle handle;
+        if (build_optix_bvh(build_input, num_motion_steps, handle)) {
+          geometry.insert({ob->geometry, handle});
+        }
+        else {
+          return false;
+        }
+      }
+      else if (geom->type == Geometry::MESH) {
+        // Build BLAS for triangle primitives
+        Mesh *const mesh = static_cast<Mesh *const>(ob->geometry);
+        if (mesh->num_triangles() == 0) {
+          continue;
+        }
+
+        const size_t num_verts = mesh->verts.size();
+
+        size_t num_motion_steps = 1;
+        Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+        if (motion_blur && mesh->use_motion_blur && motion_keys) {
+          num_motion_steps = mesh->motion_steps;
+        }
+
+        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        index_data.alloc(mesh->triangles.size());
+        memcpy(index_data.data(), mesh->triangles.data(), mesh->triangles.size() * sizeof(int));
+        device_vector<float3> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        vertex_data.alloc(num_verts * num_motion_steps);
+
+        for (size_t step = 0; step < num_motion_steps; ++step) {
+          const float3 *verts = mesh->verts.data();
+
+          size_t center_step = (num_motion_steps - 1) / 2;
+          // The center step for motion vertices is not stored in the attribute
+          if (step != center_step) {
+            verts = motion_keys->data_float3() +
+                    (step > center_step ? step - 1 : step) * num_verts;
+          }
+
+          memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+        }
+
+        // Upload triangle data to GPU
+        index_data.copy_to_device();
+        vertex_data.copy_to_device();
+
+        vector<device_ptr> vertex_ptrs;
+        vertex_ptrs.reserve(num_motion_steps);
+        for (size_t step = 0; step < num_motion_steps; ++step) {
+          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+        }
+
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+        OptixBuildInput build_input = {};
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+        build_input.triangleArray.numVertices = num_verts;
+        build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+        build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
+        build_input.triangleArray.indexBuffer = index_data.device_pointer;
+        build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+        build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+        build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+        build_input.triangleArray.flags = &build_flags;
+        // The SBT does not store per primitive data since Cycles already allocates separate
+        // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+        // one and rely on that having the same meaning in this case.
+        build_input.triangleArray.numSbtRecords = 1;
+        build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+        // Allocate memory for new BLAS and build it
+        OptixTraversableHandle handle;
+        if (build_optix_bvh(build_input, num_motion_steps, handle)) {
+          geometry.insert({ob->geometry, handle});
+        }
+        else {
+          return false;
+        }
+      }
+    }
+
+    // Fill instance descriptions
+    device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
+    aabbs.alloc(bvh->objects.size());
+    device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
+    instances.alloc(bvh->objects.size());
+
+    for (Object *ob : bvh->objects) {
+      // Skip non-traceable objects
+      if (!ob->is_traceable())
+        continue;
+
+      // Create separate instance for triangle/curve meshes of an object
+      auto handle_it = geometry.find(ob->geometry);
+      if (handle_it == geometry.end()) {
+        continue;
+      }
+      OptixTraversableHandle handle = handle_it->second;
+
+      OptixAabb &aabb = aabbs[num_instances];
+      aabb.minX = ob->bounds.min.x;
+      aabb.minY = ob->bounds.min.y;
+      aabb.minZ = ob->bounds.min.z;
+      aabb.maxX = ob->bounds.max.x;
+      aabb.maxY = ob->bounds.max.y;
+      aabb.maxZ = ob->bounds.max.z;
+
+      OptixInstance &instance = instances[num_instances++];
+      memset(&instance, 0, sizeof(instance));
+
+      // Clear transform to identity matrix
+      instance.transform[0] = 1.0f;
+      instance.transform[5] = 1.0f;
+      instance.transform[10] = 1.0f;
+
+      // Set user instance ID to object index
+      instance.instanceId = ob->get_device_index();
+
+      // Have to have at least one bit in the mask, or else instance would always be culled
+      instance.visibilityMask = 1;
+
+      if (ob->geometry->has_volume) {
+        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->geometry->type == Geometry::HAIR) {
+        // Same applies to curves (so they can be skipped in local trace calls)
+        instance.visibilityMask |= 4;
+
+#  if OPTIX_ABI_VERSION >= 36
+        if (motion_blur && ob->geometry->has_motion_blur() && DebugFlags().optix.curves_api &&
+            static_cast<const Hair *>(ob->geometry)->curve_shape == CURVE_THICK) {
+          // Select between motion blur and non-motion blur built-in intersection module
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+#  endif
+      }
+
+      // Insert motion traversable if object has motion
+      if (motion_blur && ob->use_motion()) {
+        size_t motion_keys = max(ob->motion.size(), 2) - 2;
+        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                       motion_keys * sizeof(OptixSRTData);
+
+        const CUDAContextScope scope(cuContext);
+
+        CUdeviceptr motion_transform_gpu = 0;
+        check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
+        as_mem.push_back(motion_transform_gpu);
+
+        // Allocate host side memory for motion transform and fill it with transform data
+        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+            new uint8_t[motion_transform_size]);
+        motion_transform.child = handle;
+        motion_transform.motionOptions.numKeys = ob->motion.size();
+        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+        motion_transform.motionOptions.timeBegin = 0.0f;
+        motion_transform.motionOptions.timeEnd = 1.0f;
+
+        OptixSRTData *const srt_data = motion_transform.srtData;
+        array<DecomposedTransform> decomp(ob->motion.size());
+        transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
+
+        for (size_t i = 0; i < ob->motion.size(); ++i) {
+          // Scale
+          srt_data[i].sx = decomp[i].y.w;  // scale.x.x
+          srt_data[i].sy = decomp[i].z.w;  // scale.y.y
+          srt_data[i].sz = decomp[i].w.w;  // scale.z.z
+
+          // Shear
+          srt_data[i].a = decomp[i].z.x;  // scale.x.y
+          srt_data[i].b = decomp[i].z.y;  // scale.x.z
+          srt_data[i].c = decomp[i].w.x;  // scale.y.z
+          assert(decomp[i].z.z == 0.0f);  // scale.y.x
+          assert(decomp[i].w.y == 0.0f);  // scale.z.x
+          assert(decomp[i].w.z == 0.0f);  // scale.z.y
+
+          // Pivot point
+          srt_data[i].pvx = 0.0f;
+          srt_data[i].pvy = 0.0f;
+          srt_data[i].pvz = 0.0f;
+
+          // Rotation
+          srt_data[i].qx = decomp[i].x.x;
+          srt_data[i].qy = decomp[i].x.y;
+          srt_data[i].qz = decomp[i].x.z;
+          srt_data[i].qw = decomp[i].x.w;
+
+          // Translation
+          srt_data[i].tx = decomp[i].y.x;
+          srt_data[i].ty = decomp[i].y.y;
+          srt_data[i].tz = decomp[i].y.z;
+        }
+
+        // Upload motion transform to GPU
+        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+        // Disable instance transform if object uses motion transform already
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+        // Get traversable handle to motion transform
+        optixConvertPointerToTraversableHandle(context,
+                                               motion_transform_gpu,
+                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                               &instance.traversableHandle);
+      }
+      else {
+        instance.traversableHandle = handle;
+
+        if (ob->geometry->is_instanced()) {
+          // Set transform matrix
+          memcpy(instance.transform, &ob->tfm, sizeof(instance.transform));
+        }
+        else {
+          // Disable instance transform if geometry already has it applied to vertex data
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          // Non-instanced objects read ID from prim_object, so
+          // distinguish them from instanced objects with high bit set
+          instance.instanceId |= 0x800000;
+        }
+      }
+    }
+
+    // Upload instance descriptions
+    aabbs.resize(num_instances);
+    aabbs.copy_to_device();
+    instances.resize(num_instances);
+    instances.copy_to_device();
+
+    // Build top-level acceleration structure (TLAS)
+    OptixBuildInput build_input = {};
+    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+    build_input.instanceArray.instances = instances.device_pointer;
+    build_input.instanceArray.numInstances = num_instances;
+    build_input.instanceArray.aabbs = aabbs.device_pointer;
+    build_input.instanceArray.numAabbs = num_instances;
+
+    return build_optix_bvh(build_input, 0, tlas_handle);
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size) override
+  {
+    // Set constant memory for CUDA module
+    // TODO(pmours): This is only used for tonemapping (see 'film_convert').
+    //               Could be removed by moving those functions to filter CUDA module.
+    CUDADevice::const_copy_to(name, host, size);
+
+    if (strcmp(name, "__data") == 0) {
+      assert(size <= sizeof(KernelData));
+
+      // Fix traversable handle on multi devices
+      KernelData *const data = (KernelData *)host;
+      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+      update_launch_params(offsetof(KernelParams, data), host, size);
+      return;
+    }
+
+    // Update data storage pointers in launch parameters
+#  define KERNEL_TEX(data_type, tex_name) \
+    if (strcmp(name, #tex_name) == 0) { \
+      update_launch_params(offsetof(KernelParams, tex_name), host, size); \
+      return; \
+    }
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+  }
+
+  void update_launch_params(size_t offset, void *data, size_t data_size)
+  {
+    const CUDAContextScope scope(cuContext);
+
+    for (int i = 0; i < info.cpu_threads; ++i)
+      check_result_cuda(
+          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
+                       data,
+                       data_size));
+  }
+
+  void task_add(DeviceTask &task) override
+  {
+    // Upload texture information to device if it has changed since last launch
+    load_texture_info();
+
+    if (task.type == DeviceTask::FILM_CONVERT) {
+      // Execute in main thread because of OpenGL access
+      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+      return;
+    }
+
+    if (task.type == DeviceTask::DENOISE_BUFFER) {
+      // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
+      task_pool.push([=] {
+        DeviceTask task_copy = task;
+        thread_run(task_copy, 0);
+      });
+      return;
+    }
+
+    // Split task into smaller ones
+    list<DeviceTask> tasks;
+    task.split(tasks, info.cpu_threads);
+
+    // Queue tasks in internal task pool
+    int task_index = 0;
+    for (DeviceTask &task : tasks) {
+      task_pool.push([=] {
+        // Using task index parameter instead of thread index, since number of CUDA streams may
+        // differ from number of threads
+        DeviceTask task_copy = task;
+        thread_run(task_copy, task_index);
+      });
+      task_index++;
+    }
+  }
+
+  void task_wait() override
+  {
+    // Wait for all queued tasks to finish
+    task_pool.wait_work();
+  }
+
+  void task_cancel() override
+  {
+    // Cancel any remaining tasks in the internal pool
+    task_pool.cancel();
+  }
+};
+
+bool device_optix_init()
+{
+  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
+    return true;  // Already initialized function table
+
+  // Need to initialize CUDA as well
+  if (!device_cuda_init())
+    return false;
+
+  const OptixResult result = optixInit();
+
+  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+    VLOG(1) << "OptiX initialization failed because driver does not support ABI version "
+            << OPTIX_ABI_VERSION;
+    return false;
+  }
+  else if (result != OPTIX_SUCCESS) {
+    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+    return false;
+  }
+
+  // Loaded OptiX successfully!
+  return true;
+}
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
+{
+  devices.reserve(cuda_devices.size());
+
+  // Simply add all supported CUDA devices as OptiX devices again
+  for (DeviceInfo info : cuda_devices) {
+    assert(info.type == DEVICE_CUDA);
+
+    int major;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+    if (major < 5) {
+      continue;  // Only Maxwell and up are supported by OptiX
+    }
+
+    info.type = DEVICE_OPTIX;
+    info.id += "_OptiX";
+    info.denoisers |= DENOISER_OPTIX;
+
+    devices.push_back(info);
+  }
+}
+
+Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+{
+  return new OptiXDevice(info, stats, profiler, background);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 42e597a34d7..4c288f60c16 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -55,6 +55,10 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device)
   kernel_next_iteration_setup = NULL;
   kernel_indirect_subsurface = NULL;
   kernel_buffer_update = NULL;
+  kernel_adaptive_stopping = NULL;
+  kernel_adaptive_filter_x = NULL;
+  kernel_adaptive_filter_y = NULL;
+  kernel_adaptive_adjust_samples = NULL;
 }
 
 DeviceSplitKernel::~DeviceSplitKernel()
@@ -83,6 +87,10 @@ DeviceSplitKernel::~DeviceSplitKernel()
   delete kernel_next_iteration_setup;
   delete kernel_indirect_subsurface;
   delete kernel_buffer_update;
+  delete kernel_adaptive_stopping;
+  delete kernel_adaptive_filter_x;
+  delete kernel_adaptive_filter_y;
+  delete kernel_adaptive_adjust_samples;
 }
 
 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
@@ -114,6 +122,10 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_fe
   LOAD_KERNEL(next_iteration_setup);
   LOAD_KERNEL(indirect_subsurface);
   LOAD_KERNEL(buffer_update);
+  LOAD_KERNEL(adaptive_stopping);
+  LOAD_KERNEL(adaptive_filter_x);
+  LOAD_KERNEL(adaptive_filter_y);
+  LOAD_KERNEL(adaptive_adjust_samples);
 
 #undef LOAD_KERNEL
 
@@ -133,7 +145,7 @@ size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
   return max_buffer_size / size_per_element;
 }
 
-bool DeviceSplitKernel::path_trace(DeviceTask *task,
+bool DeviceSplitKernel::path_trace(DeviceTask &task,
                                    RenderTile &tile,
                                    device_memory &kgbuffer,
                                    device_memory &kernel_data)
@@ -202,13 +214,21 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     /* initial guess to start rolling average */
     const int initial_num_samples = 1;
     /* approx number of samples per second */
-    int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                 int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                 initial_num_samples;
+    const int samples_per_second = (avg_time_per_sample > 0.0) ?
+                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
+                                       initial_num_samples;
 
     RenderTile subtile = tile;
     subtile.start_sample = tile.sample;
-    subtile.num_samples = min(samples_per_second,
+    subtile.num_samples = samples_per_second;
+
+    if (task.adaptive_sampling.use) {
+      subtile.num_samples = task.adaptive_sampling.align_dynamic_samples(subtile.start_sample,
+                                                                         subtile.num_samples);
+    }
+
+    /* Don't go beyond requested number of samples. */
+    subtile.num_samples = min(subtile.num_samples,
                               tile.start_sample + tile.num_samples - tile.sample);
 
     if (device->have_error()) {
@@ -266,7 +286,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
         ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
         ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
 
-        if (task->get_cancel() && cancel_time == DBL_MAX) {
+        if (task.get_cancel() && cancel_time == DBL_MAX) {
           /* Wait up to twice as many seconds for current samples to finish
            * to avoid artifacts in render result from ending too soon.
            */
@@ -302,6 +322,23 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
       }
     }
 
+    int filter_sample = tile.sample + subtile.num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      size_t buffer_size[2];
+      buffer_size[0] = round_up(tile.w, local_size[0]);
+      buffer_size[1] = round_up(tile.h, local_size[1]);
+      kernel_adaptive_stopping->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+      buffer_size[0] = round_up(tile.h, local_size[0]);
+      buffer_size[1] = round_up(1, local_size[1]);
+      kernel_adaptive_filter_x->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+      buffer_size[0] = round_up(tile.w, local_size[0]);
+      buffer_size[1] = round_up(1, local_size[1]);
+      kernel_adaptive_filter_y->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+    }
+
     double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
 
     if (avg_time_per_sample == 0.0) {
@@ -315,15 +352,37 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 #undef ENQUEUE_SPLIT_KERNEL
 
     tile.sample += subtile.num_samples;
-    task->update_progress(&tile, tile.w * tile.h * subtile.num_samples);
+    task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
 
     time_multiplier = min(time_multiplier << 1, 10);
 
-    if (task->get_cancel()) {
+    if (task.get_cancel()) {
       return true;
     }
   }
 
+  if (task.adaptive_sampling.use) {
+    /* Reset the start samples. */
+    RenderTile subtile = tile;
+    subtile.start_sample = tile.start_sample;
+    subtile.num_samples = tile.sample - tile.start_sample;
+    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+                                   subtile,
+                                   num_global_elements,
+                                   kgbuffer,
+                                   kernel_data,
+                                   split_data,
+                                   ray_state,
+                                   queue_index,
+                                   use_queues_flag,
+                                   work_pool_wgs);
+    size_t buffer_size[2];
+    buffer_size[0] = round_up(tile.w, local_size[0]);
+    buffer_size[1] = round_up(tile.h, local_size[1]);
+    kernel_adaptive_adjust_samples->enqueue(
+        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+  }
+
   return true;
 }
 
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index c9fb2ac844f..07a21b10299 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -27,7 +27,7 @@ CCL_NAMESPACE_BEGIN
  * Since some bytes may be needed for aligning chunks of memory;
  * This is the amount of memory that we dedicate for that purpose.
  */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000  //5MB
+#define DATA_ALLOCATION_MEM_FACTOR 5000000  // 5MB
 
 /* Types used for split kernel */
 
@@ -75,6 +75,10 @@ class DeviceSplitKernel {
   SplitKernelFunction *kernel_next_iteration_setup;
   SplitKernelFunction *kernel_indirect_subsurface;
   SplitKernelFunction *kernel_buffer_update;
+  SplitKernelFunction *kernel_adaptive_stopping;
+  SplitKernelFunction *kernel_adaptive_filter_x;
+  SplitKernelFunction *kernel_adaptive_filter_y;
+  SplitKernelFunction *kernel_adaptive_adjust_samples;
 
   /* Global memory variables [porting]; These memory is used for
    * co-operation between different kernels; Data written by one
@@ -105,7 +109,7 @@ class DeviceSplitKernel {
   virtual ~DeviceSplitKernel();
 
   bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  bool path_trace(DeviceTask *task,
+  bool path_trace(DeviceTask &task,
                   RenderTile &rtile,
                   device_memory &kgbuffer,
                   device_memory &kernel_data);
@@ -133,7 +137,7 @@ class DeviceSplitKernel {
   virtual int2 split_kernel_local_size() = 0;
   virtual int2 split_kernel_global_size(device_memory &kg,
                                         device_memory &data,
-                                        DeviceTask *task) = 0;
+                                        DeviceTask &task) = 0;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 376ad06a734..6e7c184c6c9 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -44,12 +44,13 @@ DeviceTask::DeviceTask(Type type_)
       shader_eval_type(0),
       shader_filter(0),
       shader_x(0),
-      shader_w(0)
+      shader_w(0),
+      buffers(nullptr)
 {
   last_update_time = time_dt();
 }
 
-int DeviceTask::get_subtask_count(int num, int max_size)
+int DeviceTask::get_subtask_count(int num, int max_size) const
 {
   if (max_size != 0) {
     int max_size_num;
@@ -77,7 +78,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
   return num;
 }
 
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size)
+void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
 {
   num = get_subtask_count(num, max_size);
 
@@ -115,7 +116,7 @@ void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size)
 
 void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 {
-  if ((type != RENDER) && (type != SHADER))
+  if (type == FILM_CONVERT)
     return;
 
   if (update_progress_sample) {
@@ -136,4 +137,58 @@ void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
   }
 }
 
+/* Adaptive Sampling */
+
+AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
+{
+}
+
+/* Render samples in steps that align with the adaptive filtering. */
+int AdaptiveSampling::align_static_samples(int samples) const
+{
+  if (samples > adaptive_step) {
+    /* Make multiple of adaptive_step. */
+    while (samples % adaptive_step != 0) {
+      samples--;
+    }
+  }
+  else if (samples < adaptive_step) {
+    /* Make divisor of adaptive_step. */
+    while (adaptive_step % samples != 0) {
+      samples--;
+    }
+  }
+
+  return max(samples, 1);
+}
+
+/* Render samples in steps that align with the adaptive filtering, with the
+ * suggested number of samples dynamically changing. */
+int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const
+{
+  /* Round so that we end up on multiples of adaptive_samples. */
+  samples += offset;
+
+  if (samples > adaptive_step) {
+    /* Make multiple of adaptive_step. */
+    while (samples % adaptive_step != 0) {
+      samples--;
+    }
+  }
+
+  samples -= offset;
+
+  return max(samples, 1);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (sample > min_samples) {
+    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+  }
+  else {
+    return false;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 5cc2e5e25db..fd380788282 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -21,7 +21,6 @@
 
 #include "util/util_function.h"
 #include "util/util_list.h"
-#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -30,37 +29,106 @@ CCL_NAMESPACE_BEGIN
 class Device;
 class RenderBuffers;
 class RenderTile;
+class RenderTileNeighbors;
 class Tile;
 
+enum DenoiserType {
+  DENOISER_NLM = 1,
+  DENOISER_OPTIX = 2,
+  DENOISER_OPENIMAGEDENOISE = 4,
+  DENOISER_NUM,
+
+  DENOISER_NONE = 0,
+  DENOISER_ALL = ~0,
+};
+
+enum DenoiserInput {
+  DENOISER_INPUT_RGB = 1,
+  DENOISER_INPUT_RGB_ALBEDO = 2,
+  DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
+
+  DENOISER_INPUT_NUM,
+};
+
+typedef int DenoiserTypeMask;
+
 class DenoiseParams {
  public:
-  /* Pixel radius for neighbouring pixels to take into account. */
+  /* Apply denoiser to image. */
+  bool use;
+  /* Output denoising data passes (possibly without applying the denoiser). */
+  bool store_passes;
+
+  /* Denoiser type. */
+  DenoiserType type;
+
+  /* Viewport start sample. */
+  int start_sample;
+
+  /** Native Denoiser **/
+
+  /* Pixel radius for neighboring pixels to take into account. */
   int radius;
   /* Controls neighbor pixel weighting for the denoising filter. */
   float strength;
   /* Preserve more or less detail based on feature passes. */
   float feature_strength;
-  /* When removing pixels that don't carry information, use a relative threshold instead of an absolute one. */
+  /* When removing pixels that don't carry information,
+   * use a relative threshold instead of an absolute one. */
   bool relative_pca;
   /* How many frames before and after the current center frame are included. */
   int neighbor_frames;
   /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
   bool clamp_input;
 
+  /** OIDN/Optix Denoiser **/
+
+  /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
+  DenoiserInput input_passes;
+
   DenoiseParams()
   {
+    use = false;
+    store_passes = false;
+
+    type = DENOISER_NLM;
+
     radius = 8;
     strength = 0.5f;
     feature_strength = 0.5f;
     relative_pca = false;
     neighbor_frames = 2;
     clamp_input = true;
+
+    input_passes = DENOISER_INPUT_RGB_ALBEDO_NORMAL;
+
+    start_sample = 0;
+  }
+
+  /* Test if a denoising task needs to run, also to prefilter passes for the native
+   * denoiser when we are not applying denoising to the combined image. */
+  bool need_denoising_task() const
+  {
+    return (use || (store_passes && type == DENOISER_NLM));
   }
 };
 
-class DeviceTask : public Task {
+class AdaptiveSampling {
  public:
-  typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
+  AdaptiveSampling();
+
+  int align_static_samples(int samples) const;
+  int align_dynamic_samples(int offset, int samples) const;
+  bool need_filter(int sample) const;
+
+  bool use;
+  int adaptive_step;
+  int min_samples;
+};
+
+class DeviceTask {
+ public:
+  typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
   Type type;
 
   int x, y, w, h;
@@ -77,30 +145,28 @@ class DeviceTask : public Task {
   int shader_filter;
   int shader_x, shader_w;
 
-  int passes_size;
+  RenderBuffers *buffers;
 
   explicit DeviceTask(Type type = RENDER);
 
-  int get_subtask_count(int num, int max_size = 0);
-  void split(list<DeviceTask> &tasks, int num, int max_size = 0);
+  int get_subtask_count(int num, int max_size = 0) const;
+  void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
 
   void update_progress(RenderTile *rtile, int pixel_samples = -1);
 
-  function<bool(Device *device, RenderTile &)> acquire_tile;
+  function<bool(Device *device, RenderTile &, uint)> acquire_tile;
   function<void(long, int)> update_progress_sample;
   function<void(RenderTile &)> update_tile_sample;
   function<void(RenderTile &)> release_tile;
   function<bool()> get_cancel;
-  function<void(RenderTile *, Device *)> map_neighbor_tiles;
-  function<void(RenderTile *, Device *)> unmap_neighbor_tiles;
+  function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
+  function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
 
+  uint tile_types;
   DenoiseParams denoising;
   bool denoising_from_render;
   vector<int> denoising_frames;
 
-  bool denoising_do_filter;
-  bool denoising_write_passes;
-
   int pass_stride;
   int frame_stride;
   int target_pass_stride;
@@ -109,7 +175,7 @@ class DeviceTask : public Task {
 
   bool need_finish_queue;
   bool integrator_branched;
-  int2 requested_tile_size;
+  AdaptiveSampling adaptive_sampling;
 
  protected:
   double last_update_time;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/device_opencl.h
index e7bafa0b8a8..e0140996cf0 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/device_opencl.h
@@ -23,6 +23,7 @@
 #  include "util/util_map.h"
 #  include "util/util_param.h"
 #  include "util/util_string.h"
+#  include "util/util_task.h"
 
 #  include "clew.h"
 
@@ -88,9 +89,12 @@ class OpenCLInfo {
   static bool device_supported(const string &platform_name, const cl_device_id device_id);
   static bool platform_version_check(cl_platform_id platform, string *error = NULL);
   static bool device_version_check(cl_device_id device, string *error = NULL);
+  static bool get_device_version(cl_device_id device,
+                                 int *r_major,
+                                 int *r_minor,
+                                 string *error = NULL);
   static string get_hardware_id(const string &platform_name, cl_device_id device_id);
-  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
-                                 bool force_all = false);
+  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
 
   /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
 
@@ -255,6 +259,8 @@ class OpenCLDevice : public Device {
   TaskPool load_required_kernel_task_pool;
   /* Task pool for optional kernels (feature kernels during foreground rendering) */
   TaskPool load_kernel_task_pool;
+  std::atomic<int> load_kernel_num_compiling;
+
   cl_context cxContext;
   cl_command_queue cqCommandQueue;
   cl_platform_id cpPlatform;
@@ -358,8 +364,8 @@ class OpenCLDevice : public Device {
     OpenCLSplitPrograms(OpenCLDevice *device);
     ~OpenCLSplitPrograms();
 
-    /* Load the kernels and put the created kernels in the given `programs`
-       * paramter. */
+    /* Load the kernels and put the created kernels in the given
+     * `programs` parameter. */
     void load_kernels(vector<OpenCLProgram *> &programs,
                       const DeviceRequestedFeatures &requested_features,
                       bool is_preview = false);
@@ -381,7 +387,6 @@ class OpenCLDevice : public Device {
 
   ConstMemMap const_mem_map;
   MemMap mem_map;
-  device_ptr null_mem;
 
   bool device_initialized;
   string platform_name;
@@ -429,8 +434,10 @@ class OpenCLDevice : public Device {
   int mem_sub_ptr_alignment();
 
   void const_copy_to(const char *name, void *host, size_t size);
-  void tex_alloc(device_memory &mem);
-  void tex_free(device_memory &mem);
+  void global_alloc(device_memory &mem);
+  void global_free(device_memory &mem);
+  void tex_alloc(device_texture &mem);
+  void tex_free(device_texture &mem);
 
   size_t global_size_round_up(int group_size, int global_size);
   void enqueue_kernel(cl_kernel kernel,
@@ -446,17 +453,11 @@ class OpenCLDevice : public Device {
                     device_ptr rgba_byte,
                     device_ptr rgba_half);
   void shader(DeviceTask &task);
+  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
+  void bake(DeviceTask &task, RenderTile &tile);
 
   void denoise(RenderTile &tile, DenoisingTask &denoising);
 
-  class OpenCLDeviceTask : public DeviceTask {
-   public:
-    OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task)
-    {
-      run = function_bind(&OpenCLDevice::thread_run, device, this);
-    }
-  };
-
   int get_split_task_count(DeviceTask & /*task*/)
   {
     return 1;
@@ -464,7 +465,10 @@ class OpenCLDevice : public Device {
 
   void task_add(DeviceTask &task)
   {
-    task_pool.push(new OpenCLDeviceTask(this, task));
+    task_pool.push([=] {
+      DeviceTask task_copy = task;
+      thread_run(task_copy);
+    });
   }
 
   void task_wait()
@@ -477,7 +481,7 @@ class OpenCLDevice : public Device {
     task_pool.cancel();
   }
 
-  void thread_run(DeviceTask *task);
+  void thread_run(DeviceTask &task);
 
   virtual BVHLayoutMask get_bvh_layout_mask() const
   {
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index 70b1a643044..e851749949d 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -16,7 +16,7 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
+#  include "device/opencl/device_opencl.h"
 
 #  include "kernel/kernel_types.h"
 #  include "kernel/split/kernel_split_data_types.h"
@@ -56,7 +56,11 @@ static const string SPLIT_BUNDLE_KERNELS =
     "enqueue_inactive "
     "next_iteration_setup "
     "indirect_subsurface "
-    "buffer_update";
+    "buffer_update "
+    "adaptive_stopping "
+    "adaptive_filter_x "
+    "adaptive_filter_y "
+    "adaptive_adjust_samples";
 
 const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
 {
@@ -194,7 +198,7 @@ string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_
     DeviceRequestedFeatures features(requested_features);
     enable_default_features(features);
 
-    /* Always turn off baking at this point. Baking is only usefull when building the bake kernel.
+    /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
      * this also makes sure that the kernels that are build during baking can be reused
      * when not doing any baking. */
     features.use_baking = false;
@@ -253,19 +257,19 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
 
     /* Ordered with most complex kernels first, to reduce overall compile time. */
     ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
+    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
+    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
     if (requested_features.use_volume || is_preview) {
       ADD_SPLIT_KERNEL_PROGRAM(do_volume);
     }
+    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
+    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
+    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
     ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
     ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
-    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
-    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
-    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
 
     /* Quick kernels bundled in a single program to reduce overhead of starting
-      * Blender processes. */
+     * Blender processes. */
     program_split = OpenCLDevice::OpenCLProgram(
         device,
         "split_bundle",
@@ -283,6 +287,10 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
     programs.push_back(&program_split);
 
 #  undef ADD_SPLIT_KERNEL_PROGRAM
@@ -534,7 +542,7 @@ class OpenCLSplitKernel : public DeviceSplitKernel {
 
   virtual int2 split_kernel_global_size(device_memory &kg,
                                         device_memory &data,
-                                        DeviceTask * /*task*/)
+                                        DeviceTask & /*task*/)
   {
     cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
     /* Use small global size on CPU devices as it seems to be much faster. */
@@ -602,16 +610,16 @@ void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
 
 OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
     : Device(info, stats, profiler, background),
+      load_kernel_num_compiling(0),
       kernel_programs(this),
       preview_programs(this),
       memory_manager(this),
-      texture_info(this, "__texture_info", MEM_TEXTURE)
+      texture_info(this, "__texture_info", MEM_GLOBAL)
 {
   cpPlatform = NULL;
   cdDevice = NULL;
   cxContext = NULL;
   cqCommandQueue = NULL;
-  null_mem = 0;
   device_initialized = false;
   textures_need_update = true;
   use_preview_kernels = !background;
@@ -662,35 +670,27 @@ OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, b
     return;
   }
 
-  null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-  if (opencl_error(ciErr)) {
-    opencl_error("OpenCL: Error creating memory buffer for NULL");
-    return;
-  }
-
-  /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
+  /* Allocate this right away so that texture_info
+   * is placed at offset 0 in the device memory buffers. */
   texture_info.resize(1);
   memory_manager.alloc("texture_info", texture_info);
 
   device_initialized = true;
 
   split_kernel = new OpenCLSplitKernel(this);
-  if (!background) {
+  if (use_preview_kernels) {
     load_preview_kernels();
   }
 }
 
 OpenCLDevice::~OpenCLDevice()
 {
-  task_pool.stop();
-  load_required_kernel_task_pool.stop();
-  load_kernel_task_pool.stop();
+  task_pool.cancel();
+  load_required_kernel_task_pool.cancel();
+  load_kernel_task_pool.cancel();
 
   memory_manager.free();
 
-  if (null_mem)
-    clReleaseMemObject(CL_MEM_PTR(null_mem));
-
   ConstMemMap::iterator mt;
   for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
     delete mt->second;
@@ -799,7 +799,11 @@ bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_feature
    * internally within a single process. */
   foreach (OpenCLProgram *program, programs) {
     if (!program->load()) {
-      load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+      load_kernel_num_compiling++;
+      load_kernel_task_pool.push([=] {
+        program->compile();
+        load_kernel_num_compiling--;
+      });
     }
   }
   return true;
@@ -869,7 +873,7 @@ bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requeste
      * Better to check on device level than per kernel as mixing preview and
      * non-preview kernels does not work due to different data types */
     if (use_preview_kernels) {
-      use_preview_kernels = !load_kernel_task_pool.finished();
+      use_preview_kernels = load_kernel_num_compiling.load() > 0;
     }
   }
   return split_kernel->load_kernels(requested_features);
@@ -896,7 +900,7 @@ DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
     return DEVICE_KERNEL_USING_FEATURE_KERNEL;
   }
 
-  bool other_kernels_finished = load_kernel_task_pool.finished();
+  bool other_kernels_finished = load_kernel_num_compiling.load() == 0;
   if (use_preview_kernels) {
     if (other_kernels_finished) {
       return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
@@ -946,7 +950,7 @@ void OpenCLDevice::mem_alloc(device_memory &mem)
   cl_mem_flags mem_flag;
   void *mem_ptr = NULL;
 
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
     mem_flag = CL_MEM_READ_ONLY;
   else
     mem_flag = CL_MEM_READ_WRITE;
@@ -961,7 +965,7 @@ void OpenCLDevice::mem_alloc(device_memory &mem)
     opencl_assert_err(ciErr, "clCreateBuffer");
   }
   else {
-    mem.device_pointer = null_mem;
+    mem.device_pointer = 0;
   }
 
   stats.mem_alloc(size);
@@ -970,9 +974,13 @@ void OpenCLDevice::mem_alloc(device_memory &mem)
 
 void OpenCLDevice::mem_copy_to(device_memory &mem)
 {
-  if (mem.type == MEM_TEXTURE) {
-    tex_free(mem);
-    tex_alloc(mem);
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
   }
   else {
     if (!mem.device_pointer) {
@@ -1078,12 +1086,15 @@ void OpenCLDevice::mem_zero(device_memory &mem)
 
 void OpenCLDevice::mem_free(device_memory &mem)
 {
-  if (mem.type == MEM_TEXTURE) {
-    tex_free(mem);
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
   }
   else {
     if (mem.device_pointer) {
-      if (mem.device_pointer != null_mem) {
+      if (mem.device_pointer != 0) {
         opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
       }
       mem.device_pointer = 0;
@@ -1102,7 +1113,7 @@ int OpenCLDevice::mem_sub_ptr_alignment()
 device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
 {
   cl_mem_flags mem_flag;
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
     mem_flag = CL_MEM_READ_ONLY;
   else
     mem_flag = CL_MEM_READ_WRITE;
@@ -1119,7 +1130,7 @@ device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int s
 
 void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
 {
-  if (device_pointer && device_pointer != null_mem) {
+  if (device_pointer != 0) {
     opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
   }
 }
@@ -1142,20 +1153,21 @@ void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
   data->copy_to_device();
 }
 
-void OpenCLDevice::tex_alloc(device_memory &mem)
+void OpenCLDevice::global_alloc(device_memory &mem)
 {
-  VLOG(1) << "Texture allocate: " << mem.name << ", "
+  VLOG(1) << "Global memory allocate: " << mem.name << ", "
           << string_human_readable_number(mem.memory_size()) << " bytes. ("
           << string_human_readable_size(mem.memory_size()) << ")";
 
   memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
+  /* Set the pointer to non-null to keep code that inspects its value from thinking its
+   * unallocated. */
   mem.device_pointer = 1;
   textures[mem.name] = &mem;
   textures_need_update = true;
 }
 
-void OpenCLDevice::tex_free(device_memory &mem)
+void OpenCLDevice::global_free(device_memory &mem)
 {
   if (mem.device_pointer) {
     mem.device_pointer = 0;
@@ -1173,6 +1185,25 @@ void OpenCLDevice::tex_free(device_memory &mem)
   }
 }
 
+void OpenCLDevice::tex_alloc(device_texture &mem)
+{
+  VLOG(1) << "Texture allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  memory_manager.alloc(mem.name, mem);
+  /* Set the pointer to non-null to keep code that inspects its value from thinking its
+   * unallocated. */
+  mem.device_pointer = 1;
+  textures[mem.name] = &mem;
+  textures_need_update = true;
+}
+
+void OpenCLDevice::tex_free(device_texture &mem)
+{
+  global_free(mem);
+}
+
 size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
 {
   int r = global_size % group_size;
@@ -1237,8 +1268,7 @@ void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const cha
     ptr = CL_MEM_PTR(i->second);
   }
   else {
-    /* work around NULL not working, even though the spec says otherwise */
-    ptr = CL_MEM_PTR(null_mem);
+    ptr = 0;
   }
 
   opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
@@ -1274,10 +1304,10 @@ void OpenCLDevice::flush_texture_buffers()
 
   foreach (TexturesMap::value_type &tex, textures) {
     string name = tex.first;
+    device_memory *mem = tex.second;
 
-    if (string_startswith(name, "__tex_image")) {
-      int pos = name.rfind("_");
-      int id = atoi(name.data() + pos + 1);
+    if (mem->type == MEM_TEXTURE) {
+      const uint id = ((device_texture *)mem)->slot;
       texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
       num_slots = max(num_slots, num_data_slots + id + 1);
     }
@@ -1290,22 +1320,20 @@ void OpenCLDevice::flush_texture_buffers()
 
   /* Fill in descriptors */
   foreach (texture_slot_t &slot, texture_slots) {
+    device_memory *mem = textures[slot.name];
     TextureInfo &info = texture_info[slot.slot];
 
     MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-    info.data = desc.offset;
-    info.cl_buffer = desc.device_buffer;
-
-    if (string_startswith(slot.name, "__tex_image")) {
-      device_memory *mem = textures[slot.name];
 
-      info.width = mem->data_width;
-      info.height = mem->data_height;
-      info.depth = mem->data_depth;
-
-      info.interpolation = mem->interpolation;
-      info.extension = mem->extension;
+    if (mem->type == MEM_TEXTURE) {
+      info = ((device_texture *)mem)->info;
+    }
+    else {
+      memset(&info, 0, sizeof(TextureInfo));
     }
+
+    info.data = desc.offset;
+    info.cl_buffer = desc.device_buffer;
   }
 
   /* Force write of descriptors. */
@@ -1313,26 +1341,20 @@ void OpenCLDevice::flush_texture_buffers()
   memory_manager.alloc("texture_info", texture_info);
 }
 
-void OpenCLDevice::thread_run(DeviceTask *task)
+void OpenCLDevice::thread_run(DeviceTask &task)
 {
   flush_texture_buffers();
 
-  if (task->type == DeviceTask::FILM_CONVERT) {
-    film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-  }
-  else if (task->type == DeviceTask::SHADER) {
-    shader(*task);
-  }
-  else if (task->type == DeviceTask::RENDER) {
+  if (task.type == DeviceTask::RENDER) {
     RenderTile tile;
-    DenoisingTask denoising(this, *task);
+    DenoisingTask denoising(this, task);
 
     /* Allocate buffer for kernel globals */
     device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
     kgbuffer.alloc_to_device(1);
 
     /* Keep rendering tiles until done. */
-    while (task->acquire_tile(this, tile)) {
+    while (task.acquire_tile(this, tile, task.tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         assert(tile.task == RenderTile::PATH_TRACE);
         scoped_timer timer(&tile.buffers->render_time);
@@ -1350,17 +1372,44 @@ void OpenCLDevice::thread_run(DeviceTask *task)
          */
         clFinish(cqCommandQueue);
       }
+      else if (tile.task == RenderTile::BAKE) {
+        bake(task, tile);
+      }
       else if (tile.task == RenderTile::DENOISE) {
         tile.sample = tile.start_sample + tile.num_samples;
         denoise(tile, denoising);
-        task->update_progress(&tile, tile.w * tile.h);
+        task.update_progress(&tile, tile.w * tile.h);
       }
 
-      task->release_tile(tile);
+      task.release_tile(tile);
     }
 
     kgbuffer.free();
   }
+  else if (task.type == DeviceTask::SHADER) {
+    shader(task);
+  }
+  else if (task.type == DeviceTask::FILM_CONVERT) {
+    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+  }
+  else if (task.type == DeviceTask::DENOISE_BUFFER) {
+    RenderTile tile;
+    tile.x = task.x;
+    tile.y = task.y;
+    tile.w = task.w;
+    tile.h = task.h;
+    tile.buffer = task.buffer;
+    tile.sample = task.sample + task.num_samples;
+    tile.num_samples = task.num_samples;
+    tile.start_sample = task.sample;
+    tile.offset = task.offset;
+    tile.stride = task.stride;
+    tile.buffers = task.buffers;
+
+    DenoisingTask denoising(this, task);
+    denoise(tile, denoising);
+    task.update_progress(&tile, tile.w * tile.h);
+  }
 }
 
 void OpenCLDevice::film_convert(DeviceTask &task,
@@ -1801,7 +1850,7 @@ void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
   denoising.render_buffer.samples = rtile.sample;
   denoising.buffer.gpu_temporary_mem = true;
 
-  denoising.run_denoising(&rtile);
+  denoising.run_denoising(rtile);
 }
 
 void OpenCLDevice::shader(DeviceTask &task)
@@ -1817,10 +1866,7 @@ void OpenCLDevice::shader(DeviceTask &task)
   cl_int d_offset = task.offset;
 
   OpenCLDevice::OpenCLProgram *program = &background_program;
-  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-    program = &bake_program;
-  }
-  else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
     program = &displace_program;
   }
   program->wait_for_availability();
@@ -1851,10 +1897,90 @@ void OpenCLDevice::shader(DeviceTask &task)
   }
 }
 
+void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  /* Cast arguments to cl types. */
+  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+  cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+  cl_int d_x = rtile.x;
+  cl_int d_y = rtile.y;
+  cl_int d_w = rtile.w;
+  cl_int d_h = rtile.h;
+  cl_int d_offset = rtile.offset;
+  cl_int d_stride = rtile.stride;
+
+  bake_program.wait_for_availability();
+  cl_kernel kernel = bake_program();
+
+  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
+
+  set_kernel_arg_buffers(kernel, &start_arg_index);
+
+  start_arg_index += kernel_set_args(
+      kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
+
+  int start_sample = rtile.start_sample;
+  int end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample; sample++) {
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+
+    kernel_set_args(kernel, start_arg_index, sample);
+
+    enqueue_kernel(kernel, d_w, d_h);
+
+    rtile.sample = sample + 1;
+
+    task.update_progress(&rtile, rtile.w * rtile.h);
+  }
+
+  clFinish(cqCommandQueue);
+}
+
+static bool kernel_build_opencl_2(cl_device_id cdDevice)
+{
+  /* Build with OpenCL 2.0 if available, this improves performance
+   * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
+   * Note that OpenCL selects the highest 1.x version by default,
+   * only for 2.0 do we need the explicit compiler flag. */
+  int version_major, version_minor;
+  if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
+    if (version_major >= 2) {
+      /* This appears to trigger a driver bug in Radeon RX cards with certain
+       * driver version, so don't use OpenCL 2.0 for those. */
+      string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
+      if (string_startswith(device_name, "Radeon RX 4") ||
+          string_startswith(device_name, "Radeon (TM) RX 4") ||
+          string_startswith(device_name, "Radeon RX 5") ||
+          string_startswith(device_name, "Radeon (TM) RX 5")) {
+        char version[256] = "";
+        int driver_major, driver_minor;
+        clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
+        if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
+          return !(driver_major == 3075 && driver_minor <= 12);
+        }
+      }
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
 string OpenCLDevice::kernel_build_options(const string *debug_src)
 {
   string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
 
+  if (kernel_build_opencl_2(cdDevice)) {
+    build_options += "-cl-std=CL2.0 ";
+  }
+
   if (platform_name == "NVIDIA CUDA") {
     build_options +=
         "-D__KERNEL_OPENCL_NVIDIA__ "
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
index f85aadce1c2..0285dc969ec 100644
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -18,7 +18,7 @@
 
 #  include "util/util_foreach.h"
 
-#  include "device/opencl/opencl.h"
+#  include "device/opencl/device_opencl.h"
 #  include "device/opencl/memory_manager.h"
 
 CCL_NAMESPACE_BEGIN
@@ -64,6 +64,9 @@ void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
     total_size += alloc_size;
   }
 
+  /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */
+  total_size = max(total_size, 16);
+
   if (need_realloc) {
     cl_ulong max_buffer_size;
     clGetDeviceInfo(
@@ -251,7 +254,7 @@ void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
       device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
     }
     else {
-      device->kernel_set_args(kernel, (*narg)++, device->null_mem);
+      device->kernel_set_args(kernel, (*narg)++);
     }
   }
 }
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
index 2fbc97a0756..23624f837a6 100644
--- a/intern/cycles/device/opencl/memory_manager.h
+++ b/intern/cycles/device/opencl/memory_manager.h
@@ -19,8 +19,8 @@
 #include "device/device.h"
 
 #include "util/util_map.h"
-#include "util/util_vector.h"
 #include "util/util_string.h"
+#include "util/util_vector.h"
 
 #include "clew.h"
 
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index cc40ad42b06..b8b07cf2947 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,15 +16,16 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
 #  include "device/device_intern.h"
+#  include "device/opencl/device_opencl.h"
 
 #  include "util/util_debug.h"
 #  include "util/util_logging.h"
 #  include "util/util_md5.h"
 #  include "util/util_path.h"
-#  include "util/util_time.h"
+#  include "util/util_semaphore.h"
 #  include "util/util_system.h"
+#  include "util/util_time.h"
 
 using std::cerr;
 using std::endl;
@@ -390,8 +391,27 @@ static void escape_python_string(string &str)
   string_replace(str, "'", "\'");
 }
 
+static int opencl_compile_process_limit()
+{
+  /* Limit number of concurrent processes compiling, with a heuristic based
+   * on total physical RAM and estimate of memory usage needed when compiling
+   * with all Cycles features enabled.
+   *
+   * This is somewhat arbitrary as we don't know the actual available RAM or
+   * how much the kernel compilation will needed depending on the features, but
+   * better than not limiting at all. */
+  static const int64_t GB = 1024LL * 1024LL * 1024LL;
+  static const int64_t process_memory = 2 * GB;
+  static const int64_t base_memory = 2 * GB;
+  static const int64_t system_memory = system_physical_ram();
+  static const int64_t process_limit = (system_memory - base_memory) / process_memory;
+
+  return max((int)process_limit, 1);
+}
+
 bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
 {
+  /* Construct arguments. */
   vector<string> args;
   args.push_back("--background");
   args.push_back("--factory-startup");
@@ -419,14 +439,23 @@ bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
       kernel_file_escaped.c_str(),
       clbin_escaped.c_str()));
 
-  double starttime = time_dt();
+  /* Limit number of concurrent processes compiling. */
+  static thread_counting_semaphore semaphore(opencl_compile_process_limit());
+  semaphore.acquire();
+
+  /* Compile. */
+  const double starttime = time_dt();
   add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
   add_log(string("Build flags: ") + kernel_build_options, true);
-  if (!system_call_self(args) || !path_exists(clbin)) {
+  const bool success = system_call_self(args);
+  const double elapsed = time_dt() - starttime;
+
+  semaphore.release();
+
+  if (!success || !path_exists(clbin)) {
     return false;
   }
 
-  double elapsed = time_dt() - starttime;
   add_log(
       string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
       false);
@@ -619,15 +648,16 @@ void OpenCLDevice::OpenCLProgram::compile()
       debug_src = &clsrc;
     }
 
-    /* If binary kernel exists already, try use it. */
-    if (compile_separate(clbin)) {
+    if (DebugFlags().running_inside_blender && compile_separate(clbin)) {
       add_log(string("Built and loaded program from ") + clbin + ".", true);
       loaded = true;
     }
     else {
-      add_log(string("Separate-process building of ") + clbin +
-                  " failed, will fall back to regular building.",
-              true);
+      if (DebugFlags().running_inside_blender) {
+        add_log(string("Separate-process building of ") + clbin +
+                    " failed, will fall back to regular building.",
+                true);
+      }
 
       /* If does not exist or loading binary failed, compile kernel. */
       if (!compile_kernel(debug_src)) {
@@ -746,7 +776,11 @@ bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_i
   }
   VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
 
-  /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
+  if (getenv("CYCLES_OPENCL_TEST")) {
+    return true;
+  }
+
+  /* It is possible to have Iris GPU on AMD/Apple OpenCL framework
    * (aka, it will not be on Intel framework). This isn't supported
    * and needs an explicit blacklist.
    */
@@ -805,18 +839,30 @@ bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
   return true;
 }
 
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
+bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
 {
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
   char version[256];
   clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
+  if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
     if (error != NULL) {
       *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
     }
     return false;
   }
+  if (error != NULL) {
+    *error = "";
+  }
+  return true;
+}
+
+bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
+{
+  const int req_major = 1, req_minor = 1;
+  int major, minor;
+  if (!get_device_version(device, &major, &minor, error)) {
+    return false;
+  }
+
   if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
     if (error != NULL) {
       *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
@@ -857,7 +903,7 @@ string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id dev
   return "";
 }
 
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all)
+void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
 {
   const cl_device_type device_type = OpenCLInfo::device_type();
   static bool first_time = true;
@@ -923,7 +969,7 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
         FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
         continue;
       }
-      if (force_all || device_supported(platform_name, device_id)) {
+      if (device_supported(platform_name, device_id)) {
         cl_device_type device_type;
         if (!get_device_type(device_id, &device_type, &error)) {
           FIRST_VLOG(2) << "Ignoring device " << device_name
diff --git a/intern/cycles/graph/CMakeLists.txt b/intern/cycles/graph/CMakeLists.txt
index c6c46941598..9ff1c5b98c6 100644
--- a/intern/cycles/graph/CMakeLists.txt
+++ b/intern/cycles/graph/CMakeLists.txt
@@ -17,7 +17,7 @@ set(SRC_HEADERS
 )
 
 set(LIB
-
+  cycles_util
 )
 
 include_directories(${INC})
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index fc7daaeeaa6..c437c6fda1e 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -133,7 +133,7 @@ void Node::set(const SocketType &input, const Transform &value)
 
 void Node::set(const SocketType &input, Node *value)
 {
-  assert(input.type == SocketType::TRANSFORM);
+  assert(input.type == SocketType::NODE);
   get_socket_value<Node *>(this, input) = value;
 }
 
@@ -213,7 +213,7 @@ float Node::get_float(const SocketType &input) const
 
 float2 Node::get_float2(const SocketType &input) const
 {
-  assert(input.type == SocketType::FLOAT);
+  assert(input.type == SocketType::POINT2);
   return get_socket_value<float2>(this, input);
 }
 
@@ -272,7 +272,7 @@ const array<float> &Node::get_float_array(const SocketType &input) const
 
 const array<float2> &Node::get_float2_array(const SocketType &input) const
 {
-  assert(input.type == SocketType::FLOAT_ARRAY);
+  assert(input.type == SocketType::POINT2_ARRAY);
   return get_socket_value<array<float2>>(this, input);
 }
 
@@ -313,7 +313,9 @@ void Node::set_default_value(const SocketType &socket)
 {
   const void *src = socket.default_value;
   void *dst = ((char *)this) + socket.struct_offset;
-  memcpy(dst, src, socket.size());
+  if (socket.size() > 0) {
+    memcpy(dst, src, socket.size());
+  }
 }
 
 template<typename T>
@@ -667,4 +669,14 @@ size_t Node::get_total_size_in_bytes() const
   return total_size;
 }
 
+bool Node::is_a(const NodeType *type_)
+{
+  for (const NodeType *base = type; base; base = base->base) {
+    if (base == type_) {
+      return true;
+    }
+  }
+  return false;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index 226c49b387a..4473b8aca28 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -33,7 +33,7 @@ struct Transform;
 
 struct Node {
   explicit Node(const NodeType *type, ustring name = ustring());
-  virtual ~Node();
+  virtual ~Node() = 0;
 
   /* set values */
   void set(const SocketType &input, bool value);
@@ -94,6 +94,9 @@ struct Node {
   /* Get total size of this node. */
   size_t get_total_size_in_bytes() const;
 
+  /* Type testing, taking into account base classes. */
+  bool is_a(const NodeType *type);
+
   ustring name;
   const NodeType *type;
 };
diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp
index f46d4e48026..0283ed7c817 100644
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -135,8 +135,13 @@ bool SocketType::is_float3(Type type)
 
 /* Node Type */
 
-NodeType::NodeType(Type type_) : type(type_)
+NodeType::NodeType(Type type, const NodeType *base) : type(type), base(base)
 {
+  if (base) {
+    /* Inherit sockets. */
+    inputs = base->inputs;
+    outputs = base->outputs;
+  }
 }
 
 NodeType::~NodeType()
@@ -209,7 +214,7 @@ unordered_map<ustring, NodeType, ustringHash> &NodeType::types()
   return _types;
 }
 
-NodeType *NodeType::add(const char *name_, CreateFunc create_, Type type_)
+NodeType *NodeType::add(const char *name_, CreateFunc create_, Type type_, const NodeType *base_)
 {
   ustring name(name_);
 
@@ -219,7 +224,7 @@ NodeType *NodeType::add(const char *name_, CreateFunc create_, Type type_)
     return NULL;
   }
 
-  types()[name] = NodeType(type_);
+  types()[name] = NodeType(type_, base_);
 
   NodeType *type = &types()[name];
   type->name = name;
diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h
index e9496a42658..a79d44b82f3 100644
--- a/intern/cycles/graph/node_type.h
+++ b/intern/cycles/graph/node_type.h
@@ -103,7 +103,7 @@ struct SocketType {
 struct NodeType {
   enum Type { NONE, SHADER };
 
-  explicit NodeType(Type type = NONE);
+  explicit NodeType(Type type = NONE, const NodeType *base = NULL);
   ~NodeType();
 
   void register_input(ustring name,
@@ -124,11 +124,15 @@ struct NodeType {
 
   ustring name;
   Type type;
+  const NodeType *base;
   vector<SocketType, std::allocator<SocketType>> inputs;
   vector<SocketType, std::allocator<SocketType>> outputs;
   CreateFunc create;
 
-  static NodeType *add(const char *name, CreateFunc create, Type type = NONE);
+  static NodeType *add(const char *name,
+                       CreateFunc create,
+                       Type type = NONE,
+                       const NodeType *base = NULL);
   static const NodeType *find(ustring name);
   static unordered_map<ustring, NodeType, ustringHash> &types();
 };
@@ -148,6 +152,14 @@ struct NodeType {
   } \
   template<typename T> const NodeType *structname::register_type()
 
+#define NODE_ABSTRACT_DECLARE \
+  template<typename T> static const NodeType *register_base_type(); \
+  static const NodeType *node_base_type;
+
+#define NODE_ABSTRACT_DEFINE(structname) \
+  const NodeType *structname::node_base_type = structname::register_base_type<structname>(); \
+  template<typename T> const NodeType *structname::register_base_type()
+
 /* Sock Definition Macros */
 
 #define SOCKET_OFFSETOF(T, name) (((char *)&(((T *)1)->name)) - (char *)1)
diff --git a/intern/cycles/graph/node_xml.cpp b/intern/cycles/graph/node_xml.cpp
index a96970cc904..d333400cc4a 100644
--- a/intern/cycles/graph/node_xml.cpp
+++ b/intern/cycles/graph/node_xml.cpp
@@ -200,7 +200,7 @@ void xml_read_node(XMLReader &reader, Node *node, xml_node xml_node)
         map<ustring, Node *>::iterator it = reader.node_map.find(value);
         if (it != reader.node_map.end()) {
           Node *value_node = it->second;
-          if (value_node->type == *(socket.node_type))
+          if (value_node->is_a(*(socket.node_type)))
             node->set(socket, it->second);
         }
         break;
@@ -215,7 +215,7 @@ void xml_read_node(XMLReader &reader, Node *node, xml_node xml_node)
           map<ustring, Node *>::iterator it = reader.node_map.find(ustring(tokens[i]));
           if (it != reader.node_map.end()) {
             Node *value_node = it->second;
-            value[i] = (value_node->type == *(socket.node_type)) ? value_node : NULL;
+            value[i] = (value_node->is_a(*(socket.node_type))) ? value_node : NULL;
           }
           else {
             value[i] = NULL;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 8a8fee108ae..5533eeb006d 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -36,6 +36,10 @@ set(SRC_CUDA_KERNELS
 )
 
 set(SRC_OPENCL_KERNELS
+  kernels/opencl/kernel_adaptive_stopping.cl
+  kernels/opencl/kernel_adaptive_filter_x.cl
+  kernels/opencl/kernel_adaptive_filter_y.cl
+  kernels/opencl/kernel_adaptive_adjust_samples.cl
   kernels/opencl/kernel_bake.cl
   kernels/opencl/kernel_base.cl
   kernels/opencl/kernel_displace.cl
@@ -64,6 +68,10 @@ set(SRC_OPENCL_KERNELS
   kernels/opencl/filter.cl
 )
 
+set(SRC_OPTIX_KERNELS
+  kernels/optix/kernel_optix.cu
+)
+
 set(SRC_BVH_HEADERS
   bvh/bvh.h
   bvh/bvh_nodes.h
@@ -73,28 +81,18 @@ set(SRC_BVH_HEADERS
   bvh/bvh_types.h
   bvh/bvh_volume.h
   bvh/bvh_volume_all.h
-  bvh/qbvh_nodes.h
-  bvh/qbvh_shadow_all.h
-  bvh/qbvh_local.h
-  bvh/qbvh_traversal.h
-  bvh/qbvh_volume.h
-  bvh/qbvh_volume_all.h
-  bvh/obvh_nodes.h
-  bvh/obvh_shadow_all.h
-  bvh/obvh_local.h
-  bvh/obvh_traversal.h
-  bvh/obvh_volume.h
-  bvh/obvh_volume_all.h
   bvh/bvh_embree.h
 )
 
 set(SRC_HEADERS
   kernel_accumulate.h
+  kernel_adaptive_sampling.h
   kernel_bake.h
   kernel_camera.h
   kernel_color.h
   kernel_compat_cpu.h
   kernel_compat_cuda.h
+  kernel_compat_optix.h
   kernel_compat_opencl.h
   kernel_differential.h
   kernel_emission.h
@@ -103,6 +101,8 @@ set(SRC_HEADERS
   kernel_id_passes.h
   kernel_jitter.h
   kernel_light.h
+  kernel_light_background.h
+  kernel_light_common.h
   kernel_math.h
   kernel_montecarlo.h
   kernel_passes.h
@@ -124,6 +124,7 @@ set(SRC_HEADERS
   kernel_types.h
   kernel_volume.h
   kernel_work_stealing.h
+  kernel_write_passes.h
 )
 
 set(SRC_KERNELS_CPU_HEADERS
@@ -140,6 +141,9 @@ set(SRC_KERNELS_CUDA_HEADERS
   kernels/cuda/kernel_cuda_image.h
 )
 
+set(SRC_KERNELS_OPTIX_HEADERS
+)
+
 set(SRC_KERNELS_OPENCL_HEADERS
   kernels/opencl/kernel_split_function.h
   kernels/opencl/kernel_opencl_image.h
@@ -168,17 +172,19 @@ set(SRC_CLOSURE_HEADERS
   closure/volume.h
   closure/bsdf_principled_diffuse.h
   closure/bsdf_principled_sheen.h
-    closure/bsdf_hair_principled.h
+  closure/bsdf_hair_principled.h
 )
 
 set(SRC_SVM_HEADERS
   svm/svm.h
   svm/svm_ao.h
+  svm/svm_aov.h
   svm/svm_attribute.h
   svm/svm_bevel.h
   svm/svm_blackbody.h
   svm/svm_bump.h
   svm/svm_camera.h
+  svm/svm_clamp.h
   svm/svm_closure.h
   svm/svm_convert.h
   svm/svm_checker.h
@@ -198,7 +204,9 @@ set(SRC_SVM_HEADERS
   svm/svm_invert.h
   svm/svm_light_path.h
   svm/svm_magic.h
+  svm/svm_map_range.h
   svm/svm_mapping.h
+  svm/svm_mapping_util.h
   svm/svm_math.h
   svm/svm_math_util.h
   svm/svm_mix.h
@@ -212,13 +220,16 @@ set(SRC_SVM_HEADERS
   svm/svm_sepcomb_vector.h
   svm/svm_sky.h
   svm/svm_tex_coord.h
-  svm/svm_texture.h
+  svm/svm_fractal_noise.h
   svm/svm_types.h
   svm/svm_value.h
+  svm/svm_vector_rotate.h
   svm/svm_vector_transform.h
   svm/svm_voronoi.h
   svm/svm_voxel.h
   svm/svm_wave.h
+  svm/svm_white_noise.h
+  svm/svm_vertex_color.h
 )
 
 set(SRC_GEOM_HEADERS
@@ -308,6 +319,10 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
+  split/kernel_adaptive_adjust_samples.h
+  split/kernel_adaptive_filter_x.h
+  split/kernel_adaptive_filter_y.h
+  split/kernel_adaptive_stopping.h
   split/kernel_branched.h
   split/kernel_buffer_update.h
   split/kernel_data_init.h
@@ -350,11 +365,11 @@ if(WITH_CYCLES_CUDA_BINARIES)
   set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
   # warn for other versions
-  if(CUDA_VERSION MATCHES "101")
+  if((CUDA_VERSION MATCHES "101") OR (CUDA_VERSION MATCHES "102"))
   else()
     message(WARNING
       "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-      "build may succeed but only CUDA 10.1 is officially supported")
+      "build may succeed but only CUDA 10.1 and 10.2 are officially supported")
   endif()
 
   # build for each arch
@@ -376,11 +391,20 @@ if(WITH_CYCLES_CUDA_BINARIES)
   set(cuda_cubins)
 
   macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental)
-    set(cuda_cubin ${name}_${arch}.cubin)
+    if(${arch} MATCHES "compute_.*")
+      set(format "ptx")
+    else()
+      set(format "cubin")
+    endif()
+    set(cuda_file ${name}_${arch}.${format})
 
     set(kernel_sources ${sources})
     if(NOT ${prev_arch} STREQUAL "none")
-      set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.cubin)
+      if(${prev_arch} MATCHES "compute_.*")
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx)
+      else()
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.cubin)
+      endif()
     endif()
 
     set(cuda_kernel_src "/kernels/cuda/${name}.cu")
@@ -393,7 +417,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
       -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
       --use_fast_math
-      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin})
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
     if(${experimental})
       set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
@@ -418,7 +442,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       endif()
 
       add_custom_command(
-        OUTPUT ${cuda_cubin}
+        OUTPUT ${cuda_file}
         COMMAND ${CUBIN_CC_ENV}
             "$<TARGET_FILE:cycles_cubin_cc>"
             -target ${CUDA_ARCH}
@@ -429,18 +453,18 @@ if(WITH_CYCLES_CUDA_BINARIES)
         DEPENDS ${kernel_sources} cycles_cubin_cc)
     else()
       add_custom_command(
-        OUTPUT ${cuda_cubin}
+        OUTPUT ${cuda_file}
         COMMAND ${CUDA_NVCC_EXECUTABLE}
             -arch=${arch}
             ${CUDA_NVCC_FLAGS}
-            --cubin
+            --${format}
             ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
             --ptxas-options="-v"
             ${cuda_flags}
         DEPENDS ${kernel_sources})
     endif()
-    delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
-    list(APPEND cuda_cubins ${cuda_cubin})
+    delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file}" ${CYCLES_INSTALL_PATH}/lib)
+    list(APPEND cuda_cubins ${cuda_file})
 
     unset(cuda_debug_flags)
   endmacro()
@@ -471,6 +495,89 @@ if(WITH_CYCLES_CUDA_BINARIES)
   cycles_set_solution_folder(cycles_kernel_cuda)
 endif()
 
+# OptiX PTX modules
+
+if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
+  foreach(input ${SRC_OPTIX_KERNELS})
+    get_filename_component(input_we ${input} NAME_WE)
+
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${input_we}.ptx")
+    set(cuda_flags
+      -I "${OPTIX_INCLUDE_DIR}"
+      -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
+      -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
+      --use_fast_math
+      -o ${output})
+
+    if(WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags}
+        -D __KERNEL_DEBUG__)
+    endif()
+    if(WITH_CYCLES_CUBIN_COMPILER)
+
+      # Needed to find libnvrtc-builtins.so. Can't do it from inside
+      # cycles_cubin_cc since the env variable is read before main()
+      if(APPLE)
+        set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+          -E env DYLD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib")
+      elseif(UNIX)
+        set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+          -E env LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      endif()
+
+      add_custom_command(
+        OUTPUT ${output}
+        DEPENDS
+          ${input}
+          ${SRC_HEADERS}
+          ${SRC_KERNELS_CUDA_HEADERS}
+          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_BVH_HEADERS}
+          ${SRC_SVM_HEADERS}
+          ${SRC_GEOM_HEADERS}
+          ${SRC_CLOSURE_HEADERS}
+          ${SRC_UTIL_HEADERS}
+        COMMAND ${CUBIN_CC_ENV}
+            "$<TARGET_FILE:cycles_cubin_cc>"
+            -target 52
+            -ptx
+            -i ${CMAKE_CURRENT_SOURCE_DIR}/${input}
+            ${cuda_flags}
+            -v
+            -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
+        DEPENDS ${kernel_sources} cycles_cubin_cc)
+    else()
+      add_custom_command(
+        OUTPUT
+          ${output}
+        DEPENDS
+          ${input}
+          ${SRC_HEADERS}
+          ${SRC_KERNELS_CUDA_HEADERS}
+          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_BVH_HEADERS}
+          ${SRC_SVM_HEADERS}
+          ${SRC_GEOM_HEADERS}
+          ${SRC_CLOSURE_HEADERS}
+          ${SRC_UTIL_HEADERS}
+        COMMAND
+          ${CUDA_NVCC_EXECUTABLE}
+          --ptx
+          -arch=sm_52
+          ${cuda_flags}
+          ${input}
+        WORKING_DIRECTORY
+          "${CMAKE_CURRENT_SOURCE_DIR}")
+    endif()
+    list(APPEND optix_ptx ${output})
+
+    delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
+  endforeach()
+
+  add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
+  cycles_set_solution_folder(cycles_kernel_optix)
+endif()
+
 # OSL module
 
 if(WITH_CYCLES_OSL)
@@ -486,6 +593,19 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
+if(WITH_COMPILER_ASAN)
+  if(CMAKE_COMPILER_IS_GNUCC AND (NOT WITH_CYCLES_KERNEL_ASAN))
+    # GCC hangs compiling the big kernel files with asan and release, so disable by default.
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-sanitize=all")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-sanitize=vptr")
+  elseif(CMAKE_C_COMPILER_ID MATCHES "Clang")
+    # With OSL, Cycles disables rtti in some modules, wich then breaks at linking
+    # when trying to use vptr sanitizer (included into 'undefined' general option).
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-sanitize=vptr")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-sanitize=vptr")
+  endif()
+endif()
+
 set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
@@ -517,10 +637,12 @@ endif()
 cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_CPU_KERNELS}
   ${SRC_CUDA_KERNELS}
+  ${SRC_OPTIX_KERNELS}
   ${SRC_OPENCL_KERNELS}
   ${SRC_HEADERS}
   ${SRC_KERNELS_CPU_HEADERS}
   ${SRC_KERNELS_CUDA_HEADERS}
+  ${SRC_KERNELS_OPTIX_HEADERS}
   ${SRC_KERNELS_OPENCL_HEADERS}
   ${SRC_BVH_HEADERS}
   ${SRC_CLOSURE_HEADERS}
@@ -530,25 +652,42 @@ cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_SPLIT_HEADERS}
 )
 
+source_group("bvh" FILES ${SRC_BVH_HEADERS})
+source_group("closure" FILES ${SRC_CLOSURE_HEADERS})
+source_group("filter" FILES ${SRC_FILTER_HEADERS})
+source_group("geom" FILES ${SRC_GEOM_HEADERS})
+source_group("kernel" FILES ${SRC_HEADERS})
+source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS})
+source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS})
+source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS})
+source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS})
+source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS})
+source_group("svm" FILES ${SRC_SVM_HEADERS})
+
 if(WITH_CYCLES_CUDA)
   add_dependencies(cycles_kernel cycles_kernel_cuda)
 endif()
+if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
+  add_dependencies(cycles_kernel cycles_kernel_optix)
+endif()
 
 # OpenCL kernel
 
-#set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
-#add_custom_command(
-#   OUTPUT ${KERNEL_PREPROCESSED}
-#   COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
-#   DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
-#add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
-#delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
+# set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
+# add_custom_command(
+#    OUTPUT ${KERNEL_PREPROCESSED}
+#    COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
+#    DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
+# add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
+# delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 13e72ed299f..3049f243ae9 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -33,136 +33,108 @@ CCL_NAMESPACE_BEGIN
 
 #include "kernel/bvh/bvh_types.h"
 
-/* Common QBVH functions. */
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_nodes.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_nodes.h"
-#  endif
-#endif
+#ifndef __KERNEL_OPTIX__
 
 /* Regular BVH traversal */
 
-#include "kernel/bvh/bvh_nodes.h"
+#  include "kernel/bvh/bvh_nodes.h"
 
-#define BVH_FUNCTION_NAME bvh_intersect
-#define BVH_FUNCTION_FEATURES 0
-#include "kernel/bvh/bvh_traversal.h"
-
-#if defined(__INSTANCING__)
-#  define BVH_FUNCTION_NAME bvh_intersect_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#  define BVH_FUNCTION_NAME bvh_intersect
+#  define BVH_FUNCTION_FEATURES 0
 #  include "kernel/bvh/bvh_traversal.h"
-#endif
 
-#if defined(__HAIR__)
-#  define BVH_FUNCTION_NAME bvh_intersect_hair
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_HAIR_MINIMUM_WIDTH
-#  include "kernel/bvh/bvh_traversal.h"
-#endif
+#  if defined(__HAIR__)
+#    define BVH_FUNCTION_NAME bvh_intersect_hair
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
+#    include "kernel/bvh/bvh_traversal.h"
+#  endif
 
-#if defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION
-#  include "kernel/bvh/bvh_traversal.h"
-#endif
+#  if defined(__OBJECT_MOTION__)
+#    define BVH_FUNCTION_NAME bvh_intersect_motion
+#    define BVH_FUNCTION_FEATURES BVH_MOTION
+#    include "kernel/bvh/bvh_traversal.h"
+#  endif
 
-#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_HAIR_MINIMUM_WIDTH | BVH_MOTION
-#  include "kernel/bvh/bvh_traversal.h"
-#endif
+#  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#    define BVH_FUNCTION_NAME bvh_intersect_hair_motion
+#    define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION
+#    include "kernel/bvh/bvh_traversal.h"
+#  endif
 
 /* Subsurface scattering BVH traversal */
 
-#if defined(__BVH_LOCAL__)
-#  define BVH_FUNCTION_NAME bvh_intersect_local
-#  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "kernel/bvh/bvh_local.h"
-
-#  if defined(__OBJECT_MOTION__)
-#    define BVH_FUNCTION_NAME bvh_intersect_local_motion
-#    define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#  if defined(__BVH_LOCAL__)
+#    define BVH_FUNCTION_NAME bvh_intersect_local
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
 #    include "kernel/bvh/bvh_local.h"
-#  endif
-#endif /* __BVH_LOCAL__ */
 
-/* Volume BVH traversal */
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_local_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#      include "kernel/bvh/bvh_local.h"
+#    endif
+#  endif /* __BVH_LOCAL__ */
 
-#if defined(__VOLUME__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume
-#  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "kernel/bvh/bvh_volume.h"
+/* Volume BVH traversal */
 
-#  if defined(__INSTANCING__)
-#    define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR
+#  if defined(__VOLUME__)
+#    define BVH_FUNCTION_NAME bvh_intersect_volume
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
 #    include "kernel/bvh/bvh_volume.h"
-#  endif
 
-#  if defined(__OBJECT_MOTION__)
-#    define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR
-#    include "kernel/bvh/bvh_volume.h"
-#  endif
-#endif /* __VOLUME__ */
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#      include "kernel/bvh/bvh_volume.h"
+#    endif
+#  endif /* __VOLUME__ */
 
 /* Record all intersections - Shadow BVH traversal */
 
-#if defined(__SHADOW_RECORD_ALL__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
-#  define BVH_FUNCTION_FEATURES 0
-#  include "kernel/bvh/bvh_shadow_all.h"
-
-#  if defined(__INSTANCING__)
-#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#  if defined(__SHADOW_RECORD_ALL__)
+#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all
+#    define BVH_FUNCTION_FEATURES 0
 #    include "kernel/bvh/bvh_shadow_all.h"
-#  endif
-
-#  if defined(__HAIR__)
-#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR
-#    include "kernel/bvh/bvh_shadow_all.h"
-#  endif
 
-#  if defined(__OBJECT_MOTION__)
-#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION
-#    include "kernel/bvh/bvh_shadow_all.h"
-#  endif
-
-#  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_MOTION
-#    include "kernel/bvh/bvh_shadow_all.h"
-#  endif
-#endif /* __SHADOW_RECORD_ALL__ */
+#    if defined(__HAIR__)
+#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
+#      define BVH_FUNCTION_FEATURES BVH_HAIR
+#      include "kernel/bvh/bvh_shadow_all.h"
+#    endif
+
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION
+#      include "kernel/bvh/bvh_shadow_all.h"
+#    endif
+
+#    if defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
+#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION
+#      include "kernel/bvh/bvh_shadow_all.h"
+#    endif
+#  endif /* __SHADOW_RECORD_ALL__ */
 
 /* Record all intersections - Volume BVH traversal  */
 
-#if defined(__VOLUME_RECORD_ALL__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_all
-#  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "kernel/bvh/bvh_volume_all.h"
-
-#  if defined(__INSTANCING__)
-#    define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR
+#  if defined(__VOLUME_RECORD_ALL__)
+#    define BVH_FUNCTION_NAME bvh_intersect_volume_all
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
 #    include "kernel/bvh/bvh_volume_all.h"
-#  endif
 
-#  if defined(__OBJECT_MOTION__)
-#    define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-#    define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR
-#    include "kernel/bvh/bvh_volume_all.h"
-#  endif
-#endif /* __VOLUME_RECORD_ALL__ */
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#      include "kernel/bvh/bvh_volume_all.h"
+#    endif
+#  endif /* __VOLUME_RECORD_ALL__ */
 
-#undef BVH_FEATURE
-#undef BVH_NAME_JOIN
-#undef BVH_NAME_EVAL
-#undef BVH_FUNCTION_FULL_NAME
+#  undef BVH_FEATURE
+#  undef BVH_NAME_JOIN
+#  undef BVH_NAME_EVAL
+#  undef BVH_FUNCTION_FULL_NAME
+
+#endif /* __KERNEL_OPTIX__ */
 
 ccl_device_inline bool scene_intersect_valid(const Ray *ray)
 {
@@ -173,31 +145,65 @@ ccl_device_inline bool scene_intersect_valid(const Ray *ray)
    * such cases.
    * From production scenes so far it seems it's enough to test first element
    * only.
+   * Scene intersection may also called with empty rays for conditional trace
+   * calls that evaluate to false, so filter those out.
    */
-  return isfinite(ray->P.x);
+  return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
 }
 
-/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
-                                          const Ray ray,
+                                          const Ray *ray,
                                           const uint visibility,
-                                          Intersection *isect,
-                                          uint *lcg_state,
-                                          float difl,
-                                          float extmax)
+                                          Intersection *isect)
 {
   PROFILING_INIT(kg, PROFILING_INTERSECT);
 
-  if (!scene_intersect_valid(&ray)) {
+#ifdef __KERNEL_OPTIX__
+  uint p0 = 0;
+  uint p1 = 0;
+  uint p2 = 0;
+  uint p3 = 0;
+  uint p4 = visibility;
+  uint p5 = PRIMITIVE_NONE;
+
+  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
+             ray->P,
+             ray->D,
+             0.0f,
+             ray->t,
+             ray->time,
+             0xF,
+             OPTIX_RAY_FLAG_NONE,
+             0,  // SBT offset for PG_HITD
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5);
+
+  isect->t = __uint_as_float(p0);
+  isect->u = __uint_as_float(p1);
+  isect->v = __uint_as_float(p2);
+  isect->prim = p3;
+  isect->object = p4;
+  isect->type = p5;
+
+  return p5 != PRIMITIVE_NONE;
+#else /* __KERNEL_OPTIX__ */
+  if (!scene_intersect_valid(ray)) {
     return false;
   }
-#ifdef __EMBREE__
+
+#  ifdef __EMBREE__
   if (kernel_data.bvh.scene) {
-    isect->t = ray.t;
+    isect->t = ray->t;
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
     IntersectContext rtc_ctx(&ctx);
     RTCRayHit ray_hit;
-    kernel_embree_setup_rayhit(ray, ray_hit, visibility);
+    kernel_embree_setup_rayhit(*ray, ray_hit, visibility);
     rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit);
     if (ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID &&
         ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) {
@@ -206,46 +212,33 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
     }
     return false;
   }
-#endif /* __EMBREE__ */
-#ifdef __OBJECT_MOTION__
+#  endif /* __EMBREE__ */
+
+#  ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
-#  ifdef __HAIR__
-    if (kernel_data.bvh.have_curves)
-      return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax);
-#  endif /* __HAIR__ */
+#    ifdef __HAIR__
+    if (kernel_data.bvh.have_curves) {
+      return bvh_intersect_hair_motion(kg, ray, isect, visibility);
+    }
+#    endif /* __HAIR__ */
 
-    return bvh_intersect_motion(kg, &ray, isect, visibility);
+    return bvh_intersect_motion(kg, ray, isect, visibility);
   }
-#endif /* __OBJECT_MOTION__ */
-
-#ifdef __HAIR__
-  if (kernel_data.bvh.have_curves)
-    return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax);
-#endif /* __HAIR__ */
-
-#ifdef __KERNEL_CPU__
-
-#  ifdef __INSTANCING__
-  if (kernel_data.bvh.have_instancing)
-    return bvh_intersect_instancing(kg, &ray, isect, visibility);
-#  endif /* __INSTANCING__ */
+#  endif   /* __OBJECT_MOTION__ */
 
-  return bvh_intersect(kg, &ray, isect, visibility);
-#else /* __KERNEL_CPU__ */
-
-#  ifdef __INSTANCING__
-  return bvh_intersect_instancing(kg, &ray, isect, visibility);
-#  else
-  return bvh_intersect(kg, &ray, isect, visibility);
-#  endif /* __INSTANCING__ */
+#  ifdef __HAIR__
+  if (kernel_data.bvh.have_curves) {
+    return bvh_intersect_hair(kg, ray, isect, visibility);
+  }
+#  endif /* __HAIR__ */
 
-#endif /* __KERNEL_CPU__ */
+  return bvh_intersect(kg, ray, isect, visibility);
+#endif   /* __KERNEL_OPTIX__ */
 }
 
 #ifdef __BVH_LOCAL__
-/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
-                                                const Ray ray,
+                                                const Ray *ray,
                                                 LocalIntersection *local_isect,
                                                 int local_object,
                                                 uint *lcg_state,
@@ -253,33 +246,74 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
 {
   PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
 
-  if (!scene_intersect_valid(&ray)) {
-    local_isect->num_hits = 0;
+#  ifdef __KERNEL_OPTIX__
+  uint p0 = ((uint64_t)lcg_state) & 0xFFFFFFFF;
+  uint p1 = (((uint64_t)lcg_state) >> 32) & 0xFFFFFFFF;
+  uint p2 = ((uint64_t)local_isect) & 0xFFFFFFFF;
+  uint p3 = (((uint64_t)local_isect) >> 32) & 0xFFFFFFFF;
+  uint p4 = local_object;
+  // Is set to zero on miss or if ray is aborted, so can be used as return value
+  uint p5 = max_hits;
+
+  if (local_isect) {
+    local_isect->num_hits = 0;  // Initialize hit count to zero
+  }
+  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
+             ray->P,
+             ray->D,
+             0.0f,
+             ray->t,
+             ray->time,
+             // Skip curves
+             0x3,
+             // Need to always call into __anyhit__kernel_optix_local_hit
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             2,  // SBT offset for PG_HITL
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5);
+
+  return p5;
+#  else /* __KERNEL_OPTIX__ */
+  if (!scene_intersect_valid(ray)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
     return false;
   }
-#  ifdef __EMBREE__
+
+#    ifdef __EMBREE__
   if (kernel_data.bvh.scene) {
-    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SSS);
+    const bool has_bvh = !(kernel_tex_fetch(__object_flag, local_object) &
+                           SD_OBJECT_TRANSFORM_APPLIED);
+    CCLIntersectContext ctx(
+        kg, has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
     ctx.lcg_state = lcg_state;
     ctx.max_hits = max_hits;
-    ctx.ss_isect = local_isect;
-    local_isect->num_hits = 0;
-    ctx.sss_object_id = local_object;
+    ctx.local_isect = local_isect;
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    ctx.local_object_id = local_object;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
-    kernel_embree_setup_ray(ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
-
-    /* Get the Embree scene for this intersection. */
-    RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2);
-    if (geom) {
-      float3 P = ray.P;
-      float3 dir = ray.D;
-      float3 idir = ray.D;
-      const int object_flag = kernel_tex_fetch(__object_flag, local_object);
-      if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+    kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
+
+    /* If this object has its own BVH, use it. */
+    if (has_bvh) {
+      RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2);
+      if (geom) {
+        float3 P = ray->P;
+        float3 dir = ray->D;
+        float3 idir = ray->D;
         Transform ob_itfm;
         rtc_ray.tfar = bvh_instance_motion_push(
-            kg, local_object, &ray, &P, &dir, &idir, ray.t, &ob_itfm);
+            kg, local_object, ray, &P, &dir, &idir, ray->t, &ob_itfm);
         /* bvh_instance_motion_push() returns the inverse transform but
          * it's not needed here. */
         (void)ob_itfm;
@@ -290,22 +324,30 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
         rtc_ray.dir_x = dir.x;
         rtc_ray.dir_y = dir.y;
         rtc_ray.dir_z = dir.z;
+        RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
+        kernel_assert(scene);
+        if (scene) {
+          rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
+        }
       }
-      RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
-      if (scene) {
-        rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
-      }
+    }
+    else {
+      rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
     }
 
-    return local_isect->num_hits > 0;
+    /* rtcOccluded1 sets tfar to -inf if a hit was found. */
+    return (local_isect && local_isect->num_hits > 0) || (rtc_ray.tfar < 0);
+    ;
   }
-#  endif /* __EMBREE__ */
-#  ifdef __OBJECT_MOTION__
+#    endif /* __EMBREE__ */
+
+#    ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
-    return bvh_intersect_local_motion(kg, &ray, local_isect, local_object, lcg_state, max_hits);
+    return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits);
   }
-#  endif /* __OBJECT_MOTION__ */
-  return bvh_intersect_local(kg, &ray, local_isect, local_object, lcg_state, max_hits);
+#    endif /* __OBJECT_MOTION__ */
+  return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
+#  endif   /* __KERNEL_OPTIX__ */
 }
 #endif
 
@@ -319,11 +361,41 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 {
   PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
 
+#  ifdef __KERNEL_OPTIX__
+  uint p0 = ((uint64_t)isect) & 0xFFFFFFFF;
+  uint p1 = (((uint64_t)isect) >> 32) & 0xFFFFFFFF;
+  uint p3 = max_hits;
+  uint p4 = visibility;
+  uint p5 = false;
+
+  *num_hits = 0;  // Initialize hit count to zero
+  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
+             ray->P,
+             ray->D,
+             0.0f,
+             ray->t,
+             ray->time,
+             0xF,
+             // Need to always call into __anyhit__kernel_optix_shadow_all_hit
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             1,  // SBT offset for PG_HITS
+             0,
+             0,
+             p0,
+             p1,
+             *num_hits,
+             p3,
+             p4,
+             p5);
+
+  return p5;
+#  else /* __KERNEL_OPTIX__ */
   if (!scene_intersect_valid(ray)) {
     *num_hits = 0;
     return false;
   }
-#  ifdef __EMBREE__
+
+#    ifdef __EMBREE__
   if (kernel_data.bvh.scene) {
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
     ctx.isect_s = isect;
@@ -331,7 +403,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
     ctx.num_hits = 0;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
-    kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_SHADOW);
+    kernel_embree_setup_ray(*ray, rtc_ray, visibility);
     rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
 
     if (ctx.num_hits > max_hits) {
@@ -340,32 +412,28 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
     *num_hits = ctx.num_hits;
     return rtc_ray.tfar == -INFINITY;
   }
-#  endif
-#  ifdef __OBJECT_MOTION__
+#    endif /* __EMBREE__ */
+
+#    ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
-#    ifdef __HAIR__
+#      ifdef __HAIR__
     if (kernel_data.bvh.have_curves) {
       return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, visibility, max_hits, num_hits);
     }
-#    endif /* __HAIR__ */
+#      endif /* __HAIR__ */
 
     return bvh_intersect_shadow_all_motion(kg, ray, isect, visibility, max_hits, num_hits);
   }
-#  endif /* __OBJECT_MOTION__ */
+#    endif   /* __OBJECT_MOTION__ */
 
-#  ifdef __HAIR__
+#    ifdef __HAIR__
   if (kernel_data.bvh.have_curves) {
     return bvh_intersect_shadow_all_hair(kg, ray, isect, visibility, max_hits, num_hits);
   }
-#  endif /* __HAIR__ */
-
-#  ifdef __INSTANCING__
-  if (kernel_data.bvh.have_instancing) {
-    return bvh_intersect_shadow_all_instancing(kg, ray, isect, visibility, max_hits, num_hits);
-  }
-#  endif /* __INSTANCING__ */
+#    endif /* __HAIR__ */
 
   return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits);
+#  endif   /* __KERNEL_OPTIX__ */
 }
 #endif /* __SHADOW_RECORD_ALL__ */
 
@@ -377,27 +445,54 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
 {
   PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
 
+#  ifdef __KERNEL_OPTIX__
+  uint p0 = 0;
+  uint p1 = 0;
+  uint p2 = 0;
+  uint p3 = 0;
+  uint p4 = visibility;
+  uint p5 = PRIMITIVE_NONE;
+
+  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
+             ray->P,
+             ray->D,
+             0.0f,
+             ray->t,
+             ray->time,
+             // Skip everything but volumes
+             0x2,
+             OPTIX_RAY_FLAG_NONE,
+             0,  // SBT offset for PG_HITD
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5);
+
+  isect->t = __uint_as_float(p0);
+  isect->u = __uint_as_float(p1);
+  isect->v = __uint_as_float(p2);
+  isect->prim = p3;
+  isect->object = p4;
+  isect->type = p5;
+
+  return p5 != PRIMITIVE_NONE;
+#  else /* __KERNEL_OPTIX__ */
   if (!scene_intersect_valid(ray)) {
     return false;
   }
-#  ifdef __OBJECT_MOTION__
+
+#    ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
     return bvh_intersect_volume_motion(kg, ray, isect, visibility);
   }
-#  endif /* __OBJECT_MOTION__ */
-#  ifdef __KERNEL_CPU__
-#    ifdef __INSTANCING__
-  if (kernel_data.bvh.have_instancing)
-    return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
-  return bvh_intersect_volume(kg, ray, isect, visibility);
-#  else /* __KERNEL_CPU__ */
-#    ifdef __INSTANCING__
-  return bvh_intersect_volume_instancing(kg, ray, isect, visibility);
-#    else
+#    endif /* __OBJECT_MOTION__ */
+
   return bvh_intersect_volume(kg, ray, isect, visibility);
-#    endif /* __INSTANCING__ */
-#  endif   /* __KERNEL_CPU__ */
+#  endif   /* __KERNEL_OPTIX__ */
 }
 #endif /* __VOLUME__ */
 
@@ -413,6 +508,7 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
   if (!scene_intersect_valid(ray)) {
     return false;
   }
+
 #  ifdef __EMBREE__
   if (kernel_data.bvh.scene) {
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
@@ -423,18 +519,16 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
     RTCRay rtc_ray;
     kernel_embree_setup_ray(*ray, rtc_ray, visibility);
     rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
-    return rtc_ray.tfar == -INFINITY;
+    return ctx.num_hits;
   }
-#  endif
+#  endif /* __EMBREE__ */
+
 #  ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
     return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
   }
 #  endif /* __OBJECT_MOTION__ */
-#  ifdef __INSTANCING__
-  if (kernel_data.bvh.have_instancing)
-    return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility);
-#  endif /* __INSTANCING__ */
+
   return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
 }
 #endif /* __VOLUME_RECORD_ALL__ */
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
index 661bba54fd4..ca637288bee 100644
--- a/intern/cycles/kernel/bvh/bvh_embree.h
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -17,9 +17,12 @@
 #include <embree3/rtcore_ray.h>
 #include <embree3/rtcore_scene.h>
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/split/kernel_split_data_types.h"
 #include "kernel/kernel_globals.h"
+// clang-format on
+
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -28,9 +31,9 @@ struct CCLIntersectContext {
   typedef enum {
     RAY_REGULAR = 0,
     RAY_SHADOW_ALL = 1,
-    RAY_SSS = 2,
-    RAY_VOLUME_ALL = 3,
-
+    RAY_LOCAL = 2,
+    RAY_SSS = 3,
+    RAY_VOLUME_ALL = 4,
   } RayType;
 
   KernelGlobals *kg;
@@ -42,8 +45,8 @@ struct CCLIntersectContext {
   int num_hits;
 
   /* for SSS Rays: */
-  LocalIntersection *ss_isect;
-  int sss_object_id;
+  LocalIntersection *local_isect;
+  int local_object_id;
   uint *lcg_state;
 
   CCLIntersectContext(KernelGlobals *kg_, RayType type_)
@@ -53,8 +56,8 @@ struct CCLIntersectContext {
     max_hits = 1;
     num_hits = 0;
     isect_s = NULL;
-    ss_isect = NULL;
-    sss_object_id = -1;
+    local_isect = NULL;
+    local_object_id = -1;
     lcg_state = NULL;
   }
 };
@@ -121,11 +124,11 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
   isect->type = kernel_tex_fetch(__prim_type, isect->prim);
 }
 
-ccl_device_inline void kernel_embree_convert_local_hit(KernelGlobals *kg,
-                                                       const RTCRay *ray,
-                                                       const RTCHit *hit,
-                                                       Intersection *isect,
-                                                       int local_object_id)
+ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals *kg,
+                                                     const RTCRay *ray,
+                                                     const RTCHit *hit,
+                                                     Intersection *isect,
+                                                     int local_object_id)
 {
   isect->u = 1.0f - hit->v - hit->u;
   isect->v = hit->u;
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 7a069ef1108..4006c9c1632 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_local.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_local.h"
-#  endif
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -88,26 +81,6 @@ ccl_device_inline
     object = local_object;
   }
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect_t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
   /* traversal loop */
   do {
     do {
@@ -117,33 +90,16 @@ ccl_device_inline
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                        dir,
-#  endif
+#endif
                                        idir,
                                        isect_t,
                                        node_addr,
                                        PATH_RAY_ALL_VISIBILITY,
                                        dist);
-#else  // __KERNEL_SSE2__
-        traverse_mask = NODE_INTERSECT(kg,
-                                       P,
-                                       dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                       tnear,
-                                       tfar,
-#  endif
-                                       tsplat,
-                                       Psplat,
-                                       idirsplat,
-                                       shufflexyz,
-                                       node_addr,
-                                       PATH_RAY_ALL_VISIBILITY,
-                                       dist);
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -247,20 +203,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          uint *lcg_state,
                                          int max_hits)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, local_isect, local_object, lcg_state, max_hits);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, local_isect, local_object, lcg_state, max_hits);
-#endif
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, local_isect, local_object, lcg_state, max_hits);
-  }
-  kernel_assert(!"Should not happen");
-  return false;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, local_isect, local_object, lcg_state, max_hits);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 042630121c8..5367bdb633c 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -28,7 +28,6 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
   return space;
 }
 
-#if !defined(__KERNEL_SSE2__)
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
                                                       const float3 P,
                                                       const float3 idir,
@@ -39,7 +38,9 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 {
 
   /* fetch node data */
+#ifdef __VISIBILITY_FLAG__
   float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+#endif
   float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1);
   float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2);
   float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3);
@@ -66,74 +67,13 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
   dist[0] = c0min;
   dist[1] = c1min;
 
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
   /* this visibility test gives a 5% performance hit, how to solve? */
   return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
          (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-#  else
+#else
   return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0);
-#  endif
-}
-
-ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
-                                                             const float3 P,
-                                                             const float3 idir,
-                                                             const float t,
-                                                             const float difl,
-                                                             const float extmax,
-                                                             const int node_addr,
-                                                             const uint visibility,
-                                                             float dist[2])
-{
-
-  /* fetch node data */
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-  float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1);
-  float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2);
-  float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3);
-
-  /* intersect ray against child nodes */
-  float c0lox = (node0.x - P.x) * idir.x;
-  float c0hix = (node0.z - P.x) * idir.x;
-  float c0loy = (node1.x - P.y) * idir.y;
-  float c0hiy = (node1.z - P.y) * idir.y;
-  float c0loz = (node2.x - P.z) * idir.z;
-  float c0hiz = (node2.z - P.z) * idir.z;
-  float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
-  float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
-
-  float c1lox = (node0.y - P.x) * idir.x;
-  float c1hix = (node0.w - P.x) * idir.x;
-  float c1loy = (node1.y - P.y) * idir.y;
-  float c1hiy = (node1.w - P.y) * idir.y;
-  float c1loz = (node2.y - P.z) * idir.z;
-  float c1hiz = (node2.w - P.z) * idir.z;
-  float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
-  float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
-
-  if (difl != 0.0f) {
-    float hdiff = 1.0f + difl;
-    float ldiff = 1.0f - difl;
-    if (__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
-      c0min = max(ldiff * c0min, c0min - extmax);
-      c0max = min(hdiff * c0max, c0max + extmax);
-    }
-    if (__float_as_int(cnodes.w) & PATH_RAY_CURVE) {
-      c1min = max(ldiff * c1min, c1min - extmax);
-      c1max = min(hdiff * c1max, c1max + extmax);
-    }
-  }
-
-  dist[0] = c0min;
-  dist[1] = c1min;
-
-#  ifdef __VISIBILITY_FLAG__
-  /* this visibility test gives a 5% performance hit, how to solve? */
-  return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
-         (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-#  else
-  return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0);
-#  endif
+#endif
 }
 
 ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
@@ -162,41 +102,6 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg
   return tnear <= tfar;
 }
 
-ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(KernelGlobals *kg,
-                                                                      const float3 P,
-                                                                      const float3 dir,
-                                                                      const float t,
-                                                                      const float difl,
-                                                                      int node_addr,
-                                                                      int child,
-                                                                      float dist[2])
-{
-  Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child);
-  float3 aligned_dir = transform_direction(&space, dir);
-  float3 aligned_P = transform_point(&space, P);
-  float3 nrdir = -bvh_inverse_direction(aligned_dir);
-  float3 tLowerXYZ = aligned_P * nrdir;
-  float3 tUpperXYZ = tLowerXYZ - nrdir;
-  const float near_x = min(tLowerXYZ.x, tUpperXYZ.x);
-  const float near_y = min(tLowerXYZ.y, tUpperXYZ.y);
-  const float near_z = min(tLowerXYZ.z, tUpperXYZ.z);
-  const float far_x = max(tLowerXYZ.x, tUpperXYZ.x);
-  const float far_y = max(tLowerXYZ.y, tUpperXYZ.y);
-  const float far_z = max(tLowerXYZ.z, tUpperXYZ.z);
-  const float tnear = max4(0.0f, near_x, near_y, near_z);
-  const float tfar = min4(t, far_x, far_y, far_z);
-  *dist = tnear;
-  if (difl != 0.0f) {
-    /* TODO(sergey): Same as for QBVH, needs a proper use. */
-    const float round_down = 1.0f - difl;
-    const float round_up = 1.0f + difl;
-    return round_down * tnear <= round_up * tfar;
-  }
-  else {
-    return tnear <= tfar;
-  }
-}
-
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                         const float3 P,
                                                         const float3 dir,
@@ -207,51 +112,21 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                         float dist[2])
 {
   int mask = 0;
+#ifdef __VISIBILITY_FLAG__
   float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+#endif
   if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
     if ((__float_as_uint(cnodes.x) & visibility))
-#  endif
+#endif
     {
       mask |= 1;
     }
   }
   if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
     if ((__float_as_uint(cnodes.y) & visibility))
-#  endif
-    {
-      mask |= 2;
-    }
-  }
-  return mask;
-}
-
-ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                               const float3 P,
-                                                               const float3 dir,
-                                                               const float3 idir,
-                                                               const float t,
-                                                               const float difl,
-                                                               const float extmax,
-                                                               const int node_addr,
-                                                               const uint visibility,
-                                                               float dist[2])
-{
-  int mask = 0;
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-  if (bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) {
-#  ifdef __VISIBILITY_FLAG__
-    if ((__float_as_uint(cnodes.x) & visibility))
-#  endif
-    {
-      mask |= 1;
-    }
-  }
-  if (bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) {
-#  ifdef __VISIBILITY_FLAG__
-    if ((__float_as_uint(cnodes.y) & visibility))
-#  endif
+#endif
     {
       mask |= 2;
     }
@@ -276,307 +151,3 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
     return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist);
   }
 }
-
-ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                     const float3 P,
-                                                     const float3 dir,
-                                                     const float3 idir,
-                                                     const float t,
-                                                     const float difl,
-                                                     const float extmax,
-                                                     const int node_addr,
-                                                     const uint visibility,
-                                                     float dist[2])
-{
-  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return bvh_unaligned_node_intersect_robust(
-        kg, P, dir, idir, t, difl, extmax, node_addr, visibility, dist);
-  }
-  else {
-    return bvh_aligned_node_intersect_robust(
-        kg, P, idir, t, difl, extmax, node_addr, visibility, dist);
-  }
-}
-#else /* !defined(__KERNEL_SSE2__) */
-
-int ccl_device_forceinline bvh_aligned_node_intersect(KernelGlobals *kg,
-                                                      const float3 &P,
-                                                      const float3 &dir,
-                                                      const ssef &tsplat,
-                                                      const ssef Psplat[3],
-                                                      const ssef idirsplat[3],
-                                                      const shuffle_swap_t shufflexyz[3],
-                                                      const int node_addr,
-                                                      const uint visibility,
-                                                      float dist[2])
-{
-  /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-
-  /* fetch node data */
-  const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + node_addr;
-
-  /* intersect ray against child nodes */
-  const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-  const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-  const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-  /* calculate { c0min, c1min, -c0max, -c1max} */
-  ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-  const ssef tminmax = minmax ^ pn;
-  const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-  dist[0] = tminmax[0];
-  dist[1] = tminmax[1];
-
-  int mask = movemask(lrhit);
-
-#  ifdef __VISIBILITY_FLAG__
-  /* this visibility test gives a 5% performance hit, how to solve? */
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
-              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-  return cmask;
-#  else
-  return mask & 3;
-#  endif
-}
-
-ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
-                                                             const float3 &P,
-                                                             const float3 &dir,
-                                                             const ssef &tsplat,
-                                                             const ssef Psplat[3],
-                                                             const ssef idirsplat[3],
-                                                             const shuffle_swap_t shufflexyz[3],
-                                                             const float difl,
-                                                             const float extmax,
-                                                             const int nodeAddr,
-                                                             const uint visibility,
-                                                             float dist[2])
-{
-  /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-
-  /* fetch node data */
-  const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + nodeAddr;
-
-  /* intersect ray against child nodes */
-  const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
-  const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
-  const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
-
-  /* calculate { c0min, c1min, -c0max, -c1max} */
-  ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
-  const ssef tminmax = minmax ^ pn;
-
-  if (difl != 0.0f) {
-    float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr + 0);
-    float4 *tminmaxview = (float4 *)&tminmax;
-    float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
-    float &c0max = tminmaxview->z, &c1max = tminmaxview->w;
-    float hdiff = 1.0f + difl;
-    float ldiff = 1.0f - difl;
-    if (__float_as_int(cnodes.x) & PATH_RAY_CURVE) {
-      c0min = max(ldiff * c0min, c0min - extmax);
-      c0max = min(hdiff * c0max, c0max + extmax);
-    }
-    if (__float_as_int(cnodes.y) & PATH_RAY_CURVE) {
-      c1min = max(ldiff * c1min, c1min - extmax);
-      c1max = min(hdiff * c1max, c1max + extmax);
-    }
-  }
-
-  const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
-
-  dist[0] = tminmax[0];
-  dist[1] = tminmax[1];
-
-  int mask = movemask(lrhit);
-
-#  ifdef __VISIBILITY_FLAG__
-  /* this visibility test gives a 5% performance hit, how to solve? */
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr + 0);
-  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
-              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-  return cmask;
-#  else
-  return mask & 3;
-#  endif
-}
-
-ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                        const float3 P,
-                                                        const float3 dir,
-                                                        const ssef &isect_near,
-                                                        const ssef &isect_far,
-                                                        const int node_addr,
-                                                        const uint visibility,
-                                                        float dist[2])
-{
-  Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
-  Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
-
-  float3 aligned_dir0 = transform_direction(&space0, dir),
-         aligned_dir1 = transform_direction(&space1, dir);
-  float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P);
-  float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
-         nrdir1 = -bvh_inverse_direction(aligned_dir1);
-
-  ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f),
-       lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f),
-       lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f);
-
-  ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
-       upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
-       upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
-
-  ssef tnear_x = min(lower_x, upper_x);
-  ssef tnear_y = min(lower_y, upper_y);
-  ssef tnear_z = min(lower_z, upper_z);
-  ssef tfar_x = max(lower_x, upper_x);
-  ssef tfar_y = max(lower_y, upper_y);
-  ssef tfar_z = max(lower_z, upper_z);
-
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  sseb vmask = tnear <= tfar;
-  dist[0] = tnear.f[0];
-  dist[1] = tnear.f[1];
-
-  int mask = (int)movemask(vmask);
-
-#  ifdef __VISIBILITY_FLAG__
-  /* this visibility test gives a 5% performance hit, how to solve? */
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
-              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-  return cmask;
-#  else
-  return mask & 3;
-#  endif
-}
-
-ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                               const float3 P,
-                                                               const float3 dir,
-                                                               const ssef &isect_near,
-                                                               const ssef &isect_far,
-                                                               const float difl,
-                                                               const int node_addr,
-                                                               const uint visibility,
-                                                               float dist[2])
-{
-  Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
-  Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
-
-  float3 aligned_dir0 = transform_direction(&space0, dir),
-         aligned_dir1 = transform_direction(&space1, dir);
-  float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P);
-  float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
-         nrdir1 = -bvh_inverse_direction(aligned_dir1);
-
-  ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f),
-       lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f),
-       lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f);
-
-  ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f),
-       upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f),
-       upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f);
-
-  ssef tnear_x = min(lower_x, upper_x);
-  ssef tnear_y = min(lower_y, upper_y);
-  ssef tnear_z = min(lower_z, upper_z);
-  ssef tfar_x = max(lower_x, upper_x);
-  ssef tfar_y = max(lower_y, upper_y);
-  ssef tfar_z = max(lower_z, upper_z);
-
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  sseb vmask;
-  if (difl != 0.0f) {
-    const float round_down = 1.0f - difl;
-    const float round_up = 1.0f + difl;
-    vmask = round_down * tnear <= round_up * tfar;
-  }
-  else {
-    vmask = tnear <= tfar;
-  }
-
-  dist[0] = tnear.f[0];
-  dist[1] = tnear.f[1];
-
-  int mask = (int)movemask(vmask);
-
-#  ifdef __VISIBILITY_FLAG__
-  /* this visibility test gives a 5% performance hit, how to solve? */
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-  int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) |
-              (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0);
-  return cmask;
-#  else
-  return mask & 3;
-#  endif
-}
-
-ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                              const float3 &P,
-                                              const float3 &dir,
-                                              const ssef &isect_near,
-                                              const ssef &isect_far,
-                                              const ssef &tsplat,
-                                              const ssef Psplat[3],
-                                              const ssef idirsplat[3],
-                                              const shuffle_swap_t shufflexyz[3],
-                                              const int node_addr,
-                                              const uint visibility,
-                                              float dist[2])
-{
-  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return bvh_unaligned_node_intersect(
-        kg, P, dir, isect_near, isect_far, node_addr, visibility, dist);
-  }
-  else {
-    return bvh_aligned_node_intersect(
-        kg, P, dir, tsplat, Psplat, idirsplat, shufflexyz, node_addr, visibility, dist);
-  }
-}
-
-ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                     const float3 &P,
-                                                     const float3 &dir,
-                                                     const ssef &isect_near,
-                                                     const ssef &isect_far,
-                                                     const ssef &tsplat,
-                                                     const ssef Psplat[3],
-                                                     const ssef idirsplat[3],
-                                                     const shuffle_swap_t shufflexyz[3],
-                                                     const float difl,
-                                                     const float extmax,
-                                                     const int node_addr,
-                                                     const uint visibility,
-                                                     float dist[2])
-{
-  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return bvh_unaligned_node_intersect_robust(
-        kg, P, dir, isect_near, isect_far, difl, node_addr, visibility, dist);
-  }
-  else {
-    return bvh_aligned_node_intersect_robust(kg,
-                                             P,
-                                             dir,
-                                             tsplat,
-                                             Psplat,
-                                             idirsplat,
-                                             shufflexyz,
-                                             difl,
-                                             extmax,
-                                             node_addr,
-                                             visibility,
-                                             dist);
-  }
-}
-#endif /* !defined(__KERNEL_SSE2__) */
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index b362779549c..dccd257d2de 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_shadow_all.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_shadow_all.h"
-#  endif
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -34,7 +27,6 @@
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
  *
- * BVH_INSTANCING: object instancing
  * BVH_HAIR: hair curve rendering
  * BVH_MOTION: motion blur rendering
  */
@@ -76,33 +68,11 @@ ccl_device_inline
   Transform ob_itfm;
 #endif
 
-#if BVH_FEATURE(BVH_INSTANCING)
   int num_hits_in_instance = 0;
-#endif
 
   *num_hits = 0;
   isect_array->t = tmax;
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect_t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif /* __KERNEL_SSE2__ */
-
   /* traversal loop */
   do {
     do {
@@ -112,33 +82,16 @@ ccl_device_inline
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                        dir,
-#  endif
+#endif
                                        idir,
                                        isect_t,
                                        node_addr,
                                        visibility,
                                        dist);
-#else  // __KERNEL_SSE2__
-        traverse_mask = NODE_INTERSECT(kg,
-                                       P,
-                                       dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                       tnear,
-                                       tfar,
-#  endif
-                                       tsplat,
-                                       Psplat,
-                                       idirsplat,
-                                       shufflexyz,
-                                       node_addr,
-                                       visibility,
-                                       dist);
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -174,9 +127,7 @@ ccl_device_inline
         float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
-#if BVH_FEATURE(BVH_INSTANCING)
         if (prim_addr >= 0) {
-#endif
           const int prim_addr2 = __float_as_int(leaf.y);
           const uint type = __float_as_int(leaf.w);
           const uint p_type = type & PRIMITIVE_ALL;
@@ -207,37 +158,13 @@ ccl_device_inline
               }
 #endif
 #if BVH_FEATURE(BVH_HAIR)
-              case PRIMITIVE_CURVE:
-              case PRIMITIVE_MOTION_CURVE: {
+              case PRIMITIVE_CURVE_THICK:
+              case PRIMITIVE_MOTION_CURVE_THICK:
+              case PRIMITIVE_CURVE_RIBBON:
+              case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(kg,
-                                                 isect_array,
-                                                 P,
-                                                 dir,
-                                                 visibility,
-                                                 object,
-                                                 prim_addr,
-                                                 ray->time,
-                                                 curve_type,
-                                                 NULL,
-                                                 0,
-                                                 0);
-                }
-                else {
-                  hit = curve_intersect(kg,
-                                        isect_array,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        ray->time,
-                                        curve_type,
-                                        NULL,
-                                        0,
-                                        0);
-                }
+                hit = curve_intersect(
+                    kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type);
                 break;
               }
 #endif
@@ -282,9 +209,7 @@ ccl_device_inline
               /* move on to next entry in intersections array */
               isect_array++;
               (*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
               num_hits_in_instance++;
-#endif
 
               isect_array->t = isect_t;
             }
@@ -292,32 +217,19 @@ ccl_device_inline
             prim_addr++;
           }
         }
-#if BVH_FEATURE(BVH_INSTANCING)
         else {
           /* instance push */
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
           isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
+#else
           isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
+#endif
 
           num_hits_in_instance = 0;
           isect_array->t = isect_t;
 
-#  if defined(__KERNEL_SSE2__)
-          Psplat[0] = ssef(P.x);
-          Psplat[1] = ssef(P.y);
-          Psplat[2] = ssef(P.z);
-
-          tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-          tfar = ssef(isect_t);
-#    endif
-          gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
           traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -325,10 +237,8 @@ ccl_device_inline
           node_addr = kernel_tex_fetch(__object_node, object);
         }
       }
-#endif /* FEATURE(BVH_INSTANCING) */
     } while (node_addr != ENTRYPOINT_SENTINEL);
 
-#if BVH_FEATURE(BVH_INSTANCING)
     if (stack_ptr >= 0) {
       kernel_assert(object != OBJECT_NONE);
 
@@ -336,11 +246,11 @@ ccl_device_inline
       if (num_hits_in_instance) {
         float t_fac;
 
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
         bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
+#else
         bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
+#endif
 
         /* scale isect->t to adjust for instancing */
         for (int i = 0; i < num_hits_in_instance; i++) {
@@ -348,33 +258,20 @@ ccl_device_inline
         }
       }
       else {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
         bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
+#else
         bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
+#endif
       }
 
       isect_t = tmax;
       isect_array->t = isect_t;
 
-#  if defined(__KERNEL_SSE2__)
-      Psplat[0] = ssef(P.x);
-      Psplat[1] = ssef(P.y);
-      Psplat[2] = ssef(P.z);
-
-      tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-      tfar = ssef(isect_t);
-#    endif
-      gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
-#endif /* FEATURE(BVH_INSTANCING) */
   } while (node_addr != ENTRYPOINT_SENTINEL);
 
   return false;
@@ -387,20 +284,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint max_hits,
                                          uint *num_hits)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
-#endif
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
-  }
-  kernel_assert(!"Should not happen");
-  return false;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 34a06d003bb..8b2699ab807 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -17,42 +17,24 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_traversal.h"
-#endif
-#ifdef __KERNEL_AVX2__
-#  include "kernel/bvh/obvh_traversal.h"
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
-#  define NODE_INTERSECT_ROBUST bvh_node_intersect_robust
 #else
 #  define NODE_INTERSECT bvh_aligned_node_intersect
-#  define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust
 #endif
 
 /* This is a template BVH traversal function, where various features can be
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
  *
- * BVH_INSTANCING: object instancing
  * BVH_HAIR: hair curve rendering
- * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
  * BVH_MOTION: motion blur rendering
  */
 
 ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
-                                                     const uint visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                                     ,
-                                                     uint *lcg_state,
-                                                     float difl,
-                                                     float extmax
-#endif
-)
+                                                     const uint visibility)
 {
   /* todo:
    * - test if pushing distance on the stack helps (for non shadow rays)
@@ -87,26 +69,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
   BVH_DEBUG_INIT();
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect->t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
   /* traversal loop */
   do {
     do {
@@ -116,75 +78,18 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
-#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-        if (difl != 0.0f) {
-          traverse_mask = NODE_INTERSECT_ROBUST(kg,
-                                                P,
-#    if BVH_FEATURE(BVH_HAIR)
-                                                dir,
-#    endif
-                                                idir,
-                                                isect->t,
-                                                difl,
-                                                extmax,
-                                                node_addr,
-                                                visibility,
-                                                dist);
-        }
-        else
-#  endif
         {
           traverse_mask = NODE_INTERSECT(kg,
                                          P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                          dir,
-#  endif
+#endif
                                          idir,
                                          isect->t,
                                          node_addr,
                                          visibility,
                                          dist);
         }
-#else  // __KERNEL_SSE2__
-#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-        if (difl != 0.0f) {
-          traverse_mask = NODE_INTERSECT_ROBUST(kg,
-                                                P,
-                                                dir,
-#    if BVH_FEATURE(BVH_HAIR)
-                                                tnear,
-                                                tfar,
-#    endif
-                                                tsplat,
-                                                Psplat,
-                                                idirsplat,
-                                                shufflexyz,
-                                                difl,
-                                                extmax,
-                                                node_addr,
-                                                visibility,
-                                                dist);
-        }
-        else
-#  endif
-        {
-          traverse_mask = NODE_INTERSECT(kg,
-                                         P,
-                                         dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                         tnear,
-                                         tfar,
-#  endif
-                                         tsplat,
-                                         Psplat,
-                                         idirsplat,
-                                         shufflexyz,
-                                         node_addr,
-                                         visibility,
-                                         dist);
-        }
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -221,9 +126,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
         float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
-#if BVH_FEATURE(BVH_INSTANCING)
         if (prim_addr >= 0) {
-#endif
           const int prim_addr2 = __float_as_int(leaf.y);
           const uint type = __float_as_int(leaf.w);
 
@@ -239,17 +142,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
                 if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
-#if defined(__KERNEL_SSE2__)
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
-                  tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-                  tfar = ssef(isect->t);
-#  endif
-#else
-                if (visibility & PATH_RAY_SHADOW_OPAQUE)
-                  return true;
-#endif
                 }
               }
               break;
@@ -262,71 +156,28 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                 if (motion_triangle_intersect(
                         kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
-#  if defined(__KERNEL_SSE2__)
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE)
-                    return true;
-                  tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-                  tfar = ssef(isect->t);
-#    endif
-#  else
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
-#  endif
                 }
               }
               break;
             }
 #endif /* BVH_FEATURE(BVH_MOTION) */
 #if BVH_FEATURE(BVH_HAIR)
-            case PRIMITIVE_CURVE:
-            case PRIMITIVE_MOTION_CURVE: {
+            case PRIMITIVE_CURVE_THICK:
+            case PRIMITIVE_MOTION_CURVE_THICK:
+            case PRIMITIVE_CURVE_RIBBON:
+            case PRIMITIVE_MOTION_CURVE_RIBBON: {
               for (; prim_addr < prim_addr2; prim_addr++) {
                 BVH_DEBUG_NEXT_INTERSECTION();
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
                 kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                bool hit;
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(kg,
-                                                 isect,
-                                                 P,
-                                                 dir,
-                                                 visibility,
-                                                 object,
-                                                 prim_addr,
-                                                 ray->time,
-                                                 curve_type,
-                                                 lcg_state,
-                                                 difl,
-                                                 extmax);
-                }
-                else {
-                  hit = curve_intersect(kg,
-                                        isect,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        ray->time,
-                                        curve_type,
-                                        lcg_state,
-                                        difl,
-                                        extmax);
-                }
+                const bool hit = curve_intersect(
+                    kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
                 if (hit) {
                   /* shadow ray early termination */
-#  if defined(__KERNEL_SSE2__)
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
-                  tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-                  tfar = ssef(isect->t);
-#    endif
-#  else
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE)
-                    return true;
-#  endif
                 }
               }
               break;
@@ -334,30 +185,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #endif /* BVH_FEATURE(BVH_HAIR) */
           }
         }
-#if BVH_FEATURE(BVH_INSTANCING)
         else {
           /* instance push */
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
           isect->t = bvh_instance_motion_push(
               kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
+#else
           isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-#  if defined(__KERNEL_SSE2__)
-          Psplat[0] = ssef(P.x);
-          Psplat[1] = ssef(P.y);
-          Psplat[2] = ssef(P.z);
-
-          tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-          tfar = ssef(isect->t);
-#    endif
-
-          gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
+#endif
 
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -368,38 +205,22 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
           BVH_DEBUG_NEXT_INSTANCE();
         }
       }
-#endif /* FEATURE(BVH_INSTANCING) */
     } while (node_addr != ENTRYPOINT_SENTINEL);
 
-#if BVH_FEATURE(BVH_INSTANCING)
     if (stack_ptr >= 0) {
       kernel_assert(object != OBJECT_NONE);
 
       /* instance pop */
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
       isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
+#else
       isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-#  if defined(__KERNEL_SSE2__)
-      Psplat[0] = ssef(P.x);
-      Psplat[1] = ssef(P.y);
-      Psplat[2] = ssef(P.z);
-
-      tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-      tfar = ssef(isect->t);
-#    endif
-
-      gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
+#endif
 
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
-#endif /* FEATURE(BVH_INSTANCING) */
   } while (node_addr != ENTRYPOINT_SENTINEL);
 
   return (isect->prim != PRIM_NONE);
@@ -408,62 +229,11 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
-                                         const uint visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                         ,
-                                         uint *lcg_state,
-                                         float difl,
-                                         float extmax
-#endif
-)
+                                         const uint visibility)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg,
-                                          ray,
-                                          isect,
-                                          visibility
-#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                          ,
-                                          lcg_state,
-                                          difl,
-                                          extmax
-#  endif
-      );
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
-                                          ray,
-                                          isect,
-                                          visibility
-#  if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                          ,
-                                          lcg_state,
-                                          difl,
-                                          extmax
-#  endif
-      );
-#endif /* __QBVH__ */
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg,
-                                         ray,
-                                         isect,
-                                         visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                         ,
-                                         lcg_state,
-                                         difl,
-                                         extmax
-#endif
-      );
-  }
-  kernel_assert(!"Should not happen");
-  return false;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility);
 }
 
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
 #undef NODE_INTERSECT
-#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index 16f3b03f842..b173568266b 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -31,14 +31,10 @@ CCL_NAMESPACE_BEGIN
 
 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
 #define BVH_STACK_SIZE 192
-#define BVH_QSTACK_SIZE 384
-#define BVH_OSTACK_SIZE 768
 /* BVH intersection function variations */
 
-#define BVH_INSTANCING 1
-#define BVH_MOTION 2
-#define BVH_HAIR 4
-#define BVH_HAIR_MINIMUM_WIDTH 8
+#define BVH_MOTION 1
+#define BVH_HAIR 2
 
 #define BVH_NAME_JOIN(x, y) x##_##y
 #define BVH_NAME_EVAL(x, y) BVH_NAME_JOIN(x, y)
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index c83b0d783f4..1f2ea47269b 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_volume.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_volume.h"
-#  endif
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -34,7 +27,6 @@
  * various features can be enabled/disabled. This way we can compile optimized
  * versions for each case without new features slowing things down.
  *
- * BVH_INSTANCING: object instancing
  * BVH_MOTION: motion blur rendering
  */
 
@@ -79,26 +71,6 @@ ccl_device_inline
   isect->prim = PRIM_NONE;
   isect->object = OBJECT_NONE;
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect->t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
-
   /* traversal loop */
   do {
     do {
@@ -108,33 +80,16 @@ ccl_device_inline
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                        dir,
-#  endif
+#endif
                                        idir,
                                        isect->t,
                                        node_addr,
                                        visibility,
                                        dist);
-#else  // __KERNEL_SSE2__
-        traverse_mask = NODE_INTERSECT(kg,
-                                       P,
-                                       dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                       tnear,
-                                       tfar,
-#  endif
-                                       tsplat,
-                                       Psplat,
-                                       idirsplat,
-                                       shufflexyz,
-                                       node_addr,
-                                       visibility,
-                                       dist);
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -170,9 +125,7 @@ ccl_device_inline
         float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
-#if BVH_FEATURE(BVH_INSTANCING)
         if (prim_addr >= 0) {
-#endif
           const int prim_addr2 = __float_as_int(leaf.y);
           const uint type = __float_as_int(leaf.w);
 
@@ -222,31 +175,17 @@ ccl_device_inline
             }
           }
         }
-#if BVH_FEATURE(BVH_INSTANCING)
         else {
           /* instance push */
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
             isect->t = bvh_instance_motion_push(
                 kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
+#else
             isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-#  if defined(__KERNEL_SSE2__)
-            Psplat[0] = ssef(P.x);
-            Psplat[1] = ssef(P.y);
-            Psplat[2] = ssef(P.z);
-
-            tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-            tfar = ssef(isect->t);
-#    endif
-
-            gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
+#endif
 
             ++stack_ptr;
             kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -262,38 +201,22 @@ ccl_device_inline
           }
         }
       }
-#endif /* FEATURE(BVH_INSTANCING) */
     } while (node_addr != ENTRYPOINT_SENTINEL);
 
-#if BVH_FEATURE(BVH_INSTANCING)
     if (stack_ptr >= 0) {
       kernel_assert(object != OBJECT_NONE);
 
       /* instance pop */
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
       isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
+#else
       isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-#  if defined(__KERNEL_SSE2__)
-      Psplat[0] = ssef(P.x);
-      Psplat[1] = ssef(P.y);
-      Psplat[2] = ssef(P.z);
-
-      tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-#    if BVH_FEATURE(BVH_HAIR)
-      tfar = ssef(isect->t);
-#    endif
-
-      gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
+#endif
 
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
-#endif /* FEATURE(BVH_MOTION) */
   } while (node_addr != ENTRYPOINT_SENTINEL);
 
   return (isect->prim != PRIM_NONE);
@@ -304,20 +227,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          Intersection *isect,
                                          const uint visibility)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect, visibility);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect, visibility);
-#endif
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility);
-  }
-  kernel_assert(!"Should not happen");
-  return false;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index ae8c4d12e8a..a8664cc4331 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -17,13 +17,6 @@
  * limitations under the License.
  */
 
-#ifdef __QBVH__
-#  include "kernel/bvh/qbvh_volume_all.h"
-#  ifdef __KERNEL_AVX2__
-#    include "kernel/bvh/obvh_volume_all.h"
-#  endif
-#endif
-
 #if BVH_FEATURE(BVH_HAIR)
 #  define NODE_INTERSECT bvh_node_intersect
 #else
@@ -34,7 +27,6 @@
  * various features can be enabled/disabled. This way we can compile optimized
  * versions for each case without new features slowing things down.
  *
- * BVH_INSTANCING: object instancing
  * BVH_MOTION: motion blur rendering
  */
 
@@ -76,33 +68,11 @@ ccl_device_inline
   Transform ob_itfm;
 #endif
 
-#if BVH_FEATURE(BVH_INSTANCING)
   int num_hits_in_instance = 0;
-#endif
 
   uint num_hits = 0;
   isect_array->t = tmax;
 
-#if defined(__KERNEL_SSE2__)
-  const shuffle_swap_t shuf_identity = shuffle_swap_identity();
-  const shuffle_swap_t shuf_swap = shuffle_swap_swap();
-
-  const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
-  ssef Psplat[3], idirsplat[3];
-#  if BVH_FEATURE(BVH_HAIR)
-  ssef tnear(0.0f), tfar(isect_t);
-#  endif
-  shuffle_swap_t shufflexyz[3];
-
-  Psplat[0] = ssef(P.x);
-  Psplat[1] = ssef(P.y);
-  Psplat[2] = ssef(P.z);
-
-  ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
-
-  gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif /* __KERNEL_SSE2__ */
-
   /* traversal loop */
   do {
     do {
@@ -112,33 +82,16 @@ ccl_device_inline
         float dist[2];
         float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
 
-#if !defined(__KERNEL_SSE2__)
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
-#  if BVH_FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
                                        dir,
-#  endif
+#endif
                                        idir,
                                        isect_t,
                                        node_addr,
                                        visibility,
                                        dist);
-#else  // __KERNEL_SSE2__
-        traverse_mask = NODE_INTERSECT(kg,
-                                       P,
-                                       dir,
-#  if BVH_FEATURE(BVH_HAIR)
-                                       tnear,
-                                       tfar,
-#  endif
-                                       tsplat,
-                                       Psplat,
-                                       idirsplat,
-                                       shufflexyz,
-                                       node_addr,
-                                       visibility,
-                                       dist);
-#endif  // __KERNEL_SSE2__
 
         node_addr = __float_as_int(cnodes.z);
         node_addr_child1 = __float_as_int(cnodes.w);
@@ -174,9 +127,7 @@ ccl_device_inline
         float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
-#if BVH_FEATURE(BVH_INSTANCING)
         if (prim_addr >= 0) {
-#endif
           const int prim_addr2 = __float_as_int(leaf.y);
           const uint type = __float_as_int(leaf.w);
           bool hit;
@@ -204,25 +155,21 @@ ccl_device_inline
                   /* Move on to next entry in intersections array. */
                   isect_array++;
                   num_hits++;
-#if BVH_FEATURE(BVH_INSTANCING)
                   num_hits_in_instance++;
-#endif
                   isect_array->t = isect_t;
                   if (num_hits == max_hits) {
-#if BVH_FEATURE(BVH_INSTANCING)
                     if (object != OBJECT_NONE) {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
                       float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
+#else
                       Transform itfm = object_fetch_transform(
                           kg, object, OBJECT_INVERSE_TRANSFORM);
                       float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
+#endif
                       for (int i = 0; i < num_hits_in_instance; i++) {
                         (isect_array - i - 1)->t *= t_fac;
                       }
                     }
-#endif /* BVH_FEATURE(BVH_INSTANCING) */
                     return num_hits;
                   }
                 }
@@ -248,25 +195,21 @@ ccl_device_inline
                   /* Move on to next entry in intersections array. */
                   isect_array++;
                   num_hits++;
-#  if BVH_FEATURE(BVH_INSTANCING)
                   num_hits_in_instance++;
-#  endif
                   isect_array->t = isect_t;
                   if (num_hits == max_hits) {
-#  if BVH_FEATURE(BVH_INSTANCING)
                     if (object != OBJECT_NONE) {
-#    if BVH_FEATURE(BVH_MOTION)
+#  if BVH_FEATURE(BVH_MOTION)
                       float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#    else
+#  else
                       Transform itfm = object_fetch_transform(
                           kg, object, OBJECT_INVERSE_TRANSFORM);
                       float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#    endif
+#  endif
                       for (int i = 0; i < num_hits_in_instance; i++) {
                         (isect_array - i - 1)->t *= t_fac;
                       }
                     }
-#  endif /* BVH_FEATURE(BVH_INSTANCING) */
                     return num_hits;
                   }
                 }
@@ -279,35 +222,21 @@ ccl_device_inline
             }
           }
         }
-#if BVH_FEATURE(BVH_INSTANCING)
         else {
           /* instance push */
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
             isect_t = bvh_instance_motion_push(
                 kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
+#else
             isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
+#endif
 
             num_hits_in_instance = 0;
             isect_array->t = isect_t;
 
-#  if defined(__KERNEL_SSE2__)
-            Psplat[0] = ssef(P.x);
-            Psplat[1] = ssef(P.y);
-            Psplat[2] = ssef(P.z);
-
-            tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-            tfar = ssef(isect_t);
-#    endif
-
-            gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
             ++stack_ptr;
             kernel_assert(stack_ptr < BVH_STACK_SIZE);
             traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -322,55 +251,39 @@ ccl_device_inline
           }
         }
       }
-#endif /* FEATURE(BVH_INSTANCING) */
     } while (node_addr != ENTRYPOINT_SENTINEL);
 
-#if BVH_FEATURE(BVH_INSTANCING)
     if (stack_ptr >= 0) {
       kernel_assert(object != OBJECT_NONE);
 
       /* Instance pop. */
       if (num_hits_in_instance) {
         float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
         bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
+#else
         bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
+#endif
         /* Scale isect->t to adjust for instancing. */
         for (int i = 0; i < num_hits_in_instance; i++) {
           (isect_array - i - 1)->t *= t_fac;
         }
       }
       else {
-#  if BVH_FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
         bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
+#else
         bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
+#endif
       }
 
       isect_t = tmax;
       isect_array->t = isect_t;
 
-#  if defined(__KERNEL_SSE2__)
-      Psplat[0] = ssef(P.x);
-      Psplat[1] = ssef(P.y);
-      Psplat[2] = ssef(P.z);
-
-      tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-#    if BVH_FEATURE(BVH_HAIR)
-      tfar = ssef(isect_t);
-#    endif
-
-      gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#  endif
-
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
-#endif /* FEATURE(BVH_INSTANCING) */
   } while (node_addr != ENTRYPOINT_SENTINEL);
 
   return num_hits;
@@ -382,20 +295,7 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const uint max_hits,
                                          const uint visibility)
 {
-  switch (kernel_data.bvh.bvh_layout) {
-#ifdef __KERNEL_AVX2__
-    case BVH_LAYOUT_BVH8:
-      return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect_array, max_hits, visibility);
-#endif
-#ifdef __QBVH__
-    case BVH_LAYOUT_BVH4:
-      return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, max_hits, visibility);
-#endif
-    case BVH_LAYOUT_BVH2:
-      return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, max_hits, visibility);
-  }
-  kernel_assert(!"Should not happen");
-  return 0;
+  return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, max_hits, visibility);
 }
 
 #undef BVH_FUNCTION_NAME
diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h
deleted file mode 100644
index e6bb548bc5b..00000000000
--- a/intern/cycles/kernel/bvh/obvh_local.h
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for subsurface scattering, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             LocalIntersection *local_isect,
-                                             int local_object,
-                                             uint *lcg_state,
-                                             int max_hits)
-{
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_tex_fetch(__object_node, local_object);
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = ray->t;
-
-  if (local_isect != NULL) {
-    local_isect->num_hits = 0;
-  }
-  kernel_assert((local_isect == NULL) == (max_hits == 0));
-
-  const int object_flag = kernel_tex_fetch(__object_flag, local_object);
-  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-#if BVH_FEATURE(BVH_MOTION)
-    Transform ob_itfm;
-    isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#else
-    isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
-#endif
-    object = local_object;
-  }
-
-  avxf tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        avxf dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-          avxf cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-        int prim_addr = __float_as_int(leaf.x);
-
-        int prim_addr2 = __float_as_int(leaf.y);
-        const uint type = __float_as_int(leaf.w);
-
-        /* Pop. */
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-
-        /* Primitive intersection. */
-        switch (type & PRIMITIVE_ALL) {
-          case PRIMITIVE_TRIANGLE: {
-            /* Intersect ray against primitive, */
-            for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-              if (triangle_intersect_local(kg,
-                                           local_isect,
-                                           P,
-                                           dir,
-                                           object,
-                                           local_object,
-                                           prim_addr,
-                                           isect_t,
-                                           lcg_state,
-                                           max_hits)) {
-                return true;
-              }
-            }
-            break;
-          }
-#if BVH_FEATURE(BVH_MOTION)
-          case PRIMITIVE_MOTION_TRIANGLE: {
-            /* Intersect ray against primitive. */
-            for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-              if (motion_triangle_intersect_local(kg,
-                                                  local_isect,
-                                                  P,
-                                                  dir,
-                                                  ray->time,
-                                                  object,
-                                                  local_object,
-                                                  prim_addr,
-                                                  isect_t,
-                                                  lcg_state,
-                                                  max_hits)) {
-                return true;
-              }
-            }
-            break;
-          }
-#endif
-          default:
-            break;
-        }
-      }
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-  return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h
deleted file mode 100644
index 6831562cade..00000000000
--- a/intern/cycles/kernel/bvh/obvh_nodes.h
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Aligned nodes intersection AVX code is adopted from Embree,
- */
-
-struct OBVHStackItem {
-  int addr;
-  float dist;
-};
-
-ccl_device_inline void obvh_near_far_idx_calc(const float3 &idir,
-                                              int *ccl_restrict near_x,
-                                              int *ccl_restrict near_y,
-                                              int *ccl_restrict near_z,
-                                              int *ccl_restrict far_x,
-                                              int *ccl_restrict far_y,
-                                              int *ccl_restrict far_z)
-
-{
-#ifdef __KERNEL_SSE__
-  *near_x = 0;
-  *far_x = 1;
-  *near_y = 2;
-  *far_y = 3;
-  *near_z = 4;
-  *far_z = 5;
-
-  const size_t mask = movemask(ssef(idir.m128));
-
-  const int mask_x = mask & 1;
-  const int mask_y = (mask & 2) >> 1;
-  const int mask_z = (mask & 4) >> 2;
-
-  *near_x += mask_x;
-  *far_x -= mask_x;
-  *near_y += mask_y;
-  *far_y -= mask_y;
-  *near_z += mask_z;
-  *far_z -= mask_z;
-#else
-  if (idir.x >= 0.0f) {
-    *near_x = 0;
-    *far_x = 1;
-  }
-  else {
-    *near_x = 1;
-    *far_x = 0;
-  }
-  if (idir.y >= 0.0f) {
-    *near_y = 2;
-    *far_y = 3;
-  }
-  else {
-    *near_y = 3;
-    *far_y = 2;
-  }
-  if (idir.z >= 0.0f) {
-    *near_z = 4;
-    *far_z = 5;
-  }
-  else {
-    *near_z = 5;
-    *far_z = 4;
-  }
-#endif
-}
-
-ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, OBVHStackItem *ccl_restrict b)
-{
-  OBVHStackItem tmp = *a;
-  *a = *b;
-  *b = tmp;
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3)
-{
-  if (s2->dist < s1->dist) {
-    obvh_item_swap(s2, s1);
-  }
-  if (s3->dist < s2->dist) {
-    obvh_item_swap(s3, s2);
-  }
-  if (s2->dist < s1->dist) {
-    obvh_item_swap(s2, s1);
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4)
-{
-  if (s2->dist < s1->dist) {
-    obvh_item_swap(s2, s1);
-  }
-  if (s4->dist < s3->dist) {
-    obvh_item_swap(s4, s3);
-  }
-  if (s3->dist < s1->dist) {
-    obvh_item_swap(s3, s1);
-  }
-  if (s4->dist < s2->dist) {
-    obvh_item_swap(s4, s2);
-  }
-  if (s3->dist < s2->dist) {
-    obvh_item_swap(s3, s2);
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4,
-                                       OBVHStackItem *ccl_restrict s5)
-{
-  obvh_stack_sort(s1, s2, s3, s4);
-  if (s5->dist < s4->dist) {
-    obvh_item_swap(s4, s5);
-    if (s4->dist < s3->dist) {
-      obvh_item_swap(s3, s4);
-      if (s3->dist < s2->dist) {
-        obvh_item_swap(s2, s3);
-        if (s2->dist < s1->dist) {
-          obvh_item_swap(s1, s2);
-        }
-      }
-    }
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4,
-                                       OBVHStackItem *ccl_restrict s5,
-                                       OBVHStackItem *ccl_restrict s6)
-{
-  obvh_stack_sort(s1, s2, s3, s4, s5);
-  if (s6->dist < s5->dist) {
-    obvh_item_swap(s5, s6);
-    if (s5->dist < s4->dist) {
-      obvh_item_swap(s4, s5);
-      if (s4->dist < s3->dist) {
-        obvh_item_swap(s3, s4);
-        if (s3->dist < s2->dist) {
-          obvh_item_swap(s2, s3);
-          if (s2->dist < s1->dist) {
-            obvh_item_swap(s1, s2);
-          }
-        }
-      }
-    }
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4,
-                                       OBVHStackItem *ccl_restrict s5,
-                                       OBVHStackItem *ccl_restrict s6,
-                                       OBVHStackItem *ccl_restrict s7)
-{
-  obvh_stack_sort(s1, s2, s3, s4, s5, s6);
-  if (s7->dist < s6->dist) {
-    obvh_item_swap(s6, s7);
-    if (s6->dist < s5->dist) {
-      obvh_item_swap(s5, s6);
-      if (s5->dist < s4->dist) {
-        obvh_item_swap(s4, s5);
-        if (s4->dist < s3->dist) {
-          obvh_item_swap(s3, s4);
-          if (s3->dist < s2->dist) {
-            obvh_item_swap(s2, s3);
-            if (s2->dist < s1->dist) {
-              obvh_item_swap(s1, s2);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1,
-                                       OBVHStackItem *ccl_restrict s2,
-                                       OBVHStackItem *ccl_restrict s3,
-                                       OBVHStackItem *ccl_restrict s4,
-                                       OBVHStackItem *ccl_restrict s5,
-                                       OBVHStackItem *ccl_restrict s6,
-                                       OBVHStackItem *ccl_restrict s7,
-                                       OBVHStackItem *ccl_restrict s8)
-{
-  obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7);
-  if (s8->dist < s7->dist) {
-    obvh_item_swap(s7, s8);
-    if (s7->dist < s6->dist) {
-      obvh_item_swap(s6, s7);
-      if (s6->dist < s5->dist) {
-        obvh_item_swap(s5, s6);
-        if (s5->dist < s4->dist) {
-          obvh_item_swap(s4, s5);
-          if (s4->dist < s3->dist) {
-            obvh_item_swap(s3, s4);
-            if (s3->dist < s2->dist) {
-              obvh_item_swap(s2, s3);
-              if (s2->dist < s1->dist) {
-                obvh_item_swap(s1, s2);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-/* Axis-aligned nodes intersection */
-
-ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
-                                                  const avxf &isect_near,
-                                                  const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                  const avx3f &org_idir,
-#else
-                                                  const avx3f &org,
-#endif
-                                                  const avx3f &idir,
-                                                  const int near_x,
-                                                  const int near_y,
-                                                  const int near_z,
-                                                  const int far_x,
-                                                  const int far_y,
-                                                  const int far_z,
-                                                  const int node_addr,
-                                                  avxf *ccl_restrict dist)
-{
-  const int offset = node_addr + 2;
-#ifdef __KERNEL_AVX2__
-  const avxf tnear_x = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, org_idir.x);
-  const avxf tnear_y = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, org_idir.y);
-  const avxf tnear_z = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, org_idir.z);
-  const avxf tfar_x = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, org_idir.x);
-  const avxf tfar_y = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, org_idir.y);
-  const avxf tfar_z = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, org_idir.z);
-
-  const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-  const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
-  const avxb vmask = tnear <= tfar;
-  int mask = (int)movemask(vmask);
-  *dist = tnear;
-  return mask;
-#else
-  return 0;
-#endif
-}
-
-ccl_device_inline int obvh_aligned_node_intersect_robust(KernelGlobals *ccl_restrict kg,
-                                                         const avxf &isect_near,
-                                                         const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                         const avx3f &P_idir,
-#else
-                                                         const avx3f &P,
-#endif
-                                                         const avx3f &idir,
-                                                         const int near_x,
-                                                         const int near_y,
-                                                         const int near_z,
-                                                         const int far_x,
-                                                         const int far_y,
-                                                         const int far_z,
-                                                         const int node_addr,
-                                                         const float difl,
-                                                         avxf *ccl_restrict dist)
-{
-  const int offset = node_addr + 2;
-#ifdef __KERNEL_AVX2__
-  const avxf tnear_x = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x);
-  const avxf tfar_x = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x);
-  const avxf tnear_y = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y);
-  const avxf tfar_y = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y);
-  const avxf tnear_z = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z);
-  const avxf tfar_z = msub(
-      kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z);
-
-  const float round_down = 1.0f - difl;
-  const float round_up = 1.0f + difl;
-  const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-  const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
-  const avxb vmask = round_down * tnear <= round_up * tfar;
-  int mask = (int)movemask(vmask);
-  *dist = tnear;
-  return mask;
-#else
-  return 0;
-#endif
-}
-
-/* Unaligned nodes intersection */
-
-ccl_device_inline int obvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg,
-                                                    const avxf &isect_near,
-                                                    const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                    const avx3f &org_idir,
-#endif
-                                                    const avx3f &org,
-                                                    const avx3f &dir,
-                                                    const avx3f &idir,
-                                                    const int near_x,
-                                                    const int near_y,
-                                                    const int near_z,
-                                                    const int far_x,
-                                                    const int far_y,
-                                                    const int far_z,
-                                                    const int node_addr,
-                                                    avxf *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 2);
-  const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 4);
-  const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 6);
-
-  const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 8);
-  const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 10);
-  const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 12);
-
-  const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 14);
-  const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 16);
-  const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 18);
-
-  const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 20);
-  const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 22);
-  const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 24);
-
-  const avxf aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z,
-             aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z,
-             aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z;
-
-  const avxf aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x,
-             aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y,
-             aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z;
-
-  const avxf neg_one(-1.0f);
-  const avxf nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y,
-             nrdir_z = neg_one / aligned_dir_z;
-
-  const avxf tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y,
-             tlower_z = aligned_P_z * nrdir_z;
-
-  const avxf tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y,
-             tupper_z = tlower_z - nrdir_z;
-
-  const avxf tnear_x = min(tlower_x, tupper_x);
-  const avxf tnear_y = min(tlower_y, tupper_y);
-  const avxf tnear_z = min(tlower_z, tupper_z);
-  const avxf tfar_x = max(tlower_x, tupper_x);
-  const avxf tfar_y = max(tlower_y, tupper_y);
-  const avxf tfar_z = max(tlower_z, tupper_z);
-  const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const avxb vmask = tnear <= tfar;
-  *dist = tnear;
-  return movemask(vmask);
-}
-
-ccl_device_inline int obvh_unaligned_node_intersect_robust(KernelGlobals *ccl_restrict kg,
-                                                           const avxf &isect_near,
-                                                           const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                           const avx3f &P_idir,
-#endif
-                                                           const avx3f &P,
-                                                           const avx3f &dir,
-                                                           const avx3f &idir,
-                                                           const int near_x,
-                                                           const int near_y,
-                                                           const int near_z,
-                                                           const int far_x,
-                                                           const int far_y,
-                                                           const int far_z,
-                                                           const int node_addr,
-                                                           const float difl,
-                                                           avxf *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 2);
-  const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 4);
-  const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 6);
-
-  const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 8);
-  const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 10);
-  const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 12);
-
-  const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 14);
-  const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 16);
-  const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 18);
-
-  const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 20);
-  const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 22);
-  const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 24);
-
-  const avxf aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z,
-             aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z,
-             aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z;
-
-  const avxf aligned_P_x = P.x * tfm_x_x + P.y * tfm_x_y + P.z * tfm_x_z + tfm_t_x,
-             aligned_P_y = P.x * tfm_y_x + P.y * tfm_y_y + P.z * tfm_y_z + tfm_t_y,
-             aligned_P_z = P.x * tfm_z_x + P.y * tfm_z_y + P.z * tfm_z_z + tfm_t_z;
-
-  const avxf neg_one(-1.0f);
-  const avxf nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y,
-             nrdir_z = neg_one / aligned_dir_z;
-
-  const avxf tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y,
-             tlower_z = aligned_P_z * nrdir_z;
-
-  const avxf tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y,
-             tupper_z = tlower_z - nrdir_z;
-
-  const float round_down = 1.0f - difl;
-  const float round_up = 1.0f + difl;
-
-  const avxf tnear_x = min(tlower_x, tupper_x);
-  const avxf tnear_y = min(tlower_y, tupper_y);
-  const avxf tnear_z = min(tlower_z, tupper_z);
-  const avxf tfar_x = max(tlower_x, tupper_x);
-  const avxf tfar_y = max(tlower_y, tupper_y);
-  const avxf tfar_z = max(tlower_z, tupper_z);
-
-  const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const avxb vmask = round_down * tnear <= round_up * tfar;
-  *dist = tnear;
-  return movemask(vmask);
-}
-
-/* Intersectors wrappers.
- *
- * They'll check node type and call appropriate intersection code.
- */
-
-ccl_device_inline int obvh_node_intersect(KernelGlobals *ccl_restrict kg,
-                                          const avxf &isect_near,
-                                          const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                          const avx3f &org_idir,
-#endif
-                                          const avx3f &org,
-                                          const avx3f &dir,
-                                          const avx3f &idir,
-                                          const int near_x,
-                                          const int near_y,
-                                          const int near_z,
-                                          const int far_x,
-                                          const int far_y,
-                                          const int far_z,
-                                          const int node_addr,
-                                          avxf *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return obvh_unaligned_node_intersect(kg,
-                                         isect_near,
-                                         isect_far,
-#ifdef __KERNEL_AVX2__
-                                         org_idir,
-#endif
-                                         org,
-                                         dir,
-                                         idir,
-                                         near_x,
-                                         near_y,
-                                         near_z,
-                                         far_x,
-                                         far_y,
-                                         far_z,
-                                         node_addr,
-                                         dist);
-  }
-  else {
-    return obvh_aligned_node_intersect(kg,
-                                       isect_near,
-                                       isect_far,
-#ifdef __KERNEL_AVX2__
-                                       org_idir,
-#else
-                                       org,
-#endif
-                                       idir,
-                                       near_x,
-                                       near_y,
-                                       near_z,
-                                       far_x,
-                                       far_y,
-                                       far_z,
-                                       node_addr,
-                                       dist);
-  }
-}
-
-ccl_device_inline int obvh_node_intersect_robust(KernelGlobals *ccl_restrict kg,
-                                                 const avxf &isect_near,
-                                                 const avxf &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                 const avx3f &P_idir,
-#endif
-                                                 const avx3f &P,
-                                                 const avx3f &dir,
-                                                 const avx3f &idir,
-                                                 const int near_x,
-                                                 const int near_y,
-                                                 const int near_z,
-                                                 const int far_x,
-                                                 const int far_y,
-                                                 const int far_z,
-                                                 const int node_addr,
-                                                 const float difl,
-                                                 avxf *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return obvh_unaligned_node_intersect_robust(kg,
-                                                isect_near,
-                                                isect_far,
-#ifdef __KERNEL_AVX2__
-                                                P_idir,
-#endif
-                                                P,
-                                                dir,
-                                                idir,
-                                                near_x,
-                                                near_y,
-                                                near_z,
-                                                far_x,
-                                                far_y,
-                                                far_z,
-                                                node_addr,
-                                                difl,
-                                                dist);
-  }
-  else {
-    return obvh_aligned_node_intersect_robust(kg,
-                                              isect_near,
-                                              isect_far,
-#ifdef __KERNEL_AVX2__
-                                              P_idir,
-#else
-                                              P,
-#endif
-                                              idir,
-                                              near_x,
-                                              near_y,
-                                              near_z,
-                                              far_x,
-                                              far_y,
-                                              far_z,
-                                              node_addr,
-                                              difl,
-                                              dist);
-  }
-}
diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h
deleted file mode 100644
index 98efb003788..00000000000
--- a/intern/cycles/kernel/bvh/obvh_shadow_all.h
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const int skip_object,
-                                             const uint max_hits,
-                                             uint *num_hits)
-{
-  /* TODO(sergey):
-   *  - Test if pushing distance on the stack helps.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  const float tmax = ray->t;
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  *num_hits = 0;
-  isect_array->t = tmax;
-
-#if BVH_FEATURE(BVH_INSTANCING)
-  int num_hits_in_instance = 0;
-#endif
-
-  avxf tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-        (void)inodes;
-
-        if (false
-#ifdef __VISIBILITY_FLAG__
-            || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
-#endif
-#if BVH_FEATURE(BVH_MOTION)
-            || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z)
-#endif
-        ) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        avxf dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        //#if !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          avxf cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          if (p_type == PRIMITIVE_TRIANGLE) {
-            int prim_count = prim_addr2 - prim_addr;
-            if (prim_count < 3) {
-              while (prim_addr < prim_addr2) {
-                kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) ==
-                              p_type);
-                int hit = triangle_intersect(
-                    kg, isect_array, P, dir, PATH_RAY_SHADOW, object, prim_addr);
-                /* Shadow ray early termination. */
-                if (hit) {
-                  /* detect if this surface has a shader with transparent shadows */
-
-                  /* todo: optimize so primitive visibility flag indicates if
-                   * the primitive has a transparent shadow shader? */
-                  int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-                  int shader = 0;
-
-#ifdef __HAIR__
-                  if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-                  {
-                    shader = kernel_tex_fetch(__tri_shader, prim);
-                  }
-#ifdef __HAIR__
-                  else {
-                    float4 str = kernel_tex_fetch(__curves, prim);
-                    shader = __float_as_int(str.z);
-                  }
-#endif
-                  int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-                  /* if no transparent shadows, all light is blocked */
-                  if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-                    return true;
-                  }
-                  /* if maximum number of hits reached, block all light */
-                  else if (*num_hits == max_hits) {
-                    return true;
-                  }
-
-                  /* move on to next entry in intersections array */
-                  isect_array++;
-                  (*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#endif
-
-                  isect_array->t = isect_t;
-                }
-
-                prim_addr++;
-              }  //while
-            }
-            else {
-              kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) ==
-                            p_type);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-              int *nhiptr = &num_hits_in_instance;
-#else
-            int nhi = 0;
-            int *nhiptr = &nhi;
-#endif
-
-              int result = triangle_intersect8(kg,
-                                               &isect_array,
-                                               P,
-                                               dir,
-                                               PATH_RAY_SHADOW,
-                                               object,
-                                               prim_addr,
-                                               prim_count,
-                                               num_hits,
-                                               max_hits,
-                                               nhiptr,
-                                               isect_t);
-              if (result == 2) {
-                return true;
-              }
-            }  // prim_count
-          }    // PRIMITIVE_TRIANGLE
-          else {
-            while (prim_addr < prim_addr2) {
-              kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-
-#ifdef __SHADOW_TRICKS__
-              uint tri_object = (object == OBJECT_NONE) ?
-                                    kernel_tex_fetch(__prim_object, prim_addr) :
-                                    object;
-              if (tri_object == skip_object) {
-                ++prim_addr;
-                continue;
-              }
-#endif
-
-              bool hit;
-
-              /* todo: specialized intersect functions which don't fill in
-               * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
-               * might give a few % performance improvement */
-
-              switch (p_type) {
-
-#if BVH_FEATURE(BVH_MOTION)
-                case PRIMITIVE_MOTION_TRIANGLE: {
-                  hit = motion_triangle_intersect(
-                      kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, prim_addr);
-                  break;
-                }
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                case PRIMITIVE_CURVE:
-                case PRIMITIVE_MOTION_CURVE: {
-                  const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                  if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                    hit = cardinal_curve_intersect(kg,
-                                                   isect_array,
-                                                   P,
-                                                   dir,
-                                                   PATH_RAY_SHADOW,
-                                                   object,
-                                                   prim_addr,
-                                                   ray->time,
-                                                   curve_type,
-                                                   NULL,
-                                                   0,
-                                                   0);
-                  }
-                  else {
-                    hit = curve_intersect(kg,
-                                          isect_array,
-                                          P,
-                                          dir,
-                                          PATH_RAY_SHADOW,
-                                          object,
-                                          prim_addr,
-                                          ray->time,
-                                          curve_type,
-                                          NULL,
-                                          0,
-                                          0);
-                  }
-                  break;
-                }
-#endif
-                default: {
-                  hit = false;
-                  break;
-                }
-              }
-
-              /* Shadow ray early termination. */
-              if (hit) {
-                /* detect if this surface has a shader with transparent shadows */
-
-                /* todo: optimize so primitive visibility flag indicates if
-                 * the primitive has a transparent shadow shader? */
-                int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-                int shader = 0;
-
-#ifdef __HAIR__
-                if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-                {
-                  shader = kernel_tex_fetch(__tri_shader, prim);
-                }
-#ifdef __HAIR__
-                else {
-                  float4 str = kernel_tex_fetch(__curves, prim);
-                  shader = __float_as_int(str.z);
-                }
-#endif
-                int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-                /* if no transparent shadows, all light is blocked */
-                if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-                  return true;
-                }
-                /* if maximum number of hits reached, block all light */
-                else if (*num_hits == max_hits) {
-                  return true;
-                }
-
-                /* move on to next entry in intersections array */
-                isect_array++;
-                (*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
-                num_hits_in_instance++;
-#endif
-
-                isect_array->t = isect_t;
-              }
-
-              prim_addr++;
-            }  //while prim
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-          isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
-          isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
-
-          num_hits_in_instance = 0;
-          isect_array->t = isect_t;
-
-          obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-          tfar = avxf(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-          dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-          idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-          P_idir = P * idir;
-          P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-          org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-          node_addr = kernel_tex_fetch(__object_node, object);
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
-
-      obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = avxf(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-      idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h
deleted file mode 100644
index 86b1de48aaa..00000000000
--- a/intern/cycles/kernel/bvh/obvh_traversal.h
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#  define NODE_INTERSECT_ROBUST obvh_node_intersect_robust
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#  define NODE_INTERSECT_ROBUST obvh_aligned_node_intersect_robust
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                             ,
-                                             uint *lcg_state,
-                                             float difl,
-                                             float extmax
-#endif
-)
-{
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-  traversal_stack[0].dist = -FLT_MAX;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-  float node_dist = -FLT_MAX;
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
-  isect->u = 0.0f;
-  isect->v = 0.0f;
-  isect->prim = PRIM_NONE;
-  isect->object = OBJECT_NONE;
-
-  BVH_DEBUG_INIT();
-  avxf tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-        (void)inodes;
-
-        if (UNLIKELY(node_dist > isect->t)
-#if BVH_FEATURE(BVH_MOTION)
-            || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z)
-#endif
-#ifdef __VISIBILITY_FLAG__
-            || (__float_as_uint(inodes.x) & visibility) == 0
-#endif
-        ) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-
-        int child_mask;
-        avxf dist;
-
-        BVH_DEBUG_NEXT_NODE();
-
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-        if (difl != 0.0f) {
-          /* NOTE: We extend all the child BB instead of fetching
-           * and checking visibility flags for each of the,
-           *
-           * Need to test if doing opposite would be any faster.
-           */
-          child_mask = NODE_INTERSECT_ROBUST(kg,
-                                             tnear,
-                                             tfar,
-#  ifdef __KERNEL_AVX2__
-                                             P_idir4,
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                             org4,
-#  endif
-#  if BVH_FEATURE(BVH_HAIR)
-                                             dir4,
-#  endif
-                                             idir4,
-                                             near_x,
-                                             near_y,
-                                             near_z,
-                                             far_x,
-                                             far_y,
-                                             far_z,
-                                             node_addr,
-                                             difl,
-                                             &dist);
-        }
-        else
-#endif /* BVH_HAIR_MINIMUM_WIDTH */
-        {
-          child_mask = NODE_INTERSECT(kg,
-                                      tnear,
-                                      tfar,
-#ifdef __KERNEL_AVX2__
-                                      P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                      org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                      dir4,
-#endif
-                                      idir4,
-                                      near_x,
-                                      near_y,
-                                      near_z,
-                                      far_x,
-                                      far_y,
-                                      far_z,
-                                      node_addr,
-                                      &dist);
-        }
-
-        if (child_mask != 0) {
-          avxf cnodes;
-          /* TODO(sergey): Investigate whether moving cnodes upwards
-           * gives a speedup (will be different cache pattern but will
-           * avoid extra check here).
-           */
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          float d0 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            node_dist = d0;
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              node_dist = d1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              node_dist = d0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Eight children are hit, push all onto stack and sort 8
-          * stack items, continue with closest child.
-          */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        node_dist = traversal_stack[stack_ptr].dist;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-#ifdef __VISIBILITY_FLAG__
-        if (UNLIKELY((node_dist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
-#else
-        if (UNLIKELY((node_dist > isect->t)))
-#endif
-        {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (type & PRIMITIVE_ALL) {
-            case PRIMITIVE_TRIANGLE: {
-              int prim_count = prim_addr2 - prim_addr;
-              if (prim_count < 3) {
-                for (; prim_addr < prim_addr2; prim_addr++) {
-                  BVH_DEBUG_NEXT_INTERSECTION();
-                  kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                  if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
-                    tfar = avxf(isect->t);
-                    /* Shadow ray early termination. */
-                    if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-                      return true;
-                    }
-                  }
-                }  //for
-              }
-              else {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (triangle_intersect8(kg,
-                                        &isect,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        prim_count,
-                                        0,
-                                        0,
-                                        NULL,
-                                        0.0f)) {
-                  tfar = avxf(isect->t);
-                  if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }  //prim count
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (motion_triangle_intersect(
-                        kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
-                  tfar = avxf(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#endif /* BVH_FEATURE(BVH_MOTION) */
-#if BVH_FEATURE(BVH_HAIR)
-            case PRIMITIVE_CURVE:
-            case PRIMITIVE_MOTION_CURVE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                bool hit;
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(kg,
-                                                 isect,
-                                                 P,
-                                                 dir,
-                                                 visibility,
-                                                 object,
-                                                 prim_addr,
-                                                 ray->time,
-                                                 curve_type,
-                                                 lcg_state,
-                                                 difl,
-                                                 extmax);
-                }
-                else {
-                  hit = curve_intersect(kg,
-                                        isect,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        ray->time,
-                                        curve_type,
-                                        lcg_state,
-                                        difl,
-                                        extmax);
-                }
-                if (hit) {
-                  tfar = avxf(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#endif /* BVH_FEATURE(BVH_HAIR) */
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-          qbvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
-#  else
-          qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
-#  endif
-
-          obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-          tfar = avxf(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-          dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-          idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-          P_idir = P * idir;
-          P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-          org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-          traversal_stack[stack_ptr].dist = -FLT_MAX;
-
-          node_addr = kernel_tex_fetch(__object_node, object);
-
-          BVH_DEBUG_NEXT_INSTANCE();
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-      obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = avxf(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-      idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      node_dist = traversal_stack[stack_ptr].dist;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
-#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h
deleted file mode 100644
index fb41ae783ab..00000000000
--- a/intern/cycles/kernel/bvh/obvh_volume.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility)
-{
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
-  isect->u = 0.0f;
-  isect->v = 0.0f;
-  isect->prim = PRIM_NONE;
-  isect->object = OBJECT_NONE;
-
-  avxf tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(inodes.x) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        avxf dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          avxf cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (p_type) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                motion_triangle_intersect(
-                    kg, isect, P, dir, ray->time, visibility, object, prim_addr);
-              }
-              break;
-            }
-#endif
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
-          if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
-            isect->t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-            isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-            obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-            tfar = avxf(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-            dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-            idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-            P_idir = P * idir;
-            P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-            org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-            node_addr = kernel_tex_fetch(__object_node, object);
-          }
-          else {
-            /* Pop. */
-            object = OBJECT_NONE;
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-          }
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-      obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = avxf(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-      idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h
deleted file mode 100644
index 56e2afd4a11..00000000000
--- a/intern/cycles/kernel/bvh/obvh_volume_all.h
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT obvh_node_intersect
-#else
-#  define NODE_INTERSECT obvh_aligned_node_intersect
-#endif
-
-ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const uint max_hits,
-                                             const uint visibility)
-{
-  /* Traversal stack in CUDA thread-local memory. */
-  OBVHStackItem traversal_stack[BVH_OSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  const float tmax = ray->t;
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  uint num_hits = 0;
-  isect_array->t = tmax;
-
-#if BVH_FEATURE(BVH_INSTANCING)
-  int num_hits_in_instance = 0;
-#endif
-
-  avxf tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#endif
-  avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(inodes.x) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        avxf dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          avxf cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c3;
-            traversal_stack[stack_ptr].dist = d3;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-
-          /* Five children are hit, push all onto stack and sort 5
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c4 = __float_as_int(cnodes[r]);
-          float d4 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Six children are hit, push all onto stack and sort 6
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c5 = __float_as_int(cnodes[r]);
-          float d5 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c5;
-            traversal_stack[stack_ptr].dist = d5;
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c4;
-            traversal_stack[stack_ptr].dist = d4;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c5;
-          traversal_stack[stack_ptr].dist = d5;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c4;
-          traversal_stack[stack_ptr].dist = d4;
-
-          /* Seven children are hit, push all onto stack and sort 7
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c6 = __float_as_int(cnodes[r]);
-          float d6 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c6;
-            traversal_stack[stack_ptr].dist = d6;
-            obvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2],
-                            &traversal_stack[stack_ptr - 3],
-                            &traversal_stack[stack_ptr - 4],
-                            &traversal_stack[stack_ptr - 5],
-                            &traversal_stack[stack_ptr - 6]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Eight children are hit, push all onto stack and sort 8
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c7 = __float_as_int(cnodes[r]);
-          float d7 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c7;
-          traversal_stack[stack_ptr].dist = d7;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c6;
-          traversal_stack[stack_ptr].dist = d6;
-          obvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3],
-                          &traversal_stack[stack_ptr - 4],
-                          &traversal_stack[stack_ptr - 5],
-                          &traversal_stack[stack_ptr - 6],
-                          &traversal_stack[stack_ptr - 7]);
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-          bool hit;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (p_type) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
-                if (hit) {
-                  /* Move on to next entry in intersections array. */
-                  isect_array++;
-                  num_hits++;
-#if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#endif
-                  isect_array->t = isect_t;
-                  if (num_hits == max_hits) {
-#if BVH_FEATURE(BVH_INSTANCING)
-#  if BVH_FEATURE(BVH_MOTION)
-                    float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
-                    Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-                    float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
-                    for (int i = 0; i < num_hits_in_instance; i++) {
-                      (isect_array - i - 1)->t *= t_fac;
-                    }
-#endif /* BVH_FEATURE(BVH_INSTANCING) */
-                    return num_hits;
-                  }
-                }
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
-                if (hit) {
-                  /* Move on to next entry in intersections array. */
-                  isect_array++;
-                  num_hits++;
-#  if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#  endif
-                  isect_array->t = isect_t;
-                  if (num_hits == max_hits) {
-#  if BVH_FEATURE(BVH_INSTANCING)
-#    if BVH_FEATURE(BVH_MOTION)
-                    float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#    else
-                    Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-                    float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#    endif
-                    for (int i = 0; i < num_hits_in_instance; i++) {
-                      (isect_array - i - 1)->t *= t_fac;
-                    }
-#  endif /* BVH_FEATURE(BVH_INSTANCING) */
-                    return num_hits;
-                  }
-                }
-              }
-              break;
-            }
-#endif
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
-          if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
-            isect_t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
-            isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
-
-            obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-            tfar = avxf(isect_t);
-            idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  if BVH_FEATURE(BVH_HAIR)
-            dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-#  ifdef __KERNEL_AVX2__
-            P_idir = P * idir;
-            P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-            org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-            num_hits_in_instance = 0;
-            isect_array->t = isect_t;
-
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_OSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-            node_addr = kernel_tex_fetch(__object_node, object);
-          }
-          else {
-            /* Pop. */
-            object = OBJECT_NONE;
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-          }
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
-
-      obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = avxf(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z));
-#  endif
-      idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return num_hits;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_local.h b/intern/cycles/kernel/bvh/qbvh_local.h
deleted file mode 100644
index b21f79bd3a0..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_local.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for finding local intersections
- * around the shading point, for subsurface scattering and bevel. We disable
- * various features for performance, and for instanced objects avoid traversing
- * other parts of the scene.
- *
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             LocalIntersection *local_isect,
-                                             int local_object,
-                                             uint *lcg_state,
-                                             int max_hits)
-{
-  /* TODO(sergey):
-   * - Test if pushing distance on the stack helps (for non shadow rays).
-   * - Separate version for shadow rays.
-   * - Likely and unlikely for if() statements.
-   * - SSE for hair.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_tex_fetch(__object_node, local_object);
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = ray->t;
-
-  if (local_isect != NULL) {
-    local_isect->num_hits = 0;
-  }
-  kernel_assert((local_isect == NULL) == (max_hits == 0));
-
-  const int object_flag = kernel_tex_fetch(__object_flag, local_object);
-  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-#if BVH_FEATURE(BVH_MOTION)
-    Transform ob_itfm;
-    isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#else
-    isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
-#endif
-    object = local_object;
-  }
-
-  ssef tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        ssef dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-          float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-        int prim_addr = __float_as_int(leaf.x);
-
-        int prim_addr2 = __float_as_int(leaf.y);
-        const uint type = __float_as_int(leaf.w);
-
-        /* Pop. */
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-
-        /* Primitive intersection. */
-        switch (type & PRIMITIVE_ALL) {
-          case PRIMITIVE_TRIANGLE: {
-            /* Intersect ray against primitive, */
-            for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-              if (triangle_intersect_local(kg,
-                                           local_isect,
-                                           P,
-                                           dir,
-                                           object,
-                                           local_object,
-                                           prim_addr,
-                                           isect_t,
-                                           lcg_state,
-                                           max_hits)) {
-                return true;
-              }
-            }
-            break;
-          }
-#if BVH_FEATURE(BVH_MOTION)
-          case PRIMITIVE_MOTION_TRIANGLE: {
-            /* Intersect ray against primitive. */
-            for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-              if (motion_triangle_intersect_local(kg,
-                                                  local_isect,
-                                                  P,
-                                                  dir,
-                                                  ray->time,
-                                                  object,
-                                                  local_object,
-                                                  prim_addr,
-                                                  isect_t,
-                                                  lcg_state,
-                                                  max_hits)) {
-                return true;
-              }
-            }
-            break;
-          }
-#endif
-          default:
-            break;
-        }
-      }
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
deleted file mode 100644
index 7c1d8c8c72e..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * Copyright 2011-2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Aligned nodes intersection SSE code is adopted from Embree,
- */
-
-struct QBVHStackItem {
-  int addr;
-  float dist;
-};
-
-ccl_device_inline void qbvh_near_far_idx_calc(const float3 &idir,
-                                              int *ccl_restrict near_x,
-                                              int *ccl_restrict near_y,
-                                              int *ccl_restrict near_z,
-                                              int *ccl_restrict far_x,
-                                              int *ccl_restrict far_y,
-                                              int *ccl_restrict far_z)
-
-{
-#ifdef __KERNEL_SSE__
-  *near_x = 0;
-  *far_x = 1;
-  *near_y = 2;
-  *far_y = 3;
-  *near_z = 4;
-  *far_z = 5;
-
-  const size_t mask = movemask(ssef(idir.m128));
-
-  const int mask_x = mask & 1;
-  const int mask_y = (mask & 2) >> 1;
-  const int mask_z = (mask & 4) >> 2;
-
-  *near_x += mask_x;
-  *far_x -= mask_x;
-  *near_y += mask_y;
-  *far_y -= mask_y;
-  *near_z += mask_z;
-  *far_z -= mask_z;
-#else
-  if (idir.x >= 0.0f) {
-    *near_x = 0;
-    *far_x = 1;
-  }
-  else {
-    *near_x = 1;
-    *far_x = 0;
-  }
-  if (idir.y >= 0.0f) {
-    *near_y = 2;
-    *far_y = 3;
-  }
-  else {
-    *near_y = 3;
-    *far_y = 2;
-  }
-  if (idir.z >= 0.0f) {
-    *near_z = 4;
-    *far_z = 5;
-  }
-  else {
-    *near_z = 5;
-    *far_z = 4;
-  }
-#endif
-}
-
-/* TOOD(sergey): Investigate if using intrinsics helps for both
- * stack item swap and float comparison.
- */
-ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, QBVHStackItem *ccl_restrict b)
-{
-  QBVHStackItem tmp = *a;
-  *a = *b;
-  *b = tmp;
-}
-
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
-                                       QBVHStackItem *ccl_restrict s2,
-                                       QBVHStackItem *ccl_restrict s3)
-{
-  if (s2->dist < s1->dist) {
-    qbvh_item_swap(s2, s1);
-  }
-  if (s3->dist < s2->dist) {
-    qbvh_item_swap(s3, s2);
-  }
-  if (s2->dist < s1->dist) {
-    qbvh_item_swap(s2, s1);
-  }
-}
-
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
-                                       QBVHStackItem *ccl_restrict s2,
-                                       QBVHStackItem *ccl_restrict s3,
-                                       QBVHStackItem *ccl_restrict s4)
-{
-  if (s2->dist < s1->dist) {
-    qbvh_item_swap(s2, s1);
-  }
-  if (s4->dist < s3->dist) {
-    qbvh_item_swap(s4, s3);
-  }
-  if (s3->dist < s1->dist) {
-    qbvh_item_swap(s3, s1);
-  }
-  if (s4->dist < s2->dist) {
-    qbvh_item_swap(s4, s2);
-  }
-  if (s3->dist < s2->dist) {
-    qbvh_item_swap(s3, s2);
-  }
-}
-
-/* Axis-aligned nodes intersection */
-
-//ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
-static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
-                                       const ssef &isect_near,
-                                       const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                       const sse3f &org_idir,
-#else
-                                       const sse3f &org,
-#endif
-                                       const sse3f &idir,
-                                       const int near_x,
-                                       const int near_y,
-                                       const int near_z,
-                                       const int far_x,
-                                       const int far_y,
-                                       const int far_z,
-                                       const int node_addr,
-                                       ssef *ccl_restrict dist)
-{
-  const int offset = node_addr + 1;
-#ifdef __KERNEL_AVX2__
-  const ssef tnear_x = msub(
-      kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x), idir.x, org_idir.x);
-  const ssef tnear_y = msub(
-      kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y), idir.y, org_idir.y);
-  const ssef tnear_z = msub(
-      kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z), idir.z, org_idir.z);
-  const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x), idir.x, org_idir.x);
-  const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y), idir.y, org_idir.y);
-  const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z), idir.z, org_idir.z);
-#else
-  const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x) - org.x) * idir.x;
-  const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y) - org.y) * idir.y;
-  const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z) - org.z) * idir.z;
-  const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x) - org.x) * idir.x;
-  const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y) - org.y) * idir.y;
-  const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z) - org.z) * idir.z;
-#endif
-
-#ifdef __KERNEL_SSE41__
-  const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near));
-  const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far));
-  const sseb vmask = cast(tnear) > cast(tfar);
-  int mask = (int)movemask(vmask) ^ 0xf;
-#else
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const sseb vmask = tnear <= tfar;
-  int mask = (int)movemask(vmask);
-#endif
-  *dist = tnear;
-  return mask;
-}
-
-ccl_device_inline int qbvh_aligned_node_intersect_robust(KernelGlobals *ccl_restrict kg,
-                                                         const ssef &isect_near,
-                                                         const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                         const sse3f &P_idir,
-#else
-                                                         const sse3f &P,
-#endif
-                                                         const sse3f &idir,
-                                                         const int near_x,
-                                                         const int near_y,
-                                                         const int near_z,
-                                                         const int far_x,
-                                                         const int far_y,
-                                                         const int far_z,
-                                                         const int node_addr,
-                                                         const float difl,
-                                                         ssef *ccl_restrict dist)
-{
-  const int offset = node_addr + 1;
-#ifdef __KERNEL_AVX2__
-  const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x), idir.x, P_idir.x);
-  const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y), idir.y, P_idir.y);
-  const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z), idir.z, P_idir.z);
-  const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x), idir.x, P_idir.x);
-  const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y), idir.y, P_idir.y);
-  const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z), idir.z, P_idir.z);
-#else
-  const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x) - P.x) * idir.x;
-  const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y) - P.y) * idir.y;
-  const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z) - P.z) * idir.z;
-  const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x) - P.x) * idir.x;
-  const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y) - P.y) * idir.y;
-  const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z) - P.z) * idir.z;
-#endif
-
-  const float round_down = 1.0f - difl;
-  const float round_up = 1.0f + difl;
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const sseb vmask = round_down * tnear <= round_up * tfar;
-  *dist = tnear;
-  return (int)movemask(vmask);
-}
-
-/* Unaligned nodes intersection */
-
-ccl_device_inline int qbvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg,
-                                                    const ssef &isect_near,
-                                                    const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                    const sse3f &org_idir,
-#endif
-                                                    const sse3f &org,
-                                                    const sse3f &dir,
-                                                    const sse3f &idir,
-                                                    const int near_x,
-                                                    const int near_y,
-                                                    const int near_z,
-                                                    const int far_x,
-                                                    const int far_y,
-                                                    const int far_z,
-                                                    const int node_addr,
-                                                    ssef *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 1);
-  const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 2);
-  const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 3);
-
-  const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 4);
-  const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 5);
-  const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 6);
-
-  const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 7);
-  const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 8);
-  const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 9);
-
-  const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 10);
-  const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 11);
-  const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 12);
-
-  const ssef aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z,
-             aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z,
-             aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z;
-
-  const ssef aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x,
-             aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y,
-             aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z;
-
-  const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
-  const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y,
-             nrdir_z = neg_one / aligned_dir_z;
-
-  const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y,
-             tlower_z = aligned_P_z * nrdir_z;
-
-  const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y,
-             tupper_z = tlower_z - nrdir_z;
-
-#ifdef __KERNEL_SSE41__
-  const ssef tnear_x = mini(tlower_x, tupper_x);
-  const ssef tnear_y = mini(tlower_y, tupper_y);
-  const ssef tnear_z = mini(tlower_z, tupper_z);
-  const ssef tfar_x = maxi(tlower_x, tupper_x);
-  const ssef tfar_y = maxi(tlower_y, tupper_y);
-  const ssef tfar_z = maxi(tlower_z, tupper_z);
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const sseb vmask = tnear <= tfar;
-  *dist = tnear;
-  return movemask(vmask);
-#else
-  const ssef tnear_x = min(tlower_x, tupper_x);
-  const ssef tnear_y = min(tlower_y, tupper_y);
-  const ssef tnear_z = min(tlower_z, tupper_z);
-  const ssef tfar_x = max(tlower_x, tupper_x);
-  const ssef tfar_y = max(tlower_y, tupper_y);
-  const ssef tfar_z = max(tlower_z, tupper_z);
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const sseb vmask = tnear <= tfar;
-  *dist = tnear;
-  return movemask(vmask);
-#endif
-}
-
-ccl_device_inline int qbvh_unaligned_node_intersect_robust(KernelGlobals *ccl_restrict kg,
-                                                           const ssef &isect_near,
-                                                           const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                           const sse3f &P_idir,
-#endif
-                                                           const sse3f &P,
-                                                           const sse3f &dir,
-                                                           const sse3f &idir,
-                                                           const int near_x,
-                                                           const int near_y,
-                                                           const int near_z,
-                                                           const int far_x,
-                                                           const int far_y,
-                                                           const int far_z,
-                                                           const int node_addr,
-                                                           const float difl,
-                                                           ssef *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 1);
-  const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 2);
-  const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 3);
-
-  const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 4);
-  const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 5);
-  const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 6);
-
-  const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 7);
-  const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 8);
-  const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 9);
-
-  const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 10);
-  const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 11);
-  const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 12);
-
-  const ssef aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z,
-             aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z,
-             aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z;
-
-  const ssef aligned_P_x = P.x * tfm_x_x + P.y * tfm_x_y + P.z * tfm_x_z + tfm_t_x,
-             aligned_P_y = P.x * tfm_y_x + P.y * tfm_y_y + P.z * tfm_y_z + tfm_t_y,
-             aligned_P_z = P.x * tfm_z_x + P.y * tfm_z_y + P.z * tfm_z_z + tfm_t_z;
-
-  const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f);
-  const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y,
-             nrdir_z = neg_one / aligned_dir_z;
-
-  const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y,
-             tlower_z = aligned_P_z * nrdir_z;
-
-  const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y,
-             tupper_z = tlower_z - nrdir_z;
-
-  const float round_down = 1.0f - difl;
-  const float round_up = 1.0f + difl;
-
-#ifdef __KERNEL_SSE41__
-  const ssef tnear_x = mini(tlower_x, tupper_x);
-  const ssef tnear_y = mini(tlower_y, tupper_y);
-  const ssef tnear_z = mini(tlower_z, tupper_z);
-  const ssef tfar_x = maxi(tlower_x, tupper_x);
-  const ssef tfar_y = maxi(tlower_y, tupper_y);
-  const ssef tfar_z = maxi(tlower_z, tupper_z);
-#else
-  const ssef tnear_x = min(tlower_x, tupper_x);
-  const ssef tnear_y = min(tlower_y, tupper_y);
-  const ssef tnear_z = min(tlower_z, tupper_z);
-  const ssef tfar_x = max(tlower_x, tupper_x);
-  const ssef tfar_y = max(tlower_y, tupper_y);
-  const ssef tfar_z = max(tlower_z, tupper_z);
-#endif
-  const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
-  const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
-  const sseb vmask = round_down * tnear <= round_up * tfar;
-  *dist = tnear;
-  return movemask(vmask);
-}
-
-/* Intersectors wrappers.
- *
- * They'll check node type and call appropriate intersection code.
- */
-
-ccl_device_inline int qbvh_node_intersect(KernelGlobals *ccl_restrict kg,
-                                          const ssef &isect_near,
-                                          const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                          const sse3f &org_idir,
-#endif
-                                          const sse3f &org,
-                                          const sse3f &dir,
-                                          const sse3f &idir,
-                                          const int near_x,
-                                          const int near_y,
-                                          const int near_z,
-                                          const int far_x,
-                                          const int far_y,
-                                          const int far_z,
-                                          const int node_addr,
-                                          ssef *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return qbvh_unaligned_node_intersect(kg,
-                                         isect_near,
-                                         isect_far,
-#ifdef __KERNEL_AVX2__
-                                         org_idir,
-#endif
-                                         org,
-                                         dir,
-                                         idir,
-                                         near_x,
-                                         near_y,
-                                         near_z,
-                                         far_x,
-                                         far_y,
-                                         far_z,
-                                         node_addr,
-                                         dist);
-  }
-  else {
-    return qbvh_aligned_node_intersect(kg,
-                                       isect_near,
-                                       isect_far,
-#ifdef __KERNEL_AVX2__
-                                       org_idir,
-#else
-                                       org,
-#endif
-                                       idir,
-                                       near_x,
-                                       near_y,
-                                       near_z,
-                                       far_x,
-                                       far_y,
-                                       far_z,
-                                       node_addr,
-                                       dist);
-  }
-}
-
-ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *ccl_restrict kg,
-                                                 const ssef &isect_near,
-                                                 const ssef &isect_far,
-#ifdef __KERNEL_AVX2__
-                                                 const sse3f &P_idir,
-#endif
-                                                 const sse3f &P,
-                                                 const sse3f &dir,
-                                                 const sse3f &idir,
-                                                 const int near_x,
-                                                 const int near_y,
-                                                 const int near_z,
-                                                 const int far_x,
-                                                 const int far_y,
-                                                 const int far_z,
-                                                 const int node_addr,
-                                                 const float difl,
-                                                 ssef *ccl_restrict dist)
-{
-  const int offset = node_addr;
-  const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
-  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return qbvh_unaligned_node_intersect_robust(kg,
-                                                isect_near,
-                                                isect_far,
-#ifdef __KERNEL_AVX2__
-                                                P_idir,
-#endif
-                                                P,
-                                                dir,
-                                                idir,
-                                                near_x,
-                                                near_y,
-                                                near_z,
-                                                far_x,
-                                                far_y,
-                                                far_z,
-                                                node_addr,
-                                                difl,
-                                                dist);
-  }
-  else {
-    return qbvh_aligned_node_intersect_robust(kg,
-                                              isect_near,
-                                              isect_far,
-#ifdef __KERNEL_AVX2__
-                                              P_idir,
-#else
-                                              P,
-#endif
-                                              idir,
-                                              near_x,
-                                              near_y,
-                                              near_z,
-                                              far_x,
-                                              far_y,
-                                              far_z,
-                                              node_addr,
-                                              difl,
-                                              dist);
-  }
-}
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
deleted file mode 100644
index 49e607bfbd0..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const uint visibility,
-                                             const uint max_hits,
-                                             uint *num_hits)
-{
-  /* TODO(sergey):
-  *  - Test if pushing distance on the stack helps.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  const float tmax = ray->t;
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  *num_hits = 0;
-  isect_array->t = tmax;
-
-#if BVH_FEATURE(BVH_INSTANCING)
-  int num_hits_in_instance = 0;
-#endif
-
-  ssef tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-        (void)inodes;
-
-        if (false
-#ifdef __VISIBILITY_FLAG__
-            || ((__float_as_uint(inodes.x) & visibility) == 0)
-#endif
-#if BVH_FEATURE(BVH_MOTION)
-            || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z)
-#endif
-        ) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        ssef dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          while (prim_addr < prim_addr2) {
-            kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-            bool hit;
-
-            /* todo: specialized intersect functions which don't fill in
-             * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
-             * might give a few % performance improvement */
-
-            switch (p_type) {
-              case PRIMITIVE_TRIANGLE: {
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
-                break;
-              }
-#if BVH_FEATURE(BVH_MOTION)
-              case PRIMITIVE_MOTION_TRIANGLE: {
-                hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
-                break;
-              }
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-              case PRIMITIVE_CURVE:
-              case PRIMITIVE_MOTION_CURVE: {
-                const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(kg,
-                                                 isect_array,
-                                                 P,
-                                                 dir,
-                                                 visibility,
-                                                 object,
-                                                 prim_addr,
-                                                 ray->time,
-                                                 curve_type,
-                                                 NULL,
-                                                 0,
-                                                 0);
-                }
-                else {
-                  hit = curve_intersect(kg,
-                                        isect_array,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        ray->time,
-                                        curve_type,
-                                        NULL,
-                                        0,
-                                        0);
-                }
-                break;
-              }
-#endif
-              default: {
-                hit = false;
-                break;
-              }
-            }
-
-            /* Shadow ray early termination. */
-            if (hit) {
-              /* detect if this surface has a shader with transparent shadows */
-
-              /* todo: optimize so primitive visibility flag indicates if
-               * the primitive has a transparent shadow shader? */
-              int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-              int shader = 0;
-
-#ifdef __HAIR__
-              if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-              {
-                shader = kernel_tex_fetch(__tri_shader, prim);
-              }
-#ifdef __HAIR__
-              else {
-                float4 str = kernel_tex_fetch(__curves, prim);
-                shader = __float_as_int(str.z);
-              }
-#endif
-              int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-              /* if no transparent shadows, all light is blocked */
-              if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-                return true;
-              }
-              /* if maximum number of hits reached, block all light */
-              else if (*num_hits == max_hits) {
-                return true;
-              }
-
-              /* move on to next entry in intersections array */
-              isect_array++;
-              (*num_hits)++;
-#if BVH_FEATURE(BVH_INSTANCING)
-              num_hits_in_instance++;
-#endif
-
-              isect_array->t = isect_t;
-            }
-
-            prim_addr++;
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-          isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
-          isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
-
-          num_hits_in_instance = 0;
-          isect_array->t = isect_t;
-
-          qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-          tfar = ssef(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-          dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-          idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-          P_idir = P * idir;
-          P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-          org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-          node_addr = kernel_tex_fetch(__object_node, object);
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
-
-      qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = ssef(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-      idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return false;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
deleted file mode 100644
index 9ee0f7b5933..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function, where various features can be
- * enabled/disabled. This way we can compile optimized versions for each case
- * without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_HAIR: hair curve rendering
- * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#  define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#  define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                             ,
-                                             uint *lcg_state,
-                                             float difl,
-                                             float extmax
-#endif
-)
-{
-  /* TODO(sergey):
-   * - Test if pushing distance on the stack helps (for non shadow rays).
-   * - Separate version for shadow rays.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-  traversal_stack[0].dist = -FLT_MAX;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-  float node_dist = -FLT_MAX;
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
-  isect->u = 0.0f;
-  isect->v = 0.0f;
-  isect->prim = PRIM_NONE;
-  isect->object = OBJECT_NONE;
-
-  BVH_DEBUG_INIT();
-
-  ssef tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-        (void)inodes;
-
-        if (UNLIKELY(node_dist > isect->t)
-#if BVH_FEATURE(BVH_MOTION)
-            || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z)
-#endif
-#ifdef __VISIBILITY_FLAG__
-            || (__float_as_uint(inodes.x) & visibility) == 0
-#endif
-        ) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-
-        int child_mask;
-        ssef dist;
-
-        BVH_DEBUG_NEXT_NODE();
-
-#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-        if (difl != 0.0f) {
-          /* NOTE: We extend all the child BB instead of fetching
-           * and checking visibility flags for each of the,
-           *
-           * Need to test if doing opposite would be any faster.
-           */
-          child_mask = NODE_INTERSECT_ROBUST(kg,
-                                             tnear,
-                                             tfar,
-#  ifdef __KERNEL_AVX2__
-                                             P_idir4,
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                             org4,
-#  endif
-#  if BVH_FEATURE(BVH_HAIR)
-                                             dir4,
-#  endif
-                                             idir4,
-                                             near_x,
-                                             near_y,
-                                             near_z,
-                                             far_x,
-                                             far_y,
-                                             far_z,
-                                             node_addr,
-                                             difl,
-                                             &dist);
-        }
-        else
-#endif /* BVH_HAIR_MINIMUM_WIDTH */
-        {
-          child_mask = NODE_INTERSECT(kg,
-                                      tnear,
-                                      tfar,
-#ifdef __KERNEL_AVX2__
-                                      P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                      org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                      dir4,
-#endif
-                                      idir4,
-                                      near_x,
-                                      near_y,
-                                      near_z,
-                                      far_x,
-                                      far_y,
-                                      far_z,
-                                      node_addr,
-                                      &dist);
-        }
-
-        if (child_mask != 0) {
-          float4 cnodes;
-          /* TODO(sergey): Investigate whether moving cnodes upwards
-           * gives a speedup (will be different cache pattern but will
-           * avoid extra check here).
-           */
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          float d0 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            node_dist = d0;
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              node_dist = d1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              node_dist = d0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            node_dist = traversal_stack[stack_ptr].dist;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        node_dist = traversal_stack[stack_ptr].dist;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-#ifdef __VISIBILITY_FLAG__
-        if (UNLIKELY((node_dist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
-#else
-        if (UNLIKELY((node_dist > isect->t)))
-#endif
-        {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          node_dist = traversal_stack[stack_ptr].dist;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (type & PRIMITIVE_ALL) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
-                  tfar = ssef(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (motion_triangle_intersect(
-                        kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
-                  tfar = ssef(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#endif /* BVH_FEATURE(BVH_MOTION) */
-#if BVH_FEATURE(BVH_HAIR)
-            case PRIMITIVE_CURVE:
-            case PRIMITIVE_MOTION_CURVE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                BVH_DEBUG_NEXT_INTERSECTION();
-                const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                bool hit;
-                if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-                  hit = cardinal_curve_intersect(kg,
-                                                 isect,
-                                                 P,
-                                                 dir,
-                                                 visibility,
-                                                 object,
-                                                 prim_addr,
-                                                 ray->time,
-                                                 curve_type,
-                                                 lcg_state,
-                                                 difl,
-                                                 extmax);
-                }
-                else {
-                  hit = curve_intersect(kg,
-                                        isect,
-                                        P,
-                                        dir,
-                                        visibility,
-                                        object,
-                                        prim_addr,
-                                        ray->time,
-                                        curve_type,
-                                        lcg_state,
-                                        difl,
-                                        extmax);
-                }
-                if (hit) {
-                  tfar = ssef(isect->t);
-                  /* Shadow ray early termination. */
-                  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-                    return true;
-                  }
-                }
-              }
-              break;
-            }
-#endif /* BVH_FEATURE(BVH_HAIR) */
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-
-#  if BVH_FEATURE(BVH_MOTION)
-          qbvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm);
-#  else
-          qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
-#  endif
-
-          qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-          tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-          dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-          idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-          P_idir = P * idir;
-          P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-          org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-          traversal_stack[stack_ptr].dist = -FLT_MAX;
-
-          node_addr = kernel_tex_fetch(__object_node, object);
-
-          BVH_DEBUG_NEXT_INSTANCE();
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-      qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-      idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      node_dist = traversal_stack[stack_ptr].dist;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
-#undef NODE_INTERSECT_ROBUST
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
deleted file mode 100644
index e4eaed04467..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect,
-                                             const uint visibility)
-{
-  /* TODO(sergey):
-   * - Test if pushing distance on the stack helps.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
-  isect->u = 0.0f;
-  isect->v = 0.0f;
-  isect->prim = PRIM_NONE;
-  isect->object = OBJECT_NONE;
-
-  ssef tnear(0.0f), tfar(ray->t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(inodes.x) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        ssef dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (p_type) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                motion_triangle_intersect(
-                    kg, isect, P, dir, ray->time, visibility, object, prim_addr);
-              }
-              break;
-            }
-#endif
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
-          if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
-            isect->t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-            isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-            qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-            tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-            dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-            idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-            P_idir = P * idir;
-            P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-            org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-            node_addr = kernel_tex_fetch(__object_node, object);
-          }
-          else {
-            /* Pop. */
-            object = OBJECT_NONE;
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-          }
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-#  if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#  else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#  endif
-
-      qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = ssef(isect->t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-      idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return (isect->prim != PRIM_NONE);
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
deleted file mode 100644
index eddc48c487e..00000000000
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This is a template BVH traversal function for volumes, where
- * various features can be enabled/disabled. This way we can compile optimized
- * versions for each case without new features slowing things down.
- *
- * BVH_INSTANCING: object instancing
- * BVH_MOTION: motion blur rendering
- */
-
-#if BVH_FEATURE(BVH_HAIR)
-#  define NODE_INTERSECT qbvh_node_intersect
-#else
-#  define NODE_INTERSECT qbvh_aligned_node_intersect
-#endif
-
-ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
-                                             const Ray *ray,
-                                             Intersection *isect_array,
-                                             const uint max_hits,
-                                             const uint visibility)
-{
-  /* TODO(sergey):
-   * - Test if pushing distance on the stack helps.
-   * - Likely and unlikely for if() statements.
-   * - Test restrict attribute for pointers.
-   */
-
-  /* Traversal stack in CUDA thread-local memory. */
-  QBVHStackItem traversal_stack[BVH_QSTACK_SIZE];
-  traversal_stack[0].addr = ENTRYPOINT_SENTINEL;
-
-  /* Traversal variables in registers. */
-  int stack_ptr = 0;
-  int node_addr = kernel_data.bvh.root;
-
-  /* Ray parameters in registers. */
-  const float tmax = ray->t;
-  float3 P = ray->P;
-  float3 dir = bvh_clamp_direction(ray->D);
-  float3 idir = bvh_inverse_direction(dir);
-  int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  uint num_hits = 0;
-  isect_array->t = tmax;
-
-#if BVH_FEATURE(BVH_INSTANCING)
-  int num_hits_in_instance = 0;
-#endif
-
-  ssef tnear(0.0f), tfar(isect_t);
-#if BVH_FEATURE(BVH_HAIR)
-  sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#endif
-  sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-
-#ifdef __KERNEL_AVX2__
-  float3 P_idir = P * idir;
-  sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z);
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-  sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z));
-#endif
-
-  /* Offsets to select the side that becomes the lower or upper bound. */
-  int near_x, near_y, near_z;
-  int far_x, far_y, far_z;
-  qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-
-  /* Traversal loop. */
-  do {
-    do {
-      /* Traverse internal nodes. */
-      while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-        float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
-
-#ifdef __VISIBILITY_FLAG__
-        if ((__float_as_uint(inodes.x) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-#endif
-
-        ssef dist;
-        int child_mask = NODE_INTERSECT(kg,
-                                        tnear,
-                                        tfar,
-#ifdef __KERNEL_AVX2__
-                                        P_idir4,
-#endif
-#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-                                        org4,
-#endif
-#if BVH_FEATURE(BVH_HAIR)
-                                        dir4,
-#endif
-                                        idir4,
-                                        near_x,
-                                        near_y,
-                                        near_z,
-                                        far_x,
-                                        far_y,
-                                        far_z,
-                                        node_addr,
-                                        &dist);
-
-        if (child_mask != 0) {
-          float4 cnodes;
-#if BVH_FEATURE(BVH_HAIR)
-          if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13);
-          }
-          else
-#endif
-          {
-            cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7);
-          }
-
-          /* One child is hit, continue with that child. */
-          int r = __bscf(child_mask);
-          if (child_mask == 0) {
-            node_addr = __float_as_int(cnodes[r]);
-            continue;
-          }
-
-          /* Two children are hit, push far child, and continue with
-           * closer child.
-           */
-          int c0 = __float_as_int(cnodes[r]);
-          float d0 = ((float *)&dist)[r];
-          r = __bscf(child_mask);
-          int c1 = __float_as_int(cnodes[r]);
-          float d1 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            if (d1 < d0) {
-              node_addr = c1;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c0;
-              traversal_stack[stack_ptr].dist = d0;
-              continue;
-            }
-            else {
-              node_addr = c0;
-              ++stack_ptr;
-              kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-              traversal_stack[stack_ptr].addr = c1;
-              traversal_stack[stack_ptr].dist = d1;
-              continue;
-            }
-          }
-
-          /* Here starts the slow path for 3 or 4 hit children. We push
-           * all nodes onto the stack to sort them there.
-           */
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c1;
-          traversal_stack[stack_ptr].dist = d1;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c0;
-          traversal_stack[stack_ptr].dist = d0;
-
-          /* Three children are hit, push all onto stack and sort 3
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c2 = __float_as_int(cnodes[r]);
-          float d2 = ((float *)&dist)[r];
-          if (child_mask == 0) {
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = c2;
-            traversal_stack[stack_ptr].dist = d2;
-            qbvh_stack_sort(&traversal_stack[stack_ptr],
-                            &traversal_stack[stack_ptr - 1],
-                            &traversal_stack[stack_ptr - 2]);
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-            continue;
-          }
-
-          /* Four children are hit, push all onto stack and sort 4
-           * stack items, continue with closest child.
-           */
-          r = __bscf(child_mask);
-          int c3 = __float_as_int(cnodes[r]);
-          float d3 = ((float *)&dist)[r];
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c3;
-          traversal_stack[stack_ptr].dist = d3;
-          ++stack_ptr;
-          kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-          traversal_stack[stack_ptr].addr = c2;
-          traversal_stack[stack_ptr].dist = d2;
-          qbvh_stack_sort(&traversal_stack[stack_ptr],
-                          &traversal_stack[stack_ptr - 1],
-                          &traversal_stack[stack_ptr - 2],
-                          &traversal_stack[stack_ptr - 3]);
-        }
-
-        node_addr = traversal_stack[stack_ptr].addr;
-        --stack_ptr;
-      }
-
-      /* If node is leaf, fetch triangle list. */
-      if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
-
-        if ((__float_as_uint(leaf.z) & visibility) == 0) {
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-          continue;
-        }
-
-        int prim_addr = __float_as_int(leaf.x);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-        if (prim_addr >= 0) {
-#endif
-          int prim_addr2 = __float_as_int(leaf.y);
-          const uint type = __float_as_int(leaf.w);
-          const uint p_type = type & PRIMITIVE_ALL;
-          bool hit;
-
-          /* Pop. */
-          node_addr = traversal_stack[stack_ptr].addr;
-          --stack_ptr;
-
-          /* Primitive intersection. */
-          switch (p_type) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
-                if (hit) {
-                  /* Move on to next entry in intersections array. */
-                  isect_array++;
-                  num_hits++;
-#if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#endif
-                  isect_array->t = isect_t;
-                  if (num_hits == max_hits) {
-#if BVH_FEATURE(BVH_INSTANCING)
-                    if (object != OBJECT_NONE) {
-#  if BVH_FEATURE(BVH_MOTION)
-                      float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
-                      Transform itfm = object_fetch_transform(
-                          kg, object, OBJECT_INVERSE_TRANSFORM);
-                      float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
-                      for (int i = 0; i < num_hits_in_instance; i++) {
-                        (isect_array - i - 1)->t *= t_fac;
-                      }
-                    }
-#endif /* BVH_FEATURE(BVH_INSTANCING) */
-                    return num_hits;
-                  }
-                }
-              }
-              break;
-            }
-#if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                /* Only primitives from volume object. */
-                uint tri_object = (object == OBJECT_NONE) ?
-                                      kernel_tex_fetch(__prim_object, prim_addr) :
-                                      object;
-                int object_flag = kernel_tex_fetch(__object_flag, tri_object);
-                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-                  continue;
-                }
-                /* Intersect ray against primitive. */
-                hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
-                if (hit) {
-                  /* Move on to next entry in intersections array. */
-                  isect_array++;
-                  num_hits++;
-#  if BVH_FEATURE(BVH_INSTANCING)
-                  num_hits_in_instance++;
-#  endif
-                  isect_array->t = isect_t;
-                  if (num_hits == max_hits) {
-#  if BVH_FEATURE(BVH_INSTANCING)
-                    if (object != OBJECT_NONE) {
-#    if BVH_FEATURE(BVH_MOTION)
-                      float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#    else
-                      Transform itfm = object_fetch_transform(
-                          kg, object, OBJECT_INVERSE_TRANSFORM);
-                      float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#    endif
-                      for (int i = 0; i < num_hits_in_instance; i++) {
-                        (isect_array - i - 1)->t *= t_fac;
-                      }
-                    }
-#  endif /* BVH_FEATURE(BVH_INSTANCING) */
-                    return num_hits;
-                  }
-                }
-              }
-              break;
-            }
-#endif
-          }
-        }
-#if BVH_FEATURE(BVH_INSTANCING)
-        else {
-          /* Instance push. */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
-          if (object_flag & SD_OBJECT_HAS_VOLUME) {
-#  if BVH_FEATURE(BVH_MOTION)
-            isect_t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
-#  else
-            isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
-#  endif
-
-            qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-            tfar = ssef(isect_t);
-            idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  if BVH_FEATURE(BVH_HAIR)
-            dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-#  ifdef __KERNEL_AVX2__
-            P_idir = P * idir;
-            P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-            org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-            num_hits_in_instance = 0;
-            isect_array->t = isect_t;
-
-            ++stack_ptr;
-            kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
-            traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
-
-            node_addr = kernel_tex_fetch(__object_node, object);
-          }
-          else {
-            /* Pop. */
-            object = OBJECT_NONE;
-            node_addr = traversal_stack[stack_ptr].addr;
-            --stack_ptr;
-          }
-        }
-      }
-#endif /* FEATURE(BVH_INSTANCING) */
-    } while (node_addr != ENTRYPOINT_SENTINEL);
-
-#if BVH_FEATURE(BVH_INSTANCING)
-    if (stack_ptr >= 0) {
-      kernel_assert(object != OBJECT_NONE);
-
-      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#  else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#  endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#  if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#  else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#  endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
-
-      qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z);
-      tfar = ssef(isect_t);
-#  if BVH_FEATURE(BVH_HAIR)
-      dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
-#  endif
-      idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
-#  ifdef __KERNEL_AVX2__
-      P_idir = P * idir;
-      P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
-#  endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
-      org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
-#  endif
-
-      object = OBJECT_NONE;
-      node_addr = traversal_stack[stack_ptr].addr;
-      --stack_ptr;
-    }
-#endif /* FEATURE(BVH_INSTANCING) */
-  } while (node_addr != ENTRYPOINT_SENTINEL);
-
-  return num_hits;
-}
-
-#undef NODE_INTERSECT
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 5e26f90a878..6070fd983f5 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+// clang-format off
 #include "kernel/closure/bsdf_ashikhmin_velvet.h"
 #include "kernel/closure/bsdf_diffuse.h"
 #include "kernel/closure/bsdf_oren_nayar.h"
@@ -32,6 +33,7 @@
 #include "kernel/closure/bsdf_principled_sheen.h"
 #include "kernel/closure/bssrdf.h"
 #include "kernel/closure/volume.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -73,6 +75,40 @@ ccl_device_inline float bsdf_get_roughness_squared(const ShaderClosure *sc)
   return bsdf_get_specular_roughness_squared(sc);
 }
 
+/* An additional term to smooth illumination on grazing angles when using bump mapping.
+ * Based on "Taming the Shadow Terminator" by Matt Jen-Yuan Chiang,
+ * Yining Karl Li and Brent Burley. */
+ccl_device_inline float bump_shadowing_term(float3 Ng, float3 N, float3 I)
+{
+  float g = safe_divide(dot(Ng, I), dot(N, I) * dot(Ng, N));
+
+  /* If the incoming light is on the unshadowed side, return full brightness. */
+  if (g >= 1.0f) {
+    return 1.0f;
+  }
+
+  /* If the incoming light points away from the surface, return black. */
+  if (g < 0.0f) {
+    return 0.0f;
+  }
+
+  /* Return smoothed value to avoid discontinuity at perpendicular angle. */
+  float g2 = sqr(g);
+  return -g2 * g + g2 + g;
+}
+
+/* Shadow terminator workaround, taken from Appleseed.
+ * Original code is under the MIT License
+ * Copyright (c) 2019 Francois Beaune, The appleseedhq Organization */
+ccl_device_inline float shift_cos_in(float cos_in, const float frequency_multiplier)
+{
+  cos_in = min(cos_in, 1.0f);
+
+  const float angle = fast_acosf(cos_in);
+  const float val = max(cosf(angle * frequency_multiplier), 0.0f) / cos_in;
+  return val;
+}
+
 ccl_device_inline int bsdf_sample(KernelGlobals *kg,
                                   ShaderData *sd,
                                   const ShaderClosure *sc,
@@ -83,13 +119,16 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
                                   differential3 *domega_in,
                                   float *pdf)
 {
+  /* For curves use the smooth normal, particularly for ribbons the geometric
+   * normal gives too much darkening otherwise. */
   int label;
+  const float3 Ng = (sd->type & PRIMITIVE_ALL_CURVE) ? sc->N : sd->Ng;
 
   switch (sc->type) {
     case CLOSURE_BSDF_DIFFUSE_ID:
     case CLOSURE_BSDF_BSSRDF_ID:
       label = bsdf_diffuse_sample(sc,
-                                  sd->Ng,
+                                  Ng,
                                   sd->I,
                                   sd->dI.dx,
                                   sd->dI.dy,
@@ -104,7 +143,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #ifdef __SVM__
     case CLOSURE_BSDF_OREN_NAYAR_ID:
       label = bsdf_oren_nayar_sample(sc,
-                                     sd->Ng,
+                                     Ng,
                                      sd->I,
                                      sd->dI.dx,
                                      sd->dI.dy,
@@ -119,7 +158,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #  ifdef __OSL__
     case CLOSURE_BSDF_PHONG_RAMP_ID:
       label = bsdf_phong_ramp_sample(sc,
-                                     sd->Ng,
+                                     Ng,
                                      sd->I,
                                      sd->dI.dx,
                                      sd->dI.dy,
@@ -133,7 +172,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
       label = bsdf_diffuse_ramp_sample(sc,
-                                       sd->Ng,
+                                       Ng,
                                        sd->I,
                                        sd->dI.dx,
                                        sd->dI.dy,
@@ -148,7 +187,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #  endif
     case CLOSURE_BSDF_TRANSLUCENT_ID:
       label = bsdf_translucent_sample(sc,
-                                      sd->Ng,
+                                      Ng,
                                       sd->I,
                                       sd->dI.dx,
                                       sd->dI.dy,
@@ -162,7 +201,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_REFLECTION_ID:
       label = bsdf_reflection_sample(sc,
-                                     sd->Ng,
+                                     Ng,
                                      sd->I,
                                      sd->dI.dx,
                                      sd->dI.dy,
@@ -176,7 +215,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_REFRACTION_ID:
       label = bsdf_refraction_sample(sc,
-                                     sd->Ng,
+                                     Ng,
                                      sd->I,
                                      sd->dI.dx,
                                      sd->dI.dy,
@@ -190,7 +229,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_TRANSPARENT_ID:
       label = bsdf_transparent_sample(sc,
-                                      sd->Ng,
+                                      Ng,
                                       sd->I,
                                       sd->dI.dx,
                                       sd->dI.dy,
@@ -205,12 +244,10 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_MICROFACET_GGX_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
       label = bsdf_microfacet_ggx_sample(kg,
                                          sc,
-                                         sd->Ng,
+                                         Ng,
                                          sd->I,
                                          sd->dI.dx,
                                          sd->dI.dy,
@@ -226,7 +263,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
       label = bsdf_microfacet_multi_ggx_sample(kg,
                                                sc,
-                                               sd->Ng,
+                                               Ng,
                                                sd->I,
                                                sd->dI.dx,
                                                sd->dI.dy,
@@ -243,7 +280,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
       label = bsdf_microfacet_multi_ggx_glass_sample(kg,
                                                      sc,
-                                                     sd->Ng,
+                                                     Ng,
                                                      sd->I,
                                                      sd->dI.dx,
                                                      sd->dI.dy,
@@ -257,11 +294,10 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
                                                      &sd->lcg_state);
       break;
     case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
     case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
       label = bsdf_microfacet_beckmann_sample(kg,
                                               sc,
-                                              sd->Ng,
+                                              Ng,
                                               sd->I,
                                               sd->dI.dx,
                                               sd->dI.dy,
@@ -274,9 +310,8 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
                                               pdf);
       break;
     case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
       label = bsdf_ashikhmin_shirley_sample(sc,
-                                            sd->Ng,
+                                            Ng,
                                             sd->I,
                                             sd->dI.dx,
                                             sd->dI.dy,
@@ -290,7 +325,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
       label = bsdf_ashikhmin_velvet_sample(sc,
-                                           sd->Ng,
+                                           Ng,
                                            sd->I,
                                            sd->dI.dx,
                                            sd->dI.dy,
@@ -304,7 +339,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_DIFFUSE_TOON_ID:
       label = bsdf_diffuse_toon_sample(sc,
-                                       sd->Ng,
+                                       Ng,
                                        sd->I,
                                        sd->dI.dx,
                                        sd->dI.dy,
@@ -318,7 +353,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_GLOSSY_TOON_ID:
       label = bsdf_glossy_toon_sample(sc,
-                                      sd->Ng,
+                                      Ng,
                                       sd->I,
                                       sd->dI.dx,
                                       sd->dI.dy,
@@ -332,7 +367,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_HAIR_REFLECTION_ID:
       label = bsdf_hair_reflection_sample(sc,
-                                          sd->Ng,
+                                          Ng,
                                           sd->I,
                                           sd->dI.dx,
                                           sd->dI.dy,
@@ -346,7 +381,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
       label = bsdf_hair_transmission_sample(sc,
-                                            sd->Ng,
+                                            Ng,
                                             sd->I,
                                             sd->dI.dx,
                                             sd->dI.dy,
@@ -366,7 +401,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
     case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
     case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
       label = bsdf_principled_diffuse_sample(sc,
-                                             sd->Ng,
+                                             Ng,
                                              sd->I,
                                              sd->dI.dx,
                                              sd->dI.dy,
@@ -380,7 +415,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
     case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
       label = bsdf_principled_sheen_sample(sc,
-                                           sd->Ng,
+                                           Ng,
                                            sd->I,
                                            sd->dI.dx,
                                            sd->dI.dy,
@@ -424,6 +459,19 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       }
     }
   }
+  else {
+    /* Shadow terminator offset. */
+    const float frequency_multiplier =
+        kernel_tex_fetch(__objects, sd->object).shadow_terminator_offset;
+    if (frequency_multiplier > 1.0f) {
+      *eval *= shift_cos_in(dot(*omega_in, sc->N), frequency_multiplier);
+    }
+    if (label & LABEL_DIFFUSE) {
+      if (!isequal_float3(sc->N, sd->N)) {
+        *eval *= bump_shadowing_term((label & LABEL_TRANSMIT) ? -sd->N : sd->N, sc->N, *omega_in);
+      }
+    }
+  }
 
   return label;
 }
@@ -440,9 +488,12 @@ ccl_device_inline
               const float3 omega_in,
               float *pdf)
 {
+  /* For curves use the smooth normal, particularly for ribbons the geometric
+   * normal gives too much darkening otherwise. */
+  const float3 Ng = (sd->type & PRIMITIVE_ALL_CURVE) ? sd->N : sd->Ng;
   float3 eval;
 
-  if (dot(sd->Ng, omega_in) >= 0.0f) {
+  if (dot(Ng, omega_in) >= 0.0f) {
     switch (sc->type) {
       case CLOSURE_BSDF_DIFFUSE_ID:
       case CLOSURE_BSDF_BSSRDF_ID:
@@ -475,8 +526,6 @@ ccl_device_inline
       case CLOSURE_BSDF_MICROFACET_GGX_ID:
       case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
       case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-      case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
-      case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
       case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
         eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
         break;
@@ -490,12 +539,10 @@ ccl_device_inline
             sc, sd->I, omega_in, pdf, &sd->lcg_state);
         break;
       case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-      case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
       case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
         eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
         break;
       case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-      case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
         eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
         break;
       case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
@@ -535,6 +582,17 @@ ccl_device_inline
         eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
+      if (!isequal_float3(sc->N, sd->N)) {
+        eval *= bump_shadowing_term(sd->N, sc->N, omega_in);
+      }
+    }
+    /* Shadow terminator offset. */
+    const float frequency_multiplier =
+        kernel_tex_fetch(__objects, sd->object).shadow_terminator_offset;
+    if (frequency_multiplier > 1.0f) {
+      eval *= shift_cos_in(dot(omega_in, sc->N), frequency_multiplier);
+    }
   }
   else {
     switch (sc->type) {
@@ -561,8 +619,6 @@ ccl_device_inline
       case CLOSURE_BSDF_MICROFACET_GGX_ID:
       case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
       case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-      case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
-      case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
       case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
         eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
         break;
@@ -576,12 +632,10 @@ ccl_device_inline
             sc, sd->I, omega_in, pdf, &sd->lcg_state);
         break;
       case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-      case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
       case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
         eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
         break;
       case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-      case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
         eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
         break;
       case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
@@ -621,6 +675,11 @@ ccl_device_inline
         eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
+      if (!isequal_float3(sc->N, sd->N)) {
+        eval *= bump_shadowing_term(-sd->N, sc->N, omega_in);
+      }
+    }
   }
 
   return eval;
@@ -640,18 +699,14 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
     case CLOSURE_BSDF_MICROFACET_GGX_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
       bsdf_microfacet_ggx_blur(sc, roughness);
       break;
     case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
     case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
       bsdf_microfacet_beckmann_blur(sc, roughness);
       break;
     case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
       bsdf_ashikhmin_shirley_blur(sc, roughness);
       break;
     case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
@@ -680,18 +735,14 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
     case CLOSURE_BSDF_MICROFACET_GGX_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
     case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
     case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
       return bsdf_microfacet_merge(a, b);
     case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
       return bsdf_ashikhmin_velvet_merge(a, b);
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index b3b1c37748d..0d50172a907 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -34,18 +34,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device int bsdf_ashikhmin_shirley_setup(MicrofacetBsdf *bsdf)
 {
   bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  bsdf->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
-  return SD_BSDF | SD_BSDF_HAS_EVAL;
-}
-
-ccl_device int bsdf_ashikhmin_shirley_aniso_setup(MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
   bsdf->alpha_y = clamp(bsdf->alpha_y, 1e-4f, 1.0f);
 
-  bsdf->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
+  bsdf->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
@@ -85,15 +76,11 @@ ccl_device_forceinline float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderCl
     float HdotI = fmaxf(fabsf(dot(H, I)), 1e-6f);
     float HdotN = fmaxf(dot(H, N), 1e-6f);
 
-    float pump =
-        1.0f /
-        fmaxf(
-            1e-6f,
-            (HdotI *
-             fmaxf(
-                 NdotO,
-                 NdotI))); /* pump from original paper (first derivative disc., but cancels the HdotI in the pdf nicely) */
-    /*float pump = 1.0f / fmaxf(1e-4f, ((NdotO + NdotI) * (NdotO*NdotI))); */ /* pump from d-brdf paper */
+    /* pump from original paper
+     * (first derivative disc., but cancels the HdotI in the pdf nicely) */
+    float pump = 1.0f / fmaxf(1e-6f, (HdotI * fmaxf(NdotO, NdotI)));
+    /* pump from d-brdf paper */
+    /*float pump = 1.0f / fmaxf(1e-4f, ((NdotO + NdotI) * (NdotO*NdotI))); */
 
     float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(bsdf->alpha_x);
     float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(bsdf->alpha_y);
@@ -105,9 +92,8 @@ ccl_device_forceinline float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderCl
       float norm = (n_x + 1.0f) / (8.0f * M_PI_F);
 
       out = NdotO * norm * lobe * pump;
-      *pdf =
-          norm * lobe /
-          HdotI; /* this is p_h / 4(H.I)  (conversion from 'wh measure' to 'wi measure', eq. 8 in paper) */
+      /* this is p_h / 4(H.I)  (conversion from 'wh measure' to 'wi measure', eq. 8 in paper). */
+      *pdf = norm * lobe / HdotI;
     }
     else {
       /* anisotropic */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 8122bcc1424..3d3f20edab3 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -42,6 +42,8 @@ typedef ccl_addr_space struct VelvetBsdf {
   float invsigma2;
 } VelvetBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(VelvetBsdf), "VelvetBsdf is too large!");
+
 ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
 {
   float sigma = fmaxf(bsdf->sigma, 0.01f);
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index 76b50548455..ea604ed0311 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -39,6 +39,8 @@ typedef ccl_addr_space struct DiffuseBsdf {
   SHADER_CLOSURE_BASE;
 } DiffuseBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(DiffuseBsdf), "DiffuseBsdf is too large!");
+
 /* DIFFUSE */
 
 ccl_device int bsdf_diffuse_setup(DiffuseBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 9d13eb8d4e0..aa62c1c7ceb 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -43,6 +43,8 @@ typedef ccl_addr_space struct DiffuseRampBsdf {
   float3 *colors;
 } DiffuseRampBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(DiffuseRampBsdf), "DiffuseRampBsdf is too large!");
+
 ccl_device float3 bsdf_diffuse_ramp_get_color(const float3 colors[8], float pos)
 {
   int MAXCOLORS = 8;
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 6b2a9a97d30..7ca9424b815 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -44,6 +44,8 @@ typedef ccl_addr_space struct HairBsdf {
   float offset;
 } HairBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(HairBsdf), "HairBsdf is too large!");
+
 ccl_device int bsdf_hair_reflection_setup(HairBsdf *bsdf)
 {
   bsdf->type = CLOSURE_BSDF_HAIR_REFLECTION_ID;
@@ -224,7 +226,7 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc,
   fast_sincosf(phi, &sinphi, &cosphi);
   *omega_in = (cosphi * costheta_i) * locy - (sinphi * costheta_i) * locx + (sintheta_i)*Tg;
 
-  //differentials - TODO: find a better approximation for the reflective bounce
+  // differentials - TODO: find a better approximation for the reflective bounce
 #ifdef __RAY_DIFFERENTIALS__
   *domega_in_dx = 2 * dot(locy, dIdx) * locy - dIdx;
   *domega_in_dy = 2 * dot(locy, dIdy) * locy - dIdy;
@@ -285,7 +287,7 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc,
   fast_sincosf(phi, &sinphi, &cosphi);
   *omega_in = (cosphi * costheta_i) * locy - (sinphi * costheta_i) * locx + (sintheta_i)*Tg;
 
-  //differentials - TODO: find a better approximation for the transmission bounce
+  // differentials - TODO: find a better approximation for the transmission bounce
 #ifdef __RAY_DIFFERENTIALS__
   *domega_in_dx = 2 * dot(locy, dIdx) * locy - dIdx;
   *domega_in_dy = 2 * dot(locy, dIdy) * locy - dIdy;
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index a4bba2fbf6c..389bd62ba68 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -60,7 +60,8 @@ ccl_device_inline float cos_from_sin(const float s)
   return safe_sqrtf(1.0f - s * s);
 }
 
-/* Gives the change in direction in the normal plane for the given angles and p-th-order scattering. */
+/* Gives the change in direction in the normal plane for the given angles and p-th-order
+ * scattering. */
 ccl_device_inline float delta_phi(int p, float gamma_o, float gamma_t)
 {
   return 2.0f * p * gamma_t - 2.0f * gamma_o + p * M_PI_F;
@@ -205,9 +206,6 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
   float3 X = safe_normalize(sd->dPdu);
   float3 Y = safe_normalize(cross(X, sd->I));
   float3 Z = safe_normalize(cross(X, Y));
-  /* TODO: the solution below works where sd->Ng is the normal
-   * pointing from the center of the curve to the shading point.
-   * It doesn't work for triangles, see https://developer.blender.org/T43625 */
 
   /* h -1..0..1 means the rays goes from grazing the hair, to hitting it at
    * the center, to grazing the other edge. This is the sine of the angle
@@ -215,7 +213,9 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
 
   /* TODO: we convert this value to a cosine later and discard the sign, so
    * we could probably save some operations. */
-  float h = dot(cross(sd->Ng, X), Z);
+  float h = (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) ?
+                -sd->v :
+                dot(cross(sd->Ng, X), Z);
 
   kernel_assert(fabsf(h) < 1.0f + 1e-4f);
   kernel_assert(isfinite3_safe(Y));
@@ -492,6 +492,36 @@ ccl_device void bsdf_principled_hair_blur(ShaderClosure *sc, float roughness)
   bsdf->m0_roughness = fmaxf(roughness, bsdf->m0_roughness);
 }
 
+/* Hair Albedo */
+
+ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
+    const float azimuthal_roughness)
+{
+  const float x = azimuthal_roughness;
+  return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
+}
+
+ccl_device float3 bsdf_principled_hair_albedo(ShaderClosure *sc)
+{
+  PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)sc;
+  return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
+}
+
+ccl_device_inline float3
+bsdf_principled_hair_sigma_from_reflectance(const float3 color, const float azimuthal_roughness)
+{
+  const float3 sigma = log3(color) /
+                       bsdf_principled_hair_albedo_roughness_scale(azimuthal_roughness);
+  return sigma * sigma;
+}
+
+ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const float eumelanin,
+                                                                       const float pheomelanin)
+{
+  return eumelanin * make_float3(0.506f, 0.841f, 1.653f) +
+         pheomelanin * make_float3(0.343f, 0.733f, 1.924f);
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index b4da3123f28..d9e81535b62 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -37,6 +37,7 @@ CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct MicrofacetExtra {
   float3 color, cspec0;
+  float3 fresnel_color;
   float clearcoat;
 } MicrofacetExtra;
 
@@ -48,6 +49,8 @@ typedef ccl_addr_space struct MicrofacetBsdf {
   float3 T;
 } MicrofacetBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf is too large!");
+
 /* Beckmann and GGX microfacet importance sampling. */
 
 ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals *kg,
@@ -253,9 +256,7 @@ ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float
 {
   float3 F = make_float3(1.0f, 1.0f, 1.0f);
   bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID ||
-                      bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID ||
-                      bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
-
+                      bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
   if (use_fresnel) {
     float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
 
@@ -274,6 +275,22 @@ ccl_device_forceinline float D_GTR1(float NdotH, float alpha)
   return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t);
 }
 
+ccl_device_forceinline void bsdf_microfacet_fresnel_color(const ShaderData *sd,
+                                                          MicrofacetBsdf *bsdf)
+{
+  kernel_assert(CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type));
+
+  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+  bsdf->extra->fresnel_color = interpolate_fresnel_color(
+      sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0);
+
+  if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
+    bsdf->extra->fresnel_color *= 0.25f * bsdf->extra->clearcoat;
+  }
+
+  bsdf->sample_weight *= average(bsdf->extra->fresnel_color);
+}
+
 /* GGX microfacet with Smith shadow-masking from:
  *
  * Microfacet Models for Refraction through Rough Surfaces
@@ -292,46 +309,46 @@ ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf)
   bsdf->extra = NULL;
 
   bsdf->alpha_x = saturate(bsdf->alpha_x);
-  bsdf->alpha_y = bsdf->alpha_x;
+  bsdf->alpha_y = saturate(bsdf->alpha_y);
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+/* Required to maintain OSL interface. */
+ccl_device int bsdf_microfacet_ggx_isotropic_setup(MicrofacetBsdf *bsdf)
 {
-  bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
-  bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
-  bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+  bsdf->alpha_y = bsdf->alpha_x;
 
-  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-  float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
-  bsdf->sample_weight *= F;
+  return bsdf_microfacet_ggx_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+  bsdf->extra->cspec0 = saturate3(bsdf->extra->cspec0);
 
   bsdf->alpha_x = saturate(bsdf->alpha_x);
-  bsdf->alpha_y = bsdf->alpha_x;
+  bsdf->alpha_y = saturate(bsdf->alpha_y);
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;
 
+  bsdf_microfacet_fresnel_color(sd, bsdf);
+
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
 {
-  bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
-  bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
-  bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
-
-  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-  float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
-  bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F;
+  bsdf->extra->cspec0 = saturate3(bsdf->extra->cspec0);
 
   bsdf->alpha_x = saturate(bsdf->alpha_x);
   bsdf->alpha_y = bsdf->alpha_x;
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
 
+  bsdf_microfacet_fresnel_color(sd, bsdf);
+
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
@@ -350,36 +367,6 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
            (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat)));
 }
 
-ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf)
-{
-  bsdf->extra = NULL;
-
-  bsdf->alpha_x = saturate(bsdf->alpha_x);
-  bsdf->alpha_y = saturate(bsdf->alpha_y);
-
-  bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
-
-  return SD_BSDF | SD_BSDF_HAS_EVAL;
-}
-
-ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
-{
-  bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
-  bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
-  bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
-
-  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-  float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
-  bsdf->sample_weight *= F;
-
-  bsdf->alpha_x = saturate(bsdf->alpha_x);
-  bsdf->alpha_y = saturate(bsdf->alpha_y);
-
-  bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID;
-
-  return SD_BSDF | SD_BSDF_HAS_EVAL;
-}
-
 ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
   bsdf->extra = NULL;
@@ -629,8 +616,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg,
             *eval = make_float3(1e6f, 1e6f, 1e6f);
 
             bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID ||
-                                bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID ||
-                                bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+                                bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
 
             /* if fresnel is used, calculate the color with reflection_color(...) */
             if (use_fresnel) {
@@ -804,19 +790,18 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg,
 ccl_device int bsdf_microfacet_beckmann_setup(MicrofacetBsdf *bsdf)
 {
   bsdf->alpha_x = saturate(bsdf->alpha_x);
-  bsdf->alpha_y = bsdf->alpha_x;
+  bsdf->alpha_y = saturate(bsdf->alpha_y);
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device int bsdf_microfacet_beckmann_aniso_setup(MicrofacetBsdf *bsdf)
+/* Required to maintain OSL interface. */
+ccl_device int bsdf_microfacet_beckmann_isotropic_setup(MicrofacetBsdf *bsdf)
 {
-  bsdf->alpha_x = saturate(bsdf->alpha_x);
-  bsdf->alpha_y = saturate(bsdf->alpha_y);
+  bsdf->alpha_y = bsdf->alpha_x;
 
-  bsdf->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
-  return SD_BSDF | SD_BSDF_HAS_EVAL;
+  return bsdf_microfacet_beckmann_setup(bsdf);
 }
 
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(MicrofacetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 2cc1a9c5299..9795c8da065 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -16,7 +16,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Most of the code is based on the supplemental implementations from https://eheitzresearch.wordpress.com/240-2/. */
+/* Most of the code is based on the supplemental implementations from
+ * https://eheitzresearch.wordpress.com/240-2/. */
 
 /* === GGX Microfacet distribution functions === */
 
@@ -80,7 +81,8 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI,
     return make_float2(slopeX, -slopeY);
 }
 
-/* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
+/* Visible normal sampling for the GGX distribution
+ * (based on page 7 of the supplemental implementation). */
 ccl_device_forceinline float3 mf_sample_vndf(const float3 wi,
                                              const float2 alpha,
                                              const float randx,
@@ -134,7 +136,8 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w,
   return make_float3(phase, phase, phase);
 }
 
-/* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
+/* Phase function for dielectric transmissive materials, including both reflection and refraction
+ * according to the dielectric fresnel term. */
 ccl_device_forceinline float3 mf_sample_phase_glass(
     const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
@@ -227,7 +230,8 @@ ccl_device_forceinline float mf_G1(const float3 w, const float C1, const float l
   return powf(C1, lambda);
 }
 
-/* Sampling from the visible height distribution (based on page 17 of the supplemental implementation). */
+/* Sampling from the visible height distribution (based on page 17 of the supplemental
+ * implementation). */
 ccl_device_forceinline bool mf_sample_height(
     const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
 {
@@ -254,7 +258,8 @@ ccl_device_forceinline bool mf_sample_height(
 }
 
 /* === PDF approximations for the different phase functions. ===
- * As explained in bsdf_microfacet_multi_impl.h, using approximations with MIS still produces an unbiased result. */
+ * As explained in bsdf_microfacet_multi_impl.h, using approximations with MIS still produces an
+ * unbiased result. */
 
 /* Approximation for the albedo of the single-scattering GGX distribution,
  * the missing energy is then approximated as a diffuse reflection for the PDF. */
@@ -342,7 +347,8 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi,
   }
 }
 
-/* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */
+/* === Actual random walk implementations === */
+/* One version of mf_eval and mf_sample per phase function. */
 
 #define MF_NAME_JOIN(x, y) x##_##y
 #define MF_NAME_EVAL(x, y) MF_NAME_JOIN(x, y)
@@ -372,17 +378,13 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf)
 {
   bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
   bsdf->alpha_y = clamp(bsdf->alpha_y, 1e-4f, 1.0f);
-  bsdf->extra->color.x = saturate(bsdf->extra->color.x);
-  bsdf->extra->color.y = saturate(bsdf->extra->color.y);
-  bsdf->extra->color.z = saturate(bsdf->extra->color.z);
-  bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
-  bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
-  bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+  bsdf->extra->color = saturate3(bsdf->extra->color);
+  bsdf->extra->cspec0 = saturate3(bsdf->extra->cspec0);
 
   return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
+ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
 {
   if (is_zero(bsdf->T))
     bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
@@ -392,39 +394,14 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
   return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf,
-                                                             const ShaderData *sd)
+ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
 {
   if (is_zero(bsdf->T))
     bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
 
-  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-  float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
-  bsdf->sample_weight *= F;
-
-  return bsdf_microfacet_multi_ggx_common_setup(bsdf);
-}
-
-ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
-
-  return bsdf_microfacet_multi_ggx_common_setup(bsdf);
-}
-
-ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
-
-  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-  float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
-  bsdf->sample_weight *= F;
+  bsdf_microfacet_fresnel_color(sd, bsdf);
 
   return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
@@ -562,9 +539,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf)
   bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
   bsdf->alpha_y = bsdf->alpha_x;
   bsdf->ior = max(0.0f, bsdf->ior);
-  bsdf->extra->color.x = saturate(bsdf->extra->color.x);
-  bsdf->extra->color.y = saturate(bsdf->extra->color.y);
-  bsdf->extra->color.z = saturate(bsdf->extra->color.z);
+  bsdf->extra->color = saturate3(bsdf->extra->color);
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
 
@@ -577,18 +552,12 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsd
   bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
   bsdf->alpha_y = bsdf->alpha_x;
   bsdf->ior = max(0.0f, bsdf->ior);
-  bsdf->extra->color.x = saturate(bsdf->extra->color.x);
-  bsdf->extra->color.y = saturate(bsdf->extra->color.y);
-  bsdf->extra->color.z = saturate(bsdf->extra->color.z);
-  bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
-  bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
-  bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+  bsdf->extra->color = saturate3(bsdf->extra->color);
+  bsdf->extra->cspec0 = saturate3(bsdf->extra->cspec0);
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
 
-  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-  float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
-  bsdf->sample_weight *= F;
+  bsdf_microfacet_fresnel_color(sd, bsdf);
 
   return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 79247ee8057..04d9b22d7d2 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -16,14 +16,14 @@
 
 /* Evaluate the BSDF from wi to wo.
  * Evaluation is split into the analytical single-scattering BSDF and the multi-scattering BSDF,
- * which is evaluated stochastically through a random walk. At each bounce (except for the first one),
- * the amount of reflection from here towards wo is evaluated before bouncing again.
+ * which is evaluated stochastically through a random walk. At each bounce (except for the first
+ * one), the amount of reflection from here towards wo is evaluated before bouncing again.
  *
- * Because of the random walk, the evaluation is not deterministic, but its expected value is equal to
- * the correct BSDF, which is enough for Monte-Carlo rendering. The PDF also can't be determined
- * analytically, so the single-scattering PDF plus a diffuse term to account for the multi-scattered
- * energy is used. In combination with MIS, that is enough to produce an unbiased result, although
- * the balance heuristic isn't necessarily optimal anymore.
+ * Because of the random walk, the evaluation is not deterministic, but its expected value is equal
+ * to the correct BSDF, which is enough for Monte-Carlo rendering. The PDF also can't be determined
+ * analytically, so the single-scattering PDF plus a diffuse term to account for the
+ * multi-scattered energy is used. In combination with MIS, that is enough to produce an unbiased
+ * result, although the balance heuristic isn't necessarily optimal anymore.
  */
 ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
                                                              float3 wo,
@@ -36,7 +36,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
                                                              bool use_fresnel,
                                                              const float3 cspec0)
 {
-  /* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */
+  /* Evaluating for a shallower incoming direction produces less noise, and the properties of the
+   * BSDF guarantee reciprocity. */
   bool swapped = false;
 #ifdef MF_MULTI_GLASS
   if (wi.z * wo.z < 0.0f) {
@@ -180,9 +181,9 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
   return eval;
 }
 
-/* Perform a random walk on the microsurface starting from wi, returning the direction in which the walk
- * escaped the surface in wo. The function returns the throughput between wi and wo.
- * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
+/* Perform a random walk on the microsurface starting from wi, returning the direction in which the
+ * walk escaped the surface in wo. The function returns the throughput between wi and wo. Without
+ * reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
  */
 ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
                                                                float3 *wo,
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 104ed5b2818..41e5736bf49 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -27,6 +27,8 @@ typedef ccl_addr_space struct OrenNayarBsdf {
   float b;
 } OrenNayarBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(OrenNayarBsdf), "OrenNayarBsdf is too large!");
+
 ccl_device float3 bsdf_oren_nayar_get_intensity(const ShaderClosure *sc,
                                                 float3 n,
                                                 float3 v,
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index b6fd0e68681..cf5484383f2 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -44,6 +44,8 @@ typedef ccl_addr_space struct PhongRampBsdf {
   float3 *colors;
 } PhongRampBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(PhongRampBsdf), "PhongRampBsdf is too large!");
+
 ccl_device float3 bsdf_phong_ramp_get_color(const float3 colors[8], float pos)
 {
   int MAXCOLORS = 8;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index d7795974ef5..43646aaeb5b 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -30,6 +30,9 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
   float roughness;
 } PrincipledDiffuseBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(PrincipledDiffuseBsdf),
+              "PrincipledDiffuseBsdf is too large!");
+
 ccl_device float3 calculate_principled_diffuse_brdf(
     const PrincipledDiffuseBsdf *bsdf, float3 N, float3 V, float3 L, float3 H, float *pdf)
 {
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index bc522095b3b..3707de29d73 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -26,10 +26,26 @@ CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledSheenBsdf {
   SHADER_CLOSURE_BASE;
+  float avg_value;
 } PrincipledSheenBsdf;
 
-ccl_device float3 calculate_principled_sheen_brdf(
-    const PrincipledSheenBsdf *bsdf, float3 N, float3 V, float3 L, float3 H, float *pdf)
+static_assert(sizeof(ShaderClosure) >= sizeof(PrincipledSheenBsdf),
+              "PrincipledSheenBsdf is too large!");
+
+ccl_device_inline float calculate_avg_principled_sheen_brdf(float3 N, float3 I)
+{
+  /* To compute the average, we set the half-vector to the normal, resulting in
+   * NdotI = NdotL = NdotV = LdotH */
+  float NdotI = dot(N, I);
+  if (NdotI < 0.0f) {
+    return 0.0f;
+  }
+
+  return schlick_fresnel(NdotI) * NdotI;
+}
+
+ccl_device float3
+calculate_principled_sheen_brdf(float3 N, float3 V, float3 L, float3 H, float *pdf)
 {
   float NdotL = dot(N, L);
   float NdotV = dot(N, V);
@@ -46,9 +62,11 @@ ccl_device float3 calculate_principled_sheen_brdf(
   return make_float3(value, value, value);
 }
 
-ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf)
+ccl_device int bsdf_principled_sheen_setup(const ShaderData *sd, PrincipledSheenBsdf *bsdf)
 {
   bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID;
+  bsdf->avg_value = calculate_avg_principled_sheen_brdf(bsdf->N, sd->I);
+  bsdf->sample_weight *= bsdf->avg_value;
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
@@ -66,7 +84,7 @@ ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc,
 
   if (dot(N, omega_in) > 0.0f) {
     *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
-    return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf);
+    return calculate_principled_sheen_brdf(N, V, L, H, pdf);
   }
   else {
     *pdf = 0.0f;
@@ -104,7 +122,7 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
   if (dot(Ng, *omega_in) > 0) {
     float3 H = normalize(I + *omega_in);
 
-    *eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf);
+    *eval = calculate_principled_sheen_brdf(N, I, *omega_in, H, pdf);
 
 #ifdef __RAY_DIFFERENTIALS__
     // TODO: find a better approximation for the diffuse bounce
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index f37fd228087..cc5de21ed0e 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -42,6 +42,8 @@ typedef ccl_addr_space struct ToonBsdf {
   float smooth;
 } ToonBsdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(ToonBsdf), "ToonBsdf is too large!");
+
 /* DIFFUSE TOON */
 
 ccl_device int bsdf_diffuse_toon_setup(ToonBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index a9a27edd7de..a73dee1b045 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -134,20 +134,6 @@ ccl_device float schlick_fresnel(float u)
   return m2 * m2 * m;  // pow(m, 5)
 }
 
-ccl_device float smooth_step(float edge0, float edge1, float x)
-{
-  float result;
-  if (x < edge0)
-    result = 0.0f;
-  else if (x >= edge1)
-    result = 1.0f;
-  else {
-    float t = (x - edge0) / (edge1 - edge0);
-    result = (3.0f - 2.0f * t) * (t * t);
-  }
-  return result;
-}
-
 /* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
 ccl_device_forceinline float3
 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0)
@@ -155,7 +141,7 @@ interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0
   /* Calculate the fresnel interpolation factor
    * The value from fresnel_dielectric_cos(...) has to be normalized because
    * the cspec0 keeps the F0 color
-  */
+   */
   float F0_norm = 1.0f / (1.0f - F0);
   float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
 
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 57804eca269..4d88a822821 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -30,6 +30,8 @@ typedef ccl_addr_space struct Bssrdf {
   float channels;
 } Bssrdf;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(Bssrdf), "Bssrdf is too large!");
+
 /* Planar Truncated Gaussian
  *
  * Note how this is different from the typical gaussian, this one integrates
@@ -224,12 +226,12 @@ ccl_device float bssrdf_burley_eval(const float d, float r)
   if (r >= Rm)
     return 0.0f;
 
-  /* Burley refletance profile, equation (3).
+  /* Burley reflectance profile, equation (3).
    *
    * NOTES:
    * - Surface albedo is already included into sc->weight, no need to
    *   multiply by this term here.
-   * - This is normalized diffuse model, so the equation is mutliplied
+   * - This is normalized diffuse model, so the equation is multiplied
    *   by 2*pi, which also matches cdf().
    */
   float exp_r_3_d = expf(-r / (3.0f * d));
@@ -450,7 +452,8 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
   else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
     bssrdf_gaussian_sample(radius, xi, r, h);
   }
-  else { /*if(bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID || bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
+  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
+          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) */
     bssrdf_burley_sample(radius, xi, r, h);
   }
 }
@@ -466,7 +469,8 @@ ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r)
   else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
     return bssrdf_gaussian_pdf(radius, r);
   }
-  else { /*if(bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID || bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
+  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
+          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
     return bssrdf_burley_pdf(radius, r);
   }
 }
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 473bc0e8a82..1430f712701 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -40,6 +40,9 @@ typedef ccl_addr_space struct HenyeyGreensteinVolume {
   float g;
 } HenyeyGreensteinVolume;
 
+static_assert(sizeof(ShaderClosure) >= sizeof(HenyeyGreensteinVolume),
+              "HenyeyGreensteinVolume is too large!");
+
 /* Given cosine between rays, return probability density that a photon bounces
  * to that direction. The g parameter controls how different it is from the
  * uniform sphere. g=0 uniform diffuse-like, g=1 close to sharp single ray. */
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index 809ccfe8be6..8a2af957146 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -18,8 +18,9 @@ CCL_NAMESPACE_BEGIN
 
 #define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
 
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass.
- * Repeat the loop for every secondary frame if there are any. */
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always
+ * points to the current pixel in the first pass. Repeat the loop for every secondary frame if
+ * there are any. */
 #define FOR_PIXEL_WINDOW \
   for (int frame = 0; frame < tile_info->num_frames; frame++) { \
     pixel.z = tile_info->frames[frame]; \
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 1e0d6e93453..59d4ace2bef 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -20,8 +20,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window.
- * Repeat the loop for every secondary frame if there are any. */
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set
+ * for all pixels within the window. Repeat the loop for every secondary frame if there are any. */
 #define FOR_PIXEL_WINDOW_SSE \
   for (int frame = 0; frame < tile_info->num_frames; frame++) { \
     pixel.z = tile_info->frames[frame]; \
@@ -109,7 +109,6 @@ ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
   scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
   if (use_time) {
     scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
-    ;
   }
   scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
   scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index a94266a8786..24200c29203 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -197,7 +197,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx,
                                                            bool use_time)
 {
   int4 clip_area = rect_clip(rect, filter_window);
-  /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+  /* fy and fy are in filter-window-relative coordinates,
+   * while x and y are in feature-window-relative coordinates. */
   for (int y = clip_area.y; y < clip_area.w; y++) {
     for (int x = clip_area.x; x < clip_area.z; x++) {
       const int low = max(rect.x, x - f);
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index 8211311313d..97cecba190e 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -16,14 +16,19 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* First step of the shadow prefiltering, performs the shadow division and stores all data
+/**
+ * First step of the shadow prefiltering, performs the shadow division and stores all data
  * in a nice and easy rectangular array that can be passed to the NLM filter.
  *
  * Calculates:
- * unfiltered: Contains the two half images of the shadow feature pass
- * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
- * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
- * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ * \param unfiltered: Contains the two half images of the shadow feature pass
+ * \param sampleVariance: The sample-based variance calculated in the kernel.
+ * Note: This calculation is biased in general,
+ * and especially here since the variance of the ratio can only be approximated.
+ * \param sampleVarianceV: Variance of the sample variance estimation, quite noisy
+ * (since it's essentially the buffer variance of the two variance halves)
+ * \param bufferVariance: The buffer-based variance of the shadow feature.
+ * Unbiased, but quite noisy.
  */
 ccl_device void kernel_filter_divide_shadow(int sample,
                                             CCL_FILTER_TILE_INFO,
@@ -140,25 +145,34 @@ ccl_device void kernel_filter_write_feature(int sample,
   combined_buffer[out_offset] = from[idx];
 }
 
+#define GET_COLOR(image) \
+  make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride])
+#define SET_COLOR(image, color) \
+  image[idx] = color.x; \
+  image[idx + pass_stride] = color.y; \
+  image[idx + 2 * pass_stride] = color.z
+
 ccl_device void kernel_filter_detect_outliers(int x,
                                               int y,
-                                              ccl_global float *image,
-                                              ccl_global float *variance,
+                                              ccl_global float *in,
+                                              ccl_global float *variance_out,
                                               ccl_global float *depth,
-                                              ccl_global float *out,
+                                              ccl_global float *image_out,
                                               int4 rect,
                                               int pass_stride)
 {
   int buffer_w = align_up(rect.z - rect.x, 4);
 
+  ccl_global float *image_in = in;
+  ccl_global float *variance_in = in + 3 * pass_stride;
+
   int n = 0;
   float values[25];
   float pixel_variance, max_variance = 0.0f;
   for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) {
     for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) {
       int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x);
-      float3 color = make_float3(
-          image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]);
+      float3 color = GET_COLOR(image_in);
       color = max(color, make_float3(0.0f, 0.0f, 0.0f));
       float L = average(color);
 
@@ -176,8 +190,7 @@ ccl_device void kernel_filter_detect_outliers(int x,
       values[i] = L;
       n++;
 
-      float3 pixel_var = make_float3(
-          variance[idx], variance[idx + pass_stride], variance[idx + 2 * pass_stride]);
+      float3 pixel_var = GET_COLOR(variance_in);
       float var = average(pixel_var);
       if ((x1 == x) && (y1 == y)) {
         pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f :
@@ -192,8 +205,12 @@ ccl_device void kernel_filter_detect_outliers(int x,
   max_variance += 1e-4f;
 
   int idx = (y - rect.y) * buffer_w + (x - rect.x);
-  float3 color = make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]);
+
+  float3 color = GET_COLOR(image_in);
+  float3 variance = GET_COLOR(variance_in);
   color = max(color, make_float3(0.0f, 0.0f, 0.0f));
+  variance = max(variance, make_float3(0.0f, 0.0f, 0.0f));
+
   float L = average(color);
 
   float ref = 2.0f * values[(int)(n * 0.75f)];
@@ -204,36 +221,43 @@ ccl_device void kernel_filter_detect_outliers(int x,
 
   if (L > ref) {
     /* The pixel appears to be an outlier.
-     * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
-     * should actually be at the reference value:
-     * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
-     * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+     * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is
+     * that the pixel should actually be at the reference value: If the reference is within the
+     * 3-sigma interval, the pixel is assumed to be a statistical outlier. Otherwise, it is very
+     * unlikely that the pixel should be darker, which indicates a legitimate highlight.
      */
 
     if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
       depth[idx] = -depth[idx];
       color *= ref / L;
-      variance[idx] = variance[idx + pass_stride] = variance[idx + 2 * pass_stride] = max_variance;
+      variance = make_float3(max_variance, max_variance, max_variance);
     }
     else {
       float stddev = sqrtf(pixel_variance);
       if (L - 3 * stddev < ref) {
         /* The pixel is an outlier, so negate the depth value to mark it as one.
-        * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+         * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM
+         * weights. */
         depth[idx] = -depth[idx];
         float fac = ref / L;
         color *= fac;
-        variance[idx] *= fac * fac;
-        variance[idx + pass_stride] *= fac * fac;
-        variance[idx + 2 * pass_stride] *= fac * fac;
+        variance *= sqr(fac);
       }
     }
   }
-  out[idx] = color.x;
-  out[idx + pass_stride] = color.y;
-  out[idx + 2 * pass_stride] = color.z;
+
+  /* Apply log(1+x) transform to compress highlights and avoid halos in the denoised results.
+   * Variance is transformed accordingly - the derivative of the transform is 1/(1+x), so we
+   * scale by the square of that (since we have variance instead of standard deviation). */
+  color = color_highlight_compress(color, &variance);
+
+  SET_COLOR(image_out, color);
+  SET_COLOR(variance_out, variance);
 }
 
+#undef GET_COLOR
+#undef SET_COLOR
+
 /* Combine A/B buffers.
  * Calculates the combined mean and the buffer variance. */
 ccl_device void kernel_filter_combine_halves(int x,
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 850f20584da..17941689ad5 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -108,7 +108,7 @@ ccl_device_inline void kernel_filter_finalize(int x,
 
   /* The weighted average of pixel colors (essentially, the NLM-filtered image).
    * In case the solution of the linear model fails due to numerical issues or
-   * returns non-sensical negative values, fall back to this value. */
+   * returns nonsensical negative values, fall back to this value. */
   float3 mean_color = XtWY[0] / XtWX[0];
 
   math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);
@@ -119,8 +119,8 @@ ccl_device_inline void kernel_filter_finalize(int x,
     final_color = mean_color;
   }
 
-  /* Clamp pixel value to positive values. */
-  final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+  /* Clamp pixel value to positive values and reverse the highlight compression transform. */
+  final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f)));
 
   ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
                                                    buffer_params.z;
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
index 69e3c7c458d..880a661214e 100644
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -55,7 +55,8 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
   math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
 
-  /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+  /* === Scale the shifted feature passes to a range of [-1; 1] ===
+   * Will be baked into the transform later. */
   float feature_scale[DENOISE_FEATURES];
   math_vector_zero(feature_scale, num_features);
 
@@ -69,8 +70,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
   filter_calculate_scale(feature_scale, use_time);
 
   /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
-   * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+   * This transformation maps the num_features-dimensional feature space to a reduced feature
+   * (r-feature) space which generally has fewer dimensions.
+   * This mainly helps to prevent over-fitting. */
   float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
   math_matrix_zero(feature_matrix, num_features);
   FOR_PIXEL_WINDOW
@@ -83,7 +85,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
   math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
   *rank = 0;
-  /* Prevent overfitting when a small window is used. */
+  /* Prevent over-fitting when a small window is used. */
   int max_rank = min(num_features, num_pixels / 3);
   if (pca_threshold < 0.0f) {
     float threshold_energy = 0.0f;
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index 89cddfd927f..adc85881fe5 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -61,7 +61,8 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
 
   math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
 
-  /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+  /* === Scale the shifted feature passes to a range of [-1; 1] ===
+   * Will be baked into the transform later. */
   float feature_scale[DENOISE_FEATURES];
   math_vector_zero(feature_scale, num_features);
 
@@ -75,8 +76,9 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
   filter_calculate_scale(feature_scale, use_time);
 
   /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
-   * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+   * This transformation maps the num_features-dimensional feature space to a reduced feature
+   * (r-feature) space which generally has fewer dimensions.
+   * This mainly helps to prevent overfitting. */
   float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
   math_matrix_zero(feature_matrix, num_features);
   FOR_PIXEL_WINDOW
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 22397b292db..5a124b5d73b 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -58,7 +58,8 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
     feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
   }
 
-  /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+  /* === Scale the shifted feature passes to a range of [-1; 1] ===
+   * Will be baked into the transform later. */
   float4 feature_scale[DENOISE_FEATURES];
   math_vector_zero_sse(feature_scale, num_features);
   FOR_PIXEL_WINDOW_SSE
@@ -72,8 +73,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
   filter_calculate_scale_sse(feature_scale, use_time);
 
   /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
-   * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+   * This transformation maps the num_features-dimensional feature space to a reduced feature
+   * (r-feature) space which generally has fewer dimensions.
+   * This mainly helps to prevent over-fitting. */
   float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
   math_matrix_zero_sse(feature_matrix_sse, num_features);
   FOR_PIXEL_WINDOW_SSE
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index e81c1b781c8..5ff4d5f7053 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+// clang-format off
 #include "kernel/geom/geom_attribute.h"
 #include "kernel/geom/geom_object.h"
 #ifdef __PATCH_EVAL__
@@ -30,3 +31,4 @@
 #include "kernel/geom/geom_curve_intersect.h"
 #include "kernel/geom/geom_volume.h"
 #include "kernel/geom/geom_primitive.h"
+// clang-format on
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 456608bfa22..e1b0e6fb81c 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -29,17 +29,11 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
 
 ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
 {
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE) {
-    return ATTR_PRIM_CURVE;
-  }
-  else
-#endif
-      if (subd_triangle_patch(kg, sd) != ~0) {
+  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
     return ATTR_PRIM_SUBD;
   }
   else {
-    return ATTR_PRIM_TRIANGLE;
+    return ATTR_PRIM_GEOMETRY;
   }
 }
 
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index e0aacb434eb..6ff0c7f2044 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -23,33 +23,6 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __HAIR__
 
-/* Interpolation of curve geometry */
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-  float fc = 0.71f;
-  float data[4];
-  float t2 = t * t;
-  data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc;
-  data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t;
-  data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc;
-  data[3] = 3.0f * fc * t2 - 2.0f * fc * t;
-  return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-  float data[4];
-  float fc = 0.71f;
-  float t2 = t * t;
-  float t3 = t2 * t;
-  data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t;
-  data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f;
-  data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t;
-  data[3] = fc * t3 - fc * t2;
-  return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
 /* Reading attributes on various curve elements */
 
 ccl_device float curve_attribute_float(
@@ -83,6 +56,16 @@ ccl_device float curve_attribute_float(
 
     return (1.0f - sd->u) * f0 + sd->u * f1;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = 0.0f;
+    if (dy)
+      *dy = 0.0f;
+#  endif
+
+    return kernel_tex_fetch(__attributes_float, desc.offset);
+  }
   else {
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -133,6 +116,16 @@ ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
 
     return (1.0f - sd->u) * f0 + sd->u * f1;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float2(0.0f, 0.0f);
+    if (dy)
+      *dy = make_float2(0.0f, 0.0f);
+#  endif
+
+    return kernel_tex_fetch(__attributes_float2, desc.offset);
+  }
   else {
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -183,6 +176,16 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
 
     return (1.0f - sd->u) * f0 + sd->u * f1;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float3(0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float3(0.0f, 0.0f, 0.0f);
+#  endif
+
+    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+  }
   else {
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -195,6 +198,66 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
   }
 }
 
+ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
+                                         const ShaderData *sd,
+                                         const AttributeDescriptor desc,
+                                         float4 *dx,
+                                         float4 *dy)
+{
+  if (desc.element == ATTR_ELEMENT_CURVE) {
+    /* idea: we can't derive any useful differentials here, but for tiled
+     * mipmap image caching it would be useful to avoid reading the highest
+     * detail level always. maybe a derivative based on the hair density
+     * could be computed somehow? */
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+    return kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim);
+  }
+  else if (desc.element == ATTR_ELEMENT_CURVE_KEY ||
+           desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
+    float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+    int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+    int k1 = k0 + 1;
+
+    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
+    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
+
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = sd->du.dx * (f1 - f0);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+    return (1.0f - sd->u) * f0 + sd->u * f1;
+  }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+    return kernel_tex_fetch(__attributes_float3, desc.offset);
+  }
+  else {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+}
+
 /* Curve thickness */
 
 ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
@@ -208,12 +271,12 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
     float4 P_curve[2];
 
-    if (sd->type & PRIMITIVE_CURVE) {
+    if (!(sd->type & PRIMITIVE_ALL_MOTION)) {
       P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
       P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
     }
     else {
-      motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+      motion_curve_keys_linear(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
     }
 
     r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index 5fd277c2f99..06d2c016f5b 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -1,4 +1,7 @@
 /*
+ * Copyright 2009-2020 Intel Corporation. Adapted from Embree with
+ * with modifications.
+ *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
@@ -14,802 +17,685 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Curve primitive intersection functions. */
+/* Curve primitive intersection functions.
+ *
+ * The code here was adapted from curve_intersector_sweep.h in Embree, to get
+ * an exact match between Embree CPU ray-tracing and our GPU ray-tracing. */
+
+#define CURVE_NUM_BEZIER_SUBDIVISIONS 3
+#define CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE (CURVE_NUM_BEZIER_SUBDIVISIONS + 1)
+#define CURVE_NUM_BEZIER_STEPS 2
+#define CURVE_NUM_JACOBIAN_ITERATIONS 5
 
 #ifdef __HAIR__
 
-#  ifdef __KERNEL_SSE2__
-ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
+/* Catmull-rom curve evaluation. */
+
+ccl_device_inline float4 catmull_rom_basis_eval(const float4 curve[4], float u)
 {
-  return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
+  const float t = u;
+  const float s = 1.0f - u;
+  const float n0 = -t * s * s;
+  const float n1 = 2.0f + t * t * (3.0f * t - 5.0f);
+  const float n2 = 2.0f + s * s * (3.0f * s - 5.0f);
+  const float n3 = -s * t * t;
+  return 0.5f * (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3);
 }
-#  endif
 
-/* On CPU pass P and dir by reference to aligned vector. */
-ccl_device_forceinline bool cardinal_curve_intersect(KernelGlobals *kg,
-                                                     Intersection *isect,
-                                                     const float3 ccl_ref P,
-                                                     const float3 ccl_ref dir,
-                                                     uint visibility,
-                                                     int object,
-                                                     int curveAddr,
-                                                     float time,
-                                                     int type,
-                                                     uint *lcg_state,
-                                                     float difl,
-                                                     float extmax)
+ccl_device_inline float4 catmull_rom_basis_derivative(const float4 curve[4], float u)
 {
-  const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+  const float t = u;
+  const float s = 1.0f - u;
+  const float n0 = -s * s + 2.0f * s * t;
+  const float n1 = 2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t;
+  const float n2 = 2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s;
+  const float n3 = -2.0f * s * t + t * t;
+  return 0.5f * (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3);
+}
 
-  if (!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-    const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-    if (time < prim_time.x || time > prim_time.y) {
-      return false;
-    }
-  }
+ccl_device_inline float4 catmull_rom_basis_derivative2(const float4 curve[4], float u)
+{
 
-  int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-  float epsilon = 0.0f;
-  float r_st, r_en;
+  const float t = u;
+  const float n0 = -3.0f * t + 2.0f;
+  const float n1 = 9.0f * t - 5.0f;
+  const float n2 = -9.0f * t + 4.0f;
+  const float n3 = 3.0f * t - 1.0f;
+  return (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3);
+}
 
-  int depth = kernel_data.curve.subdivisions;
-  int flags = kernel_data.curve.curveflags;
-  int prim = kernel_tex_fetch(__prim_index, curveAddr);
+/* Thick Curve */
 
-#  ifdef __KERNEL_SSE2__
-  ssef vdir = load4f(dir);
-  ssef vcurve_coef[4];
-  const float3 *curve_coef = (float3 *)vcurve_coef;
+ccl_device_inline float3 dnormalize(const float3 p, const float3 dp)
+{
+  const float pp = dot(p, p);
+  const float pdp = dot(p, dp);
+  return (pp * dp - pdp * p) / (pp * sqrtf(pp));
+}
 
-  {
-    ssef dtmp = vdir * vdir;
-    ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
-    ssef rd_ss = load1f_first(1.0f) / d_ss;
-
-    ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
-    int2 &v00 = (int2 &)v00vec;
-
-    int k0 = v00.x + segment;
-    int k1 = k0 + 1;
-    int ka = max(k0 - 1, v00.x);
-    int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-#    if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && \
-        (!defined(_MSC_VER) || _MSC_VER > 1800)
-    avxf P_curve_0_1, P_curve_2_3;
-    if (is_curve_primitive) {
-      P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
-      P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
-    }
-    else {
-      int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-      motion_cardinal_curve_keys_avx(
-          kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1, &P_curve_2_3);
-    }
-#    else  /* __KERNEL_AVX2__ */
-    ssef P_curve[4];
-
-    if (is_curve_primitive) {
-      P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
-      P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
-      P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
-      P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
+ccl_device_inline float sqr_point_to_line_distance(const float3 PmQ0, const float3 Q1mQ0)
+{
+  const float3 N = cross(PmQ0, Q1mQ0);
+  const float3 D = Q1mQ0;
+  return dot(N, N) / dot(D, D);
+}
+
+ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
+                                          const float3 cylinder_end,
+                                          const float cylinder_radius,
+                                          const float3 ray_dir,
+                                          float2 *t_o,
+                                          float *u0_o,
+                                          float3 *Ng0_o,
+                                          float *u1_o,
+                                          float3 *Ng1_o)
+{
+  /* Calculate quadratic equation to solve. */
+  const float rl = 1.0f / len(cylinder_end - cylinder_start);
+  const float3 P0 = cylinder_start, dP = (cylinder_end - cylinder_start) * rl;
+  const float3 O = -P0, dO = ray_dir;
+
+  const float dOdO = dot(dO, dO);
+  const float OdO = dot(dO, O);
+  const float OO = dot(O, O);
+  const float dOz = dot(dP, dO);
+  const float Oz = dot(dP, O);
+
+  const float A = dOdO - sqr(dOz);
+  const float B = 2.0f * (OdO - dOz * Oz);
+  const float C = OO - sqr(Oz) - sqr(cylinder_radius);
+
+  /* We miss the cylinder if determinant is smaller than zero. */
+  const float D = B * B - 4.0f * A * C;
+  if (!(D >= 0.0f)) {
+    *t_o = make_float2(FLT_MAX, -FLT_MAX);
+    return false;
+  }
+
+  /* Special case for rays that are parallel to the cylinder. */
+  const float eps = 16.0f * FLT_EPSILON * max(fabsf(dOdO), fabsf(sqr(dOz)));
+  if (fabsf(A) < eps) {
+    if (C <= 0.0f) {
+      *t_o = make_float2(-FLT_MAX, FLT_MAX);
+      return true;
     }
     else {
-      int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-      motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4 *)&P_curve);
+      *t_o = make_float2(-FLT_MAX, FLT_MAX);
+      return false;
     }
-#    endif /* __KERNEL_AVX2__ */
-
-    ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
-    ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
-    ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
-    ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-    ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
-
-    ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-    ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
-    ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-#    if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && \
-        (!defined(_MSC_VER) || _MSC_VER > 1800)
-    const avxf vPP = _mm256_broadcast_ps(&P.m128);
-    const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
-    const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
-    const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
-
-    const avxf p01 = madd(
-        shuffle<0>(P_curve_0_1 - vPP),
-        htfm00,
-        madd(shuffle<1>(P_curve_0_1 - vPP), htfm11, shuffle<2>(P_curve_0_1 - vPP) * htfm22));
-    const avxf p23 = madd(
-        shuffle<0>(P_curve_2_3 - vPP),
-        htfm00,
-        madd(shuffle<1>(P_curve_2_3 - vPP), htfm11, shuffle<2>(P_curve_2_3 - vPP) * htfm22));
-
-    const ssef p0 = _mm256_castps256_ps128(p01);
-    const ssef p1 = _mm256_extractf128_ps(p01, 1);
-    const ssef p2 = _mm256_castps256_ps128(p23);
-    const ssef p3 = _mm256_extractf128_ps(p23, 1);
-
-    const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
-    r_st = ((float4 &)P_curve_1).w;
-    const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
-    r_en = ((float4 &)P_curve_2).w;
-#    else  /* __KERNEL_AVX2__ */
-    ssef htfm[] = {htfm0, htfm1, htfm2};
-    ssef vP = load4f(P);
-    ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
-    ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
-    ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
-    ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
-
-    r_st = ((float4 &)P_curve[1]).w;
-    r_en = ((float4 &)P_curve[2]).w;
-#    endif /* __KERNEL_AVX2__ */
-
-    float fc = 0.71f;
-    ssef vfc = ssef(fc);
-    ssef vfcxp3 = vfc * p3;
-
-    vcurve_coef[0] = p1;
-    vcurve_coef[1] = vfc * (p2 - p0);
-    vcurve_coef[2] = madd(
-        ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
-    vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
   }
-#  else
-  float3 curve_coef[4];
 
-  /* curve Intersection check */
-  /* obtain curve parameters */
+  /* Standard case for rays that are not parallel to the cylinder. */
+  const float Q = sqrtf(D);
+  const float rcp_2A = 1.0f / (2.0f * A);
+  const float t0 = (-B - Q) * rcp_2A;
+  const float t1 = (-B + Q) * rcp_2A;
+
+  /* Calculates u and Ng for near hit. */
   {
-    /* ray transform created - this should be created at beginning of intersection loop */
-    Transform htfm;
-    float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
-    htfm = make_transform(dir.z / d,
-                          0,
-                          -dir.x / d,
-                          0,
-                          -dir.x * dir.y / d,
-                          d,
-                          -dir.y * dir.z / d,
-                          0,
-                          dir.x,
-                          dir.y,
-                          dir.z,
-                          0);
-
-    float4 v00 = kernel_tex_fetch(__curves, prim);
-
-    int k0 = __float_as_int(v00.x) + segment;
-    int k1 = k0 + 1;
-
-    int ka = max(k0 - 1, __float_as_int(v00.x));
-    int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-    float4 P_curve[4];
-
-    if (is_curve_primitive) {
-      P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-      P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-      P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-      P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-    }
-    else {
-      int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-      motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
-    }
+    *u0_o = (t0 * dOz + Oz) * rl;
+    const float3 Pr = t0 * ray_dir;
+    const float3 Pl = (*u0_o) * (cylinder_end - cylinder_start) + cylinder_start;
+    *Ng0_o = Pr - Pl;
+  }
 
-    float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
-    float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
-    float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
-    float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
-
-    float fc = 0.71f;
-    curve_coef[0] = p1;
-    curve_coef[1] = -fc * p0 + fc * p2;
-    curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
-    curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
-    r_st = P_curve[1].w;
-    r_en = P_curve[2].w;
+  /* Calculates u and Ng for far hit. */
+  {
+    *u1_o = (t1 * dOz + Oz) * rl;
+    const float3 Pr = t1 * ray_dir;
+    const float3 Pl = (*u1_o) * (cylinder_end - cylinder_start) + cylinder_start;
+    *Ng1_o = Pr - Pl;
   }
-#  endif
 
-  float r_curr = max(r_st, r_en);
-
-  if ((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
-    epsilon = 2 * r_curr;
-
-  /* find bounds - this is slow for cubic curves */
-  float upper, lower;
-
-  float zextrem[4];
-  curvebounds(&lower,
-              &upper,
-              &zextrem[0],
-              &zextrem[1],
-              &zextrem[2],
-              &zextrem[3],
-              curve_coef[0].z,
-              curve_coef[1].z,
-              curve_coef[2].z,
-              curve_coef[3].z);
-  if (lower - r_curr > isect->t || upper + r_curr < epsilon)
-    return false;
+  *t_o = make_float2(t0, t1);
 
-  /* minimum width extension */
-  float mw_extension = min(difl * fabsf(upper), extmax);
-  float r_ext = mw_extension + r_curr;
-
-  float xextrem[4];
-  curvebounds(&lower,
-              &upper,
-              &xextrem[0],
-              &xextrem[1],
-              &xextrem[2],
-              &xextrem[3],
-              curve_coef[0].x,
-              curve_coef[1].x,
-              curve_coef[2].x,
-              curve_coef[3].x);
-  if (lower > r_ext || upper < -r_ext)
-    return false;
+  return true;
+}
 
-  float yextrem[4];
-  curvebounds(&lower,
-              &upper,
-              &yextrem[0],
-              &yextrem[1],
-              &yextrem[2],
-              &yextrem[3],
-              curve_coef[0].y,
-              curve_coef[1].y,
-              curve_coef[2].y,
-              curve_coef[3].y);
-  if (lower > r_ext || upper < -r_ext)
-    return false;
+ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, const float3 ray_dir)
+{
+  const float3 O = -P;
+  const float3 D = ray_dir;
+  const float ON = dot(O, N);
+  const float DN = dot(D, N);
+  const float min_rcp_input = 1e-18f;
+  const bool eps = fabsf(DN) < min_rcp_input;
+  const float t = -ON / DN;
+  const float lower = (eps || DN < 0.0f) ? -FLT_MAX : t;
+  const float upper = (eps || DN > 0.0f) ? FLT_MAX : t;
+  return make_float2(lower, upper);
+}
 
-  /* setup recurrent loop */
-  int level = 1 << depth;
-  int tree = 0;
-  float resol = 1.0f / (float)level;
-  bool hit = false;
-
-  /* begin loop */
-  while (!(tree >> (depth))) {
-    const float i_st = tree * resol;
-    const float i_en = i_st + (level * resol);
-
-#  ifdef __KERNEL_SSE2__
-    ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
-    ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]),
-                      vi_st,
-                      vcurve_coef[0]);
-    ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]),
-                      vi_en,
-                      vcurve_coef[0]);
-
-    ssef vbmin = min(vp_st, vp_en);
-    ssef vbmax = max(vp_st, vp_en);
-
-    float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
-    float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
-    float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
-    float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#  else
-    float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st +
-                  curve_coef[0];
-    float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en +
-                  curve_coef[0];
-
-    float bminx = min(p_st.x, p_en.x);
-    float bmaxx = max(p_st.x, p_en.x);
-    float bminy = min(p_st.y, p_en.y);
-    float bmaxy = max(p_st.y, p_en.y);
-    float bminz = min(p_st.z, p_en.z);
-    float bmaxz = max(p_st.z, p_en.z);
-#  endif
+ccl_device bool curve_intersect_iterative(const float3 ray_dir,
+                                          const float dt,
+                                          const float4 curve[4],
+                                          float u,
+                                          float t,
+                                          const bool use_backfacing,
+                                          Intersection *isect)
+{
+  const float length_ray_dir = len(ray_dir);
+
+  /* Error of curve evaluations is proportional to largest coordinate. */
+  const float4 box_min = min(min(curve[0], curve[1]), min(curve[2], curve[3]));
+  const float4 box_max = max(min(curve[0], curve[1]), max(curve[2], curve[3]));
+  const float4 box_abs = max(fabs(box_min), fabs(box_max));
+  const float P_err = 16.0f * FLT_EPSILON *
+                      max(box_abs.x, max(box_abs.y, max(box_abs.z, box_abs.w)));
+  const float radius_max = box_max.w;
+
+  for (int i = 0; i < CURVE_NUM_JACOBIAN_ITERATIONS; i++) {
+    const float3 Q = ray_dir * t;
+    const float3 dQdt = ray_dir;
+    const float Q_err = 16.0f * FLT_EPSILON * length_ray_dir * t;
+
+    const float4 P4 = catmull_rom_basis_eval(curve, u);
+    const float4 dPdu4 = catmull_rom_basis_derivative(curve, u);
+
+    const float3 P = float4_to_float3(P4);
+    const float3 dPdu = float4_to_float3(dPdu4);
+    const float radius = P4.w;
+    const float dradiusdu = dPdu4.w;
+
+    const float3 ddPdu = float4_to_float3(catmull_rom_basis_derivative2(curve, u));
+
+    const float3 R = Q - P;
+    const float len_R = len(R);
+    const float R_err = max(Q_err, P_err);
+    const float3 dRdu = -dPdu;
+    const float3 dRdt = dQdt;
+
+    const float3 T = normalize(dPdu);
+    const float3 dTdu = dnormalize(dPdu, ddPdu);
+    const float cos_err = P_err / len(dPdu);
+
+    const float f = dot(R, T);
+    const float f_err = len_R * P_err + R_err + cos_err * (1.0f + len_R);
+    const float dfdu = dot(dRdu, T) + dot(R, dTdu);
+    const float dfdt = dot(dRdt, T);
+
+    const float K = dot(R, R) - sqr(f);
+    const float dKdu = (dot(R, dRdu) - f * dfdu);
+    const float dKdt = (dot(R, dRdt) - f * dfdt);
+    const float rsqrt_K = inversesqrtf(K);
+
+    const float g = sqrtf(K) - radius;
+    const float g_err = R_err + f_err + 16.0f * FLT_EPSILON * radius_max;
+    const float dgdu = dKdu * rsqrt_K - dradiusdu;
+    const float dgdt = dKdt * rsqrt_K;
+
+    const float invdet = 1.0f / (dfdu * dgdt - dgdu * dfdt);
+    u -= (dgdt * f - dfdt * g) * invdet;
+    t -= (-dgdu * f + dfdu * g) * invdet;
+
+    if (fabsf(f) < f_err && fabsf(g) < g_err) {
+      t += dt;
+      if (!(0.0f <= t && t <= isect->t)) {
+        return false; /* Rejects NaNs */
+      }
+      if (!(u >= 0.0f && u <= 1.0f)) {
+        return false; /* Rejects NaNs */
+      }
 
-    if (xextrem[0] >= i_st && xextrem[0] <= i_en) {
-      bminx = min(bminx, xextrem[1]);
-      bmaxx = max(bmaxx, xextrem[1]);
-    }
-    if (xextrem[2] >= i_st && xextrem[2] <= i_en) {
-      bminx = min(bminx, xextrem[3]);
-      bmaxx = max(bmaxx, xextrem[3]);
-    }
-    if (yextrem[0] >= i_st && yextrem[0] <= i_en) {
-      bminy = min(bminy, yextrem[1]);
-      bmaxy = max(bmaxy, yextrem[1]);
-    }
-    if (yextrem[2] >= i_st && yextrem[2] <= i_en) {
-      bminy = min(bminy, yextrem[3]);
-      bmaxy = max(bmaxy, yextrem[3]);
-    }
-    if (zextrem[0] >= i_st && zextrem[0] <= i_en) {
-      bminz = min(bminz, zextrem[1]);
-      bmaxz = max(bmaxz, zextrem[1]);
-    }
-    if (zextrem[2] >= i_st && zextrem[2] <= i_en) {
-      bminz = min(bminz, zextrem[3]);
-      bmaxz = max(bmaxz, zextrem[3]);
+      /* Backface culling. */
+      const float3 R = normalize(Q - P);
+      const float3 U = dradiusdu * R + dPdu;
+      const float3 V = cross(dPdu, R);
+      const float3 Ng = cross(V, U);
+      if (!use_backfacing && dot(ray_dir, Ng) > 0.0f) {
+        return false;
+      }
+
+      /* Record intersection. */
+      isect->t = t;
+      isect->u = u;
+      isect->v = 0.0f;
+
+      return true;
     }
+  }
+  return false;
+}
 
-    float r1 = r_st + (r_en - r_st) * i_st;
-    float r2 = r_st + (r_en - r_st) * i_en;
-    r_curr = max(r1, r2);
+ccl_device bool curve_intersect_recursive(const float3 ray_orig,
+                                          const float3 ray_dir,
+                                          float4 curve[4],
+                                          Intersection *isect)
+{
+  /* Move ray closer to make intersection stable. */
+  const float3 center = float4_to_float3(0.25f * (curve[0] + curve[1] + curve[2] + curve[3]));
+  const float dt = dot(center - ray_orig, ray_dir) / dot(ray_dir, ray_dir);
+  const float3 ref = ray_orig + ray_dir * dt;
+  const float4 ref4 = make_float4(ref.x, ref.y, ref.z, 0.0f);
+  curve[0] -= ref4;
+  curve[1] -= ref4;
+  curve[2] -= ref4;
+  curve[3] -= ref4;
+
+  const bool use_backfacing = false;
+  const float step_size = 1.0f / (float)(CURVE_NUM_BEZIER_STEPS);
+
+  int depth = 0;
+
+  /* todo: optimize stack for GPU somehow? Possibly some bitflags are enough, and
+   * u0/u1 can be derived from the depth. */
+  struct {
+    float u0, u1;
+    int i;
+  } stack[CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE];
+
+  bool found = false;
+
+  float u0 = 0.0f;
+  float u1 = 1.0f;
+  int i = 0;
+
+  while (1) {
+    for (; i < CURVE_NUM_BEZIER_STEPS; i++) {
+      const float step = i * step_size;
+
+      /* Subdivide curve. */
+      const float dscale = (u1 - u0) * (1.0f / 3.0f) * step_size;
+      const float vu0 = mix(u0, u1, step);
+      const float vu1 = mix(u0, u1, step + step_size);
+
+      const float4 P0 = catmull_rom_basis_eval(curve, vu0);
+      const float4 dP0du = dscale * catmull_rom_basis_derivative(curve, vu0);
+      const float4 P3 = catmull_rom_basis_eval(curve, vu1);
+      const float4 dP3du = dscale * catmull_rom_basis_derivative(curve, vu1);
+
+      const float4 P1 = P0 + dP0du;
+      const float4 P2 = P3 - dP3du;
+
+      /* Calculate bounding cylinders. */
+      const float rr1 = sqr_point_to_line_distance(float4_to_float3(dP0du),
+                                                   float4_to_float3(P3 - P0));
+      const float rr2 = sqr_point_to_line_distance(float4_to_float3(dP3du),
+                                                   float4_to_float3(P3 - P0));
+      const float maxr12 = sqrtf(max(rr1, rr2));
+      const float one_plus_ulp = 1.0f + 2.0f * FLT_EPSILON;
+      const float one_minus_ulp = 1.0f - 2.0f * FLT_EPSILON;
+      float r_outer = max(max(P0.w, P1.w), max(P2.w, P3.w)) + maxr12;
+      float r_inner = min(min(P0.w, P1.w), min(P2.w, P3.w)) - maxr12;
+      r_outer = one_plus_ulp * r_outer;
+      r_inner = max(0.0f, one_minus_ulp * r_inner);
+      bool valid = true;
+
+      /* Intersect with outer cylinder. */
+      float2 tc_outer;
+      float u_outer0, u_outer1;
+      float3 Ng_outer0, Ng_outer1;
+      valid = cylinder_intersect(float4_to_float3(P0),
+                                 float4_to_float3(P3),
+                                 r_outer,
+                                 ray_dir,
+                                 &tc_outer,
+                                 &u_outer0,
+                                 &Ng_outer0,
+                                 &u_outer1,
+                                 &Ng_outer1);
+      if (!valid) {
+        continue;
+      }
 
-    mw_extension = min(difl * fabsf(bmaxz), extmax);
-    float r_ext = mw_extension + r_curr;
-    float coverage = 1.0f;
+      /* Intersect with cap-planes. */
+      float2 tp = make_float2(-dt, isect->t - dt);
+      tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
+      const float2 h0 = half_plane_intersect(
+          float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
+      tp = make_float2(max(tp.x, h0.x), min(tp.y, h0.y));
+      const float2 h1 = half_plane_intersect(
+          float4_to_float3(P3), -float4_to_float3(dP3du), ray_dir);
+      tp = make_float2(max(tp.x, h1.x), min(tp.y, h1.y));
+      valid = tp.x <= tp.y;
+      if (!valid) {
+        continue;
+      }
 
-    if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext || bmaxx < -r_ext ||
-        bminy > r_ext || bmaxy < -r_ext) {
-      /* the bounding box does not overlap the square centered at O */
-      tree += level;
-      level = tree & -tree;
-    }
-    else if (level == 1) {
-
-      /* the maximum recursion depth is reached.
-       * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-       * dP* is reversed if necessary.*/
-      float t = isect->t;
-      float u = 0.0f;
-      float gd = 0.0f;
-
-      if (flags & CURVE_KN_RIBBONS) {
-        float3 tg = (p_en - p_st);
-#  ifdef __KERNEL_SSE__
-        const float3 tg_sq = tg * tg;
-        float w = tg_sq.x + tg_sq.y;
-#  else
-        float w = tg.x * tg.x + tg.y * tg.y;
-#  endif
-        if (w == 0) {
-          tree++;
-          level = tree & -tree;
-          continue;
-        }
-#  ifdef __KERNEL_SSE__
-        const float3 p_sttg = p_st * tg;
-        w = -(p_sttg.x + p_sttg.y) / w;
+      /* Clamp and correct u parameter. */
+      u_outer0 = clamp(u_outer0, 0.0f, 1.0f);
+      u_outer1 = clamp(u_outer1, 0.0f, 1.0f);
+      u_outer0 = mix(u0, u1, (step + u_outer0) * (1.0f / (float)(CURVE_NUM_BEZIER_STEPS + 1)));
+      u_outer1 = mix(u0, u1, (step + u_outer1) * (1.0f / (float)(CURVE_NUM_BEZIER_STEPS + 1)));
+
+      /* Intersect with inner cylinder. */
+      float2 tc_inner;
+      float u_inner0, u_inner1;
+      float3 Ng_inner0, Ng_inner1;
+      const bool valid_inner = cylinder_intersect(float4_to_float3(P0),
+                                                  float4_to_float3(P3),
+                                                  r_inner,
+                                                  ray_dir,
+                                                  &tc_inner,
+                                                  &u_inner0,
+                                                  &Ng_inner0,
+                                                  &u_inner1,
+                                                  &Ng_inner1);
+
+      /* At the unstable area we subdivide deeper. */
+#  if 0
+      const bool unstable0 = (!valid_inner) |
+                             (fabsf(dot(normalize(ray_dir), normalize(Ng_inner0))) < 0.3f);
+      const bool unstable1 = (!valid_inner) |
+                             (fabsf(dot(normalize(ray_dir), normalize(Ng_inner1))) < 0.3f);
 #  else
-        w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+      /* On the GPU appears to be a little faster if always enabled. */
+      (void)valid_inner;
+
+      const bool unstable0 = true;
+      const bool unstable1 = true;
 #  endif
-        w = saturate(w);
-
-        /* compute u on the curve segment */
-        u = i_st * (1 - w) + i_en * w;
-        r_curr = r_st + (r_en - r_st) * u;
-        /* compare x-y distances */
-        float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u +
-                        curve_coef[0];
-
-        float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-        if (dot(tg, dp_st) < 0)
-          dp_st *= -1;
-        if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
-          tree++;
-          level = tree & -tree;
-          continue;
+
+      /* Subtract the inner interval from the current hit interval. */
+      float2 tp0 = make_float2(tp.x, min(tp.y, tc_inner.x));
+      float2 tp1 = make_float2(max(tp.x, tc_inner.y), tp.y);
+      bool valid0 = valid && (tp0.x <= tp0.y);
+      bool valid1 = valid && (tp1.x <= tp1.y);
+      if (!(valid0 || valid1)) {
+        continue;
+      }
+
+      /* Process one or two hits. */
+      bool recurse = false;
+      if (valid0) {
+        const int termDepth = unstable0 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
+                                          CURVE_NUM_BEZIER_SUBDIVISIONS;
+        if (depth >= termDepth) {
+          found |= curve_intersect_iterative(
+              ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
         }
-        float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-        if (dot(tg, dp_en) < 0)
-          dp_en *= -1;
-        if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
-          tree++;
-          level = tree & -tree;
-          continue;
+        else {
+          recurse = true;
         }
+      }
 
-        /* compute coverage */
-        float r_ext = r_curr;
-        coverage = 1.0f;
-        if (difl != 0.0f) {
-          mw_extension = min(difl * fabsf(bmaxz), extmax);
-          r_ext = mw_extension + r_curr;
-#  ifdef __KERNEL_SSE__
-          const float3 p_curr_sq = p_curr * p_curr;
-          const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
-          float d = dxxx.x;
-#  else
-          float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-#  endif
-          float d0 = d - r_curr;
-          float d1 = d + r_curr;
-          float inv_mw_extension = 1.0f / mw_extension;
-          if (d0 >= 0)
-            coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) *
-                       0.5f;
-          else  // inside
-            coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) *
-                       0.5f;
+      if (valid1 && (tp1.x + dt <= isect->t)) {
+        const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
+                                          CURVE_NUM_BEZIER_SUBDIVISIONS;
+        if (depth >= termDepth) {
+          found |= curve_intersect_iterative(
+              ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
         }
-
-        if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon ||
-            isect->t < p_curr.z) {
-          tree++;
-          level = tree & -tree;
-          continue;
+        else {
+          recurse = true;
         }
+      }
 
-        t = p_curr.z;
+      if (recurse) {
+        stack[depth].u0 = u0;
+        stack[depth].u1 = u1;
+        stack[depth].i = i + 1;
+        depth++;
 
-        /* stochastic fade from minimum width */
-        if (difl != 0.0f && lcg_state) {
-          if (coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-            return hit;
-        }
+        u0 = vu0;
+        u1 = vu1;
+        i = -1;
       }
-      else {
-        float l = len(p_en - p_st);
-        /* minimum width extension */
-        float or1 = r1;
-        float or2 = r2;
-
-        if (difl != 0.0f) {
-          mw_extension = min(len(p_st - P) * difl, extmax);
-          or1 = r1 < mw_extension ? mw_extension : r1;
-          mw_extension = min(len(p_en - P) * difl, extmax);
-          or2 = r2 < mw_extension ? mw_extension : r2;
-        }
-        /* --- */
-        float invl = 1.0f / l;
-        float3 tg = (p_en - p_st) * invl;
-        gd = (or2 - or1) * invl;
-        float difz = -dot(p_st, tg);
-        float cyla = 1.0f - (tg.z * tg.z * (1 + gd * gd));
-        float invcyla = 1.0f / cyla;
-        float halfb = (-p_st.z - tg.z * (difz + gd * (difz * gd + or1)));
-        float tcentre = -halfb * invcyla;
-        float zcentre = difz + (tg.z * tcentre);
-        float3 tdif = -p_st;
-        tdif.z += tcentre;
-        float tdifz = dot(tdif, tg);
-        float tb = 2 * (tdif.z - tg.z * (tdifz + gd * (tdifz * gd + or1)));
-        float tc = dot(tdif, tdif) - tdifz * tdifz * (1 + gd * gd) - or1 * or1 -
-                   2 * or1 * tdifz * gd;
-        float td = tb * tb - 4 * cyla * tc;
-        if (td < 0.0f) {
-          tree++;
-          level = tree & -tree;
-          continue;
-        }
+    }
 
-        float rootd = sqrtf(td);
-        float correction = (-tb - rootd) * 0.5f * invcyla;
-        t = tcentre + correction;
-
-        float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-        if (dot(tg, dp_st) < 0)
-          dp_st *= -1;
-        float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-        if (dot(tg, dp_en) < 0)
-          dp_en *= -1;
-
-        if (flags & CURVE_KN_BACKFACING &&
-            (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 ||
-             isect->t < t || t <= 0.0f)) {
-          correction = (-tb + rootd) * 0.5f * invcyla;
-          t = tcentre + correction;
-        }
+    if (depth > 0) {
+      depth--;
+      u0 = stack[depth].u0;
+      u1 = stack[depth].u1;
+      i = stack[depth].i;
+    }
+    else {
+      break;
+    }
+  }
 
-        if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 ||
-            isect->t < t || t <= 0.0f) {
-          tree++;
-          level = tree & -tree;
-          continue;
-        }
+  return found;
+}
 
-        float w = (zcentre + (tg.z * correction)) * invl;
-        w = saturate(w);
-        /* compute u on the curve segment */
-        u = i_st * (1 - w) + i_en * w;
+/* Ribbons */
 
-        /* stochastic fade from minimum width */
-        if (difl != 0.0f && lcg_state) {
-          r_curr = r1 + (r2 - r1) * w;
-          r_ext = or1 + (or2 - or1) * w;
-          coverage = r_curr / r_ext;
+ccl_device_inline bool cylinder_culling_test(const float2 p1, const float2 p2, const float r)
+{
+  /* Performs culling against a cylinder. */
+  const float2 dp = p2 - p1;
+  const float num = dp.x * p1.y - dp.y * p1.x;
+  const float den2 = dot(p2 - p1, p2 - p1);
+  return num * num <= r * r * den2;
+}
 
-          if (coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-            return hit;
-        }
-      }
-      /* we found a new intersection */
+/*! Intersects a ray with a quad with backface culling
+ *  enabled. The quad v0,v1,v2,v3 is split into two triangles
+ *  v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
+ *  triangles gets intersected. */
+ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar,
+                                             const float3 quad_v0,
+                                             const float3 quad_v1,
+                                             const float3 quad_v2,
+                                             const float3 quad_v3,
+                                             float *u_o,
+                                             float *v_o,
+                                             float *t_o)
+{
+  /* Calculate vertices relative to ray origin? */
+  const float3 O = make_float3(0.0f, 0.0f, 0.0f);
+  const float3 D = make_float3(0.0f, 0.0f, 1.0f);
+  const float3 va = quad_v0 - O;
+  const float3 vb = quad_v1 - O;
+  const float3 vc = quad_v2 - O;
+  const float3 vd = quad_v3 - O;
+
+  const float3 edb = vb - vd;
+  const float WW = dot(cross(vd, edb), D);
+  const float3 v0 = (WW <= 0.0f) ? va : vc;
+  const float3 v1 = (WW <= 0.0f) ? vb : vd;
+  const float3 v2 = (WW <= 0.0f) ? vd : vb;
+
+  /* Calculate edges? */
+  const float3 e0 = v2 - v0;
+  const float3 e1 = v0 - v1;
+
+  /* perform edge tests */
+  const float U = dot(cross(v0, e0), D);
+  const float V = dot(cross(v1, e1), D);
+  if (!(max(U, V) <= 0.0f)) {
+    return false;
+  }
 
-#  ifdef __VISIBILITY_FLAG__
-      /* visibility flag test. we do it here under the assumption
-       * that most triangles are culled by node flags */
-      if (kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#  endif
-      {
-        /* record intersection */
-        isect->t = t;
-        isect->u = u;
-        isect->v = gd;
-        isect->prim = curveAddr;
-        isect->object = object;
-        isect->type = type;
-        hit = true;
-      }
+  /* Calculate geometry normal and denominator? */
+  const float3 Ng = cross(e1, e0);
+  const float den = dot(Ng, D);
+  const float rcpDen = 1.0f / den;
+
+  /* Perform depth test? */
+  const float t = rcpDen * dot(v0, Ng);
+  if (!(0.0f <= t && t <= ray_tfar)) {
+    return false;
+  }
+
+  /* Avoid division by 0? */
+  if (!(den != 0.0f)) {
+    return false;
+  }
+
+  /* Update hit information? */
+  *t_o = t;
+  *u_o = U * rcpDen;
+  *v_o = V * rcpDen;
+  *u_o = (WW <= 0.0f) ? *u_o : 1.0f - *u_o;
+  *v_o = (WW <= 0.0f) ? *v_o : 1.0f - *v_o;
+  return true;
+}
 
-      tree++;
-      level = tree & -tree;
+ccl_device_inline void ribbon_ray_space(const float3 ray_dir, float3 ray_space[3])
+{
+  const float3 dx0 = make_float3(0, ray_dir.z, -ray_dir.y);
+  const float3 dx1 = make_float3(-ray_dir.z, 0, ray_dir.x);
+  ray_space[0] = normalize(dot(dx0, dx0) > dot(dx1, dx1) ? dx0 : dx1);
+  ray_space[1] = normalize(cross(ray_dir, ray_space[0]));
+  ray_space[2] = ray_dir;
+}
+
+ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
+                                             const float3 ray_org,
+                                             const float4 P4)
+{
+  float3 P = float4_to_float3(P4) - ray_org;
+  return make_float4(dot(ray_space[0], P), dot(ray_space[1], P), dot(ray_space[2], P), P4.w);
+}
+
+ccl_device_inline bool ribbon_intersect(const float3 ray_org,
+                                        const float3 ray_dir,
+                                        const float ray_tfar,
+                                        const int N,
+                                        float4 curve[4],
+                                        Intersection *isect)
+{
+  /* Transform control points into ray space. */
+  float3 ray_space[3];
+  ribbon_ray_space(ray_dir, ray_space);
+
+  curve[0] = ribbon_to_ray_space(ray_space, ray_org, curve[0]);
+  curve[1] = ribbon_to_ray_space(ray_space, ray_org, curve[1]);
+  curve[2] = ribbon_to_ray_space(ray_space, ray_org, curve[2]);
+  curve[3] = ribbon_to_ray_space(ray_space, ray_org, curve[3]);
+
+  const float4 mx = max(max(fabs(curve[0]), fabs(curve[1])), max(fabs(curve[2]), fabs(curve[3])));
+  const float eps = 4.0f * FLT_EPSILON * max(max(mx.x, mx.y), max(mx.z, mx.w));
+  const float step_size = 1.0f / (float)N;
+
+  /* Evaluate first point and radius scaled normal direction. */
+  float4 p0 = catmull_rom_basis_eval(curve, 0.0f);
+  float3 dp0dt = float4_to_float3(catmull_rom_basis_derivative(curve, 0.0f));
+  if (max3(fabs(dp0dt)) < eps) {
+    const float4 p1 = catmull_rom_basis_eval(curve, step_size);
+    dp0dt = float4_to_float3(p1 - p0);
+  }
+  float3 wn0 = normalize(make_float3(dp0dt.y, -dp0dt.x, 0.0f)) * p0.w;
+
+  /* Evaluate the bezier curve. */
+  for (int i = 0; i < N; i++) {
+    const float u = i * step_size;
+    const float4 p1 = catmull_rom_basis_eval(curve, u + step_size);
+    bool valid = cylinder_culling_test(
+        make_float2(p0.x, p0.y), make_float2(p1.x, p1.y), max(p0.w, p1.w));
+    if (!valid) {
+      continue;
     }
-    else {
-      /* split the curve into two curves and process */
-      level = level >> 1;
+
+    /* Evaluate next point. */
+    float3 dp1dt = float4_to_float3(catmull_rom_basis_derivative(curve, u + step_size));
+    dp1dt = (max3(fabs(dp1dt)) < eps) ? float4_to_float3(p1 - p0) : dp1dt;
+    const float3 wn1 = normalize(make_float3(dp1dt.y, -dp1dt.x, 0.0f)) * p1.w;
+
+    /* Construct quad coordinates. */
+    const float3 lp0 = float4_to_float3(p0) + wn0;
+    const float3 lp1 = float4_to_float3(p1) + wn1;
+    const float3 up0 = float4_to_float3(p0) - wn0;
+    const float3 up1 = float4_to_float3(p1) - wn1;
+
+    /* Intersect quad. */
+    float vu, vv, vt;
+    bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt);
+
+    if (valid0) {
+      /* ignore self intersections */
+      const float avoidance_factor = 2.0f;
+      if (avoidance_factor != 0.0f) {
+        float r = mix(p0.w, p1.w, vu);
+        valid0 = vt > avoidance_factor * r;
+      }
+
+      if (valid0) {
+        vv = 2.0f * vv - 1.0f;
+
+        /* Record intersection. */
+        isect->t = vt;
+        isect->u = u + vu * step_size;
+        isect->v = vv;
+        return true;
+      }
     }
-  }
 
-  return hit;
+    p0 = p1;
+    wn0 = wn1;
+  }
+  return false;
 }
 
 ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
                                             Intersection *isect,
-                                            float3 P,
-                                            float3 direction,
+                                            const float3 P,
+                                            const float3 dir,
                                             uint visibility,
                                             int object,
                                             int curveAddr,
                                             float time,
-                                            int type,
-                                            uint *lcg_state,
-                                            float difl,
-                                            float extmax)
+                                            int type)
 {
-  /* define few macros to minimize code duplication for SSE */
-#  ifndef __KERNEL_SSE2__
-#    define len3_squared(x) len_squared(x)
-#    define len3(x) len(x)
-#    define dot3(x, y) dot(x, y)
-#  endif
-
-  const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+  const bool is_motion = (type & PRIMITIVE_ALL_MOTION);
 
-  if (!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+#  ifndef __KERNEL_OPTIX__ /* See OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */
+  if (is_motion && kernel_data.bvh.use_bvh_steps) {
     const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
     if (time < prim_time.x || time > prim_time.y) {
       return false;
     }
   }
+#  endif
 
   int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-  /* curve Intersection check */
-  int flags = kernel_data.curve.curveflags;
-
   int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
   float4 v00 = kernel_tex_fetch(__curves, prim);
 
-  int cnum = __float_as_int(v00.x);
-  int k0 = cnum + segment;
+  int k0 = __float_as_int(v00.x) + segment;
   int k1 = k0 + 1;
 
-#  ifndef __KERNEL_SSE2__
-  float4 P_curve[2];
-
-  if (is_curve_primitive) {
-    P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-    P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
-  }
-  else {
-    int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-    motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
-  }
-
-  float or1 = P_curve[0].w;
-  float or2 = P_curve[1].w;
-  float3 p1 = float4_to_float3(P_curve[0]);
-  float3 p2 = float4_to_float3(P_curve[1]);
-
-  /* minimum width extension */
-  float r1 = or1;
-  float r2 = or2;
-  float3 dif = P - p1;
-  float3 dif_second = P - p2;
-  if (difl != 0.0f) {
-    float pixelsize = min(len3(dif) * difl, extmax);
-    r1 = or1 < pixelsize ? pixelsize : or1;
-    pixelsize = min(len3(dif_second) * difl, extmax);
-    r2 = or2 < pixelsize ? pixelsize : or2;
-  }
-  /* --- */
-
-  float3 p21_diff = p2 - p1;
-  float3 sphere_dif1 = (dif + dif_second) * 0.5f;
-  float3 dir = direction;
-  float sphere_b_tmp = dot3(dir, sphere_dif1);
-  float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
-#  else
-  ssef P_curve[2];
+  int ka = max(k0 - 1, __float_as_int(v00.x));
+  int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1);
 
-  if (is_curve_primitive) {
-    P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
-    P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
+  float4 curve[4];
+  if (!is_motion) {
+    curve[0] = kernel_tex_fetch(__curve_keys, ka);
+    curve[1] = kernel_tex_fetch(__curve_keys, k0);
+    curve[2] = kernel_tex_fetch(__curve_keys, k1);
+    curve[3] = kernel_tex_fetch(__curve_keys, kb);
   }
   else {
     int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-    motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4 *)&P_curve);
+    motion_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, curve);
   }
 
-  const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
-
-  ssef r12 = or12;
-  const ssef vP = load4f(P);
-  const ssef dif = vP - P_curve[0];
-  const ssef dif_second = vP - P_curve[1];
-  if (difl != 0.0f) {
-    const ssef len1_sq = len3_squared_splat(dif);
-    const ssef len2_sq = len3_squared_splat(dif_second);
-    const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
-    const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
-    r12 = max(or12, pixelsize12);
-  }
-  float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
-  float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
-
-  const ssef p21_diff = P_curve[1] - P_curve[0];
-  const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
-  const ssef dir = load4f(direction);
-  const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-  const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
-#  endif
-
-  float mr = max(r1, r2);
-  float l = len3(p21_diff);
-  float invl = 1.0f / l;
-  float sp_r = mr + 0.5f * l;
-
-  float sphere_b = dot3(dir, sphere_dif2);
-  float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
-
-  if (sdisc < 0.0f)
-    return false;
-
-    /* obtain parameters and test midpoint distance for suitable modes */
-#  ifndef __KERNEL_SSE2__
-  float3 tg = p21_diff * invl;
-#  else
-  const ssef tg = p21_diff * invl;
-#  endif
-  float gd = (r2 - r1) * invl;
-
-  float dirz = dot3(dir, tg);
-  float difz = dot3(dif, tg);
-
-  float a = 1.0f - (dirz * dirz * (1 + gd * gd));
-
-  float halfb = dot3(dir, dif) - dirz * (difz + gd * (difz * gd + r1));
-
-  float tcentre = -halfb / a;
-  float zcentre = difz + (dirz * tcentre);
-
-  if ((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-    return false;
-  if ((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) &&
-      !(flags & CURVE_KN_INTERSECTCORRECTION))
+#  ifdef __VISIBILITY_FLAG__
+  if (!(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)) {
     return false;
-
-    /* test minimum separation */
-#  ifndef __KERNEL_SSE2__
-  float3 cprod = cross(tg, dir);
-  float cprod2sq = len3_squared(cross(tg, dif));
-#  else
-  const ssef cprod = cross(tg, dir);
-  float cprod2sq = len3_squared(cross_zxy(tg, dif));
+  }
 #  endif
-  float cprodsq = len3_squared(cprod);
-  float distscaled = dot3(cprod, dif);
 
-  if (cprodsq == 0)
-    distscaled = cprod2sq;
-  else
-    distscaled = (distscaled * distscaled) / cprodsq;
-
-  if (distscaled > mr * mr)
-    return false;
-
-    /* calculate true intersection */
-#  ifndef __KERNEL_SSE2__
-  float3 tdif = dif + tcentre * dir;
-#  else
-  const ssef tdif = madd(ssef(tcentre), dir, dif);
-#  endif
-  float tdifz = dot3(tdif, tg);
-  float tdifma = tdifz * gd + r1;
-  float tb = 2 * (dot3(dir, tdif) - dirz * (tdifz + gd * tdifma));
-  float tc = dot3(tdif, tdif) - tdifz * tdifz - tdifma * tdifma;
-  float td = tb * tb - 4 * a * tc;
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+    /* todo: adaptive number of subdivisions could help performance here. */
+    const int subdivisions = kernel_data.bvh.curve_subdivisions;
+    if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) {
+      isect->prim = curveAddr;
+      isect->object = object;
+      isect->type = type;
+      return true;
+    }
 
-  if (td < 0.0f)
     return false;
-
-  float rootd = 0.0f;
-  float correction = 0.0f;
-  if (flags & CURVE_KN_ACCURATE) {
-    rootd = sqrtf(td);
-    correction = ((-tb - rootd) / (2 * a));
   }
-
-  float t = tcentre + correction;
-
-  if (t < isect->t) {
-
-    if (flags & CURVE_KN_INTERSECTCORRECTION) {
-      rootd = sqrtf(td);
-      correction = ((-tb - rootd) / (2 * a));
-      t = tcentre + correction;
-    }
-
-    float z = zcentre + (dirz * correction);
-    // bool backface = false;
-
-    if (flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
-      // backface = true;
-      correction = ((-tb + rootd) / (2 * a));
-      t = tcentre + correction;
-      z = zcentre + (dirz * correction);
-    }
-
-    /* stochastic fade from minimum width */
-    float adjradius = or1 + z * (or2 - or1) * invl;
-    adjradius = adjradius / (r1 + z * gd);
-    if (lcg_state && adjradius != 1.0f) {
-      if (lcg_step_float(lcg_state) > adjradius)
-        return false;
+  else {
+    if (curve_intersect_recursive(P, dir, curve, isect)) {
+      isect->prim = curveAddr;
+      isect->object = object;
+      isect->type = type;
+      return true;
     }
-    /* --- */
-
-    if (t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
-      if (flags & CURVE_KN_ENCLOSEFILTER) {
-        float enc_ratio = 1.01f;
-        if ((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
-          float a2 = 1.0f - (dirz * dirz * (1 + gd * gd * enc_ratio * enc_ratio));
-          float c2 = dot3(dif, dif) - difz * difz * (1 + gd * gd * enc_ratio * enc_ratio) -
-                     r1 * r1 * enc_ratio * enc_ratio - 2 * r1 * difz * gd * enc_ratio;
-          if (a2 * c2 < 0.0f)
-            return false;
-        }
-      }
 
-#  ifdef __VISIBILITY_FLAG__
-      /* visibility flag test. we do it here under the assumption
-       * that most triangles are culled by node flags */
-      if (kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#  endif
-      {
-        /* record intersection */
-        isect->t = t;
-        isect->u = z * invl;
-        isect->v = gd;
-        isect->prim = curveAddr;
-        isect->object = object;
-        isect->type = type;
-
-        return true;
-      }
-    }
+    return false;
   }
-
-  return false;
-
-#  ifndef __KERNEL_SSE2__
-#    undef len3_squared
-#    undef len3
-#    undef dot3
-#  endif
 }
 
-ccl_device_inline float3 curve_refine(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      const Intersection *isect,
-                                      const Ray *ray)
+ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          const Intersection *isect,
+                                          const Ray *ray)
 {
-  int flag = kernel_data.curve.curveflags;
   float t = isect->t;
   float3 P = ray->P;
   float3 D = ray->D;
@@ -832,118 +718,63 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
   int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
   int k1 = k0 + 1;
 
-  float3 tg;
+  int ka = max(k0 - 1, __float_as_int(v00.x));
+  int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1);
 
-  if (flag & CURVE_KN_INTERPOLATE) {
-    int ka = max(k0 - 1, __float_as_int(v00.x));
-    int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+  float4 P_curve[4];
 
-    float4 P_curve[4];
+  if (!(sd->type & PRIMITIVE_ALL_MOTION)) {
+    P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+    P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+    P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+    P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+  }
+  else {
+    motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+  }
 
-    if (sd->type & PRIMITIVE_CURVE) {
-      P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-      P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-      P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-      P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-    }
-    else {
-      motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
-    }
+  sd->u = isect->u;
 
-    float3 p[4];
-    p[0] = float4_to_float3(P_curve[0]);
-    p[1] = float4_to_float3(P_curve[1]);
-    p[2] = float4_to_float3(P_curve[2]);
-    p[3] = float4_to_float3(P_curve[3]);
+  P = P + D * t;
 
-    P = P + D * t;
+  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u);
+  const float3 dPdu = float4_to_float3(dPdu4);
 
-#  ifdef __UV__
-    sd->u = isect->u;
-    sd->v = 0.0f;
-#  endif
+  if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+    /* Rounded smooth normals for ribbons, to approximate thick curve shape. */
+    const float3 tangent = normalize(dPdu);
+    const float3 bitangent = normalize(cross(tangent, -D));
+    const float sine = isect->v;
+    const float cosine = safe_sqrtf(1.0f - sine * sine);
 
-    tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+    sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
+    sd->Ng = -D;
+    sd->v = isect->v;
 
-    if (kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-      sd->Ng = normalize(-(D - tg * (dot(tg, D))));
-    }
-    else {
-#  ifdef __EMBREE__
-      if (kernel_data.bvh.scene) {
-        sd->Ng = normalize(isect->Ng);
-      }
-      else
+#  if 0
+    /* This approximates the position and geometric normal of a thick curve too,
+     * but gives too many issues with wrong self intersections. */
+    const float dPdu_radius = dPdu4.w;
+    sd->Ng = sd->N;
+    P += sd->N * dPdu_radius;
 #  endif
-      {
-        /* direction from inside to surface of curve */
-        float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
-        sd->Ng = normalize(P - p_curr);
-
-        /* adjustment for changing radius */
-        float gd = isect->v;
-
-        if (gd != 0.0f) {
-          sd->Ng = sd->Ng - gd * tg;
-          sd->Ng = normalize(sd->Ng);
-        }
-      }
-    }
-
-    /* todo: sometimes the normal is still so that this is detected as
-     * backfacing even if cull backfaces is enabled */
-
-    sd->N = sd->Ng;
   }
   else {
-    float4 P_curve[2];
-
-    if (sd->type & PRIMITIVE_CURVE) {
-      P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-      P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
-    }
-    else {
-      motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
-    }
-
-    float l = 1.0f;
-    tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
-
-    P = P + D * t;
-
-    float3 dif = P - float4_to_float3(P_curve[0]);
-
-#  ifdef __UV__
-    sd->u = dot(dif, tg) / l;
+    /* Thick curves, compute normal using direction from inside the curve.
+     * This could be optimized by recording the normal in the intersection,
+     * however for Optix this would go beyond the size of the payload. */
+    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
+    const float3 Ng = normalize(P - P_inside);
+
+    sd->N = Ng;
+    sd->Ng = Ng;
     sd->v = 0.0f;
-#  endif
-
-    if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
-      sd->Ng = -(D - tg * dot(tg, D));
-      sd->Ng = normalize(sd->Ng);
-    }
-    else {
-      float gd = isect->v;
-
-      /* direction from inside to surface of curve */
-      float denom = fmaxf(P_curve[0].w + sd->u * l * gd, 1e-8f);
-      sd->Ng = (dif - tg * sd->u * l) / denom;
-
-      /* adjustment for changing radius */
-      if (gd != 0.0f) {
-        sd->Ng = sd->Ng - gd * tg;
-      }
-
-      sd->Ng = normalize(sd->Ng);
-    }
-
-    sd->N = sd->Ng;
   }
 
 #  ifdef __DPDU__
   /* dPdu/dPdv */
-  sd->dPdu = tg;
-  sd->dPdv = cross(tg, sd->Ng);
+  sd->dPdu = dPdu;
+  sd->dPdv = cross(dPdu, sd->Ng);
 #  endif
 
   if (isect->object != OBJECT_NONE) {
@@ -956,7 +787,10 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
     P = transform_point(&tfm, P);
   }
 
-  return P;
+  sd->P = P;
+
+  float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+  sd->shader = __float_as_int(curvedata.z);
 }
 
 #endif
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 7380c506bf4..0f66f4af755 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -36,7 +36,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
    * zero iterations and rendering is really slow with motion curves. For until other
    * areas are speed up it's probably not so crucial to optimize this out.
    */
-  uint attr_offset = object_attribute_map_offset(kg, object) + ATTR_PRIM_CURVE;
+  uint attr_offset = object_attribute_map_offset(kg, object) + ATTR_PRIM_GEOMETRY;
   uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 
   while (attr_map.x != id) {
@@ -50,14 +50,14 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
-                                                  int offset,
-                                                  int numkeys,
-                                                  int numsteps,
-                                                  int step,
-                                                  int k0,
-                                                  int k1,
-                                                  float4 keys[2])
+ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
+                                                         int offset,
+                                                         int numkeys,
+                                                         int numsteps,
+                                                         int step,
+                                                         int k0,
+                                                         int k1,
+                                                         float4 keys[2])
 {
   if (step == numsteps) {
     /* center step: regular key location */
@@ -77,7 +77,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_curve_keys(
+ccl_device_inline void motion_curve_keys_linear(
     KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
 {
   /* get motion info */
@@ -97,24 +97,24 @@ ccl_device_inline void motion_curve_keys(
   /* fetch key coordinates */
   float4 next_keys[2];
 
-  motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, keys);
-  motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step + 1, k0, k1, next_keys);
+  motion_curve_keys_for_step_linear(kg, offset, numkeys, numsteps, step, k0, k1, keys);
+  motion_curve_keys_for_step_linear(kg, offset, numkeys, numsteps, step + 1, k0, k1, next_keys);
 
   /* interpolate between steps */
   keys[0] = (1.0f - t) * keys[0] + t * next_keys[0];
   keys[1] = (1.0f - t) * keys[1] + t * next_keys[1];
 }
 
-ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg,
-                                                           int offset,
-                                                           int numkeys,
-                                                           int numsteps,
-                                                           int step,
-                                                           int k0,
-                                                           int k1,
-                                                           int k2,
-                                                           int k3,
-                                                           float4 keys[4])
+ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
+                                                  int offset,
+                                                  int numkeys,
+                                                  int numsteps,
+                                                  int step,
+                                                  int k0,
+                                                  int k1,
+                                                  int k2,
+                                                  int k3,
+                                                  float4 keys[4])
 {
   if (step == numsteps) {
     /* center step: regular key location */
@@ -138,15 +138,15 @@ ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg,
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
-                                                  int object,
-                                                  int prim,
-                                                  float time,
-                                                  int k0,
-                                                  int k1,
-                                                  int k2,
-                                                  int k3,
-                                                  float4 keys[4])
+ccl_device_inline void motion_curve_keys(KernelGlobals *kg,
+                                         int object,
+                                         int prim,
+                                         float time,
+                                         int k0,
+                                         int k1,
+                                         int k2,
+                                         int k3,
+                                         float4 keys[4])
 {
   /* get motion info */
   int numsteps, numkeys;
@@ -165,9 +165,8 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
   /* fetch key coordinates */
   float4 next_keys[4];
 
-  motion_cardinal_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys);
-  motion_cardinal_curve_keys_for_step(
-      kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys);
+  motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys);
+  motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys);
 
   /* interpolate between steps */
   keys[0] = (1.0f - t) * keys[0] + t * next_keys[0];
@@ -176,53 +175,6 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
   keys[3] = (1.0f - t) * keys[3] + t * next_keys[3];
 }
 
-#  if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-/* Similar to above, but returns keys as pair of two AVX registers with each
- * holding two float4.
- */
-ccl_device_inline void motion_cardinal_curve_keys_avx(KernelGlobals *kg,
-                                                      int object,
-                                                      int prim,
-                                                      float time,
-                                                      int k0,
-                                                      int k1,
-                                                      int k2,
-                                                      int k3,
-                                                      avxf *out_keys_0_1,
-                                                      avxf *out_keys_2_3)
-{
-  /* Get motion info. */
-  int numsteps, numkeys;
-  object_motion_info(kg, object, &numsteps, NULL, &numkeys);
-
-  /* Figure out which steps we need to fetch and their interpolation factor. */
-  int maxstep = numsteps * 2;
-  int step = min((int)(time * maxstep), maxstep - 1);
-  float t = time * maxstep - step;
-
-  /* Find attribute. */
-  AttributeElement elem;
-  int offset = find_attribute_curve_motion(kg, object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
-  kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-  /* Fetch key coordinates. */
-  float4 next_keys[4];
-  float4 keys[4];
-  motion_cardinal_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys);
-  motion_cardinal_curve_keys_for_step(
-      kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys);
-
-  const avxf keys_0_1 = avxf(keys[0].m128, keys[1].m128);
-  const avxf keys_2_3 = avxf(keys[2].m128, keys[3].m128);
-  const avxf next_keys_0_1 = avxf(next_keys[0].m128, next_keys[1].m128);
-  const avxf next_keys_2_3 = avxf(next_keys[2].m128, next_keys[3].m128);
-
-  /* Interpolate between steps. */
-  *out_keys_0_1 = (1.0f - t) * keys_0_1 + t * next_keys_0_1;
-  *out_keys_2_3 = (1.0f - t) * keys_2_3 + t * next_keys_2_3;
-}
-#  endif
-
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index 49d4829af38..859d919f0bb 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -103,17 +103,21 @@ ccl_device_inline
                                  const Ray *ray,
                                  float3 verts[3])
 {
+#  ifdef __KERNEL_OPTIX__
+  /* isect->t is always in world space with OptiX. */
+  return motion_triangle_refine(kg, sd, isect, ray, verts);
+#  else
   float3 P = ray->P;
   float3 D = ray->D;
   float t = isect->t;
 
-#  ifdef __INTERSECTION_REFINE__
+#    ifdef __INTERSECTION_REFINE__
   if (isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
+#      ifdef __OBJECT_MOTION__
     Transform tfm = sd->ob_itfm;
-#    else
+#      else
     Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#    endif
+#      endif
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -135,19 +139,20 @@ ccl_device_inline
   P = P + D * rt;
 
   if (isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
+#      ifdef __OBJECT_MOTION__
     Transform tfm = sd->ob_tfm;
-#    else
+#      else
     Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#    endif
+#      endif
 
     P = transform_point(&tfm, P);
   }
 
   return P;
-#  else  /* __INTERSECTION_REFINE__ */
+#    else  /* __INTERSECTION_REFINE__ */
   return P + D * t;
-#  endif /* __INTERSECTION_REFINE__ */
+#    endif /* __INTERSECTION_REFINE__ */
+#  endif
 }
 #endif /* __BVH_LOCAL__ */
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 2792fd64c61..614e2e3b92b 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -81,13 +81,7 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
   const uint num_steps = kernel_tex_fetch(__objects, object).numsteps * 2 + 1;
 
   Transform tfm;
-#  ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
-    transform_motion_array_interpolate_straight(&tfm, motion, num_steps, time);
-  }
-  else
-#  endif
-    transform_motion_array_interpolate(&tfm, motion, num_steps, time);
+  transform_motion_array_interpolate(&tfm, motion, num_steps, time);
 
   return tfm;
 }
@@ -227,6 +221,17 @@ ccl_device_inline float object_surface_area(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).surface_area;
 }
 
+/* Color of the object */
+
+ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
+{
+  if (object == OBJECT_NONE)
+    return make_float3(0.0f, 0.0f, 0.0f);
+
+  const ccl_global KernelObject *kobject = &kernel_tex_fetch(__objects, object);
+  return make_float3(kobject->color[0], kobject->color[1], kobject->color[2]);
+}
+
 /* Pass ID number of object */
 
 ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
@@ -315,6 +320,26 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).patch_map_offset;
 }
 
+/* Volume step size */
+
+ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
+{
+  if (object == OBJECT_NONE) {
+    return 1.0f;
+  }
+
+  return kernel_tex_fetch(__objects, object).surface_area;
+}
+
+ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
+{
+  if (object == OBJECT_NONE) {
+    return kernel_data.background.volume_step_size;
+  }
+
+  return kernel_tex_fetch(__object_volume_step, object);
+}
+
 /* Pass ID for shader */
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
@@ -386,24 +411,10 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
 
 ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 {
-  /* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse direction */
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
-  const ssef oopes(8.271806E-25f, 8.271806E-25f, 8.271806E-25f, 0.0f);
-  const ssef mask = _mm_cmpgt_ps(fabs(dir), oopes);
-  const ssef signdir = signmsk(dir.m128) | oopes;
-#  ifndef __KERNEL_AVX__
-  ssef res = mask & ssef(dir);
-  res = _mm_or_ps(res, _mm_andnot_ps(mask, signdir));
-#  else
-  ssef res = _mm_blendv_ps(signdir, dir, mask);
-#  endif
-  return float3(res);
-#else  /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
   const float ooeps = 8.271806E-25f;
   return make_float3((fabsf(dir.x) > ooeps) ? dir.x : copysignf(ooeps, dir.x),
                      (fabsf(dir.y) > ooeps) ? dir.y : copysignf(ooeps, dir.y),
                      (fabsf(dir.z) > ooeps) ? dir.z : copysignf(ooeps, dir.z));
-#endif /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
 }
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
@@ -431,38 +442,6 @@ ccl_device_inline float bvh_instance_push(
   return t;
 }
 
-#ifdef __QBVH__
-/* Same as above, but optimized for QBVH scene intersection,
- * which needs to modify two max distances.
- *
- * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
- * so we can avoid having this duplication.
- */
-ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
-                                          int object,
-                                          const Ray *ray,
-                                          float3 *P,
-                                          float3 *dir,
-                                          float3 *idir,
-                                          float *t,
-                                          float *t1)
-{
-  Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-
-  *P = transform_point(&tfm, ray->P);
-
-  float len;
-  *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
-  *idir = bvh_inverse_direction(*dir);
-
-  if (*t != FLT_MAX)
-    *t *= len;
-
-  if (*t1 != -FLT_MAX)
-    *t1 *= len;
-}
-#endif
-
 /* Transorm ray to exit static object in BVH */
 
 ccl_device_inline float bvh_instance_pop(
@@ -525,39 +504,6 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
   return t;
 }
 
-#  ifdef __QBVH__
-/* Same as above, but optimized for QBVH scene intersection,
- * which needs to modify two max distances.
- *
- * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
- * so we can avoid having this duplication.
- */
-ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg,
-                                                 int object,
-                                                 const Ray *ray,
-                                                 float3 *P,
-                                                 float3 *dir,
-                                                 float3 *idir,
-                                                 float *t,
-                                                 float *t1,
-                                                 Transform *itfm)
-{
-  object_fetch_transform_motion_test(kg, object, ray->time, itfm);
-
-  *P = transform_point(itfm, ray->P);
-
-  float len;
-  *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
-  *idir = bvh_inverse_direction(*dir);
-
-  if (*t != FLT_MAX)
-    *t *= len;
-
-  if (*t1 != -FLT_MAX)
-    *t1 *= len;
-}
-#  endif
-
 /* Transorm ray to exit motion blurred object in BVH */
 
 ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index df19199f68e..8b4b91b96c8 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -380,15 +380,15 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float3 patch_eval_uchar4(KernelGlobals *kg,
+ccl_device float4 patch_eval_uchar4(KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
                                     float u,
                                     float v,
                                     int channel,
-                                    float3 *du,
-                                    float3 *dv)
+                                    float4 *du,
+                                    float4 *dv)
 {
   int indices[PATCH_MAX_CONTROL_VERTS];
   float weights[PATCH_MAX_CONTROL_VERTS];
@@ -398,14 +398,14 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg,
   int num_control = patch_eval_control_verts(
       kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv);
 
-  float3 val = make_float3(0.0f, 0.0f, 0.0f);
+  float4 val = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
   if (du)
-    *du = make_float3(0.0f, 0.0f, 0.0f);
+    *du = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
   if (dv)
-    *dv = make_float3(0.0f, 0.0f, 0.0f);
+    *dv = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
   for (int i = 0; i < num_control; i++) {
-    float3 v = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, offset + indices[i]));
+    float4 v = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, offset + indices[i]));
 
     val += v * weights[i];
     if (du)
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 7f2b52a24c4..997abf438d0 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -162,6 +162,32 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
   }
 }
 
+ccl_device_inline float4 primitive_attribute_float4(KernelGlobals *kg,
+                                                    const ShaderData *sd,
+                                                    const AttributeDescriptor desc,
+                                                    float4 *dx,
+                                                    float4 *dy)
+{
+  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+    if (subd_triangle_patch(kg, sd) == ~0)
+      return triangle_attribute_float4(kg, sd, desc, dx, dy);
+    else
+      return subd_triangle_attribute_float4(kg, sd, desc, dx, dy);
+  }
+#ifdef __HAIR__
+  else if (sd->type & PRIMITIVE_ALL_CURVE) {
+    return curve_attribute_float4(kg, sd, desc, dx, dy);
+  }
+#endif
+  else {
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+}
+
 ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 8d5b3c12833..3eef9857ae3 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -217,6 +217,14 @@ ccl_device_noinline float subd_triangle_attribute_float(
 
     return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = 0.0f;
+    if (dy)
+      *dy = 0.0f;
+
+    return kernel_tex_fetch(__attributes_float, desc.offset);
+  }
   else {
     if (dx)
       *dx = 0.0f;
@@ -352,6 +360,14 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
 
     return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float2(0.0f, 0.0f);
+    if (dy)
+      *dy = make_float2(0.0f, 0.0f);
+
+    return kernel_tex_fetch(__attributes_float2, desc.offset);
+  }
   else {
     if (dx)
       *dx = make_float2(0.0f, 0.0f);
@@ -382,13 +398,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
     float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
     float3 a, dads, dadt;
-
-    if (desc.element == ATTR_ELEMENT_CORNER_BYTE) {
-      a = patch_eval_uchar4(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
-    }
-    else {
-      a = patch_eval_float3(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
-    }
+    a = patch_eval_float3(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
 
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx || dy) {
@@ -460,7 +470,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
 
     return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
   }
-  else if (desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
+  else if (desc.element == ATTR_ELEMENT_CORNER) {
     float2 uv[3];
     subd_triangle_patch_uv(kg, sd, uv);
 
@@ -469,18 +479,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
 
     float3 f0, f1, f2, f3;
 
-    if (desc.element == ATTR_ELEMENT_CORNER) {
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset));
-      f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset));
-    }
-    else {
-      f0 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, corners[0] + desc.offset));
-      f1 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, corners[1] + desc.offset));
-      f2 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, corners[2] + desc.offset));
-      f3 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset));
-    }
+    f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset));
+    f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset));
+    f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset));
+    f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset));
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -500,6 +502,14 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
 
     return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float3(0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+  }
   else {
     if (dx)
       *dx = make_float3(0.0f, 0.0f, 0.0f);
@@ -510,4 +520,110 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
+ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals *kg,
+                                                          const ShaderData *sd,
+                                                          const AttributeDescriptor desc,
+                                                          float4 *dx,
+                                                          float4 *dy)
+{
+  int patch = subd_triangle_patch(kg, sd);
+
+#ifdef __PATCH_EVAL__
+  if (desc.flags & ATTR_SUBDIVIDED) {
+    float2 uv[3];
+    subd_triangle_patch_uv(kg, sd, uv);
+
+    float2 dpdu = uv[0] - uv[2];
+    float2 dpdv = uv[1] - uv[2];
+
+    /* p is [s, t] */
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+
+    float4 dads, dadt;
+
+    float4 a = patch_eval_uchar4(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
+
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx || dy) {
+      float dsdu = dpdu.x;
+      float dtdu = dpdu.y;
+      float dsdv = dpdv.x;
+      float dtdv = dpdv.y;
+
+      if (dx) {
+        float dudx = sd->du.dx;
+        float dvdx = sd->dv.dx;
+
+        float dsdx = dsdu * dudx + dsdv * dvdx;
+        float dtdx = dtdu * dudx + dtdv * dvdx;
+
+        *dx = dads * dsdx + dadt * dtdx;
+      }
+      if (dy) {
+        float dudy = sd->du.dy;
+        float dvdy = sd->dv.dy;
+
+        float dsdy = dsdu * dudy + dsdv * dvdy;
+        float dtdy = dtdu * dudy + dtdv * dvdy;
+
+        *dy = dads * dsdy + dadt * dtdy;
+      }
+    }
+#  endif
+    return a;
+  }
+  else
+#endif /* __PATCH_EVAL__ */
+      if (desc.element == ATTR_ELEMENT_CORNER_BYTE) {
+    float2 uv[3];
+    subd_triangle_patch_uv(kg, sd, uv);
+
+    int corners[4];
+    subd_triangle_patch_corners(kg, patch, corners);
+
+    float4 f0 = color_uchar4_to_float4(
+        kernel_tex_fetch(__attributes_uchar4, corners[0] + desc.offset));
+    float4 f1 = color_uchar4_to_float4(
+        kernel_tex_fetch(__attributes_uchar4, corners[1] + desc.offset));
+    float4 f2 = color_uchar4_to_float4(
+        kernel_tex_fetch(__attributes_uchar4, corners[2] + desc.offset));
+    float4 f3 = color_uchar4_to_float4(
+        kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset));
+
+    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
+      f1 = (f1 + f0) * 0.5f;
+      f3 = (f3 + f0) * 0.5f;
+    }
+
+    float4 a = mix(mix(f0, f1, uv[0].x), mix(f3, f2, uv[0].x), uv[0].y);
+    float4 b = mix(mix(f0, f1, uv[1].x), mix(f3, f2, uv[1].x), uv[1].y);
+    float4 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
+
+#ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+    if (dy)
+      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+#endif
+
+    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+  }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    return color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, desc.offset));
+  }
+  else {
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 9938c0ba2c3..0278f3ade8e 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -153,6 +153,14 @@ ccl_device float triangle_attribute_float(
 
     return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = 0.0f;
+    if (dy)
+      *dy = 0.0f;
+
+    return kernel_tex_fetch(__attributes_float, desc.offset);
+  }
   else {
     if (dx)
       *dx = 0.0f;
@@ -212,6 +220,14 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
 
     return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float2(0.0f, 0.0f);
+    if (dy)
+      *dy = make_float2(0.0f, 0.0f);
+
+    return kernel_tex_fetch(__attributes_float2, desc.offset);
+  }
   else {
     if (dx)
       *dx = make_float2(0.0f, 0.0f);
@@ -255,20 +271,13 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
 
     return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
   }
-  else if (desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
+  else if (desc.element == ATTR_ELEMENT_CORNER) {
     int tri = desc.offset + sd->prim * 3;
     float3 f0, f1, f2;
 
-    if (desc.element == ATTR_ELEMENT_CORNER) {
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
-    }
-    else {
-      f0 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 0));
-      f1 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 1));
-      f2 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 2));
-    }
+    f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
+    f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
+    f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -279,6 +288,14 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
 
     return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float3(0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+  }
   else {
     if (dx)
       *dx = make_float3(0.0f, 0.0f, 0.0f);
@@ -289,4 +306,53 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
+ccl_device float4 triangle_attribute_float4(KernelGlobals *kg,
+                                            const ShaderData *sd,
+                                            const AttributeDescriptor desc,
+                                            float4 *dx,
+                                            float4 *dy)
+{
+  if (desc.element == ATTR_ELEMENT_CORNER_BYTE || desc.element == ATTR_ELEMENT_VERTEX) {
+    float4 f0, f1, f2;
+
+    if (desc.element == ATTR_ELEMENT_CORNER_BYTE) {
+      int tri = desc.offset + sd->prim * 3;
+      f0 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 0));
+      f1 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 1));
+      f2 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 2));
+    }
+    else {
+      uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
+    }
+
+#ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+    if (dy)
+      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+#endif
+
+    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+  }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    return color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, desc.offset));
+  }
+  else {
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index bcad03102d2..b0cce274b94 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -71,424 +71,6 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
   return false;
 }
 
-#ifdef __KERNEL_AVX2__
-#  define cross256(A, B, C, D) _mm256_fmsub_ps(A, B, _mm256_mul_ps(C, D))
-ccl_device_inline int ray_triangle_intersect8(KernelGlobals *kg,
-                                              float3 ray_P,
-                                              float3 ray_dir,
-                                              Intersection **isect,
-                                              uint visibility,
-                                              int object,
-                                              __m256 *triA,
-                                              __m256 *triB,
-                                              __m256 *triC,
-                                              int prim_addr,
-                                              int prim_num,
-                                              uint *num_hits,
-                                              uint max_hits,
-                                              int *num_hits_in_instance,
-                                              float isect_t)
-{
-
-  const unsigned char prim_num_mask = (1 << prim_num) - 1;
-
-  const __m256i zero256 = _mm256_setzero_si256();
-
-  const __m256 Px256 = _mm256_set1_ps(ray_P.x);
-  const __m256 Py256 = _mm256_set1_ps(ray_P.y);
-  const __m256 Pz256 = _mm256_set1_ps(ray_P.z);
-
-  const __m256 dirx256 = _mm256_set1_ps(ray_dir.x);
-  const __m256 diry256 = _mm256_set1_ps(ray_dir.y);
-  const __m256 dirz256 = _mm256_set1_ps(ray_dir.z);
-
-  /* Calculate vertices relative to ray origin. */
-  __m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256);
-  __m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256);
-  __m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256);
-
-  __m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256);
-  __m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256);
-  __m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256);
-
-  __m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256);
-  __m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256);
-  __m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256);
-
-  __m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256);
-  __m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256);
-  __m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256);
-
-  __m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256);
-  __m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256);
-  __m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256);
-
-  __m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256);
-  __m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256);
-  __m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256);
-
-  /* Calculate triangle edges. */
-  __m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256);
-  __m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256);
-  __m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256);
-
-  __m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256);
-  __m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256);
-  __m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256);
-
-  __m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256);
-  __m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256);
-  __m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256);
-
-  /* Perform edge tests. */
-  /* cross (AyBz - AzBy, AzBx -AxBz,  AxBy - AyBx) */
-  __m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256);
-  __m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256);
-  __m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256);
-  /* vertical dot */
-  __m256 U_256 = _mm256_mul_ps(U_x_256, dirx256);
-  U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256);
-  U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256);
-
-  __m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256);
-  __m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256);
-  __m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256);
-  /* vertical dot */
-  __m256 V_256 = _mm256_mul_ps(V_x_256, dirx256);
-  V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);
-  V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);
-
-  __m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256);
-  __m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256);
-  __m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256);
-  /* vertical dot */
-  __m256 W_256 = _mm256_mul_ps(W_x_256, dirx256);
-  W_256 = _mm256_fmadd_ps(W_y_256, diry256, W_256);
-  W_256 = _mm256_fmadd_ps(W_z_256, dirz256, W_256);
-
-  __m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31);
-  __m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31);
-  __m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31);
-  __m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1);
-
-  const __m256i one256 = _mm256_set1_epi32(1);
-  const __m256i two256 = _mm256_set1_epi32(2);
-
-  __m256i mask_minmaxUVW_256 = _mm256_or_si256(_mm256_cmpeq_epi32(one256, UVW_256_1),
-                                               _mm256_cmpeq_epi32(two256, UVW_256_1));
-
-  unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256));
-  if ((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) {  //all bits set
-    return false;
-  }
-
-  /* Calculate geometry normal and denominator. */
-  __m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256);
-  __m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256);
-  __m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256);
-
-  Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256);
-  Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256);
-  Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256);
-
-  /* vertical dot */
-  __m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256);
-  den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256, den_256);
-  den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256, den_256);
-
-  /* Perform depth test. */
-  __m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256);
-  T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256, T_256);
-  T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256, T_256);
-
-  const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000);
-  __m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000);
-
-  __m256 sign_T_256 = _mm256_castsi256_ps(
-      _mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256));
-
-  unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256);
-  if (((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) {
-    return false;
-  }
-
-  __m256 xor_signmask_256 = _mm256_castsi256_ps(
-      _mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256));
-
-  ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8];
-  ccl_align(32) unsigned int mask_minmaxUVW8[8];
-
-  if (visibility == PATH_RAY_SHADOW_OPAQUE) {
-    __m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);
-    __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256);
-    __m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256));
-    __m256 rayt_256 = _mm256_set1_ps((*isect)->t);
-    __m256i mask1 = _mm256_cmpgt_epi32(
-        _mm256_castps_si256(sign_T_256),
-        _mm256_castps_si256(_mm256_mul_ps(
-            _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)),
-            rayt_256)));
-    mask0 = _mm256_or_si256(mask1, mask0);
-    mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256);  //(~mask_minmaxUVW_pos) &(~mask)
-    mask_final_256 = _mm256_andnot_si256(
-        maskden256, mask_final_256);  //(~mask_minmaxUVW_pos) &(~mask) & (~maskden)
-    unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256));
-    if ((mask_final & prim_num_mask) == 0) {
-      return false;
-    }
-    const int i = __bsf(mask_final);
-    __m256 inv_den_256 = _mm256_rcp_ps(den_256);
-    U_256 = _mm256_mul_ps(U_256, inv_den_256);
-    V_256 = _mm256_mul_ps(V_256, inv_den_256);
-    T_256 = _mm256_mul_ps(T_256, inv_den_256);
-    _mm256_store_ps(U8, U_256);
-    _mm256_store_ps(V8, V_256);
-    _mm256_store_ps(T8, T_256);
-    /* NOTE: Here we assume visibility for all triangles in the node is
-     * the same. */
-    (*isect)->u = U8[i];
-    (*isect)->v = V8[i];
-    (*isect)->t = T8[i];
-    (*isect)->prim = (prim_addr + i);
-    (*isect)->object = object;
-    (*isect)->type = PRIMITIVE_TRIANGLE;
-    return true;
-  }
-  else {
-    _mm256_store_ps(den8, den_256);
-    _mm256_store_ps(U8, U_256);
-    _mm256_store_ps(V8, V_256);
-    _mm256_store_ps(T8, T_256);
-
-    _mm256_store_ps(sign_T8, sign_T_256);
-    _mm256_store_ps(xor_signmask8, xor_signmask_256);
-    _mm256_store_si256((__m256i *)mask_minmaxUVW8, mask_minmaxUVW_256);
-
-    int ret = false;
-
-    if (visibility == PATH_RAY_SHADOW) {
-      for (int i = 0; i < prim_num; i++) {
-        if (mask_minmaxUVW8[i]) {
-          continue;
-        }
-#  ifdef __VISIBILITY_FLAG__
-        if ((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
-          continue;
-        }
-#  endif
-        if ((sign_T8[i] < 0.0f) || (sign_T8[i] > (*isect)->t * xor_signmask8[i])) {
-          continue;
-        }
-        if (!den8[i]) {
-          continue;
-        }
-        const float inv_den = 1.0f / den8[i];
-        (*isect)->u = U8[i] * inv_den;
-        (*isect)->v = V8[i] * inv_den;
-        (*isect)->t = T8[i] * inv_den;
-        (*isect)->prim = (prim_addr + i);
-        (*isect)->object = object;
-        (*isect)->type = PRIMITIVE_TRIANGLE;
-        const int prim = kernel_tex_fetch(__prim_index, (*isect)->prim);
-        int shader = 0;
-#  ifdef __HAIR__
-        if (kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE)
-#  endif
-        {
-          shader = kernel_tex_fetch(__tri_shader, prim);
-        }
-#  ifdef __HAIR__
-        else {
-          float4 str = kernel_tex_fetch(__curves, prim);
-          shader = __float_as_int(str.z);
-        }
-#  endif
-        const int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-        /* If no transparent shadows, all light is blocked. */
-        if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
-          return 2;
-        }
-        /* If maximum number of hits reached, block all light. */
-        else if (num_hits == NULL || *num_hits == max_hits) {
-          return 2;
-        }
-        /* Move on to next entry in intersections array. */
-        ret = true;
-        (*isect)++;
-        (*num_hits)++;
-        (*num_hits_in_instance)++;
-        (*isect)->t = isect_t;
-      }
-    }
-    else {
-      for (int i = 0; i < prim_num; i++) {
-        if (mask_minmaxUVW8[i]) {
-          continue;
-        }
-#  ifdef __VISIBILITY_FLAG__
-        if ((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) {
-          continue;
-        }
-#  endif
-        if ((sign_T8[i] < 0.0f) || (sign_T8[i] > (*isect)->t * xor_signmask8[i])) {
-          continue;
-        }
-        if (!den8[i]) {
-          continue;
-        }
-        const float inv_den = 1.0f / den8[i];
-        (*isect)->u = U8[i] * inv_den;
-        (*isect)->v = V8[i] * inv_den;
-        (*isect)->t = T8[i] * inv_den;
-        (*isect)->prim = (prim_addr + i);
-        (*isect)->object = object;
-        (*isect)->type = PRIMITIVE_TRIANGLE;
-        ret = true;
-      }
-    }
-    return ret;
-  }
-}
-
-ccl_device_inline int triangle_intersect8(KernelGlobals *kg,
-                                          Intersection **isect,
-                                          float3 P,
-                                          float3 dir,
-                                          uint visibility,
-                                          int object,
-                                          int prim_addr,
-                                          int prim_num,
-                                          uint *num_hits,
-                                          uint max_hits,
-                                          int *num_hits_in_instance,
-                                          float isect_t)
-{
-  __m128 tri_a[8], tri_b[8], tri_c[8];
-  __m256 tritmp[12], tri[12];
-  __m256 triA[3], triB[3], triC[3];
-
-  int i, r;
-
-  uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
-  for (i = 0; i < prim_num; i++) {
-    tri_a[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++];
-    tri_b[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++];
-    tri_c[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++];
-  }
-  //create 9 or  12 placeholders
-  tri[0] = _mm256_castps128_ps256(tri_a[0]);  //_mm256_zextps128_ps256
-  tri[1] = _mm256_castps128_ps256(tri_b[0]);  //_mm256_zextps128_ps256
-  tri[2] = _mm256_castps128_ps256(tri_c[0]);  //_mm256_zextps128_ps256
-
-  tri[3] = _mm256_castps128_ps256(tri_a[1]);  //_mm256_zextps128_ps256
-  tri[4] = _mm256_castps128_ps256(tri_b[1]);  //_mm256_zextps128_ps256
-  tri[5] = _mm256_castps128_ps256(tri_c[1]);  //_mm256_zextps128_ps256
-
-  tri[6] = _mm256_castps128_ps256(tri_a[2]);  //_mm256_zextps128_ps256
-  tri[7] = _mm256_castps128_ps256(tri_b[2]);  //_mm256_zextps128_ps256
-  tri[8] = _mm256_castps128_ps256(tri_c[2]);  //_mm256_zextps128_ps256
-
-  if (prim_num > 3) {
-    tri[9] = _mm256_castps128_ps256(tri_a[3]);   //_mm256_zextps128_ps256
-    tri[10] = _mm256_castps128_ps256(tri_b[3]);  //_mm256_zextps128_ps256
-    tri[11] = _mm256_castps128_ps256(tri_c[3]);  //_mm256_zextps128_ps256
-  }
-
-  for (i = 4, r = 0; i < prim_num; i++, r += 3) {
-    tri[r] = _mm256_insertf128_ps(tri[r], tri_a[i], 1);
-    tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1);
-    tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1);
-  }
-
-  //------------------------------------------------
-  //0!  Xa0 Ya0 Za0 1 Xa4 Ya4 Za4  1
-  //1!  Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1
-  //2!  Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1
-
-  //3!  Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1
-  //4!  Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5  1
-  //5!  Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1
-
-  //6!  Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1
-  //7!  Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6  1
-  //8!  Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1
-
-  //9!  Xa3 Ya3 Za3 1 Xa7 Ya7 Za7  1
-  //10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7  1
-  //11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7  1
-
-  //"transpose"
-  tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]);  //0!  Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5
-  tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]);  //1!  Za0 Za1 1   1   Za4 Za5  1   1
-
-  tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]);  //2!  Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7
-  tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]);  //3!  Za2 Za3  1   1  Za6 Za7  1   1
-
-  tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]);  //4!  Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5
-  tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]);  //5!  Zb0 Zb1  1  1   Zb4 Zb5  1   1
-
-  tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]);  //6!  Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7
-  tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]);  //7!  Zb2 Zb3  1    1 Zb6 Zb7  1   1
-
-  tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]);  //8!  Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5
-  tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]);  //9!  Zc0 Zc1  1   1  Zc4 Zc5  1   1
-
-  tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]);  //10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7
-  tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]);  //11! Zc2 Zc3  1   1  Zc6 Zc7  1   1
-
-  /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
-  triA[0] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]),
-                         _mm256_castps_pd(tritmp[2])));  //  Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7
-  triA[1] = _mm256_castpd_ps(
-      _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]),
-                         _mm256_castps_pd(tritmp[2])));  //  Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7
-  triA[2] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]),
-                         _mm256_castps_pd(tritmp[3])));  //  Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7
-
-  triB[0] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]),
-                         _mm256_castps_pd(tritmp[6])));  //  Xb0 Xb1  Xb2 Xb3 Xb4 Xb5 Xb5 Xb7
-  triB[1] = _mm256_castpd_ps(
-      _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]),
-                         _mm256_castps_pd(tritmp[6])));  //  Yb0 Yb1  Yb2 Yb3 Yb4 Yb5 Yb5 Yb7
-  triB[2] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]),
-                         _mm256_castps_pd(tritmp[7])));  //    Zb0 Zb1  Zb2 Zb3 Zb4 Zb5 Zb5 Zb7
-
-  triC[0] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]),
-                         _mm256_castps_pd(tritmp[10])));  //Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7
-  triC[1] = _mm256_castpd_ps(
-      _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]),
-                         _mm256_castps_pd(tritmp[10])));  //Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7
-  triC[2] = _mm256_castpd_ps(
-      _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]),
-                         _mm256_castps_pd(tritmp[11])));  //Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7
-
-  /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
-
-  int result = ray_triangle_intersect8(kg,
-                                       P,
-                                       dir,
-                                       isect,
-                                       visibility,
-                                       object,
-                                       triA,
-                                       triB,
-                                       triC,
-                                       prim_addr,
-                                       prim_num,
-                                       num_hits,
-                                       max_hits,
-                                       num_hits_in_instance,
-                                       isect_t);
-  return result;
-}
-
-#endif /* __KERNEL_AVX2__ */
-
 /* Special ray intersection routines for subsurface scattering. In that case we
  * only want to intersect with primitives in the same object, and if case of
  * multiple hits we pick a single random primitive as the intersection point.
@@ -681,16 +263,20 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
                                                const Intersection *isect,
                                                const Ray *ray)
 {
+#ifdef __KERNEL_OPTIX__
+  /* isect->t is always in world space with OptiX. */
+  return triangle_refine(kg, sd, isect, ray);
+#else
   float3 P = ray->P;
   float3 D = ray->D;
   float t = isect->t;
 
   if (isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
+#  ifdef __OBJECT_MOTION__
     Transform tfm = sd->ob_itfm;
-#else
+#  else
     Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
+#  endif
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -699,7 +285,7 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
 
   P = P + D * t;
 
-#ifdef __INTERSECTION_REFINE__
+#  ifdef __INTERSECTION_REFINE__
   const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
@@ -719,19 +305,20 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
     float rt = dot(edge2, qvec) / det;
     P = P + D * rt;
   }
-#endif /* __INTERSECTION_REFINE__ */
+#  endif /* __INTERSECTION_REFINE__ */
 
   if (isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
+#  ifdef __OBJECT_MOTION__
     Transform tfm = sd->ob_tfm;
-#else
+#  else
     Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
+#  endif
 
     P = transform_point(&tfm, P);
   }
 
   return P;
+#endif
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 96cf35a40dc..f43a7841b46 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -51,10 +51,14 @@ ccl_device float volume_attribute_float(KernelGlobals *kg,
                                         const ShaderData *sd,
                                         const AttributeDescriptor desc)
 {
-  float3 P = volume_normalized_position(kg, sd, sd->P);
+  /* todo: optimize this so we don't have to transform both here and in
+   * kernel_tex_image_interp_3d when possible. Also could optimize for the
+   * common case where transform is translation/scale only. */
+  float3 P = sd->P;
+  object_inverse_position_transform(kg, sd, &P);
   InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC :
                                                             INTERPOLATION_NONE;
-  float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp);
+  float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P, interp);
   return average(float4_to_float3(r));
 }
 
@@ -62,10 +66,11 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg,
                                           const ShaderData *sd,
                                           const AttributeDescriptor desc)
 {
-  float3 P = volume_normalized_position(kg, sd, sd->P);
+  float3 P = sd->P;
+  object_inverse_position_transform(kg, sd, &P);
   InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC :
                                                             INTERPOLATION_NONE;
-  float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp);
+  float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P, interp);
 
   if (r.w > 1e-6f && r.w != 1.0f) {
     /* For RGBA colors, unpremultiply after interpolation. */
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index dfdd8843f29..b907c6a2bac 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -19,8 +19,8 @@
 
 /* CPU Kernel Interface */
 
-#include "util/util_types.h"
 #include "kernel/kernel_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,7 +38,7 @@ void *kernel_osl_memory(KernelGlobals *kg);
 bool kernel_osl_use(KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
-void kernel_tex_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
 
 #define KERNEL_ARCH cpu
 #include "kernel/kernels/cpu/kernel_cpu.h"
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index b9d723222a1..79ea03f4f6f 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -36,21 +36,18 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval,
     eval->glossy = make_float3(0.0f, 0.0f, 0.0f);
     eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
     eval->transparent = make_float3(0.0f, 0.0f, 0.0f);
-    eval->subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
+    eval->volume = make_float3(0.0f, 0.0f, 0.0f);
 
     if (type == CLOSURE_BSDF_TRANSPARENT_ID)
       eval->transparent = value;
-    else if (CLOSURE_IS_BSDF_DIFFUSE(type))
+    else if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
       eval->diffuse = value;
     else if (CLOSURE_IS_BSDF_GLOSSY(type))
       eval->glossy = value;
     else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
       eval->transmission = value;
-    else if (CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->subsurface = value;
     else if (CLOSURE_IS_PHASE(type))
-      eval->scatter = value;
+      eval->volume = value;
   }
   else
 #endif
@@ -73,16 +70,14 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval,
   value *= mis_weight;
 #ifdef __PASSES__
   if (eval->use_light_pass) {
-    if (CLOSURE_IS_BSDF_DIFFUSE(type))
+    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
       eval->diffuse += value;
     else if (CLOSURE_IS_BSDF_GLOSSY(type))
       eval->glossy += value;
     else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
       eval->transmission += value;
-    else if (CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->subsurface += value;
     else if (CLOSURE_IS_PHASE(type))
-      eval->scatter += value;
+      eval->volume += value;
 
     /* skipping transparent, this function is used by for eval(), will be zero then */
   }
@@ -98,7 +93,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 #ifdef __PASSES__
   if (eval->use_light_pass) {
     return is_zero(eval->diffuse) && is_zero(eval->glossy) && is_zero(eval->transmission) &&
-           is_zero(eval->transparent) && is_zero(eval->subsurface) && is_zero(eval->scatter);
+           is_zero(eval->transparent) && is_zero(eval->volume);
   }
   else
 #endif
@@ -114,8 +109,7 @@ ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
     eval->diffuse *= value;
     eval->glossy *= value;
     eval->transmission *= value;
-    eval->subsurface *= value;
-    eval->scatter *= value;
+    eval->volume *= value;
 
     /* skipping transparent, this function is used by for eval(), will be zero then */
   }
@@ -144,8 +138,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
     eval->diffuse *= value;
     eval->glossy *= value;
     eval->transmission *= value;
-    eval->subsurface *= value;
-    eval->scatter *= value;
+    eval->volume *= value;
 
     /* skipping transparent, this function is used by for eval(), will be zero then */
   }
@@ -160,7 +153,7 @@ ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
 #ifdef __PASSES__
   if (eval->use_light_pass) {
-    return eval->diffuse + eval->glossy + eval->transmission + eval->subsurface + eval->scatter;
+    return eval->diffuse + eval->glossy + eval->transmission + eval->volume;
   }
   else
 #endif
@@ -174,32 +167,29 @@ ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
  * visible as the first non-transparent hit, while indirectly visible are the
  * bounces after that. */
 
-ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
+ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L)
 {
   /* clear all */
 #ifdef __PASSES__
-  L->use_light_pass = use_light_pass;
+  L->use_light_pass = kernel_data.film.use_light_pass;
 
-  if (use_light_pass) {
+  if (kernel_data.film.use_light_pass) {
     L->indirect = make_float3(0.0f, 0.0f, 0.0f);
     L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 
     L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->color_glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->color_transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f);
 
     L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
+    L->direct_volume = make_float3(0.0f, 0.0f, 0.0f);
 
     L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
+    L->indirect_volume = make_float3(0.0f, 0.0f, 0.0f);
 
     L->transparent = 0.0f;
     L->emission = make_float3(0.0f, 0.0f, 0.0f);
@@ -211,8 +201,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
     L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
+    L->state.volume = make_float3(0.0f, 0.0f, 0.0f);
     L->state.direct = make_float3(0.0f, 0.0f, 0.0f);
   }
   else
@@ -264,11 +253,9 @@ ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg,
       L_state->diffuse = bsdf_eval->diffuse * value;
       L_state->glossy = bsdf_eval->glossy * value;
       L_state->transmission = bsdf_eval->transmission * value;
-      L_state->subsurface = bsdf_eval->subsurface * value;
-      L_state->scatter = bsdf_eval->scatter * value;
+      L_state->volume = bsdf_eval->volume * value;
 
-      *throughput = L_state->diffuse + L_state->glossy + L_state->transmission +
-                    L_state->subsurface + L_state->scatter;
+      *throughput = L_state->diffuse + L_state->glossy + L_state->transmission + L_state->volume;
 
       L_state->direct = *throughput;
     }
@@ -285,7 +272,37 @@ ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void path_radiance_accum_emission(PathRadiance *L,
+#ifdef __CLAMP_SAMPLE__
+ccl_device_forceinline void path_radiance_clamp(KernelGlobals *kg, float3 *L, int bounce)
+{
+  float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
+                               kernel_data.integrator.sample_clamp_direct;
+  float sum = reduce_add(fabs(*L));
+  if (sum > limit) {
+    *L *= limit / sum;
+  }
+}
+
+ccl_device_forceinline void path_radiance_clamp_throughput(KernelGlobals *kg,
+                                                           float3 *L,
+                                                           float3 *throughput,
+                                                           int bounce)
+{
+  float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
+                               kernel_data.integrator.sample_clamp_direct;
+
+  float sum = reduce_add(fabs(*L));
+  if (sum > limit) {
+    float clamp_factor = limit / sum;
+    *L *= clamp_factor;
+    *throughput *= clamp_factor;
+  }
+}
+
+#endif
+
+ccl_device_inline void path_radiance_accum_emission(KernelGlobals *kg,
+                                                    PathRadiance *L,
                                                     ccl_addr_space PathState *state,
                                                     float3 throughput,
                                                     float3 value)
@@ -296,33 +313,41 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L,
   }
 #endif
 
+  float3 contribution = throughput * value;
+#ifdef __CLAMP_SAMPLE__
+  path_radiance_clamp(kg, &contribution, state->bounce - 1);
+#endif
+
 #ifdef __PASSES__
   if (L->use_light_pass) {
     if (state->bounce == 0)
-      L->emission += throughput * value;
+      L->emission += contribution;
     else if (state->bounce == 1)
-      L->direct_emission += throughput * value;
+      L->direct_emission += contribution;
     else
-      L->indirect += throughput * value;
+      L->indirect += contribution;
   }
   else
 #endif
   {
-    L->emission += throughput * value;
+    L->emission += contribution;
   }
 }
 
-ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+ccl_device_inline void path_radiance_accum_ao(KernelGlobals *kg,
+                                              PathRadiance *L,
                                               ccl_addr_space PathState *state,
                                               float3 throughput,
                                               float3 alpha,
                                               float3 bsdf,
                                               float3 ao)
 {
+#ifdef __PASSES__
   /* Store AO pass. */
   if (L->use_light_pass && state->bounce == 0) {
     L->ao += alpha * throughput * ao;
   }
+#endif
 
 #ifdef __SHADOW_TRICKS__
   /* For shadow catcher, accumulate ratio. */
@@ -337,21 +362,23 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
   }
 #endif
 
+  float3 contribution = throughput * bsdf * ao;
+
 #ifdef __PASSES__
   if (L->use_light_pass) {
     if (state->bounce == 0) {
       /* Directly visible lighting. */
-      L->direct_diffuse += throughput * bsdf * ao;
+      L->direct_diffuse += contribution;
     }
     else {
       /* Indirectly visible lighting after BSDF bounce. */
-      L->indirect += throughput * bsdf * ao;
+      L->indirect += contribution;
     }
   }
   else
 #endif
   {
-    L->emission += throughput * bsdf * ao;
+    L->emission += contribution;
   }
 }
 
@@ -372,7 +399,8 @@ ccl_device_inline void path_radiance_accum_total_ao(PathRadiance *L,
 #endif
 }
 
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+ccl_device_inline void path_radiance_accum_light(KernelGlobals *kg,
+                                                 PathRadiance *L,
                                                  ccl_addr_space PathState *state,
                                                  float3 throughput,
                                                  BsdfEval *bsdf_eval,
@@ -392,15 +420,23 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
   }
 #endif
 
+  float3 shaded_throughput = throughput * shadow;
+
 #ifdef __PASSES__
   if (L->use_light_pass) {
+    /* Compute the clamping based on the total contribution.
+     * The resulting scale is then be applied to all individual components. */
+    float3 full_contribution = shaded_throughput * bsdf_eval_sum(bsdf_eval);
+#  ifdef __CLAMP_SAMPLE__
+    path_radiance_clamp_throughput(kg, &full_contribution, &shaded_throughput, state->bounce);
+#  endif
+
     if (state->bounce == 0) {
       /* directly visible lighting */
-      L->direct_diffuse += throughput * bsdf_eval->diffuse * shadow;
-      L->direct_glossy += throughput * bsdf_eval->glossy * shadow;
-      L->direct_transmission += throughput * bsdf_eval->transmission * shadow;
-      L->direct_subsurface += throughput * bsdf_eval->subsurface * shadow;
-      L->direct_scatter += throughput * bsdf_eval->scatter * shadow;
+      L->direct_diffuse += shaded_throughput * bsdf_eval->diffuse;
+      L->direct_glossy += shaded_throughput * bsdf_eval->glossy;
+      L->direct_transmission += shaded_throughput * bsdf_eval->transmission;
+      L->direct_volume += shaded_throughput * bsdf_eval->volume;
 
       if (is_lamp) {
         L->shadow.x += shadow.x * shadow_fac;
@@ -410,13 +446,15 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
     }
     else {
       /* indirectly visible lighting after BSDF bounce */
-      L->indirect += throughput * bsdf_eval_sum(bsdf_eval) * shadow;
+      L->indirect += full_contribution;
     }
   }
   else
 #endif
   {
-    L->emission += throughput * bsdf_eval->diffuse * shadow;
+    float3 contribution = shaded_throughput * bsdf_eval->diffuse;
+    path_radiance_clamp(kg, &contribution, state->bounce);
+    L->emission += contribution;
   }
 }
 
@@ -437,7 +475,8 @@ ccl_device_inline void path_radiance_accum_total_light(PathRadiance *L,
 #endif
 }
 
-ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
+ccl_device_inline void path_radiance_accum_background(KernelGlobals *kg,
+                                                      PathRadiance *L,
                                                       ccl_addr_space PathState *state,
                                                       float3 throughput,
                                                       float3 value)
@@ -454,23 +493,29 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
   }
 #endif
 
+  float3 contribution = throughput * value;
+#ifdef __CLAMP_SAMPLE__
+  path_radiance_clamp(kg, &contribution, state->bounce - 1);
+#endif
+
 #ifdef __PASSES__
   if (L->use_light_pass) {
     if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)
-      L->background += throughput * value;
+      L->background += contribution;
     else if (state->bounce == 1)
-      L->direct_emission += throughput * value;
+      L->direct_emission += contribution;
     else
-      L->indirect += throughput * value;
+      L->indirect += contribution;
   }
   else
 #endif
   {
-    L->emission += throughput * value;
+    L->emission += contribution;
   }
 
 #ifdef __DENOISING_FEATURES__
-  L->denoising_albedo += state->denoising_feature_weight * value;
+  L->denoising_albedo += state->denoising_feature_weight * state->denoising_feature_throughput *
+                         value;
 #endif /* __DENOISING_FEATURES__ */
 }
 
@@ -503,15 +548,13 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
     L->direct_diffuse += L->state.diffuse * L->direct_emission;
     L->direct_glossy += L->state.glossy * L->direct_emission;
     L->direct_transmission += L->state.transmission * L->direct_emission;
-    L->direct_subsurface += L->state.subsurface * L->direct_emission;
-    L->direct_scatter += L->state.scatter * L->direct_emission;
+    L->direct_volume += L->state.volume * L->direct_emission;
 
     L->indirect = safe_divide_color(L->indirect, L->state.direct);
     L->indirect_diffuse += L->state.diffuse * L->indirect;
     L->indirect_glossy += L->state.glossy * L->indirect;
     L->indirect_transmission += L->state.transmission * L->indirect;
-    L->indirect_subsurface += L->state.subsurface * L->indirect;
-    L->indirect_scatter += L->state.scatter * L->indirect;
+    L->indirect_volume += L->state.volume * L->indirect;
   }
 #endif
 }
@@ -523,8 +566,7 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
     L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
+    L->state.volume = make_float3(0.0f, 0.0f, 0.0f);
 
     L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
     L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -585,15 +627,13 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
   /* Light Passes are used */
 #ifdef __PASSES__
   float3 L_direct, L_indirect;
-  float clamp_direct = kernel_data.integrator.sample_clamp_direct;
-  float clamp_indirect = kernel_data.integrator.sample_clamp_indirect;
   if (L->use_light_pass) {
     path_radiance_sum_indirect(L);
 
-    L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission +
-               L->direct_subsurface + L->direct_scatter + L->emission;
+    L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_volume +
+               L->emission;
     L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission +
-                 L->indirect_subsurface + L->indirect_scatter;
+                 L->indirect_volume;
 
     if (!kernel_data.background.transparent)
       L_direct += L->background;
@@ -609,55 +649,15 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
       L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f);
       L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
       L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
-      L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-      L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
+      L->direct_volume = make_float3(0.0f, 0.0f, 0.0f);
 
       L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
       L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
       L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
-      L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-      L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
+      L->indirect_volume = make_float3(0.0f, 0.0f, 0.0f);
 
       L->emission = make_float3(0.0f, 0.0f, 0.0f);
     }
-
-    /* Clamp direct and indirect samples */
-#  ifdef __CLAMP_SAMPLE__
-    else if (sum > clamp_direct || sum > clamp_indirect) {
-      float scale;
-
-      /* Direct */
-      float sum_direct = fabsf(L_direct.x) + fabsf(L_direct.y) + fabsf(L_direct.z);
-      if (sum_direct > clamp_direct) {
-        scale = clamp_direct / sum_direct;
-        L_direct *= scale;
-
-        L->direct_diffuse *= scale;
-        L->direct_glossy *= scale;
-        L->direct_transmission *= scale;
-        L->direct_subsurface *= scale;
-        L->direct_scatter *= scale;
-        L->emission *= scale;
-        L->background *= scale;
-      }
-
-      /* Indirect */
-      float sum_indirect = fabsf(L_indirect.x) + fabsf(L_indirect.y) + fabsf(L_indirect.z);
-      if (sum_indirect > clamp_indirect) {
-        scale = clamp_indirect / sum_indirect;
-        L_indirect *= scale;
-
-        L->indirect_diffuse *= scale;
-        L->indirect_glossy *= scale;
-        L->indirect_transmission *= scale;
-        L->indirect_subsurface *= scale;
-        L->indirect_scatter *= scale;
-      }
-
-      /* Sum again, after clamping */
-      L_sum = L_direct + L_indirect;
-    }
-#  endif
   }
 
   /* No Light Passes */
@@ -696,7 +696,7 @@ ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
   kernel_assert(L->use_light_pass);
 
   *clean = L->emission + L->background;
-  *noisy = L->direct_scatter + L->indirect_scatter;
+  *noisy = L->direct_volume + L->indirect_volume;
 
 #  define ADD_COMPONENT(flag, component) \
     if (kernel_data.film.denoising_flags & flag) \
@@ -710,8 +710,6 @@ ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
   ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
   ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
   ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
-  ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
-  ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
 #  undef ADD_COMPONENT
 #else
   *noisy = L->emission;
@@ -748,14 +746,12 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
   safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
   safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
   safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
-  safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface);
-  safe_float3_add(L->direct_scatter, L_sample->direct_scatter);
+  safe_float3_add(L->direct_volume, L_sample->direct_volume);
 
   safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
   safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
   safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
-  safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface);
-  safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter);
+  safe_float3_add(L->indirect_volume, L_sample->indirect_volume);
 
   safe_float3_add(L->background, L_sample->background);
   safe_float3_add(L->ao, L_sample->ao);
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
new file mode 100644
index 00000000000..98b7bf7e7dc
--- /dev/null
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
+#define __KERNEL_ADAPTIVE_SAMPLING_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+
+ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
+                                            ccl_global float *buffer,
+                                            int sample)
+{
+  /* TODO Stefan: Is this better in linear, sRGB or something else? */
+  float4 I = *((ccl_global float4 *)buffer);
+  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
+   * A small epsilon is added to the divisor to prevent division by zero. */
+  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
+                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
+  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
+    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
+    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  }
+}
+
+/* Adjust the values of an adaptively sampled pixel. */
+
+ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
+                                            ccl_global float *buffer,
+                                            float sample_multiplier)
+{
+  *(ccl_global float4 *)(buffer) *= sample_multiplier;
+
+  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
+  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+
+#ifdef __PASSES__
+  int flag = kernel_data.film.pass_flag;
+
+  if (flag & PASSMASK(NORMAL))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+
+  if (flag & PASSMASK(UV))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+
+  if (flag & PASSMASK(MOTION)) {
+    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
+    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+  }
+
+  if (kernel_data.film.use_light_pass) {
+    int light_flag = kernel_data.film.light_pass_flag;
+
+    if (light_flag & PASSMASK(MIST))
+      *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
+
+    /* Shadow pass omitted on purpose. It has its own scale parameter. */
+
+    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(VOLUME_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(VOLUME_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
+
+    if (light_flag & PASSMASK(EMISSION))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
+    if (light_flag & PASSMASK(BACKGROUND))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
+    if (light_flag & PASSMASK(AO))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
+
+    if (light_flag & PASSMASK(DIFFUSE_COLOR))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_COLOR))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
+  }
+#endif
+
+#ifdef __DENOISING_FEATURES__
+
+#  define scale_float3_variance(buffer, offset, scale) \
+    *(buffer + offset) *= scale; \
+    *(buffer + offset + 1) *= scale; \
+    *(buffer + offset + 2) *= scale; \
+    *(buffer + offset + 3) *= scale * scale; \
+    *(buffer + offset + 4) *= scale * scale; \
+    *(buffer + offset + 5) *= scale * scale;
+
+#  define scale_shadow_variance(buffer, offset, scale) \
+    *(buffer + offset) *= scale; \
+    *(buffer + offset + 1) *= scale; \
+    *(buffer + offset + 2) *= scale * scale;
+
+  if (kernel_data.film.pass_denoising_data) {
+    scale_shadow_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
+    scale_shadow_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
+    if (kernel_data.film.pass_denoising_clean) {
+      scale_float3_variance(
+          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
+      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
+      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
+    }
+    else {
+      scale_float3_variance(
+          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+    }
+    scale_float3_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
+    scale_float3_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
+    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
+    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
+      1) *= sample_multiplier * sample_multiplier;
+  }
+#endif /* __DENOISING_FEATURES__ */
+
+  /* Cryptomatte. */
+  if (kernel_data.film.cryptomatte_passes) {
+    int num_slots = 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
+    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
+    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
+                                                         kernel_data.film.pass_cryptomatte);
+    for (int slot = 0; slot < num_slots; slot++) {
+      id_buffer[slot].y *= sample_multiplier;
+    }
+  }
+
+  /* AOVs. */
+  for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) {
+    *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier;
+  }
+  for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) {
+    *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier;
+  }
+}
+
+/* This is a simple box filter in two passes.
+ * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
+
+ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+{
+  bool any = false;
+  bool prev = false;
+  for (int x = tile->x; x < tile->x + tile->w; ++x) {
+    int index = tile->offset + x + y * tile->stride;
+    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if ((*aux).w == 0.0f) {
+      any = true;
+      if (x > tile->x && !prev) {
+        index = index - 1;
+        buffer = tile->buffer + index * kernel_data.film.pass_stride;
+        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+        (*aux).w = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        (*aux).w = 0.0f;
+      }
+      prev = false;
+    }
+  }
+  return any;
+}
+
+ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+{
+  bool prev = false;
+  bool any = false;
+  for (int y = tile->y; y < tile->y + tile->h; ++y) {
+    int index = tile->offset + x + y * tile->stride;
+    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if ((*aux).w == 0.0f) {
+      any = true;
+      if (y > tile->y && !prev) {
+        index = index - tile->stride;
+        buffer = tile->buffer + index * kernel_data.film.pass_stride;
+        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+        (*aux).w = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        (*aux).w = 0.0f;
+      }
+      prev = false;
+    }
+  }
+  return any;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 64840c00f16..4bae9e5e1b4 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -18,19 +18,33 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BAKING__
 
-ccl_device_inline void compute_light_pass(
+ccl_device_noinline void compute_light_pass(
     KernelGlobals *kg, ShaderData *sd, PathRadiance *L, uint rng_hash, int pass_filter, int sample)
 {
   kernel_assert(kernel_data.film.use_light_pass);
 
-  PathRadiance L_sample;
-  PathState state;
-  Ray ray;
   float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
-  /* emission and indirect shader data memory used by various functions */
-  ShaderData emission_sd, indirect_sd;
+  /* Emission and indirect shader data memory used by various functions. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  ShaderData indirect_sd;
+
+  /* Init radiance. */
+  path_radiance_init(kg, L);
+
+  /* Init path state. */
+  PathState state;
+  path_state_init(kg, emission_sd, &state, rng_hash, sample, NULL);
+
+  /* Evaluate surface shader. */
+  shader_eval_surface(kg, sd, &state, NULL, state.flag);
 
+  /* TODO, disable more closures we don't need besides transparent */
+  shader_bsdf_disable_transparency(kg, sd);
+
+  /* Init ray. */
+  Ray ray;
   ray.P = sd->P + sd->Ng;
   ray.D = -sd->Ng;
   ray.t = FLT_MAX;
@@ -38,18 +52,6 @@ ccl_device_inline void compute_light_pass(
   ray.time = 0.5f;
 #  endif
 
-  /* init radiance */
-  path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
-
-  /* init path state */
-  path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
-
-  /* evaluate surface shader */
-  shader_eval_surface(kg, sd, &state, state.flag);
-
-  /* TODO, disable more closures we don't need besides transparent */
-  shader_bsdf_disable_transparency(kg, sd);
-
 #  ifdef __BRANCHED_PATH__
   if (!kernel_data.integrator.branched) {
     /* regular path tracer */
@@ -57,8 +59,7 @@ ccl_device_inline void compute_light_pass(
 
     /* sample ambient occlusion */
     if (pass_filter & BAKE_FILTER_AO) {
-      kernel_path_ao(
-          kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd));
+      kernel_path_ao(kg, sd, emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, sd));
     }
 
     /* sample emission */
@@ -66,26 +67,27 @@ ccl_device_inline void compute_light_pass(
       bool is_volume_boundary = (state.volume_bounce > 0) || (state.volume_bounds_bounce > 0);
       float3 emission = indirect_primitive_emission(
           kg, sd, 0.0f, sd->P_pick, sd->N_pick, state.flag, state.ray_pdf, is_volume_boundary);
-      path_radiance_accum_emission(&L_sample, &state, throughput, emission);
+      path_radiance_accum_emission(kg, L, &state, throughput, emission);
     }
 
     bool is_sss_sample = false;
 
 #  ifdef __SUBSURFACE__
     /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
-      /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
+      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
+       * if scattering was successful. */
       SubsurfaceIndirectRays ss_indirect;
       kernel_path_subsurface_init_indirect(&ss_indirect);
       if (kernel_path_subsurface_scatter(
-              kg, sd, &emission_sd, &L_sample, &state, &ray, &throughput, &ss_indirect)) {
+              kg, sd, emission_sd, L, &state, &ray, &throughput, &ss_indirect)) {
         while (ss_indirect.num_rays) {
-          kernel_path_subsurface_setup_indirect(
-              kg, &ss_indirect, &state, &ray, &L_sample, &throughput);
+          kernel_path_subsurface_setup_indirect(kg, &ss_indirect, &state, &ray, L, &throughput);
+          kernel_path_indirect(kg, &indirect_sd, emission_sd, &ray, throughput, &state, L);
           indirect_sd.P_pick = sd->P_pick;
           indirect_sd.N_pick = sd->N_pick;
           kernel_path_indirect(
-              kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample);
+              kg, &indirect_sd, emission_sd, &ray, throughput, &state, L);
         }
         is_sss_sample = true;
       }
@@ -94,20 +96,20 @@ ccl_device_inline void compute_light_pass(
 
     /* sample light and BSDF */
     if (!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-      kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample);
+      kernel_path_surface_connect_light(kg, sd, emission_sd, throughput, &state, L);
 
-      if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) {
+      if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L->state, &ray)) {
 #  ifdef __LAMP_MIS__
         state.ray_t = 0.0f;
 #  endif
         /* compute indirect light */
         indirect_sd.P_pick = sd->P_pick;
         indirect_sd.N_pick = sd->N_pick;
-        kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample);
+        kernel_path_indirect(kg, &indirect_sd, emission_sd, &ray, throughput, &state, L);
 
         /* sum and reset indirect light pass variables for the next samples */
-        path_radiance_sum_indirect(&L_sample);
-        path_radiance_reset_indirect(&L_sample);
+        path_radiance_sum_indirect(L);
+        path_radiance_reset_indirect(L);
       }
     }
 #  ifdef __BRANCHED_PATH__
@@ -117,7 +119,7 @@ ccl_device_inline void compute_light_pass(
 
     /* sample ambient occlusion */
     if (pass_filter & BAKE_FILTER_AO) {
-      kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput);
+      kernel_branched_path_ao(kg, sd, emission_sd, L, &state, throughput);
     }
 
     /* sample emission */
@@ -125,15 +127,16 @@ ccl_device_inline void compute_light_pass(
       bool is_volume_boundary = (state.volume_bounce > 0) || (state.volume_bounds_bounce > 0);
       float3 emission = indirect_primitive_emission(
           kg, sd, 0.0f, sd->P_pick, sd->N_pick, state.flag, state.ray_pdf, is_volume_boundary);
-      path_radiance_accum_emission(&L_sample, &state, throughput, emission);
+      path_radiance_accum_emission(kg, L, &state, throughput, emission);
     }
 
 #    ifdef __SUBSURFACE__
     /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
-      /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
+      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
+       * if scattering was successful. */
       kernel_branched_path_subsurface_scatter(
-          kg, sd, &indirect_sd, &emission_sd, &L_sample, &state, &ray, throughput);
+          kg, sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
     }
 #    endif
 
@@ -144,19 +147,16 @@ ccl_device_inline void compute_light_pass(
       if (kernel_data.integrator.use_direct_light) {
         int all = kernel_data.integrator.sample_all_lights_direct;
         kernel_branched_path_surface_connect_light(
-            kg, sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all);
+            kg, sd, emission_sd, &state, throughput, 1.0f, L, all);
       }
 #    endif
 
       /* indirect light */
       kernel_branched_path_surface_indirect_light(
-          kg, sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
+          kg, sd, &indirect_sd, emission_sd, throughput, 1.0f, &state, L);
     }
   }
 #  endif
-
-  /* accumulate into master L */
-  path_radiance_accum_sample(L, &L_sample);
 }
 
 /* this helps with AA but it's not the real solution as it does not AA the geometry
@@ -184,10 +184,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
       return shader_bsdf_glossy(kg, sd);
     case SHADER_EVAL_TRANSMISSION:
       return shader_bsdf_transmission(kg, sd);
-#  ifdef __SUBSURFACE__
-    case SHADER_EVAL_SUBSURFACE:
-      return shader_bsdf_subsurface(kg, sd);
-#  endif
     default:
       kernel_assert(!"Unknown bake type passed to BSDF evaluate");
       return make_float3(0.0f, 0.0f, 0.0f);
@@ -215,12 +211,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
     }
     else {
       /* surface color of the pass only */
-      shader_eval_surface(kg, sd, state, 0);
+      shader_eval_surface(kg, sd, state, NULL, 0);
       return kernel_bake_shader_bsdf(kg, sd, type);
     }
   }
   else {
-    shader_eval_surface(kg, sd, state, 0);
+    shader_eval_surface(kg, sd, state, NULL, 0);
     color = kernel_bake_shader_bsdf(kg, sd, type);
   }
 
@@ -235,41 +231,28 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
   return out;
 }
 
-ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
-                                     ccl_global uint4 *input,
-                                     ccl_global float4 *output,
-                                     ShaderEvalType type,
-                                     int pass_filter,
-                                     int i,
-                                     int offset,
-                                     int sample)
+ccl_device void kernel_bake_evaluate(
+    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i * 2];
-  uint4 diff = input[i * 2 + 1];
-
-  float3 out = make_float3(0.0f, 0.0f, 0.0f);
+  /* Setup render buffers. */
+  const int index = offset + x + y * stride;
+  const int pass_stride = kernel_data.film.pass_stride;
+  buffer += index * pass_stride;
 
-  int object = in.x;
-  int prim = in.y;
+  ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
+  ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
+  ccl_global float *output = buffer + kernel_data.film.pass_combined;
 
+  int prim = __float_as_uint(primitive[1]);
   if (prim == -1)
     return;
 
-  float u = __uint_as_float(in.z);
-  float v = __uint_as_float(in.w);
-
-  float dudx = __uint_as_float(diff.x);
-  float dudy = __uint_as_float(diff.y);
-  float dvdx = __uint_as_float(diff.z);
-  float dvdy = __uint_as_float(diff.w);
+  prim += kernel_data.bake.tri_offset;
 
+  /* Random number generator. */
+  uint rng_hash = hash_uint2(x, y) ^ kernel_data.integrator.seed;
   int num_samples = kernel_data.integrator.aa_samples;
 
-  /* random number generator */
-  uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed);
-
   float filter_x, filter_y;
   if (sample == 0) {
     filter_x = filter_y = 0.5f;
@@ -278,23 +261,29 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
     path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
   }
 
-  /* subpixel u/v offset */
+  /* Barycentric UV with subpixel offset. */
+  float u = primitive[2];
+  float v = primitive[3];
+
+  float dudx = differential[0];
+  float dudy = differential[1];
+  float dvdx = differential[2];
+  float dvdy = differential[3];
+
   if (sample > 0) {
     u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
     v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
                                  1.0f - u);
   }
 
-  /* triangle */
+  /* Shader data setup. */
+  int object = kernel_data.bake.object_index;
   int shader;
   float3 P, Ng;
 
   triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
 
-  /* light passes */
-  PathRadiance L;
-  path_radiance_init(&L, kernel_data.film.use_light_pass);
-
+  ShaderData sd;
   shader_setup_from_sample(
       kg,
       &sd,
@@ -312,7 +301,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
       LAMP_NONE);
   sd.I = sd.N;
 
-  /* update differentials */
+  /* Setup differentials. */
   sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx;
   sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy;
   sd.du.dx = dudx;
@@ -320,17 +309,24 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
   sd.dv.dx = dvdx;
   sd.dv.dy = dvdy;
 
-  /* set RNG state for shaders that use sampling */
+  /* Set RNG state for shaders that use sampling. */
+  PathState state = {0};
   state.rng_hash = rng_hash;
   state.rng_offset = 0;
   state.sample = sample;
   state.num_samples = num_samples;
   state.min_ray_pdf = FLT_MAX;
 
-  /* light passes if we need more than color */
-  if (pass_filter & ~BAKE_FILTER_COLOR)
+  /* Light passes if we need more than color. */
+  PathRadiance L;
+  int pass_filter = kernel_data.bake.pass_filter;
+
+  if (kernel_data.bake.pass_filter & ~BAKE_FILTER_COLOR)
     compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
 
+  float3 out = make_float3(0.0f, 0.0f, 0.0f);
+
+  ShaderEvalType type = (ShaderEvalType)kernel_data.bake.type;
   switch (type) {
     /* data passes */
     case SHADER_EVAL_NORMAL:
@@ -338,7 +334,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
     case SHADER_EVAL_EMISSION: {
       if (type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) {
         int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0;
-        shader_eval_surface(kg, &sd, &state, path_flag);
+        shader_eval_surface(kg, &sd, &state, NULL, path_flag);
       }
 
       if (type == SHADER_EVAL_NORMAL) {
@@ -391,11 +387,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
       if ((pass_filter & BAKE_FILTER_TRANSMISSION_INDIRECT) == BAKE_FILTER_TRANSMISSION_INDIRECT)
         out += L.indirect_transmission;
 
-      if ((pass_filter & BAKE_FILTER_SUBSURFACE_DIRECT) == BAKE_FILTER_SUBSURFACE_DIRECT)
-        out += L.direct_subsurface;
-      if ((pass_filter & BAKE_FILTER_SUBSURFACE_INDIRECT) == BAKE_FILTER_SUBSURFACE_INDIRECT)
-        out += L.indirect_subsurface;
-
       if ((pass_filter & BAKE_FILTER_EMISSION) != 0)
         out += L.emission;
 
@@ -420,13 +411,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
           kg, &sd, &state, L.direct_transmission, L.indirect_transmission, type, pass_filter);
       break;
     }
-    case SHADER_EVAL_SUBSURFACE: {
-#    ifdef __SUBSURFACE__
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_subsurface, L.indirect_subsurface, type, pass_filter);
-#    endif
-      break;
-    }
 #  endif
 
     /* extra */
@@ -451,7 +435,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
 
       /* evaluate */
       int path_flag = 0; /* we can't know which type of BSDF this is for */
-      shader_eval_surface(kg, &sd, &state, path_flag | PATH_RAY_EMISSION);
+      shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
       out = shader_background_eval(&sd);
       break;
     }
@@ -463,10 +447,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
   }
 
   /* write output */
-  const float output_fac = 1.0f / num_samples;
-  const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
-
-  output[i] = (sample == 0) ? scaled_result : output[i] + scaled_result;
+  const float4 result = make_float4(out.x, out.y, out.z, 1.0f);
+  kernel_write_pass_float4(output, result);
 }
 
 #endif /* __BAKING__ */
@@ -530,7 +512,7 @@ ccl_device void kernel_background_evaluate(KernelGlobals *kg,
 
   /* evaluate */
   int path_flag = 0; /* we can't know which type of BSDF this is for */
-  shader_eval_surface(kg, &sd, &state, path_flag | PATH_RAY_EMISSION);
+  shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
   float3 color = shader_background_eval(&sd);
 
   /* write output */
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 1085930c33a..efe46d5b0dd 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -128,7 +128,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 #ifdef __RAY_DIFFERENTIALS__
     /* Ray differentials, computed from scratch using the raster coordinates
      * because we don't want to be affected by depth of field. We compute
-     * ray origin and direction for the center and two neighbouring pixels
+     * ray origin and direction for the center and two neighboring pixels
      * and simply take their differences. */
     float3 Pnostereo = transform_point(&cameratoworld, make_float3(0.0f, 0.0f, 0.0f));
 
@@ -237,7 +237,9 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg,
 /* Panorama Camera */
 
 ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
+#ifdef __CAMERA_MOTION__
                                               const ccl_global DecomposedTransform *cam_motion,
+#endif
                                               float raster_x,
                                               float raster_y,
                                               float lens_u,
@@ -303,7 +305,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
 #ifdef __RAY_DIFFERENTIALS__
   /* Ray differentials, computed from scratch using the raster coordinates
    * because we don't want to be affected by depth of field. We compute
-   * ray origin and direction for the center and two neighbouring pixels
+   * ray origin and direction for the center and two neighboring pixels
    * and simply take their differences. */
   float3 Pcenter = Pcamera;
   float3 Dcenter = panorama_to_direction(cam, Pcenter.x, Pcenter.y);
@@ -377,9 +379,9 @@ ccl_device_inline void camera_sample(KernelGlobals *kg,
     const int shutter_table_offset = kernel_data.cam.shutter_table_offset;
     ray->time = lookup_table_read(kg, time, shutter_table_offset, SHUTTER_TABLE_SIZE);
     /* TODO(sergey): Currently single rolling shutter effect type only
-     * where scanlines are acquired from top to bottom and whole scanline
+     * where scan-lines are acquired from top to bottom and whole scan-line
      * is acquired at once (no delay in acquisition happens between pixels
-     * of single scanline).
+     * of single scan-line).
      *
      * Might want to support more models in the future.
      */
@@ -413,8 +415,12 @@ ccl_device_inline void camera_sample(KernelGlobals *kg,
     camera_sample_orthographic(kg, raster_x, raster_y, lens_u, lens_v, ray);
   }
   else {
+#ifdef __CAMERA_MOTION__
     const ccl_global DecomposedTransform *cam_motion = kernel_tex_array(__camera_motion);
     camera_sample_panorama(&kernel_data.cam, cam_motion, raster_x, raster_y, lens_u, lens_v, ray);
+#else
+    camera_sample_panorama(&kernel_data.cam, raster_x, raster_y, lens_u, lens_v, ray);
+#endif
   }
 }
 
@@ -435,8 +441,22 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
     float3 camD = make_float3(cameratoworld.x.z, cameratoworld.y.z, cameratoworld.z.z);
     return fabsf(dot((P - camP), camD));
   }
-  else
+  else {
+    return len(P - camP);
+  }
+}
+
+ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
+{
+  if (kernel_data.cam.type != CAMERA_PANORAMA) {
+    Transform worldtocamera = kernel_data.cam.worldtocamera;
+    return transform_point(&worldtocamera, P).z;
+  }
+  else {
+    Transform cameratoworld = kernel_data.cam.cameratoworld;
+    float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
     return len(P - camP);
+  }
 }
 
 ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P)
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index e8fedca4489..88f6a264a5a 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -35,11 +35,11 @@
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
+#include "util/util_half.h"
 #include "util/util_math.h"
 #include "util/util_simd.h"
-#include "util/util_half.h"
-#include "util/util_types.h"
 #include "util/util_texture.h"
+#include "util/util_types.h"
 
 #define ccl_addr_space
 
@@ -79,7 +79,7 @@ template<typename T> struct texture {
   }
 #if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
   /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
-   * compatibility with existing indicies and data structures.
+   * compatibility with existing indices and data structures.
    */
   ccl_always_inline avxf fetch_avxf(const int index)
   {
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 469b81d120b..4094e173da9 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -37,8 +37,11 @@ typedef unsigned long long uint64_t;
 typedef unsigned short half;
 typedef unsigned long long CUtexObject;
 
-#define FLT_MIN 1.175494350822287507969e-38f
-#define FLT_MAX 340282346638528859811704183484516925440.0f
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
 
 __device__ half __float2half(const float f)
 {
@@ -58,6 +61,7 @@ __device__ half __float2half(const float f)
 #  define ccl_device_forceinline __device__ __forceinline__
 #endif
 #define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
 #define ccl_constant const
@@ -67,11 +71,13 @@ __device__ half __float2half(const float f)
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
 /* TODO(sergey): In theory we might use references with CUDA, however
  * performance impact yet to be investigated.
  */
 #define ccl_ref
 #define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
 
 #define ATTR_FALLTHROUGH
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index e040ea88d7c..ba7ab43a47a 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -35,6 +35,7 @@
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device ccl_noinline
+#define ccl_device_noinline_cpu ccl_device
 #define ccl_may_alias
 #define ccl_static_constant static __constant
 #define ccl_constant __constant
@@ -45,6 +46,13 @@
 #define ccl_restrict restrict
 #define ccl_ref
 #define ccl_align(n) __attribute__((aligned(n)))
+#define ccl_optional_struct_init
+
+#if __OPENCL_VERSION__ >= 200
+#  define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1)))
+#else
+#  define ccl_loop_no_unroll
+#endif
 
 #ifdef __SPLIT_KERNEL__
 #  define ccl_addr_space __global
@@ -124,6 +132,8 @@
 #define fminf(x, y) fmin(((float)(x)), ((float)(y)))
 #define fmodf(x, y) fmod((float)(x), (float)(y))
 #define sinhf(x) sinh(((float)(x)))
+#define coshf(x) cosh(((float)(x)))
+#define tanhf(x) tanh(((float)(x)))
 
 /* Use native functions with possibly lower precision for performance,
  * no issues found so far. */
diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/kernel_compat_optix.h
new file mode 100644
index 00000000000..970f5cf864c
--- /dev/null
+++ b/intern/cycles/kernel/kernel_compat_optix.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_COMPAT_OPTIX_H__
+#define __KERNEL_COMPAT_OPTIX_H__
+
+#define OPTIX_DONT_INCLUDE_CUDA
+#include <optix.h>
+
+#define __KERNEL_GPU__
+#define __KERNEL_CUDA__  // OptiX kernels are implicitly CUDA kernels too
+#define __KERNEL_OPTIX__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef unsigned short half;
+typedef unsigned long long CUtexObject;
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
+#define ccl_device \
+  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_constant const
+#define ccl_local
+#define ccl_local_param
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_loop_no_unroll
+#define ccl_restrict __restrict__
+#define ccl_ref
+#define ccl_align(n) __align__(n)
+
+// Zero initialize structs to help the compiler figure out scoping
+#define ccl_optional_struct_init = {}
+
+#define kernel_data __params.data  // See kernel_globals.h
+#define kernel_tex_array(t) __params.t
+#define kernel_tex_fetch(t, index) __params.t[(index)]
+
+#define kernel_assert(cond)
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
+
+#endif /* __KERNEL_COMPAT_OPTIX_H__ */
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 76fcc9d9e51..dc51d01bab7 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -17,17 +17,17 @@
 CCL_NAMESPACE_BEGIN
 
 /* Direction Emission */
-ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
-                                                ShaderData *emission_sd,
-                                                LightSample *ls,
-                                                ccl_addr_space PathState *state,
-                                                float3 I,
-                                                differential3 dI,
-                                                float t,
-                                                float time)
+ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
+                                                    ShaderData *emission_sd,
+                                                    LightSample *ls,
+                                                    ccl_addr_space PathState *state,
+                                                    float3 I,
+                                                    differential3 dI,
+                                                    float t,
+                                                    float time)
 {
   /* setup shading at emitter */
-  float3 eval;
+  float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
   if (shader_constant_emission_eval(kg, ls->shader, &eval)) {
     if ((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
@@ -73,7 +73,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
     /* No proper path flag, we're evaluating this for all closures. that's
      * weak but we'd have to do multiple evaluations otherwise. */
     path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, PATH_RAY_EMISSION);
+    shader_eval_surface(kg, emission_sd, state, NULL, PATH_RAY_EMISSION);
     path_state_modify_bounce(state, false);
 
     /* Evaluate closures. */
@@ -90,18 +90,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 
   eval *= ls->eval_fac;
 
+  if (ls->lamp != LAMP_NONE) {
+    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, ls->lamp);
+    eval *= make_float3(klight->strength[0], klight->strength[1], klight->strength[2]);
+  }
+
   return eval;
 }
 
-ccl_device_noinline bool direct_emission(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         ShaderData *emission_sd,
-                                         LightSample *ls,
-                                         ccl_addr_space PathState *state,
-                                         Ray *ray,
-                                         BsdfEval *eval,
-                                         bool *is_lamp,
-                                         float rand_terminate)
+ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             ShaderData *emission_sd,
+                                             LightSample *ls,
+                                             ccl_addr_space PathState *state,
+                                             Ray *ray,
+                                             BsdfEval *eval,
+                                             bool *is_lamp,
+                                             float rand_terminate)
 {
   if (ls->pdf == 0.0f)
     return false;
@@ -140,16 +145,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 #ifdef __PASSES__
   /* use visibility flag to skip lights */
   if (ls->shader & SHADER_EXCLUDE_ANY) {
-    if (ls->shader & SHADER_EXCLUDE_DIFFUSE) {
+    if (ls->shader & SHADER_EXCLUDE_DIFFUSE)
       eval->diffuse = make_float3(0.0f, 0.0f, 0.0f);
-      eval->subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    }
     if (ls->shader & SHADER_EXCLUDE_GLOSSY)
       eval->glossy = make_float3(0.0f, 0.0f, 0.0f);
     if (ls->shader & SHADER_EXCLUDE_TRANSMIT)
       eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
     if (ls->shader & SHADER_EXCLUDE_SCATTER)
-      eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
+      eval->volume = make_float3(0.0f, 0.0f, 0.0f);
   }
 #endif
 
@@ -203,7 +206,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 
 /* Indirect Primitive Emission */
 
-ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg,
+ccl_device_noinline_cpu float3 indirect_primitive_emission(KernelGlobals *kg,
                                                        ShaderData *sd,
                                                        float t,
                                                        float3 P,
@@ -236,18 +239,16 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg,
 
 /* Indirect Lamp Emission */
 
-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
-                                                ShaderData *emission_sd,
-                                                ccl_addr_space PathState *state,
+ccl_device_noinline_cpu void indirect_lamp_emission(KernelGlobals *kg,
+                                                    ShaderData *emission_sd,
+                                                    ccl_addr_space PathState *state,
                                                 float3 N,
-                                                Ray *ray,
-                                                float3 *emission)
+                                                    PathRadiance *L,
+                                                    Ray *ray,
+                                                    float3 throughput)
 {
-  bool hit_lamp = false;
-
-  *emission = make_float3(0.0f, 0.0f, 0.0f);
   for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
-    LightSample ls;
+    LightSample ls ccl_optional_struct_init;
 
     if (!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls))
       continue;
@@ -265,7 +266,7 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
     }
 #endif
 
-    float3 L = direct_emissive_eval(
+    float3 lamp_L = direct_emissive_eval(
         kg, emission_sd, &ls, state, -ray->D, ray->dD, ls.t, ray->time);
 
     bool has_volume = false;
@@ -276,7 +277,7 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
       volume_ray.t = ls.t;
       float3 volume_tp = make_float3(1.0f, 1.0f, 1.0f);
       kernel_volume_shadow(kg, emission_sd, state, &volume_ray, &volume_tp);
-      L *= volume_tp;
+      lamp_L *= volume_tp;
     }
 
     has_volume = ((emission_sd->flag & SD_HAS_VOLUME) != 0);
@@ -289,23 +290,21 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
       /* multiply with light picking probablity to pdf */
       ls.pdf *= light_distribution_pdf(kg, ray->P, N, ~ls.lamp, -1, has_volume);
       float mis_weight = power_heuristic(state->ray_pdf, ls.pdf);
-      L *= mis_weight;
+      lamp_L *= mis_weight;
     }
 
-    *emission += L;
-    hit_lamp = true;
+    path_radiance_accum_emission(kg, L, state, throughput, lamp_L);
   }
-
-  return hit_lamp;
 }
 
 /* Indirect Background */
 
-ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
-                                               ShaderData *emission_sd,
+ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
+                                                   ShaderData *emission_sd,
                                                float3 N,
-                                               ccl_addr_space PathState *state,
-                                               ccl_addr_space Ray *ray)
+                                                   ccl_addr_space PathState *state,
+                                                   ccl_global float *buffer,
+                                                   ccl_addr_space Ray *ray)
 {
 #ifdef __BACKGROUND__
   int shader = kernel_data.background.surface_shader;
@@ -328,7 +327,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
   has_volume = ((emission_sd->flag & SD_HAS_VOLUME) != 0);
 #  endif
   /* Evaluate background shader. */
-  float3 L;
+  float3 L = make_float3(0.0f, 0.0f, 0.0f);
   if (!shader_constant_emission_eval(kg, shader, &L)) {
 #  ifdef __SPLIT_KERNEL__
     Ray priv_ray = *ray;
@@ -338,7 +337,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
 #  endif
 
     path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, state->flag | PATH_RAY_EMISSION);
+    shader_eval_surface(kg, emission_sd, state, buffer, state->flag | PATH_RAY_EMISSION);
     path_state_modify_bounce(state, false);
 
     L = shader_background_eval(emission_sd);
@@ -362,7 +361,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
   /* check if background light exists or if we should skip pdf */
   int res_x = kernel_data.integrator.pdf_background_res_x;
 
-  if (!(state->flag & PATH_RAY_MIS_SKIP) && res_x) {
+  if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
     /* multiple importance sampling, get background light pdf for ray
      * direction, and compute weight with respect to BSDF pdf */
     float pdf = background_light_pdf(kg, P_pick, ray->D);
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index d20f1adf663..8344f4b4f47 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -16,18 +16,60 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale)
+ccl_device float4 film_get_pass_result(KernelGlobals *kg,
+                                       ccl_global float *buffer,
+                                       float sample_scale,
+                                       int index,
+                                       bool use_display_sample_scale)
 {
-  float exposure = kernel_data.film.exposure;
-  float4 result = irradiance * scale;
+  float4 pass_result;
+
+  int display_pass_stride = kernel_data.film.display_pass_stride;
+  int display_pass_components = kernel_data.film.display_pass_components;
+
+  if (display_pass_components == 4) {
+    float4 in = *(ccl_global float4 *)(buffer + display_pass_stride +
+                                       index * kernel_data.film.pass_stride);
+    float alpha = use_display_sample_scale ?
+                      (kernel_data.film.use_display_pass_alpha ? in.w : 1.0f / sample_scale) :
+                      1.0f;
+
+    pass_result = make_float4(in.x, in.y, in.z, alpha);
+
+    int display_divide_pass_stride = kernel_data.film.display_divide_pass_stride;
+    if (display_divide_pass_stride != -1) {
+      ccl_global float4 *divide_in = (ccl_global float4 *)(buffer + display_divide_pass_stride +
+                                                           index * kernel_data.film.pass_stride);
+      float3 divided = safe_divide_even_color(float4_to_float3(pass_result),
+                                              float4_to_float3(*divide_in));
+      pass_result = make_float4(divided.x, divided.y, divided.z, pass_result.w);
+    }
+
+    if (kernel_data.film.use_display_exposure) {
+      float exposure = kernel_data.film.exposure;
+      pass_result *= make_float4(exposure, exposure, exposure, alpha);
+    }
+  }
+  else if (display_pass_components == 1) {
+    ccl_global float *in = (ccl_global float *)(buffer + display_pass_stride +
+                                                index * kernel_data.film.pass_stride);
+    pass_result = make_float4(*in, *in, *in, 1.0f / sample_scale);
+  }
+
+  return pass_result;
+}
+
+ccl_device float4 film_map(KernelGlobals *kg, float4 rgba_in, float scale)
+{
+  float4 result;
 
   /* conversion to srgb */
-  result.x = color_linear_to_srgb(result.x * exposure);
-  result.y = color_linear_to_srgb(result.y * exposure);
-  result.z = color_linear_to_srgb(result.z * exposure);
+  result.x = color_linear_to_srgb(rgba_in.x * scale);
+  result.y = color_linear_to_srgb(rgba_in.y * scale);
+  result.z = color_linear_to_srgb(rgba_in.z * scale);
 
   /* clamp since alpha might be > 1.0 due to russian roulette */
-  result.w = saturate(result.w);
+  result.w = saturate(rgba_in.w * scale);
 
   return result;
 }
@@ -57,15 +99,15 @@ ccl_device void kernel_film_convert_to_byte(KernelGlobals *kg,
   /* buffer offset */
   int index = offset + x + y * stride;
 
-  rgba += index;
-  buffer += index * kernel_data.film.pass_stride;
+  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
+  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
 
   /* map colors */
-  float4 irradiance = *((ccl_global float4 *)buffer);
-  float4 float_result = film_map(kg, irradiance, sample_scale);
-  uchar4 byte_result = film_float_to_byte(float_result);
+  float4 float_result = film_map(kg, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
+  uchar4 uchar_result = film_float_to_byte(float_result);
 
-  *rgba = byte_result;
+  rgba += index;
+  *rgba = uchar_result;
 }
 
 ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
@@ -80,20 +122,11 @@ ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
   /* buffer offset */
   int index = offset + x + y * stride;
 
-  ccl_global float4 *in = (ccl_global float4 *)(buffer + index * kernel_data.film.pass_stride);
-  ccl_global half *out = (ccl_global half *)rgba + index * 4;
-
-  float exposure = kernel_data.film.exposure;
-
-  float4 rgba_in = *in;
+  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
+  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
 
-  if (exposure != 1.0f) {
-    rgba_in.x *= exposure;
-    rgba_in.y *= exposure;
-    rgba_in.z *= exposure;
-  }
-
-  float4_store_half(out, rgba_in, sample_scale);
+  ccl_global half *out = (ccl_global half *)rgba + index * 4;
+  float4_store_half(out, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 9dbf3b7ea2e..c186e8560eb 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -22,8 +22,8 @@
 #include "kernel/kernel_profiling.h"
 
 #ifdef __KERNEL_CPU__
-#  include "util/util_vector.h"
 #  include "util/util_map.h"
+#  include "util/util_vector.h"
 #endif
 
 #ifdef __KERNEL_OPENCL__
@@ -90,12 +90,43 @@ typedef struct KernelGlobals {
 
 #endif /* __KERNEL_CPU__ */
 
+#ifdef __KERNEL_OPTIX__
+
+typedef struct ShaderParams {
+  uint4 *input;
+  float4 *output;
+  int type;
+  int filter;
+  int sx;
+  int offset;
+  int sample;
+} ShaderParams;
+
+typedef struct KernelParams {
+  WorkTile tile;
+  KernelData data;
+  ShaderParams shader;
+#  define KERNEL_TEX(type, name) const type *name;
+#  include "kernel/kernel_textures.h"
+} KernelParams;
+
+typedef struct KernelGlobals {
+#  ifdef __VOLUME__
+  VolumeState volume_state;
+#  endif
+  Intersection hits_stack[64];
+} KernelGlobals;
+
+extern "C" __constant__ KernelParams __params;
+
+#else /* __KERNEL_OPTIX__ */
+
 /* For CUDA, constant memory textures must be globals, so we can't put them
  * into a struct. As a result we don't actually use this struct and use actual
  * globals and simply pass along a NULL pointer everywhere, which we hope gets
  * optimized out. */
 
-#ifdef __KERNEL_CUDA__
+#  ifdef __KERNEL_CUDA__
 
 __constant__ KernelData __data;
 typedef struct KernelGlobals {
@@ -103,10 +134,12 @@ typedef struct KernelGlobals {
   Intersection hits_stack[64];
 } KernelGlobals;
 
-#  define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-#  include "kernel/kernel_textures.h"
+#    define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#    include "kernel/kernel_textures.h"
+
+#  endif /* __KERNEL_CUDA__ */
 
-#endif /* __KERNEL_CUDA__ */
+#endif /* __KERNEL_OPTIX__ */
 
 /* OpenCL */
 
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
index c1f4e39e5e7..1ca42e933d1 100644
--- a/intern/cycles/kernel/kernel_id_passes.h
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -1,18 +1,18 @@
 /*
-* Copyright 2018 Blender Foundation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright 2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 CCL_NAMESPACE_BEGIN
 
@@ -32,7 +32,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
     /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
     if (id_buffer[slot].x == ID_NONE) {
       /* Use an atomic to claim this slot.
-      * If a different thread got here first, try again from this slot on. */
+       * If a different thread got here first, try again from this slot on. */
       float old_id = atomic_compare_and_swap_float(buffer + slot * 2, ID_NONE, id);
       if (old_id != ID_NONE && old_id != id) {
         continue;
@@ -54,7 +54,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
       break;
     }
     /* If there already is a slot for that ID, add the weight.
-    * If no slot was found, add it to the last. */
+     * If no slot was found, add it to the last. */
     else if (id_buffer[slot].x == id || slot == num_slots - 1) {
       id_buffer[slot].y += weight;
       break;
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index f7270a14940..b9c48b86a5d 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -38,43 +38,13 @@ ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
 ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 {
   kernel_assert(b > 1);
-#if defined(__KERNEL_SSE2__)
-#  ifdef _MSC_VER
-  unsigned long ctz;
-  _BitScanForward(&ctz, b);
-  return a >> ctz;
-#  else
-  return a >> __builtin_ctz(b);
-#  endif
-#elif defined(__KERNEL_CUDA__)
-  return a >> (__ffs(b) - 1);
-#else
-  return a / b;
-#endif
+  return a >> count_trailing_zeros(b);
 }
 
 ccl_device_inline uint cmj_w_mask(uint w)
 {
   kernel_assert(w > 1);
-#if defined(__KERNEL_SSE2__)
-#  ifdef _MSC_VER
-  unsigned long leading_zero;
-  _BitScanReverse(&leading_zero, w);
-  return ((1 << (1 + leading_zero)) - 1);
-#  else
-  return ((1 << (32 - __builtin_clz(w))) - 1);
-#  endif
-#elif defined(__KERNEL_CUDA__)
-  return ((1 << (32 - __clz(w))) - 1);
-#else
-  w |= w >> 1;
-  w |= w >> 2;
-  w |= w >> 4;
-  w |= w >> 8;
-  w |= w >> 16;
-
-  return w;
-#endif
+  return ((1 << (32 - count_leading_zeros(w))) - 1);
 }
 
 ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
@@ -225,4 +195,37 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 }
 #endif
 
+ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
+{
+  /* Fallback to random */
+  if (sample >= NUM_PMJ_SAMPLES) {
+    const int p = rng_hash + dimension;
+    return cmj_randfloat(sample, p);
+  }
+  else {
+    const uint mask = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
+    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
+    return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ mask) - 1.0f;
+  }
+}
+
+ccl_device float2 pmj_sample_2D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
+{
+  if (sample >= NUM_PMJ_SAMPLES) {
+    const int p = rng_hash + dimension;
+    const float fx = cmj_randfloat(sample, p);
+    const float fy = cmj_randfloat(sample, p + 1);
+    return make_float2(fx, fy);
+  }
+  else {
+    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
+    const uint maskx = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
+    const uint masky = cmj_hash_simple(dimension + 1, rng_hash) & 0x007fffff;
+    const float fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ maskx) - 1.0f;
+    const float fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ masky) -
+                     1.0f;
+    return make_float2(fx, fy);
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 48cc284694a..5c47cc4ce23 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "kernel_light_background.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Light Sample result */
@@ -86,515 +88,6 @@ ccl_device void kernel_update_light_picking(ShaderData *sd, ccl_addr_space PathS
 #endif
 }
 
-/* Area light sampling */
-
-/* Uses the following paper:
- *
- * Carlos Urena et al.
- * An Area-Preserving Parametrization for Spherical Rectangles.
- *
- * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
- *
- * Note: light_p is modified when sample_coord is true.
- */
-ccl_device_inline float rect_light_sample(float3 P,
-                                          float3 *light_p,
-                                          float3 axisu,
-                                          float3 axisv,
-                                          float randu,
-                                          float randv,
-                                          bool sample_coord)
-{
-  /* In our name system we're using P for the center,
-   * which is o in the paper.
-   */
-
-  float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
-  float axisu_len, axisv_len;
-  /* Compute local reference system R. */
-  float3 x = normalize_len(axisu, &axisu_len);
-  float3 y = normalize_len(axisv, &axisv_len);
-  float3 z = cross(x, y);
-  /* Compute rectangle coords in local reference system. */
-  float3 dir = corner - P;
-  float z0 = dot(dir, z);
-  /* Flip 'z' to make it point against Q. */
-  if (z0 > 0.0f) {
-    z *= -1.0f;
-    z0 *= -1.0f;
-  }
-  float x0 = dot(dir, x);
-  float y0 = dot(dir, y);
-  float x1 = x0 + axisu_len;
-  float y1 = y0 + axisv_len;
-  /* Compute internal angles (gamma_i). */
-  float4 diff = make_float4(x0, y1, x1, y0) - make_float4(x1, y0, x0, y1);
-  float4 nz = make_float4(y0, x1, y1, x0) * diff;
-  nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz);
-  float g0 = safe_acosf(-nz.x * nz.y);
-  float g1 = safe_acosf(-nz.y * nz.z);
-  float g2 = safe_acosf(-nz.z * nz.w);
-  float g3 = safe_acosf(-nz.w * nz.x);
-  /* Compute predefined constants. */
-  float b0 = nz.x;
-  float b1 = nz.z;
-  float b0sq = b0 * b0;
-  float k = M_2PI_F - g2 - g3;
-  /* Compute solid angle from internal angles. */
-  float S = g0 + g1 - k;
-
-  if (sample_coord) {
-    /* Compute cu. */
-    float au = randu * S + k;
-    float fu = (cosf(au) * b0 - b1) / sinf(au);
-    float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
-    cu = clamp(cu, -1.0f, 1.0f);
-    /* Compute xu. */
-    float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
-    xu = clamp(xu, x0, x1);
-    /* Compute yv. */
-    float z0sq = z0 * z0;
-    float y0sq = y0 * y0;
-    float y1sq = y1 * y1;
-    float d = sqrtf(xu * xu + z0sq);
-    float h0 = y0 / sqrtf(d * d + y0sq);
-    float h1 = y1 / sqrtf(d * d + y1sq);
-    float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
-    float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
-
-    /* Transform (xu, yv, z0) to world coords. */
-    *light_p = P + xu * x + yv * y + z0 * z;
-  }
-
-  /* return pdf */
-  if (S != 0.0f)
-    return 1.0f / S;
-  else
-    return 0.0f;
-}
-
-ccl_device_inline float3 ellipse_sample(float3 ru, float3 rv, float randu, float randv)
-{
-  to_unit_disk(&randu, &randv);
-  return ru * randu + rv * randv;
-}
-
-ccl_device float3 disk_light_sample(float3 v, float randu, float randv)
-{
-  float3 ru, rv;
-
-  make_orthonormals(v, &ru, &rv);
-
-  return ellipse_sample(ru, rv, randu, randv);
-}
-
-ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
-{
-  return normalize(D + disk_light_sample(D, randu, randv) * radius);
-}
-
-ccl_device float3
-sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
-{
-  return disk_light_sample(normalize(P - center), randu, randv) * radius;
-}
-
-ccl_device float spot_light_attenuation(float3 dir,
-                                        float spot_angle,
-                                        float spot_smooth,
-                                        LightSample *ls)
-{
-  float3 I = ls->Ng;
-
-  float attenuation = dot(dir, I);
-
-  if (attenuation <= spot_angle) {
-    attenuation = 0.0f;
-  }
-  else {
-    float t = attenuation - spot_angle;
-
-    if (t < spot_smooth && spot_smooth != 0.0f)
-      attenuation *= smoothstepf(t / spot_smooth);
-  }
-
-  return attenuation;
-}
-
-ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
-{
-  float cos_pi = dot(Ng, I);
-
-  if (cos_pi <= 0.0f)
-    return 0.0f;
-
-  return t * t / cos_pi;
-}
-
-/* Background Light */
-
-#ifdef __BACKGROUND_MIS__
-
-/* TODO(sergey): In theory it should be all fine to use noinline for all
- * devices, but we're so close to the release so better not screw things
- * up for CPU at least.
- */
-#  ifdef __KERNEL_GPU__
-ccl_device_noinline
-#  else
-ccl_device
-#  endif
-    float3
-    background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
-{
-  /* for the following, the CDF values are actually a pair of floats, with the
-   * function value as X and the actual CDF as Y.  The last entry's function
-   * value is the CDF total. */
-  int res_x = kernel_data.integrator.pdf_background_res_x;
-  int res_y = kernel_data.integrator.pdf_background_res_y;
-  int cdf_width = res_x + 1;
-
-  /* this is basically std::lower_bound as used by pbrt */
-  int first = 0;
-  int count = res_y;
-
-  while (count > 0) {
-    int step = count >> 1;
-    int middle = first + step;
-
-    if (kernel_tex_fetch(__light_background_marginal_cdf, middle).y < randv) {
-      first = middle + 1;
-      count -= step + 1;
-    }
-    else
-      count = step;
-  }
-
-  int index_v = max(0, first - 1);
-  kernel_assert(index_v >= 0 && index_v < res_y);
-
-  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
-  float2 cdf_next_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v + 1);
-  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
-
-  /* importance-sampled V direction */
-  float dv = inverse_lerp(cdf_v.y, cdf_next_v.y, randv);
-  float v = (index_v + dv) / res_y;
-
-  /* this is basically std::lower_bound as used by pbrt */
-  first = 0;
-  count = res_x;
-  while (count > 0) {
-    int step = count >> 1;
-    int middle = first + step;
-
-    if (kernel_tex_fetch(__light_background_conditional_cdf, index_v * cdf_width + middle).y <
-        randu) {
-      first = middle + 1;
-      count -= step + 1;
-    }
-    else
-      count = step;
-  }
-
-  int index_u = max(0, first - 1);
-  kernel_assert(index_u >= 0 && index_u < res_x);
-
-  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                  index_v * cdf_width + index_u);
-  float2 cdf_next_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + index_u + 1);
-  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + res_x);
-
-  /* importance-sampled U direction */
-  float du = inverse_lerp(cdf_u.y, cdf_next_u.y, randu);
-  float u = (index_u + du) / res_x;
-
-  /* compute pdf */
-  float denom = cdf_last_u.x * cdf_last_v.x;
-  float sin_theta = sinf(M_PI_F * v);
-
-  if (sin_theta == 0.0f || denom == 0.0f)
-    *pdf = 0.0f;
-  else
-    *pdf = (cdf_u.x * cdf_v.x) / (M_2PI_F * M_PI_F * sin_theta * denom);
-
-  /* compute direction */
-  return equirectangular_to_direction(u, v);
-}
-
-/* TODO(sergey): Same as above, after the release we should consider using
- * 'noinline' for all devices.
- */
-#  ifdef __KERNEL_GPU__
-ccl_device_noinline
-#  else
-ccl_device
-#  endif
-    float
-    background_map_pdf(KernelGlobals *kg, float3 direction)
-{
-  float2 uv = direction_to_equirectangular(direction);
-  int res_x = kernel_data.integrator.pdf_background_res_x;
-  int res_y = kernel_data.integrator.pdf_background_res_y;
-  int cdf_width = res_x + 1;
-
-  float sin_theta = sinf(uv.y * M_PI_F);
-
-  if (sin_theta == 0.0f)
-    return 0.0f;
-
-  int index_u = clamp(float_to_int(uv.x * res_x), 0, res_x - 1);
-  int index_v = clamp(float_to_int(uv.y * res_y), 0, res_y - 1);
-
-  /* pdfs in V direction */
-  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + res_x);
-  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
-
-  float denom = cdf_last_u.x * cdf_last_v.x;
-
-  if (denom == 0.0f)
-    return 0.0f;
-
-  /* pdfs in U direction */
-  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                  index_v * cdf_width + index_u);
-  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
-
-  return (cdf_u.x * cdf_v.x) / (M_2PI_F * M_PI_F * sin_theta * denom);
-}
-
-ccl_device_inline bool background_portal_data_fetch_and_check_side(
-    KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
-{
-  int portal = kernel_data.integrator.portal_offset + index;
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
-
-  *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-  *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-
-  /* Check whether portal is on the right side. */
-  if (dot(*dir, P - *lightpos) > 1e-4f)
-    return true;
-
-  return false;
-}
-
-ccl_device_inline float background_portal_pdf(
-    KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
-{
-  float portal_pdf = 0.0f;
-
-  int num_possible = 0;
-  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
-    if (p == ignore_portal)
-      continue;
-
-    float3 lightpos, dir;
-    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
-      continue;
-
-    /* There's a portal that could be sampled from this position. */
-    if (is_possible) {
-      *is_possible = true;
-    }
-    num_possible++;
-
-    int portal = kernel_data.integrator.portal_offset + p;
-    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
-    float3 axisu = make_float3(
-        klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-    float3 axisv = make_float3(
-        klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
-    bool is_round = (klight->area.invarea < 0.0f);
-
-    if (!ray_quad_intersect(P,
-                            direction,
-                            1e-4f,
-                            FLT_MAX,
-                            lightpos,
-                            axisu,
-                            axisv,
-                            dir,
-                            NULL,
-                            NULL,
-                            NULL,
-                            NULL,
-                            is_round))
-      continue;
-
-    if (is_round) {
-      float t;
-      float3 D = normalize_len(lightpos - P, &t);
-      portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
-    }
-    else {
-      portal_pdf += rect_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
-    }
-  }
-
-  if (ignore_portal >= 0) {
-    /* We have skipped a portal that could be sampled as well. */
-    num_possible++;
-  }
-
-  return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
-}
-
-ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
-{
-  int num_possible_portals = 0;
-  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
-    float3 lightpos, dir;
-    if (background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
-      num_possible_portals++;
-  }
-  return num_possible_portals;
-}
-
-ccl_device float3 background_portal_sample(KernelGlobals *kg,
-                                           float3 P,
-                                           float randu,
-                                           float randv,
-                                           int num_possible,
-                                           int *sampled_portal,
-                                           float *pdf)
-{
-  /* Pick a portal, then re-normalize randv. */
-  randv *= num_possible;
-  int portal = (int)randv;
-  randv -= portal;
-
-  /* TODO(sergey): Some smarter way of finding portal to sample
-   * is welcome.
-   */
-  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
-    /* Search for the sampled portal. */
-    float3 lightpos, dir;
-    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
-      continue;
-
-    if (portal == 0) {
-      /* p is the portal to be sampled. */
-      int portal = kernel_data.integrator.portal_offset + p;
-      const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
-      float3 axisu = make_float3(
-          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-      float3 axisv = make_float3(
-          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
-      bool is_round = (klight->area.invarea < 0.0f);
-
-      float3 D;
-      if (is_round) {
-        lightpos += ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
-        float t;
-        D = normalize_len(lightpos - P, &t);
-        *pdf = fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
-      }
-      else {
-        *pdf = rect_light_sample(P, &lightpos, axisu, axisv, randu, randv, true);
-        D = normalize(lightpos - P);
-      }
-
-      *pdf /= num_possible;
-      *sampled_portal = p;
-      return D;
-    }
-
-    portal--;
-  }
-
-  return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device_inline float3
-background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
-{
-  /* Probability of sampling portals instead of the map. */
-  float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
-
-  /* Check if there are portals in the scene which we can sample. */
-  if (portal_sampling_pdf > 0.0f) {
-    int num_portals = background_num_possible_portals(kg, P);
-    if (num_portals > 0) {
-      if (portal_sampling_pdf == 1.0f || randu < portal_sampling_pdf) {
-        if (portal_sampling_pdf < 1.0f) {
-          randu /= portal_sampling_pdf;
-        }
-        int portal;
-        float3 D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf);
-        if (num_portals > 1) {
-          /* Ignore the chosen portal, its pdf is already included. */
-          *pdf += background_portal_pdf(kg, P, D, portal, NULL);
-        }
-        /* We could also have sampled the map, so combine with MIS. */
-        if (portal_sampling_pdf < 1.0f) {
-          float cdf_pdf = background_map_pdf(kg, D);
-          *pdf = (portal_sampling_pdf * (*pdf) + (1.0f - portal_sampling_pdf) * cdf_pdf);
-        }
-        return D;
-      }
-      else {
-        /* Sample map, but with nonzero portal_sampling_pdf for MIS. */
-        randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
-      }
-    }
-    else {
-      /* We can't sample a portal.
-       * Check if we can sample the map instead.
-       */
-      if (portal_sampling_pdf == 1.0f) {
-        /* Use uniform as a fallback if we can't sample the map. */
-        *pdf = 1.0f / M_4PI_F;
-        return sample_uniform_sphere(randu, randv);
-      }
-      else {
-        portal_sampling_pdf = 0.0f;
-      }
-    }
-  }
-
-  float3 D = background_map_sample(kg, randu, randv, pdf);
-  /* Use MIS if portals could be sampled as well. */
-  if (portal_sampling_pdf > 0.0f) {
-    float portal_pdf = background_portal_pdf(kg, P, D, -1, NULL);
-    *pdf = (portal_sampling_pdf * portal_pdf + (1.0f - portal_sampling_pdf) * (*pdf));
-  }
-  return D;
-}
-
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
-{
-  /* Probability of sampling portals instead of the map. */
-  float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
-
-  float portal_pdf = 0.0f, map_pdf = 0.0f;
-  if (portal_sampling_pdf > 0.0f) {
-    /* Evaluate PDF of sampling this direction by portal sampling. */
-    bool is_possible = false;
-    portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible) * portal_sampling_pdf;
-    if (!is_possible) {
-      /* Portal sampling is not possible here because all portals point to the wrong side.
-       * If map sampling is possible, it would be used instead, otherwise fallback sampling is used. */
-      if (portal_sampling_pdf == 1.0f) {
-        return 1.0f / M_4PI_F;
-      }
-      else {
-        /* Force map sampling. */
-        portal_sampling_pdf = 0.0f;
-      }
-    }
-  }
-  if (portal_sampling_pdf < 1.0f) {
-    /* Evaluate PDF of sampling this direction by map sampling. */
-    map_pdf = background_map_pdf(kg, direction) * (1.0f - portal_sampling_pdf);
-  }
-  return portal_pdf + map_pdf;
-}
-#endif
-
 /* Regular Light */
 
 /* returns the PDF of sampling a point on this lamp */
@@ -663,7 +156,7 @@ ccl_device_inline bool lamp_light_sample(
         /* spot light attenuation */
         float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
         ls->eval_fac *= spot_light_attenuation(
-            dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls);
+            dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
         if (ls->eval_fac == 0.0f) {
           return false;
         }
@@ -799,7 +292,7 @@ ccl_device bool lamp_light_eval(
       /* spot light attenuation */
       float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
       ls->eval_fac *= spot_light_attenuation(
-          dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls);
+          dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
 
       if (ls->eval_fac == 0.0f)
         return false;
@@ -870,20 +363,18 @@ ccl_device_inline bool triangle_world_space_vertices(
     triangle_vertices(kg, prim, V);
   }
 
-#ifdef __INSTANCING__
   if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-#  ifdef __OBJECT_MOTION__
+#ifdef __OBJECT_MOTION__
     float object_time = (time >= 0.0f) ? time : 0.5f;
     Transform tfm = object_fetch_transform_motion_test(kg, object, object_time, NULL);
-#  else
+#else
     Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-#  endif
+#endif
     V[0] = transform_point(&tfm, V[0]);
     V[1] = transform_point(&tfm, V[1]);
     V[2] = transform_point(&tfm, V[2]);
     has_motion = true;
   }
-#endif
   return has_motion;
 }
 
@@ -1074,11 +565,19 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
     }
   }
   else {
-    /* compute random point in triangle */
-    randu = sqrtf(randu);
+    /* compute random point in triangle. From Eric Heitz's "A Low-Distortion Map Between Triangle
+     * and Square" */
+    float u = randu;
+    float v = randv;
+    if (v > u) {
+      u *= 0.5f;
+      v -= u;
+    }
+    else {
+      v *= 0.5f;
+      u -= v;
+    }
 
-    const float u = 1.0f - randu;
-    const float v = randv * randu;
     const float t = 1.0f - u - v;
     ls->P = u * V[0] + v * V[1] + t * V[2];
     /* compute incoming direction, distance and pdf */
@@ -1142,7 +641,7 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
   int len = kernel_data.integrator.num_distribution + 1;
   float r = *randu;
 
-  while (len > 0) {
+  do {
     int half_len = len >> 1;
     int middle = first + half_len;
 
@@ -1153,7 +652,7 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
       first = middle + 1;
       len = len - half_len - 1;
     }
-  }
+  } while (len > 0);
 
   /* Clamping should not be needed but float rounding errors seem to
    * make this fail on rare occasions. */
@@ -1170,7 +669,7 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
 
 /* Generic Light */
 
-ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
 {
   return (bounce > kernel_tex_fetch(__lights, index).max_bounces);
 }
@@ -1840,7 +1339,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
   return (ls->pdf > 0.0f);
 }
 
-ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
+ccl_device_inline int light_select_num_samples(KernelGlobals *kg, int index)
 {
   return kernel_tex_fetch(__lights, index).samples;
 }
diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h
new file mode 100644
index 00000000000..30e336f0f80
--- /dev/null
+++ b/intern/cycles/kernel/kernel_light_background.h
@@ -0,0 +1,448 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_light_common.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Background Light */
+
+#ifdef __BACKGROUND_MIS__
+
+ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+{
+  /* for the following, the CDF values are actually a pair of floats, with the
+   * function value as X and the actual CDF as Y.  The last entry's function
+   * value is the CDF total. */
+  int res_x = kernel_data.background.map_res_x;
+  int res_y = kernel_data.background.map_res_y;
+  int cdf_width = res_x + 1;
+
+  /* this is basically std::lower_bound as used by pbrt */
+  int first = 0;
+  int count = res_y;
+
+  while (count > 0) {
+    int step = count >> 1;
+    int middle = first + step;
+
+    if (kernel_tex_fetch(__light_background_marginal_cdf, middle).y < randv) {
+      first = middle + 1;
+      count -= step + 1;
+    }
+    else
+      count = step;
+  }
+
+  int index_v = max(0, first - 1);
+  kernel_assert(index_v >= 0 && index_v < res_y);
+
+  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
+  float2 cdf_next_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v + 1);
+  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
+
+  /* importance-sampled V direction */
+  float dv = inverse_lerp(cdf_v.y, cdf_next_v.y, randv);
+  float v = (index_v + dv) / res_y;
+
+  /* this is basically std::lower_bound as used by pbrt */
+  first = 0;
+  count = res_x;
+  while (count > 0) {
+    int step = count >> 1;
+    int middle = first + step;
+
+    if (kernel_tex_fetch(__light_background_conditional_cdf, index_v * cdf_width + middle).y <
+        randu) {
+      first = middle + 1;
+      count -= step + 1;
+    }
+    else
+      count = step;
+  }
+
+  int index_u = max(0, first - 1);
+  kernel_assert(index_u >= 0 && index_u < res_x);
+
+  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                  index_v * cdf_width + index_u);
+  float2 cdf_next_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                       index_v * cdf_width + index_u + 1);
+  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                       index_v * cdf_width + res_x);
+
+  /* importance-sampled U direction */
+  float du = inverse_lerp(cdf_u.y, cdf_next_u.y, randu);
+  float u = (index_u + du) / res_x;
+
+  /* compute pdf */
+  float sin_theta = sinf(M_PI_F * v);
+  float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x;
+
+  if (sin_theta == 0.0f || denom == 0.0f)
+    *pdf = 0.0f;
+  else
+    *pdf = (cdf_u.x * cdf_v.x) / denom;
+
+  /* compute direction */
+  return equirectangular_to_direction(u, v);
+}
+
+/* TODO(sergey): Same as above, after the release we should consider using
+ * 'noinline' for all devices.
+ */
+ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
+{
+  float2 uv = direction_to_equirectangular(direction);
+  int res_x = kernel_data.background.map_res_x;
+  int res_y = kernel_data.background.map_res_y;
+  int cdf_width = res_x + 1;
+
+  float sin_theta = sinf(uv.y * M_PI_F);
+
+  if (sin_theta == 0.0f)
+    return 0.0f;
+
+  int index_u = clamp(float_to_int(uv.x * res_x), 0, res_x - 1);
+  int index_v = clamp(float_to_int(uv.y * res_y), 0, res_y - 1);
+
+  /* pdfs in V direction */
+  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                       index_v * cdf_width + res_x);
+  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
+
+  float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x;
+
+  if (denom == 0.0f)
+    return 0.0f;
+
+  /* pdfs in U direction */
+  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
+                                  index_v * cdf_width + index_u);
+  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
+
+  return (cdf_u.x * cdf_v.x) / denom;
+}
+
+ccl_device_inline bool background_portal_data_fetch_and_check_side(
+    KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
+{
+  int portal = kernel_data.background.portal_offset + index;
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+
+  *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+  *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
+
+  /* Check whether portal is on the right side. */
+  if (dot(*dir, P - *lightpos) > 1e-4f)
+    return true;
+
+  return false;
+}
+
+ccl_device_inline float background_portal_pdf(
+    KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
+{
+  float portal_pdf = 0.0f;
+
+  int num_possible = 0;
+  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+    if (p == ignore_portal)
+      continue;
+
+    float3 lightpos, dir;
+    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+      continue;
+
+    /* There's a portal that could be sampled from this position. */
+    if (is_possible) {
+      *is_possible = true;
+    }
+    num_possible++;
+
+    int portal = kernel_data.background.portal_offset + p;
+    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+    float3 axisu = make_float3(
+        klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+    float3 axisv = make_float3(
+        klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+    bool is_round = (klight->area.invarea < 0.0f);
+
+    if (!ray_quad_intersect(P,
+                            direction,
+                            1e-4f,
+                            FLT_MAX,
+                            lightpos,
+                            axisu,
+                            axisv,
+                            dir,
+                            NULL,
+                            NULL,
+                            NULL,
+                            NULL,
+                            is_round))
+      continue;
+
+    if (is_round) {
+      float t;
+      float3 D = normalize_len(lightpos - P, &t);
+      portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
+    }
+    else {
+      portal_pdf += rect_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
+    }
+  }
+
+  if (ignore_portal >= 0) {
+    /* We have skipped a portal that could be sampled as well. */
+    num_possible++;
+  }
+
+  return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
+}
+
+ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+{
+  int num_possible_portals = 0;
+  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+    float3 lightpos, dir;
+    if (background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+      num_possible_portals++;
+  }
+  return num_possible_portals;
+}
+
+ccl_device float3 background_portal_sample(KernelGlobals *kg,
+                                           float3 P,
+                                           float randu,
+                                           float randv,
+                                           int num_possible,
+                                           int *sampled_portal,
+                                           float *pdf)
+{
+  /* Pick a portal, then re-normalize randv. */
+  randv *= num_possible;
+  int portal = (int)randv;
+  randv -= portal;
+
+  /* TODO(sergey): Some smarter way of finding portal to sample
+   * is welcome.
+   */
+  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+    /* Search for the sampled portal. */
+    float3 lightpos, dir;
+    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+      continue;
+
+    if (portal == 0) {
+      /* p is the portal to be sampled. */
+      int portal = kernel_data.background.portal_offset + p;
+      const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+      float3 axisu = make_float3(
+          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+      float3 axisv = make_float3(
+          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+      bool is_round = (klight->area.invarea < 0.0f);
+
+      float3 D;
+      if (is_round) {
+        lightpos += ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
+        float t;
+        D = normalize_len(lightpos - P, &t);
+        *pdf = fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
+      }
+      else {
+        *pdf = rect_light_sample(P, &lightpos, axisu, axisv, randu, randv, true);
+        D = normalize(lightpos - P);
+      }
+
+      *pdf /= num_possible;
+      *sampled_portal = p;
+      return D;
+    }
+
+    portal--;
+  }
+
+  return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
+                                               float randu,
+                                               float randv,
+                                               float *pdf)
+{
+  float3 D;
+  const float3 N = float4_to_float3(kernel_data.background.sun);
+  const float angle = kernel_data.background.sun.w;
+  sample_uniform_cone(N, angle, randu, randv, &D, pdf);
+  return D;
+}
+
+ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
+{
+  const float3 N = float4_to_float3(kernel_data.background.sun);
+  const float angle = kernel_data.background.sun.w;
+  return pdf_uniform_cone(N, D, angle);
+}
+
+ccl_device_inline float3
+background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+{
+  float portal_method_pdf = kernel_data.background.portal_weight;
+  float sun_method_pdf = kernel_data.background.sun_weight;
+  float map_method_pdf = kernel_data.background.map_weight;
+
+  int num_portals = 0;
+  if (portal_method_pdf > 0.0f) {
+    /* Check if there are portals in the scene which we can sample. */
+    num_portals = background_num_possible_portals(kg, P);
+    if (num_portals == 0) {
+      portal_method_pdf = 0.0f;
+    }
+  }
+
+  float pdf_fac = (portal_method_pdf + sun_method_pdf + map_method_pdf);
+  if (pdf_fac == 0.0f) {
+    /* Use uniform as a fallback if we can't use any strategy. */
+    *pdf = 1.0f / M_4PI_F;
+    return sample_uniform_sphere(randu, randv);
+  }
+
+  pdf_fac = 1.0f / pdf_fac;
+  portal_method_pdf *= pdf_fac;
+  sun_method_pdf *= pdf_fac;
+  map_method_pdf *= pdf_fac;
+
+  /* We have 100% in total and split it between the three categories.
+   * Therefore, we pick portals if randu is between 0 and portal_method_pdf,
+   * sun if randu is between portal_method_pdf and (portal_method_pdf + sun_method_pdf)
+   * and map if randu is between (portal_method_pdf + sun_method_pdf) and 1. */
+  float sun_method_cdf = portal_method_pdf + sun_method_pdf;
+
+  int method = 0;
+  float3 D;
+  if (randu < portal_method_pdf) {
+    method = 0;
+    /* Rescale randu. */
+    if (portal_method_pdf != 1.0f) {
+      randu /= portal_method_pdf;
+    }
+
+    /* Sample a portal. */
+    int portal;
+    D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf);
+    if (num_portals > 1) {
+      /* Ignore the chosen portal, its pdf is already included. */
+      *pdf += background_portal_pdf(kg, P, D, portal, NULL);
+    }
+
+    /* Skip MIS if this is the only method. */
+    if (portal_method_pdf == 1.0f) {
+      return D;
+    }
+    *pdf *= portal_method_pdf;
+  }
+  else if (randu < sun_method_cdf) {
+    method = 1;
+    /* Rescale randu. */
+    if (sun_method_pdf != 1.0f) {
+      randu = (randu - portal_method_pdf) / sun_method_pdf;
+    }
+
+    D = background_sun_sample(kg, randu, randv, pdf);
+
+    /* Skip MIS if this is the only method. */
+    if (sun_method_pdf == 1.0f) {
+      return D;
+    }
+    *pdf *= sun_method_pdf;
+  }
+  else {
+    method = 2;
+    /* Rescale randu. */
+    if (map_method_pdf != 1.0f) {
+      randu = (randu - sun_method_cdf) / map_method_pdf;
+    }
+
+    D = background_map_sample(kg, randu, randv, pdf);
+
+    /* Skip MIS if this is the only method. */
+    if (map_method_pdf == 1.0f) {
+      return D;
+    }
+    *pdf *= map_method_pdf;
+  }
+
+  /* MIS weighting. */
+  if (method != 0 && portal_method_pdf != 0.0f) {
+    *pdf += portal_method_pdf * background_portal_pdf(kg, P, D, -1, NULL);
+  }
+  if (method != 1 && sun_method_pdf != 0.0f) {
+    *pdf += sun_method_pdf * background_sun_pdf(kg, D);
+  }
+  if (method != 2 && map_method_pdf != 0.0f) {
+    *pdf += map_method_pdf * background_map_pdf(kg, D);
+  }
+  return D;
+}
+
+ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+{
+  float portal_method_pdf = kernel_data.background.portal_weight;
+  float sun_method_pdf = kernel_data.background.sun_weight;
+  float map_method_pdf = kernel_data.background.map_weight;
+
+  float portal_pdf = 0.0f;
+  /* Portals are a special case here since we need to compute their pdf in order
+   * to find out if we can sample them. */
+  if (portal_method_pdf > 0.0f) {
+    /* Evaluate PDF of sampling this direction by portal sampling. */
+    bool is_possible = false;
+    portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible);
+    if (!is_possible) {
+      /* Portal sampling is not possible here because all portals point to the wrong side.
+       * If other methods can be used instead, do so, otherwise uniform sampling is used as a
+       * fallback. */
+      portal_method_pdf = 0.0f;
+    }
+  }
+
+  float pdf_fac = (portal_method_pdf + sun_method_pdf + map_method_pdf);
+  if (pdf_fac == 0.0f) {
+    /* Use uniform as a fallback if we can't use any strategy. */
+    return kernel_data.integrator.pdf_lights / M_4PI_F;
+  }
+
+  pdf_fac = 1.0f / pdf_fac;
+  portal_method_pdf *= pdf_fac;
+  sun_method_pdf *= pdf_fac;
+  map_method_pdf *= pdf_fac;
+
+  float pdf = portal_pdf * portal_method_pdf;
+  if (sun_method_pdf != 0.0f) {
+    pdf += background_sun_pdf(kg, direction) * sun_method_pdf;
+  }
+  if (map_method_pdf != 0.0f) {
+    pdf += background_map_pdf(kg, direction) * map_method_pdf;
+  }
+
+  return pdf * kernel_data.integrator.pdf_lights;
+}
+
+#endif
+
+CCL_NAMESPACE_END
+\ No newline at end of file
diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h
new file mode 100644
index 00000000000..39503a4b479
--- /dev/null
+++ b/intern/cycles/kernel/kernel_light_common.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Area light sampling */
+
+/* Uses the following paper:
+ *
+ * Carlos Urena et al.
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ *
+ * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
+ *
+ * Note: light_p is modified when sample_coord is true.
+ */
+ccl_device_inline float rect_light_sample(float3 P,
+                                          float3 *light_p,
+                                          float3 axisu,
+                                          float3 axisv,
+                                          float randu,
+                                          float randv,
+                                          bool sample_coord)
+{
+  /* In our name system we're using P for the center,
+   * which is o in the paper.
+   */
+
+  float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
+  float axisu_len, axisv_len;
+  /* Compute local reference system R. */
+  float3 x = normalize_len(axisu, &axisu_len);
+  float3 y = normalize_len(axisv, &axisv_len);
+  float3 z = cross(x, y);
+  /* Compute rectangle coords in local reference system. */
+  float3 dir = corner - P;
+  float z0 = dot(dir, z);
+  /* Flip 'z' to make it point against Q. */
+  if (z0 > 0.0f) {
+    z *= -1.0f;
+    z0 *= -1.0f;
+  }
+  float x0 = dot(dir, x);
+  float y0 = dot(dir, y);
+  float x1 = x0 + axisu_len;
+  float y1 = y0 + axisv_len;
+  /* Compute internal angles (gamma_i). */
+  float4 diff = make_float4(x0, y1, x1, y0) - make_float4(x1, y0, x0, y1);
+  float4 nz = make_float4(y0, x1, y1, x0) * diff;
+  nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz);
+  float g0 = safe_acosf(-nz.x * nz.y);
+  float g1 = safe_acosf(-nz.y * nz.z);
+  float g2 = safe_acosf(-nz.z * nz.w);
+  float g3 = safe_acosf(-nz.w * nz.x);
+  /* Compute predefined constants. */
+  float b0 = nz.x;
+  float b1 = nz.z;
+  float b0sq = b0 * b0;
+  float k = M_2PI_F - g2 - g3;
+  /* Compute solid angle from internal angles. */
+  float S = g0 + g1 - k;
+
+  if (sample_coord) {
+    /* Compute cu. */
+    float au = randu * S + k;
+    float fu = (cosf(au) * b0 - b1) / sinf(au);
+    float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+    cu = clamp(cu, -1.0f, 1.0f);
+    /* Compute xu. */
+    float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
+    xu = clamp(xu, x0, x1);
+    /* Compute yv. */
+    float z0sq = z0 * z0;
+    float y0sq = y0 * y0;
+    float y1sq = y1 * y1;
+    float d = sqrtf(xu * xu + z0sq);
+    float h0 = y0 / sqrtf(d * d + y0sq);
+    float h1 = y1 / sqrtf(d * d + y1sq);
+    float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+    float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+    /* Transform (xu, yv, z0) to world coords. */
+    *light_p = P + xu * x + yv * y + z0 * z;
+  }
+
+  /* return pdf */
+  if (S != 0.0f)
+    return 1.0f / S;
+  else
+    return 0.0f;
+}
+
+ccl_device_inline float3 ellipse_sample(float3 ru, float3 rv, float randu, float randv)
+{
+  to_unit_disk(&randu, &randv);
+  return ru * randu + rv * randv;
+}
+
+ccl_device float3 disk_light_sample(float3 v, float randu, float randv)
+{
+  float3 ru, rv;
+
+  make_orthonormals(v, &ru, &rv);
+
+  return ellipse_sample(ru, rv, randu, randv);
+}
+
+ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
+{
+  return normalize(D + disk_light_sample(D, randu, randv) * radius);
+}
+
+ccl_device float3
+sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
+{
+  return disk_light_sample(normalize(P - center), randu, randv) * radius;
+}
+
+ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot_smooth, float3 N)
+{
+  float attenuation = dot(dir, N);
+
+  if (attenuation <= spot_angle) {
+    attenuation = 0.0f;
+  }
+  else {
+    float t = attenuation - spot_angle;
+
+    if (t < spot_smooth && spot_smooth != 0.0f)
+      attenuation *= smoothstepf(t / spot_smooth);
+  }
+
+  return attenuation;
+}
+
+ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+{
+  float cos_pi = dot(Ng, I);
+
+  if (cos_pi <= 0.0f)
+    return 0.0f;
+
+  return t * t / cos_pi;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index a933be970c2..0edcc1a5a14 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -85,8 +85,9 @@ ccl_device_inline void sample_uniform_hemisphere(
 ccl_device_inline void sample_uniform_cone(
     const float3 N, float angle, float randu, float randv, float3 *omega_in, float *pdf)
 {
-  float z = cosf(angle * randu);
-  float r = sqrtf(max(0.0f, 1.0f - z * z));
+  float zMin = cosf(angle);
+  float z = zMin - zMin * randu + randu;
+  float r = safe_sqrtf(1.0f - sqr(z));
   float phi = M_2PI_F * randv;
   float x = r * cosf(phi);
   float y = r * sinf(phi);
@@ -94,7 +95,17 @@ ccl_device_inline void sample_uniform_cone(
   float3 T, B;
   make_orthonormals(N, &T, &B);
   *omega_in = x * T + y * B + z * N;
-  *pdf = 0.5f * M_1_PI_F / (1.0f - cosf(angle));
+  *pdf = M_1_2PI_F / (1.0f - zMin);
+}
+
+ccl_device_inline float pdf_uniform_cone(const float3 N, float3 D, float angle)
+{
+  float zMin = cosf(angle);
+  float z = dot(N, D);
+  if (z > zMin) {
+    return M_1_2PI_F / (1.0f - zMin);
+  }
+  return 0.0f;
 }
 
 /* sample uniform point on the surface of a sphere */
@@ -199,21 +210,27 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
   float NdotNg = dot(N, Ng);
   float3 X = normalize(N - NdotNg * Ng);
 
+  /* Keep math expressions. */
+  /* clang-format off */
   /* Calculate N.z and N.x in the local coordinate system.
    *
    * The goal of this computation is to find a N' that is rotated towards Ng just enough
    * to lift R' above the threshold (here called t), therefore dot(R', Ng) = t.
    *
-   * According to the standard reflection equation, this means that we want dot(2*dot(N', I)*N' - I, Ng) = t.
+   * According to the standard reflection equation,
+   * this means that we want dot(2*dot(N', I)*N' - I, Ng) = t.
    *
-   * Since the Z axis of our local coordinate system is Ng, dot(x, Ng) is just x.z, so we get 2*dot(N', I)*N'.z - I.z = t.
+   * Since the Z axis of our local coordinate system is Ng, dot(x, Ng) is just x.z, so we get
+   * 2*dot(N', I)*N'.z - I.z = t.
    *
-   * The rotation is simple to express in the coordinate system we formed - since N lies in the X-Z-plane, we know that
-   * N' will also lie in the X-Z-plane, so N'.y = 0 and therefore dot(N', I) = N'.x*I.x + N'.z*I.z .
+   * The rotation is simple to express in the coordinate system we formed -
+   * since N lies in the X-Z-plane, we know that N' will also lie in the X-Z-plane,
+   * so N'.y = 0 and therefore dot(N', I) = N'.x*I.x + N'.z*I.z .
    *
    * Furthermore, we want N' to be normalized, so N'.x = sqrt(1 - N'.z^2).
    *
-   * With these simplifications, we get the final equation 2*(sqrt(1 - N'.z^2)*I.x + N'.z*I.z)*N'.z - I.z = t.
+   * With these simplifications,
+   * we get the final equation 2*(sqrt(1 - N'.z^2)*I.x + N'.z*I.z)*N'.z - I.z = t.
    *
    * The only unknown here is N'.z, so we can solve for that.
    *
@@ -227,8 +244,11 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
    * c = I.z*t + a
    * N'.z = +-sqrt(0.5*(+-b + c)/a)
    *
-   * Two solutions can immediately be discarded because they're negative so N' would lie in the lower hemisphere.
+   * Two solutions can immediately be discarded because they're negative so N' would lie in the
+   * lower hemisphere.
    */
+  /* clang-format on */
+
   float Ix = dot(I, X), Iz = dot(I, Ng);
   float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
   float a = Ix2 + Iz2;
@@ -237,8 +257,9 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
   float c = Iz * threshold + a;
 
   /* Evaluate both solutions.
-   * In many cases one can be immediately discarded (if N'.z would be imaginary or larger than one), so check for that first.
-   * If no option is viable (might happen in extreme cases like N being in the wrong hemisphere), give up and return Ng. */
+   * In many cases one can be immediately discarded (if N'.z would be imaginary or larger than
+   * one), so check for that first. If no option is viable (might happen in extreme cases like N
+   * being in the wrong hemisphere), give up and return Ng. */
   float fac = 0.5f / a;
   float N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c);
   bool valid1 = (N1_z2 > 1e-5f) && (N1_z2 <= (1.0f + 1e-5f));
@@ -256,8 +277,9 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
     valid1 = (R1 >= 1e-5f);
     valid2 = (R2 >= 1e-5f);
     if (valid1 && valid2) {
-      /* If both solutions are valid, return the one with the shallower reflection since it will be closer to the input
-       * (if the original reflection wasn't shallow, we would not be in this part of the function). */
+      /* If both solutions are valid, return the one with the shallower reflection since it will be
+       * closer to the input (if the original reflection wasn't shallow, we would not be in this
+       * part of the function). */
       N_new = (R1 < R2) ? N1 : N2;
     }
     else {
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 462ec037ee7..753cf4561b2 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,84 +14,11 @@
  * limitations under the License.
  */
 
-#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
-#  define __ATOMIC_PASS_WRITE__
-#endif
-
 #include "kernel/kernel_id_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
-{
-  ccl_global float *buf = buffer;
-#ifdef __ATOMIC_PASS_WRITE__
-  atomic_add_and_fetch_float(buf, value);
-#else
-  *buf += value;
-#endif
-}
-
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
-{
-#ifdef __ATOMIC_PASS_WRITE__
-  ccl_global float *buf_x = buffer + 0;
-  ccl_global float *buf_y = buffer + 1;
-  ccl_global float *buf_z = buffer + 2;
-
-  atomic_add_and_fetch_float(buf_x, value.x);
-  atomic_add_and_fetch_float(buf_y, value.y);
-  atomic_add_and_fetch_float(buf_z, value.z);
-#else
-  ccl_global float3 *buf = (ccl_global float3 *)buffer;
-  *buf += value;
-#endif
-}
-
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
-{
-#ifdef __ATOMIC_PASS_WRITE__
-  ccl_global float *buf_x = buffer + 0;
-  ccl_global float *buf_y = buffer + 1;
-  ccl_global float *buf_z = buffer + 2;
-  ccl_global float *buf_w = buffer + 3;
-
-  atomic_add_and_fetch_float(buf_x, value.x);
-  atomic_add_and_fetch_float(buf_y, value.y);
-  atomic_add_and_fetch_float(buf_z, value.z);
-  atomic_add_and_fetch_float(buf_w, value.w);
-#else
-  ccl_global float4 *buf = (ccl_global float4 *)buffer;
-  *buf += value;
-#endif
-}
-
 #ifdef __DENOISING_FEATURES__
-ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
-{
-  kernel_write_pass_float(buffer, value);
-
-  /* The online one-pass variance update that's used for the megakernel can't easily be implemented
-   * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
-  kernel_write_pass_float(buffer + 1, value * value);
-}
-
-#  ifdef __ATOMIC_PASS_WRITE__
-#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
-#  else
-ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
-{
-  buffer[0] += value.x;
-  buffer[1] += value.y;
-  buffer[2] += value.z;
-}
-#  endif
-
-ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
-{
-  kernel_write_pass_float3_unaligned(buffer, value);
-  kernel_write_pass_float3_unaligned(buffer + 3, value * value);
-}
 
 ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
                                                      ccl_global float *buffer,
@@ -102,7 +29,9 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
   if (kernel_data.film.pass_denoising_data == 0)
     return;
 
-  buffer += (sample & 1) ? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+  buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
+                DENOISING_PASS_SHADOW_B :
+                DENOISING_PASS_SHADOW_A;
 
   path_total = ensure_finite(path_total);
   path_total_shaded = ensure_finite(path_total_shaded);
@@ -113,14 +42,12 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
   float value = path_total_shaded / max(path_total, 1e-7f);
   kernel_write_pass_float(buffer + 2, value * value);
 }
-#endif /* __DENOISING_FEATURES__ */
 
 ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
                                                         ShaderData *sd,
                                                         ccl_addr_space PathState *state,
                                                         PathRadiance *L)
 {
-#ifdef __DENOISING_FEATURES__
   if (state->denoising_feature_weight == 0.0f) {
     return;
   }
@@ -133,7 +60,8 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
   }
 
   float3 normal = make_float3(0.0f, 0.0f, 0.0f);
-  float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+  float3 diffuse_albedo = make_float3(0.0f, 0.0f, 0.0f);
+  float3 specular_albedo = make_float3(0.0f, 0.0f, 0.0f);
   float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
@@ -145,10 +73,31 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
     /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
     normal += sc->N * sc->sample_weight;
     sum_weight += sc->sample_weight;
+
+    float3 closure_albedo = sc->weight;
+    /* Closures that include a Fresnel term typically have weights close to 1 even though their
+     * actual contribution is significantly lower.
+     * To account for this, we scale their weight by the average fresnel factor (the same is also
+     * done for the sample weight in the BSDF setup, so we don't need to scale that here). */
+    if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
+      MicrofacetBsdf *bsdf = (MicrofacetBsdf *)sc;
+      closure_albedo *= bsdf->extra->fresnel_color;
+    }
+    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
+      PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf *)sc;
+      closure_albedo *= bsdf->avg_value;
+    }
+    else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
+      closure_albedo *= bsdf_principled_hair_albedo(sc);
+    }
+
     if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
-      albedo += sc->weight;
+      diffuse_albedo += closure_albedo;
       sum_nonspecular_weight += sc->sample_weight;
     }
+    else {
+      specular_albedo += closure_albedo;
+    }
   }
 
   /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
@@ -156,18 +105,22 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
     if (sum_weight != 0.0f) {
       normal /= sum_weight;
     }
+
+    /* Transform normal into camera space. */
+    const Transform worldtocamera = kernel_data.cam.worldtocamera;
+    normal = transform_direction(&worldtocamera, normal);
+
     L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
-    L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+    L->denoising_albedo += ensure_finite3(state->denoising_feature_weight *
+                                          state->denoising_feature_throughput * diffuse_albedo);
 
     state->denoising_feature_weight = 0.0f;
   }
-#else
-  (void)kg;
-  (void)sd;
-  (void)state;
-  (void)L;
-#endif /* __DENOISING_FEATURES__ */
+  else {
+    state->denoising_feature_throughput *= specular_albedo;
+  }
 }
+#endif /* __DENOISING_FEATURES__ */
 
 #ifdef __KERNEL_DEBUG__
 ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
@@ -241,7 +194,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
         average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
       if (state->sample == 0) {
         if (flag & PASSMASK(DEPTH)) {
-          float depth = camera_distance(kg, sd->P);
+          float depth = camera_z_depth(kg, sd->P);
           kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
         }
         if (flag & PASSMASK(OBJECT_ID)) {
@@ -301,8 +254,6 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
     L->color_glossy += shader_bsdf_glossy(kg, sd) * throughput;
   if (light_flag & PASSMASK_COMPONENT(TRANSMISSION))
     L->color_transmission += shader_bsdf_transmission(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(SUBSURFACE))
-    L->color_subsurface += shader_bsdf_subsurface(kg, sd) * throughput;
 
   if (light_flag & PASSMASK(MIST)) {
     /* bring depth into 0..1 range */
@@ -348,11 +299,8 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
   if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect,
                              L->indirect_transmission);
-  if (light_flag & PASSMASK(SUBSURFACE_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect,
-                             L->indirect_subsurface);
   if (light_flag & PASSMASK(VOLUME_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_scatter);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_volume);
   if (light_flag & PASSMASK(DIFFUSE_DIRECT))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
   if (light_flag & PASSMASK(GLOSSY_DIRECT))
@@ -360,11 +308,8 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
   if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct,
                              L->direct_transmission);
-  if (light_flag & PASSMASK(SUBSURFACE_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct,
-                             L->direct_subsurface);
   if (light_flag & PASSMASK(VOLUME_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_scatter);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_volume);
 
   if (light_flag & PASSMASK(EMISSION))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
@@ -380,8 +325,6 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
   if (light_flag & PASSMASK(TRANSMISSION_COLOR))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
                              L->color_transmission);
-  if (light_flag & PASSMASK(SUBSURFACE_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, L->color_subsurface);
   if (light_flag & PASSMASK(SHADOW)) {
     float4 shadow = L->shadow;
     shadow.w = kernel_data.film.pass_shadow_scale;
@@ -403,7 +346,9 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
   float alpha;
   float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
 
-  kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+  if (kernel_data.film.pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+  }
 
   kernel_write_light_passes(kg, buffer, L);
 
@@ -446,6 +391,45 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
 #ifdef __KERNEL_DEBUG__
   kernel_write_debug_passes(kg, buffer, L);
 #endif
+
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+     criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+     Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+     tiles. */
+  if (kernel_data.film.pass_adaptive_aux_buffer &&
+      kernel_data.integrator.adaptive_threshold > 0.0f) {
+    if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+      kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
+                               make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
+    }
+#ifdef __KERNEL_CPU__
+    if ((sample > kernel_data.integrator.adaptive_min_samples) &&
+        kernel_data.integrator.adaptive_stop_per_sample) {
+      const int step = kernel_data.integrator.adaptive_step;
+
+      if ((sample & (step - 1)) == (step - 1)) {
+        kernel_do_adaptive_stopping(kg, buffer, sample);
+      }
+    }
+#endif
+  }
+
+  /* Write the sample count as negative numbers initially to mark the samples as in progress.
+   * Once the tile has finished rendering, the sign gets flipped and all the pixel values
+   * are scaled as if they were taken at a uniform sample count. */
+  if (kernel_data.film.pass_sample_count) {
+    /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
+     * passes. */
+#ifdef __ATOMIC_PASS_WRITE__
+    atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
+                               0x80000000);
+#else
+    if (buffer[kernel_data.film.pass_sample_count] > 0) {
+      buffer[kernel_data.film.pass_sample_count] *= -1.0f;
+    }
+#endif
+    kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index b5219a0d04a..eec3556c3a1 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -18,6 +18,7 @@
 #  include "kernel/osl/osl_shader.h"
 #endif
 
+// clang-format off
 #include "kernel/kernel_random.h"
 #include "kernel/kernel_projection.h"
 #include "kernel/kernel_montecarlo.h"
@@ -27,9 +28,11 @@
 #include "kernel/geom/geom.h"
 #include "kernel/bvh/bvh.h"
 
+#include "kernel/kernel_write_passes.h"
 #include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
 #include "kernel/kernel_light.h"
+#include "kernel/kernel_adaptive_sampling.h"
 #include "kernel/kernel_passes.h"
 
 #if defined(__VOLUME__) || defined(__SUBSURFACE__)
@@ -47,6 +50,7 @@
 #include "kernel/kernel_path_surface.h"
 #include "kernel/kernel_path_volume.h"
 #include "kernel/kernel_path_subsurface.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -65,25 +69,7 @@ ccl_device_forceinline bool kernel_path_scene_intersect(KernelGlobals *kg,
     ray->t = kernel_data.background.ao_distance;
   }
 
-#ifdef __HAIR__
-  float difl = 0.0f, extmax = 0.0f;
-  uint lcg_state = 0;
-
-  if (kernel_data.bvh.have_curves) {
-    if ((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) {
-      float3 pixdiff = ray->dD.dx + ray->dD.dy;
-      /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-      difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-    }
-
-    extmax = kernel_data.curve.maximum_width;
-    lcg_state = lcg_state_init_addrspace(state, 0x51633e2d);
-  }
-
-  bool hit = scene_intersect(kg, *ray, visibility, isect, &lcg_state, difl, extmax);
-#else
-  bool hit = scene_intersect(kg, *ray, visibility, isect, NULL, 0.0f, 0.0f);
-#endif /* __HAIR__ */
+  bool hit = scene_intersect(kg, ray, visibility, isect);
 
 #ifdef __KERNEL_DEBUG__
   if (state->flag & PATH_RAY_CAMERA) {
@@ -110,7 +96,7 @@ ccl_device_forceinline void kernel_path_lamp_emission(KernelGlobals *kg,
 #ifdef __LAMP_MIS__
   if (kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
     /* ray starting from previous non-transparent bounce */
-    Ray light_ray;
+    Ray light_ray ccl_optional_struct_init;
     float3 N_pick;
     if (state->ray_t == 0.0f) {
       light_ray.P = emission_sd->P_pick;
@@ -141,6 +127,7 @@ ccl_device_forceinline void kernel_path_background(KernelGlobals *kg,
                                                    ccl_addr_space Ray *ray,
                                                    float3 throughput,
                                                    ShaderData *sd,
+                                                   ccl_global float *buffer,
                                                    PathRadiance *L)
 {
   /* eval background shader if nothing hit */
@@ -161,8 +148,8 @@ ccl_device_forceinline void kernel_path_background(KernelGlobals *kg,
 
 #ifdef __BACKGROUND__
   /* sample background shader */
-  float3 L_background = indirect_background(kg, sd, sd->N_pick, state, ray);
-  path_radiance_accum_background(L, state, throughput, L_background);
+  float3 L_background = indirect_background(kg, sd, sd->N_pick, state, buffer, ray);
+  path_radiance_accum_background(kg, L, state, throughput, L_background);
 #endif /* __BACKGROUND__ */
 }
 
@@ -194,19 +181,19 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *k
   Ray volume_ray = *ray;
   volume_ray.t = (hit) ? isect->t : FLT_MAX;
 
-  bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+  float step_size = volume_stack_step_size(kg, state->volume_stack);
 
 #    ifdef __VOLUME_DECOUPLED__
   int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
   bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
-  bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method);
+  bool decoupled = kernel_volume_use_decoupled(kg, step_size, direct, sampling_method);
 
   if (decoupled) {
     /* cache steps along volume for repeated sampling */
     VolumeSegment volume_segment;
 
     shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, heterogeneous);
+    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
 
     kernel_update_light_picking(sd, state);
 
@@ -214,7 +201,7 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *k
 
     /* emission */
     if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
 
     /* scattering */
     VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -254,7 +241,7 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *k
   {
     /* integrate along volume segment with distance sampling */
     VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+        kg, state, sd, &volume_ray, L, throughput, step_size);
 
     kernel_update_light_picking(sd, state);
 
@@ -297,7 +284,7 @@ ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
 
       float3 bg = make_float3(0.0f, 0.0f, 0.0f);
       if (!kernel_data.background.transparent) {
-        bg = indirect_background(kg, emission_sd, sd->N_pick, state, ray);
+        bg = indirect_background(kg, emission_sd, sd->N_pick, state, NULL, ray);
       }
       path_radiance_accum_shadowcatcher(L, throughput, bg);
     }
@@ -312,19 +299,11 @@ ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
 #ifdef __HOLDOUT__
   if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
       (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+    const float3 holdout_weight = shader_holdout_apply(kg, sd);
     if (kernel_data.background.transparent) {
-      float3 holdout_weight;
-      if (sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-        holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-      }
-      else {
-        holdout_weight = shader_holdout_eval(kg, sd);
-      }
-      /* any throughput is ok, should all be identical here */
       L->transparent += average(holdout_weight * throughput);
     }
-
-    if (sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    if (isequal_float3(holdout_weight, make_float3(1.0f, 1.0f, 1.0f))) {
       return false;
     }
   }
@@ -364,7 +343,7 @@ ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
 
     float3 emission = indirect_primitive_emission(
         kg, sd, ray_length, P_pick, N_pick, state->flag, state->ray_pdf, has_volume);
-    path_radiance_accum_emission(L, state, throughput, emission);
+    path_radiance_accum_emission(kg, L, state, throughput, emission);
   }
 #endif /* __EMISSION__ */
 
@@ -375,13 +354,19 @@ ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
   return true;
 }
 
-ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
-                                        ShaderData *sd,
-                                        ShaderData *emission_sd,
-                                        PathRadiance *L,
-                                        ccl_addr_space PathState *state,
-                                        float3 throughput,
-                                        float3 ao_alpha)
+#ifdef __KERNEL_OPTIX__
+ccl_device_inline /* inline trace calls */
+#else
+ccl_device_noinline
+#endif
+    void
+    kernel_path_ao(KernelGlobals *kg,
+                   ShaderData *sd,
+                   ShaderData *emission_sd,
+                   PathRadiance *L,
+                   ccl_addr_space PathState *state,
+                   float3 throughput,
+                   float3 ao_alpha)
 {
   PROFILING_INIT(kg, PROFILING_AO);
 
@@ -410,7 +395,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
     light_ray.dD = differential3_zero();
 
     if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-      path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
+      path_radiance_accum_ao(kg, L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
     }
     else {
       path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
@@ -463,7 +448,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
       /* Shade background. */
       if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, sd, L);
+        kernel_path_background(kg, state, ray, throughput, sd, NULL, L);
         break;
       }
       else if (path_state_ao_bounce(kg, state)) {
@@ -480,7 +465,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #    endif
 
         /* Evaluate shader. */
-        shader_eval_surface(kg, sd, state, state->flag);
+        shader_eval_surface(kg, sd, state, NULL, state->flag);
         shader_prepare_closures(sd, state);
 
         /* Apply shadow catcher, holdout, emission. */
@@ -490,8 +475,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
         }
 
         /* path termination. this is a strange place to put the termination, it's
-     * mainly due to the mixed in MIS that we use. gives too many unneeded
-     * shader evaluations, only need emission if we are going to terminate */
+         * mainly due to the mixed in MIS that we use. gives too many unneeded
+         * shader evaluations, only need emission if we are going to terminate */
         float probability = path_state_continuation_probability(kg, state, throughput);
 
         if (probability == 0.0f) {
@@ -508,7 +493,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
         kernel_update_light_picking(sd, state);
 
+#    ifdef __DENOISING_FEATURES__
         kernel_update_denoising_features(kg, sd, state, L);
+#    endif
 
 #    ifdef __AO__
         /* ambient occlusion */
@@ -519,7 +506,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
 #    ifdef __SUBSURFACE__
         /* bssrdf scatter to a different location on the same object, replacing
-     * the closures with a diffuse BSDF */
+         * the closures with a diffuse BSDF */
         if (sd->flag & SD_BSSRDF) {
           if (kernel_path_subsurface_scatter(
                   kg, sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
@@ -529,12 +516,10 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #    endif /* __SUBSURFACE__ */
 
 #    if defined(__EMISSION__)
-        if (kernel_data.integrator.use_direct_light) {
-          int all = (kernel_data.integrator.sample_all_lights_indirect) ||
-                    (state->flag & PATH_RAY_SHADOW_CATCHER);
-          kernel_branched_path_surface_connect_light(
-              kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-        }
+        int all = (kernel_data.integrator.sample_all_lights_indirect) ||
+                  (state->flag & PATH_RAY_SHADOW_CATCHER);
+        kernel_branched_path_surface_connect_light(
+            kg, sd, emission_sd, state, throughput, 1.0f, L, all);
 #    endif /* defined(__EMISSION__) */
 
 #    ifdef __VOLUME__
@@ -605,7 +590,7 @@ ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
 
       /* Shade background. */
       if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, &sd, L);
+        kernel_path_background(kg, state, ray, throughput, &sd, buffer, L);
         break;
       }
       else if (path_state_ao_bounce(kg, state)) {
@@ -622,7 +607,7 @@ ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
 #  endif
 
         /* Evaluate shader. */
-        shader_eval_surface(kg, &sd, state, state->flag);
+        shader_eval_surface(kg, &sd, state, buffer, state->flag);
         shader_prepare_closures(&sd, state);
 
         /* Apply shadow catcher, holdout, emission. */
@@ -632,8 +617,8 @@ ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
         }
 
         /* path termination. this is a strange place to put the termination, it's
-     * mainly due to the mixed in MIS that we use. gives too many unneeded
-     * shader evaluations, only need emission if we are going to terminate */
+         * mainly due to the mixed in MIS that we use. gives too many unneeded
+         * shader evaluations, only need emission if we are going to terminate */
         float probability = path_state_continuation_probability(kg, state, throughput);
 
         if (probability == 0.0f) {
@@ -649,7 +634,9 @@ ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
 
         kernel_update_light_picking(&sd, state);
 
+#  ifdef __DENOISING_FEATURES__
         kernel_update_denoising_features(kg, &sd, state, L);
+#  endif
 
 #  ifdef __AO__
         /* ambient occlusion */
@@ -660,7 +647,7 @@ ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
 
 #  ifdef __SUBSURFACE__
         /* bssrdf scatter to a different location on the same object, replacing
-     * the closures with a diffuse BSDF */
+         * the closures with a diffuse BSDF */
         if (sd.flag & SD_BSSRDF) {
           if (kernel_path_subsurface_scatter(
                   kg, &sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
@@ -669,8 +656,10 @@ ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
         }
 #  endif /* __SUBSURFACE__ */
 
+#  ifdef __EMISSION__
         /* direct lighting */
         kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
+#  endif /* __EMISSION__ */
 
 #  ifdef __VOLUME__
       }
@@ -706,21 +695,31 @@ ccl_device void kernel_path_trace(
 
   buffer += index * pass_stride;
 
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if ((*aux).w > 0.0f) {
+      return;
+    }
+  }
+
   /* Initialize random numbers and sample ray. */
   uint rng_hash;
   Ray ray;
 
   kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
 
+#  ifndef __KERNEL_OPTIX__
   if (ray.t == 0.0f) {
     return;
   }
+#  endif
 
   /* Initialize state. */
   float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
   PathRadiance L;
-  path_radiance_init(&L, kernel_data.film.use_light_pass);
+  path_radiance_init(kg, &L);
 
   ShaderDataTinyStorage emission_sd_storage;
   ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
@@ -728,6 +727,13 @@ ccl_device void kernel_path_trace(
   PathState state;
   path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
 
+#  ifdef __KERNEL_OPTIX__
+  /* Force struct into local memory to avoid costly spilling on trace calls. */
+  if (pass_stride < 0) /* This is never executed and just prevents the compiler from doing SROA. */
+    for (int i = 0; i < sizeof(L); ++i)
+      reinterpret_cast<unsigned char *>(&L)[-pass_stride + i] = 0;
+#  endif
+
   /* Integrate. */
   kernel_path_integrate(kg, &state, throughput, &ray, &L, buffer, emission_sd);
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 7ec535ee215..64da8c0b83a 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
       if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
         path_radiance_accum_ao(
-            L, state, throughput * num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
+            kg, L, state, throughput * num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
       }
       else {
         path_radiance_accum_total_ao(L, state, throughput * num_samples_inv, ao_bsdf);
@@ -91,7 +91,7 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
   Ray volume_ray = *ray;
   volume_ray.t = (hit) ? isect->t : FLT_MAX;
 
-  bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+  float step_size = volume_stack_step_size(kg, state->volume_stack);
 
 #      ifdef __VOLUME_DECOUPLED__
   /* decoupled ray marching only supported on CPU */
@@ -100,7 +100,7 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
     VolumeSegment volume_segment;
 
     shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, heterogeneous);
+    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
 
     kernel_update_light_picking(sd, state);
 
@@ -150,7 +150,7 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
 
     /* emission and transmittance */
     if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
     *throughput *= volume_segment.accum_transmittance;
 
     /* free cached steps */
@@ -175,7 +175,7 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
       path_state_branch(&ps, j, num_samples);
 
       VolumeIntegrateResult result = kernel_volume_integrate(
-          kg, &ps, sd, &volume_ray, L, &tp, heterogeneous);
+          kg, &ps, sd, &volume_ray, L, &tp, step_size);
 
       kernel_update_light_picking(sd, &ps);
 
@@ -206,14 +206,14 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
 #    endif /* __VOLUME__ */
 
 /* bounce off surface and integrate indirect light */
-ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-                                                                     ShaderData *sd,
-                                                                     ShaderData *indirect_sd,
-                                                                     ShaderData *emission_sd,
-                                                                     float3 throughput,
-                                                                     float num_samples_adjust,
-                                                                     PathState *state,
-                                                                     PathRadiance *L)
+ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
+                                                                         ShaderData *sd,
+                                                                         ShaderData *indirect_sd,
+                                                                         ShaderData *emission_sd,
+                                                                         float3 throughput,
+                                                                         float num_samples_adjust,
+                                                                         PathState *state,
+                                                                         PathRadiance *L)
 {
   float sum_sample_weight = 0.0f;
 #    ifdef __DENOISING_FEATURES__
@@ -385,7 +385,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
   /* initialize */
   float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
-  path_radiance_init(L, kernel_data.film.use_light_pass);
+  path_radiance_init(kg, L);
 
   /* shader data memory used for both volumes and surfaces, saves stack space */
   ShaderData sd;
@@ -415,7 +415,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
 
     /* Shade background. */
     if (!hit) {
-      kernel_path_background(kg, &state, &ray, throughput, &sd, L);
+      kernel_path_background(kg, &state, &ray, throughput, &sd, buffer, L);
       break;
     }
 
@@ -428,7 +428,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
     if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
 #    endif
 
-      shader_eval_surface(kg, &sd, &state, state.flag);
+      shader_eval_surface(kg, &sd, &state, buffer, state.flag);
       shader_merge_closures(&sd);
 
       /* Apply shadow catcher, holdout, emission. */
@@ -440,8 +440,8 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
       /* transparency termination */
       if (state.flag & PATH_RAY_TRANSPARENT) {
         /* path termination. this is a strange place to put the termination, it's
-       * mainly due to the mixed in MIS that we use. gives too many unneeded
-       * shader evaluations, only need emission if we are going to terminate */
+         * mainly due to the mixed in MIS that we use. gives too many unneeded
+         * shader evaluations, only need emission if we are going to terminate */
         float probability = path_state_continuation_probability(kg, &state, throughput);
 
         if (probability == 0.0f) {
@@ -459,7 +459,9 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
 
       kernel_update_light_picking(&sd, &state);
 
+#    ifdef __DENOISING_FEATURES__
       kernel_update_denoising_features(kg, &sd, &state, L);
+#    endif
 
 #    ifdef __AO__
       /* ambient occlusion */
@@ -535,6 +537,14 @@ ccl_device void kernel_branched_path_trace(
 
   buffer += index * pass_stride;
 
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if ((*aux).w > 0.0f) {
+      return;
+    }
+  }
+
   /* initialize random numbers and ray */
   uint rng_hash;
   Ray ray;
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index cdca0b1f9bf..c389c815ae2 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -41,9 +41,11 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
   if (kernel_data.film.pass_denoising_data) {
     state->flag |= PATH_RAY_STORE_SHADOW_INFO;
     state->denoising_feature_weight = 1.0f;
+    state->denoising_feature_throughput = make_float3(1.0f, 1.0f, 1.0f);
   }
   else {
     state->denoising_feature_weight = 0.0f;
+    state->denoising_feature_throughput = make_float3(0.0f, 0.0f, 0.0f);
   }
 #endif /* __DENOISING_FEATURES__ */
 
@@ -209,8 +211,8 @@ ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
     return 0.0f;
   }
   else if (state->flag & PATH_RAY_TRANSPARENT) {
-    /* Do at least one bounce without RR. */
-    if (state->transparent_bounce <= 1) {
+    /* Do at least specified number of bounces without RR. */
+    if (state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
       return 1.0f;
     }
 #ifdef __SHADOW_TRICKS__
@@ -221,8 +223,8 @@ ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
 #endif
   }
   else {
-    /* Do at least one bounce without RR. */
-    if (state->bounce <= 1) {
+    /* Do at least specified number of bounces without RR. */
+    if (state->bounce <= kernel_data.integrator.min_bounce) {
       return 1.0f;
     }
 #ifdef __SHADOW_TRICKS__
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 3ad737acd85..ca27b68a627 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -224,8 +224,9 @@ ccl_device void accum_light_tree_contribution(KernelGlobals *kg,
 
 #if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || \
     defined(__BAKING__)
-/* branched path tracing: connect path directly to position on one or more lights and add it to L */
-ccl_device_noinline void kernel_branched_path_surface_connect_light(
+/* branched path tracing: connect path directly to position on one or more lights and add it to L
+ */
+ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
     KernelGlobals *kg,
     ShaderData *sd,
     ShaderData *emission_sd,
@@ -438,9 +439,9 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg,
 {
   /* sample BSDF */
   float bsdf_pdf;
-  BsdfEval bsdf_eval;
-  float3 bsdf_omega_in;
-  differential3 bsdf_domega_in;
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  float3 bsdf_omega_in ccl_optional_struct_init;
+  differential3 bsdf_domega_in ccl_optional_struct_init;
   float bsdf_u, bsdf_v;
   path_branched_rng_2D(
       kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
@@ -570,9 +571,9 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
   if (sd->flag & SD_BSDF) {
     /* sample BSDF */
     float bsdf_pdf;
-    BsdfEval bsdf_eval;
-    float3 bsdf_omega_in;
-    differential3 bsdf_domega_in;
+    BsdfEval bsdf_eval ccl_optional_struct_init;
+    float3 bsdf_omega_in ccl_optional_struct_init;
+    differential3 bsdf_domega_in ccl_optional_struct_init;
     float bsdf_u, bsdf_v;
     path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
     int label;
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 1d7c289d383..f2a36d172a8 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -26,55 +26,54 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
                                                         PathRadiance *L)
 {
 #  ifdef __EMISSION__
-  if (!kernel_data.integrator.use_direct_light)
-    return;
-
   /* sample illumination from lights to find path contribution */
-  float light_u, light_v;
-  path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-  Ray light_ray;
-  BsdfEval L_light;
-  LightSample ls;
-  bool is_lamp;
+  Ray light_ray ccl_optional_struct_init;
+  BsdfEval L_light ccl_optional_struct_init;
+  bool is_lamp = false;
+  bool has_emission = false;
 
+  light_ray.t = 0.0f;
+#    ifdef __OBJECT_MOTION__
   /* connect to light from given point where shader has been evaluated */
   light_ray.time = sd->time;
-  if (light_sample(
-          kg, light_u, light_v, sd->time, sd->P_pick, sd->N_pick, state->bounce, &ls, true)) {
-    float terminate = path_state_rng_light_termination(kg, state);
-    if (direct_emission(
-            kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
-      /* trace shadow ray */
-      float3 shadow;
+#    endif
 
-      if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
-        /* accumulate */
-        path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-      }
+  if (kernel_data.integrator.use_direct_light) {
+    float light_u, light_v;
+    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    LightSample ls ccl_optional_struct_init;
+    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+      float terminate = path_state_rng_light_termination(kg, state);
+      has_emission = direct_emission(
+          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
     }
   }
+
+  /* trace shadow ray */
+  float3 shadow;
+
+  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
+
+  if (has_emission && !blocked) {
+    /* accumulate */
+    path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+  }
 #  endif /* __EMISSION__ */
 }
 
-#  ifdef __KERNEL_GPU__
-ccl_device_noinline
-#  else
-ccl_device
-#  endif
-    bool
-    kernel_path_volume_bounce(KernelGlobals *kg,
-                              ShaderData *sd,
-                              ccl_addr_space float3 *throughput,
-                              ccl_addr_space PathState *state,
-                              PathRadianceState *L_state,
-                              ccl_addr_space Ray *ray)
+ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       ccl_addr_space float3 *throughput,
+                                                       ccl_addr_space PathState *state,
+                                                       PathRadianceState *L_state,
+                                                       ccl_addr_space Ray *ray)
 {
   /* sample phase function */
   float phase_pdf;
-  BsdfEval phase_eval;
-  float3 phase_omega_in;
-  differential3 phase_domega_in;
+  BsdfEval phase_eval ccl_optional_struct_init;
+  float3 phase_omega_in ccl_optional_struct_init;
+  differential3 phase_domega_in ccl_optional_struct_init;
   float phase_u, phase_v;
   path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
   int label;
@@ -128,7 +127,7 @@ ccl_device
   return true;
 }
 
-#  ifndef __SPLIT_KERNEL__
+#  if !defined(__SPLIT_KERNEL__) && (defined(__BRANCHED_PATH__) || defined(__VOLUME_DECOUPLED__))
 ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg,
                                                           ShaderData *sd,
                                                           ShaderData *emission_sd,
@@ -140,94 +139,71 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg,
                                                           const VolumeSegment *segment)
 {
 #    ifdef __EMISSION__
-  if (!kernel_data.integrator.use_direct_light)
-    return;
-
-  Ray light_ray;
-  BsdfEval L_light;
-  bool is_lamp;
+  BsdfEval L_light ccl_optional_struct_init;
 
-  light_ray.time = sd->time;
-  if (sample_all_lights && !kernel_data.integrator.use_light_tree) {
-    /* lamp sampling */
-    for (int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
-      if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce)))
-        continue;
-
-      int num_samples = light_select_num_samples(kg, i);
-      float num_samples_inv = 1.0f / num_samples;
-      uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
-
-      for (int j = 0; j < num_samples; j++) {
-        /* sample random position on given light */
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-        LightSample ls;
-        lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);
-
-        float3 tp = throughput;
-
-        /* sample position on volume segment */
-        float rphase = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
-        float rscatter = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-                                                                       state,
-                                                                       ray,
-                                                                       sd,
-                                                                       &tp,
-                                                                       rphase,
-                                                                       rscatter,
-                                                                       segment,
-                                                                       (ls.t != FLT_MAX) ? &ls.P :
-                                                                                           NULL,
-                                                                       false);
+  int num_lights = 1;
+  if (sample_all_lights) {
+    num_lights = kernel_data.integrator.num_all_lights;
+    if (kernel_data.integrator.pdf_triangles != 0.0f) {
+      num_lights += 1;
+    }
+  }
 
-        /* todo: split up light_sample so we don't have to call it again with new position */
-        if (result == VOLUME_PATH_SCATTERED &&
-            lamp_light_sample(kg, i, light_u, light_v, sd->P_pick, &ls)) {
-
-          float terminate = path_branched_rng_light_termination(
-              kg, state->rng_hash, state, j, num_samples);
-          if (direct_emission(
-                  kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
-            /* trace shadow ray */
-            float3 shadow;
-
-            if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
-              /* accumulate */
-              path_radiance_accum_light(
-                  L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
-            }
-          }
+  for (int i = 0; i < num_lights; ++i) {
+    /* sample one light at random */
+    int num_samples = 1;
+    int num_all_lights = 1;
+    uint lamp_rng_hash = state->rng_hash;
+    bool double_pdf = false;
+    bool is_mesh_light = false;
+    bool is_lamp = false;
+
+    if (sample_all_lights) {
+      /* lamp sampling */
+      is_lamp = i < kernel_data.integrator.num_all_lights;
+      if (is_lamp) {
+        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
+          continue;
         }
+        num_samples = light_select_num_samples(kg, i);
+        num_all_lights = kernel_data.integrator.num_all_lights;
+        lamp_rng_hash = cmj_hash(state->rng_hash, i);
+        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
+      }
+      /* mesh light sampling */
+      else {
+        num_samples = kernel_data.integrator.mesh_light_samples;
+        double_pdf = kernel_data.integrator.num_all_lights != 0;
+        is_mesh_light = true;
       }
     }
 
-    /* mesh light sampling */
-    if (kernel_data.integrator.pdf_triangles != 0.0f) {
-      int num_samples = kernel_data.integrator.mesh_light_samples;
-      float num_samples_inv = 1.0f / num_samples;
+    float num_samples_inv = 1.0f / (num_samples * num_all_lights);
+
+    for (int j = 0; j < num_samples; j++) {
+      Ray light_ray ccl_optional_struct_init;
+      light_ray.t = 0.0f; /* reset ray */
+#      ifdef __OBJECT_MOTION__
+      light_ray.time = sd->time;
+#      endif
+      bool has_emission = false;
 
-      for (int j = 0; j < num_samples; j++) {
-        /* sample random position on random triangle */
+      float3 tp = throughput;
+
+      if (kernel_data.integrator.use_direct_light) {
+        /* sample random position on random light/triangle */
         float light_u, light_v;
         path_branched_rng_2D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
         /* only sample triangle lights */
-        if (kernel_data.integrator.num_all_lights)
+        if (is_mesh_light && double_pdf) {
           light_u = 0.5f * light_u;
+        }
 
-        LightSample ls;
-        light_sample(
-            kg, light_u, light_v, sd->time, sd->P_pick, sd->N_pick, state->bounce, &ls, true);
-
-        float3 tp = throughput;
+        LightSample ls ccl_optional_struct_init;
+        const int lamp = is_lamp ? i : -1;
+        light_sample(kg, lamp, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
         /* sample position on volume segment */
         float rphase = path_branched_rng_1D(
@@ -260,63 +236,24 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg,
           if (kernel_data.integrator.num_all_lights)
             ls.pdf *= 2.0f;
 
-          float terminate = path_branched_rng_light_termination(
-              kg, state->rng_hash, state, j, num_samples);
-          if (direct_emission(
-                  kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
-            /* trace shadow ray */
-            float3 shadow;
-
-            if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
-              /* accumulate */
-              path_radiance_accum_light(
-                  L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
-            }
+            /* sample random light */
+            float terminate = path_branched_rng_light_termination(
+                kg, state->rng_hash, state, j, num_samples);
+            has_emission = direct_emission(
+                kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
           }
         }
       }
-    }
-  }
-  else {
-    /* sample random position on random light */
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
-    LightSample ls;
-    light_sample(kg, light_u, light_v, sd->time, sd->P_pick, sd->N_pick, state->bounce, &ls, true);
-
-    float3 tp = throughput;
-
-    /* sample position on volume segment */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-                                                                   state,
-                                                                   ray,
-                                                                   sd,
-                                                                   &tp,
-                                                                   rphase,
-                                                                   rscatter,
-                                                                   segment,
-                                                                   (ls.t != FLT_MAX) ? &ls.P :
-                                                                                       NULL,
-                                                                   false);
-
-    /* todo: split up light_sample so we don't have to call it again with new position */
-    if (result == VOLUME_PATH_SCATTERED &&
-        light_sample(kg, light_u, light_v, sd->time, sd->P, sd->N, state->bounce, &ls, true)) {
-      /* sample random light */
-      float terminate = path_state_rng_light_termination(kg, state);
-      if (direct_emission(
-              kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
-        /* trace shadow ray */
-        float3 shadow;
-
-        if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
-          /* accumulate */
-          path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
-        }
+      /* trace shadow ray */
+      float3 shadow;
+
+      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
+
+      if (has_emission && !blocked) {
+        /* accumulate */
+        path_radiance_accum_light(
+            kg, L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
       }
     }
   }
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index 91a39fc1465..451d2a0cedf 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -73,7 +73,7 @@ ccl_device void enqueue_ray_index_local(
     int queue_number,  /* Queue in which to enqueue ray index. */
     char enqueue_flag, /* True for threads whose ray index has to be enqueued. */
     int queuesize,     /* queue size. */
-    ccl_local_param unsigned int *local_queue_atomics, /* To to local queue atomics. */
+    ccl_local_param unsigned int *local_queue_atomics, /* To do local queue atomics. */
     ccl_global int *Queue_data,                        /* Queues. */
     ccl_global int *Queue_index)                       /* To do global queue atomics. */
 {
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 6779c1f7160..a9b17854f25 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -41,10 +41,9 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 {
   uint result = 0;
   uint i = index + SOBOL_SKIP;
-  for (uint j = 0; i; i >>= 1, j++) {
-    if (i & 1) {
-      result ^= kernel_tex_fetch(__sobol_directions, 32 * dimension + j);
-    }
+  for (int j = 0, x; (x = find_first_set(i)); i >>= x) {
+    j += x;
+    result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1);
   }
   return result;
 }
@@ -57,7 +56,9 @@ ccl_device_forceinline float path_rng_1D(
 #ifdef __DEBUG_CORRELATION__
   return (float)drand48();
 #endif
-
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
+    return pmj_sample_1D(kg, sample, rng_hash, dimension);
+  }
 #ifdef __CMJ__
 #  ifdef __SOBOL__
   if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
@@ -100,7 +101,12 @@ ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
   *fy = (float)drand48();
   return;
 #endif
-
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
+    const float2 f = pmj_sample_2D(kg, sample, rng_hash, dimension);
+    *fx = f.x;
+    *fy = f.y;
+    return;
+  }
 #ifdef __CMJ__
 #  ifdef __SOBOL__
   if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
@@ -130,7 +136,7 @@ ccl_device_inline void path_rng_init(KernelGlobals *kg,
                                      float *fy)
 {
   /* load state */
-  *rng_hash = hash_int_2d(x, y);
+  *rng_hash = hash_uint2(x, y);
   *rng_hash ^= kernel_data.integrator.seed;
 
 #ifdef __DEBUG_CORRELATION__
@@ -244,7 +250,7 @@ ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg,
               fy);
 }
 
-/* Utitility functions to get light termination value,
+/* Utility functions to get light termination value,
  * since it might not be needed in many cases.
  */
 ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg,
@@ -285,4 +291,31 @@ ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
   return (float)*rng * (1.0f / (float)0xFFFFFFFF);
 }
 
+ccl_device_inline bool sample_is_even(int pattern, int sample)
+{
+  if (pattern == SAMPLING_PATTERN_PMJ) {
+    /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
+     * We can use this to get divide sample sequence into two classes for easier variance
+     * estimation. */
+#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
+    return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
+#elif defined(__NVCC__)
+    return __popc(sample & 0xaaaaaaaa) & 1;
+#elif defined(__KERNEL_OPENCL__)
+    return popcount(sample & 0xaaaaaaaa) & 1;
+#else
+    /* TODO(Stefan): popcnt intrinsic for Windows with fallback for older CPUs. */
+    int i = sample & 0xaaaaaaaa;
+    i = i - ((i >> 1) & 0x55555555);
+    i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+    i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+    return i & 1;
+#endif
+  }
+  else {
+    /* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
+    return sample & 0x1;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 351b623addb..e461e1642b6 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -23,10 +23,12 @@
  * Release.
  */
 
+// clang-format off
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/bsdf_util.h"
 #include "kernel/closure/bsdf.h"
 #include "kernel/closure/emissive.h"
+// clang-format on
 
 #include "kernel/svm/svm.h"
 
@@ -48,17 +50,21 @@ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd
 }
 #endif
 
-ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               const Intersection *isect,
-                                               const Ray *ray)
+#ifdef __KERNEL_OPTIX__
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+    void
+    shader_setup_from_ray(KernelGlobals *kg,
+                          ShaderData *sd,
+                          const Intersection *isect,
+                          const Ray *ray)
 {
   PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
 
-#ifdef __INSTANCING__
   sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
                                                 isect->object;
-#endif
   sd->lamp = LAMP_NONE;
 
   sd->type = isect->type;
@@ -74,18 +80,13 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
   sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
   sd->ray_length = isect->t;
 
-#ifdef __UV__
   sd->u = isect->u;
   sd->v = isect->v;
-#endif
 
 #ifdef __HAIR__
   if (sd->type & PRIMITIVE_ALL_CURVE) {
     /* curve */
-    float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-
-    sd->shader = __float_as_int(curvedata.z);
-    sd->P = curve_refine(kg, sd, isect, ray);
+    curve_shader_setup(kg, sd, isect, ray);
   }
   else
 #endif
@@ -117,17 +118,15 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 
   sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
 
-#ifdef __INSTANCING__
   if (isect->object != OBJECT_NONE) {
     /* instance transform */
     object_normal_transform_auto(kg, sd, &sd->N);
     object_normal_transform_auto(kg, sd, &sd->Ng);
-#  ifdef __DPDU__
+#ifdef __DPDU__
     object_dir_transform_auto(kg, sd, &sd->dPdu);
     object_dir_transform_auto(kg, sd, &sd->dPdv);
-#  endif
-  }
 #endif
+  }
 
   /* backfacing test */
   bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
@@ -177,10 +176,8 @@ ccl_device_inline
   sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
   sd->type = isect->type;
 
-#  ifdef __UV__
   sd->u = isect->u;
   sd->v = isect->v;
-#  endif
 
   /* fetch triangle data */
   if (sd->type == PRIMITIVE_TRIANGLE) {
@@ -207,17 +204,15 @@ ccl_device_inline
 
   sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
 
-#  ifdef __INSTANCING__
   if (isect->object != OBJECT_NONE) {
     /* instance transform */
     object_normal_transform_auto(kg, sd, &sd->N);
     object_normal_transform_auto(kg, sd, &sd->Ng);
-#    ifdef __DPDU__
+#  ifdef __DPDU__
     object_dir_transform_auto(kg, sd, &sd->dPdu);
     object_dir_transform_auto(kg, sd, &sd->dPdv);
-#    endif
-  }
 #  endif
+  }
 
   /* backfacing test */
   if (backfacing) {
@@ -276,17 +271,13 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
   else
     sd->type = PRIMITIVE_NONE;
 
-    /* primitive */
-#ifdef __INSTANCING__
+  /* primitive */
   sd->object = object;
-#endif
   sd->lamp = LAMP_NONE;
   /* currently no access to bvh prim index for strand sd->prim*/
   sd->prim = prim;
-#ifdef __UV__
   sd->u = u;
   sd->v = v;
-#endif
   sd->time = time;
   sd->ray_length = t;
 
@@ -322,23 +313,19 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
     if (sd->shader & SHADER_SMOOTH_NORMAL) {
       sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
-#ifdef __INSTANCING__
       if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
         object_normal_transform_auto(kg, sd, &sd->N);
       }
-#endif
     }
 
     /* dPdu/dPdv */
 #ifdef __DPDU__
     triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
-#  ifdef __INSTANCING__
     if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
       object_dir_transform_auto(kg, sd, &sd->dPdu);
       object_dir_transform_auto(kg, sd, &sd->dPdv);
     }
-#  endif
 #endif
   }
   else {
@@ -424,15 +411,11 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg,
   sd->time = ray->time;
   sd->ray_length = 0.0f;
 
-#ifdef __INSTANCING__
   sd->object = OBJECT_NONE;
-#endif
   sd->lamp = LAMP_NONE;
   sd->prim = PRIM_NONE;
-#ifdef __UV__
   sd->u = 0.0f;
   sd->v = 0.0f;
-#endif
 
 #ifdef __DPDU__
   /* dPdu/dPdv */
@@ -473,17 +456,13 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
   sd->time = ray->time;
   sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
 
-#  ifdef __INSTANCING__
   sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
-#  endif
   sd->lamp = LAMP_NONE;
   sd->prim = PRIM_NONE;
   sd->type = PRIMITIVE_NONE;
 
-#  ifdef __UV__
   sd->u = 0.0f;
   sd->v = 0.0f;
-#  endif
 
 #  ifdef __DPDU__
   /* dPdu/dPdv */
@@ -686,8 +665,7 @@ ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *r
         if (r < next_sum) {
           sampled = i;
 
-          /* Rescale to reuse for direction sample, to better
-           * preserve stratifaction. */
+          /* Rescale to reuse for direction sample, to better preserve stratification. */
           *randu = (r - partial_sum) / sc->sample_weight;
           break;
         }
@@ -743,8 +721,7 @@ ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
             *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
             sampled = i;
 
-            /* Rescale to reuse for direction sample, to better
-             * preserve stratifaction. */
+            /* Rescale to reuse for direction sample, to better preserve stratification. */
             *randu = (r - partial_sum) / sc->sample_weight;
             break;
           }
@@ -780,7 +757,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
   kernel_assert(CLOSURE_IS_BSDF(sc->type));
 
   int label;
-  float3 eval;
+  float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
   *pdf = 0.0f;
   label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
@@ -810,7 +787,7 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
   PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
 
   int label;
-  float3 eval;
+  float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
   *pdf = 0.0f;
   label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
@@ -897,7 +874,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
   for (int i = 0; i < sd->num_closure; i++) {
     ShaderClosure *sc = &sd->closure[i];
 
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type) ||
+        CLOSURE_IS_BSDF_BSSRDF(sc->type))
       eval += sc->weight;
   }
 
@@ -932,20 +910,6 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
-{
-  float3 eval = make_float3(0.0f, 0.0f, 0.0f);
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      eval += sc->weight;
-  }
-
-  return eval;
-}
-
 ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
 {
   float3 N = make_float3(0.0f, 0.0f, 0.0f);
@@ -1053,15 +1017,36 @@ ccl_device float3 shader_emissive_eval(ShaderData *sd)
 
 /* Holdout */
 
-ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
 {
   float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+  /* For objects marked as holdout, preserve transparency and remove all other
+   * closures, replacing them with a holdout weight. */
+  if (sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    if ((sd->flag & SD_TRANSPARENT) && !(sd->flag & SD_HAS_ONLY_VOLUME)) {
+      weight = make_float3(1.0f, 1.0f, 1.0f) - sd->closure_transparent_extinction;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ShaderClosure *sc = &sd->closure[i];
+        if (!CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+          sc->type = NBUILTIN_CLOSURES;
+        }
+      }
 
-    if (CLOSURE_IS_HOLDOUT(sc->type))
-      weight += sc->weight;
+      sd->flag &= ~(SD_CLOSURE_FLAGS - (SD_TRANSPARENT | SD_BSDF));
+    }
+    else {
+      weight = make_float3(1.0f, 1.0f, 1.0f);
+    }
+  }
+  else {
+    for (int i = 0; i < sd->num_closure; i++) {
+      ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_HOLDOUT(sc->type)) {
+        weight += sc->weight;
+      }
+    }
   }
 
   return weight;
@@ -1072,6 +1057,7 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg,
                                     ShaderData *sd,
                                     ccl_addr_space PathState *state,
+                                    ccl_global float *buffer,
                                     int path_flag)
 {
   PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
@@ -1092,7 +1078,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 
 #ifdef __OSL__
   if (kg->osl) {
-    if (sd->object == OBJECT_NONE) {
+    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
       OSLShader::eval_background(kg, sd, state, path_flag);
     }
     else {
@@ -1103,7 +1089,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #endif
   {
 #ifdef __SVM__
-    svm_eval_nodes(kg, sd, state, SHADER_TYPE_SURFACE, path_flag);
+    svm_eval_nodes(kg, sd, state, buffer, SHADER_TYPE_SURFACE, path_flag);
 #else
     if (sd->object == OBJECT_NONE) {
       sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
@@ -1223,7 +1209,7 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
    * depending on color channels, even if this is perhaps not a common case */
   const ShaderClosure *sc = &sd->closure[sampled];
   int label;
-  float3 eval;
+  float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
   *pdf = 0.0f;
   label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
@@ -1248,7 +1234,7 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
   PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
 
   int label;
-  float3 eval;
+  float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
   *pdf = 0.0f;
   label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
@@ -1315,7 +1301,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
     else
 #    endif
     {
-      svm_eval_nodes(kg, sd, state, SHADER_TYPE_VOLUME, path_flag);
+      svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_VOLUME, path_flag);
     }
 #  endif
 
@@ -1344,7 +1330,7 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
   else
 #  endif
   {
-    svm_eval_nodes(kg, sd, state, SHADER_TYPE_DISPLACEMENT, 0);
+    svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_DISPLACEMENT, 0);
   }
 #endif
 }
@@ -1358,7 +1344,7 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
   int shader = 0;
 
 #  ifdef __HAIR__
-  if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
+  if (isect->type & PRIMITIVE_ALL_TRIANGLE) {
 #  endif
     shader = kernel_tex_fetch(__tri_shader, prim);
 #  ifdef __HAIR__
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 6af1369feab..07043e6a769 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -17,13 +17,6 @@
 CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME__
-typedef struct VolumeState {
-#  ifdef __SPLIT_KERNEL__
-#  else
-  PathState ps;
-#  endif
-} VolumeState;
-
 /* Get PathState ready for use for volume stack evaluation. */
 #  ifdef __SPLIT_KERNEL__
 ccl_addr_space
@@ -55,16 +48,15 @@ ccl_addr_space
 /* Attenuate throughput accordingly to the given intersection event.
  * Returns true if the throughput is zero and traversal can be aborted.
  */
-ccl_device_forceinline bool shadow_handle_transparent_isect(
-    KernelGlobals *kg,
-    ShaderData *shadow_sd,
-    ccl_addr_space PathState *state,
+ccl_device_forceinline bool shadow_handle_transparent_isect(KernelGlobals *kg,
+                                                            ShaderData *shadow_sd,
+                                                            ccl_addr_space PathState *state,
 #ifdef __VOLUME__
-    ccl_addr_space struct PathState *volume_state,
+                                                            ccl_addr_space PathState *volume_state,
 #endif
-    Intersection *isect,
-    Ray *ray,
-    float3 *throughput)
+                                                            Intersection *isect,
+                                                            Ray *ray,
+                                                            float3 *throughput)
 {
 #ifdef __VOLUME__
   /* Attenuation between last surface and next surface. */
@@ -79,7 +71,7 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
   /* Attenuation from transparent surface. */
   if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
     path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, shadow_sd, state, PATH_RAY_SHADOW);
+    shader_eval_surface(kg, shadow_sd, state, NULL, PATH_RAY_SHADOW);
     path_state_modify_bounce(state, false);
     *throughput *= shader_bsdf_transparency(kg, shadow_sd);
   }
@@ -103,8 +95,7 @@ ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
                                       Intersection *isect,
                                       float3 *shadow)
 {
-  const bool blocked = scene_intersect(
-      kg, *ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
+  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
 #ifdef __VOLUME__
   if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
     /* Apply attenuation from current volume shader. */
@@ -164,7 +155,11 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
   uint num_hits;
   const bool blocked = scene_intersect_shadow_all(kg, ray, hits, visibility, max_hits, &num_hits);
 #    ifdef __VOLUME__
+#      ifdef __KERNEL_OPTIX__
+  VolumeState &volume_state = kg->volume_state;
+#      else
   VolumeState volume_state;
+#      endif
 #    endif
   /* If no opaque surface found but we did find transparent hits,
    * shade them.
@@ -303,7 +298,11 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg,
                                                         float3 *shadow)
 {
 #    ifdef __VOLUME__
+#      ifdef __KERNEL_OPTIX__
+  VolumeState &volume_state = kg->volume_state;
+#      else
   VolumeState volume_state;
+#      endif
 #    endif
   if (blocked && is_transparent_isect) {
     float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
@@ -319,8 +318,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg,
       if (bounce >= kernel_data.integrator.transparent_max_bounce) {
         return true;
       }
-      if (!scene_intersect(
-              kg, *ray, visibility & PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) {
+      if (!scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_TRANSPARENT, isect)) {
         break;
       }
       if (!shader_transparent_shadow(kg, isect)) {
@@ -376,8 +374,7 @@ ccl_device bool shadow_blocked_transparent_stepped(KernelGlobals *kg,
                                                    Intersection *isect,
                                                    float3 *shadow)
 {
-  bool blocked = scene_intersect(
-      kg, *ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
+  bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
   bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, isect) : false;
   return shadow_blocked_transparent_stepped_loop(
       kg, sd, shadow_sd, state, visibility, ray, isect, blocked, is_transparent_isect, shadow);
@@ -390,32 +387,38 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
                                       ShaderData *sd,
                                       ShaderData *shadow_sd,
                                       ccl_addr_space PathState *state,
-                                      Ray *ray_input,
+                                      Ray *ray,
                                       float3 *shadow)
 {
-  Ray *ray = ray_input;
-  Intersection isect;
-  /* Some common early checks. */
   *shadow = make_float3(1.0f, 1.0f, 1.0f);
+#if !defined(__KERNEL_OPTIX__)
+  /* Some common early checks.
+   * Avoid conditional trace call in OptiX though, since those hurt performance there.
+   */
   if (ray->t == 0.0f) {
     return false;
   }
+#endif
 #ifdef __SHADOW_TRICKS__
   const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) ? PATH_RAY_SHADOW_NON_CATCHER :
                                                                     PATH_RAY_SHADOW;
 #else
   const uint visibility = PATH_RAY_SHADOW;
 #endif
-  /* Do actual shadow shading. */
-  /* First of all, we check if integrator requires transparent shadows.
+  /* Do actual shadow shading.
+   * First of all, we check if integrator requires transparent shadows.
    * if not, we use simplest and fastest ever way to calculate occlusion.
+   * Do not do this in OptiX to avoid the additional trace call.
    */
-#ifdef __TRANSPARENT_SHADOWS__
+#if !defined(__KERNEL_OPTIX__) || !defined(__TRANSPARENT_SHADOWS__)
+  Intersection isect;
+#  ifdef __TRANSPARENT_SHADOWS__
   if (!kernel_data.integrator.transparent_shadows)
-#endif
+#  endif
   {
     return shadow_blocked_opaque(kg, shadow_sd, state, visibility, ray, &isect, shadow);
   }
+#endif
 #ifdef __TRANSPARENT_SHADOWS__
 #  ifdef __SHADOW_RECORD_ALL__
   /* For the transparent shadows we try to use record-all logic on the
@@ -428,16 +431,20 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
   if (state->transparent_bounce >= transparent_max_bounce) {
     return true;
   }
-  const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-#    ifdef __KERNEL_GPU__
-  /* On GPU we do trickey with tracing opaque ray first, this avoids speed
+  uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
+#    if defined(__KERNEL_OPTIX__)
+  /* Always use record-all behavior in OptiX, but ensure there are no out of bounds
+   * accesses to the hit stack.
+   */
+  max_hits = min(max_hits, SHADOW_STACK_MAX_HITS - 1);
+#    elif defined(__KERNEL_GPU__)
+  /* On GPU we do tricky with tracing opaque ray first, this avoids speed
    * regressions in some files.
    *
    * TODO(sergey): Check why using record-all behavior causes slowdown in such
    * cases. Could that be caused by a higher spill pressure?
    */
-  const bool blocked = scene_intersect(
-      kg, *ray, visibility & PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, &isect);
   const bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, &isect) : false;
   if (!blocked || !is_transparent_isect || max_hits + 1 >= SHADOW_STACK_MAX_HITS) {
     return shadow_blocked_transparent_stepped_loop(
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 7510e50a962..ed8572467ea 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -138,7 +138,7 @@ ccl_device void subsurface_color_bump_blur(
 
   if (bump || texture_blur > 0.0f) {
     /* average color and normal at incoming point */
-    shader_eval_surface(kg, sd, state, state->flag);
+    shader_eval_surface(kg, sd, state, NULL, state->flag);
     float3 in_color = shader_bssrdf_sum(sd, (bump) ? N : NULL, NULL);
 
     /* we simply divide out the average color and multiply with the average
@@ -222,7 +222,7 @@ ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg,
 
   /* intersect with the same object. if multiple intersections are found it
    * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
-  scene_intersect_local(kg, *ray, ss_isect, sd->object, lcg_state, BSSRDF_MAX_HITS);
+  scene_intersect_local(kg, ray, ss_isect, sd->object, lcg_state, BSSRDF_MAX_HITS);
   int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
 
   for (int hit = 0; hit < num_eval_hits; hit++) {
@@ -353,13 +353,19 @@ ccl_device void subsurface_random_walk_coefficients(const ShaderClosure *sc,
   *weight = safe_divide_color(bssrdf->weight, A);
 }
 
-ccl_device_noinline bool subsurface_random_walk(KernelGlobals *kg,
-                                                LocalIntersection *ss_isect,
-                                                ShaderData *sd,
-                                                ccl_addr_space PathState *state,
-                                                const ShaderClosure *sc,
-                                                const float bssrdf_u,
-                                                const float bssrdf_v)
+#ifdef __KERNEL_OPTIX__
+ccl_device_inline /* inline trace calls */
+#else
+ccl_device_noinline
+#endif
+    bool
+    subsurface_random_walk(KernelGlobals *kg,
+                           LocalIntersection *ss_isect,
+                           ShaderData *sd,
+                           ccl_addr_space PathState *state,
+                           const ShaderClosure *sc,
+                           const float bssrdf_u,
+                           const float bssrdf_v)
 {
   /* Sample diffuse surface scatter into the object. */
   float3 D;
@@ -418,16 +424,21 @@ ccl_device_noinline bool subsurface_random_walk(KernelGlobals *kg,
     float t = -logf(1.0f - rdist) / sample_sigma_t;
 
     ray->t = t;
-    scene_intersect_local(kg, *ray, ss_isect, sd->object, NULL, 1);
+    scene_intersect_local(kg, ray, ss_isect, sd->object, NULL, 1);
     hit = (ss_isect->num_hits > 0);
 
     if (hit) {
+#ifdef __KERNEL_OPTIX__
+      /* t is always in world space with OptiX. */
+      t = ss_isect->hits[0].t;
+#else
       /* Compute world space distance to surface hit. */
       float3 D = ray->D;
       object_inverse_dir_transform(kg, sd, &D);
       D = normalize(D) * ss_isect->hits[0].t;
       object_dir_transform(kg, sd, &D);
       t = len(D);
+#endif
     }
 
     /* Advance to new scatter location. */
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index b6c1701211d..293335e0e08 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -35,6 +35,7 @@ KERNEL_TEX(KernelObject, __objects)
 KERNEL_TEX(Transform, __object_motion_pass)
 KERNEL_TEX(DecomposedTransform, __object_motion)
 KERNEL_TEX(uint, __object_flag)
+KERNEL_TEX(float, __object_volume_step)
 
 /* cameras */
 KERNEL_TEX(DecomposedTransform, __camera_motion)
@@ -85,7 +86,7 @@ KERNEL_TEX(KernelShader, __shaders)
 KERNEL_TEX(float, __lookup_table)
 
 /* sobol */
-KERNEL_TEX(uint, __sobol_directions)
+KERNEL_TEX(uint, __sample_pattern_lut)
 
 /* image textures */
 KERNEL_TEX(TextureInfo, __texture_info)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 390f239f68f..c6a7524c643 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -84,9 +84,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Kernel features */
 #define __SOBOL__
-#define __INSTANCING__
 #define __DPDU__
-#define __UV__
 #define __BACKGROUND__
 #define __CAUSTICS_TRICKS__
 #define __VISIBILITY_FLAG__
@@ -106,8 +104,6 @@ CCL_NAMESPACE_BEGIN
 #ifndef __KERNEL_AO_PREVIEW__
 #  define __SVM__
 #  define __EMISSION__
-#  define __TEXTURES__
-#  define __EXTRA_NODES__
 #  define __HOLDOUT__
 #  define __MULTI_CLOSURE__
 #  define __TRANSPARENT_SHADOWS__
@@ -115,7 +111,6 @@ CCL_NAMESPACE_BEGIN
 #  define __LAMP_MIS__
 #  define __CAMERA_MOTION__
 #  define __OBJECT_MOTION__
-#  define __HAIR__
 #  define __BAKING__
 #  define __PRINCIPLED__
 #  define __SUBSURFACE__
@@ -128,9 +123,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Device specific features */
 #ifdef __KERNEL_CPU__
-#  ifdef __KERNEL_SSE2__
-#    define __QBVH__
-#  endif
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
@@ -144,6 +136,13 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #endif /* __KERNEL_CUDA__ */
 
+#ifdef __KERNEL_OPTIX__
+#  undef __BAKING__
+#  undef __BRANCHED_PATH__
+/* TODO(pmours): Cannot use optixTrace in non-inlined functions */
+#  undef __SHADER_RAYTRACE__
+#endif /* __KERNEL_OPTIX__ */
+
 #ifdef __KERNEL_OPENCL__
 #endif /* __KERNEL_OPENCL__ */
 
@@ -214,8 +213,9 @@ typedef enum ShaderEvalType {
   SHADER_EVAL_DIFFUSE_COLOR,
   SHADER_EVAL_GLOSSY_COLOR,
   SHADER_EVAL_TRANSMISSION_COLOR,
-  SHADER_EVAL_SUBSURFACE_COLOR,
   SHADER_EVAL_EMISSION,
+  SHADER_EVAL_AOV_COLOR,
+  SHADER_EVAL_AOV_VALUE,
 
   /* light passes */
   SHADER_EVAL_AO,
@@ -224,7 +224,6 @@ typedef enum ShaderEvalType {
   SHADER_EVAL_DIFFUSE,
   SHADER_EVAL_GLOSSY,
   SHADER_EVAL_TRANSMISSION,
-  SHADER_EVAL_SUBSURFACE,
 
   /* extra */
   SHADER_EVAL_ENVIRONMENT,
@@ -261,6 +260,7 @@ enum PathTraceDimension {
 enum SamplingPattern {
   SAMPLING_PATTERN_SOBOL = 0,
   SAMPLING_PATTERN_CMJ = 1,
+  SAMPLING_PATTERN_PMJ = 2,
 
   SAMPLING_NUM_PATTERNS,
 };
@@ -268,6 +268,7 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
+  /* Ray visibility. */
   PATH_RAY_CAMERA = (1 << 0),
   PATH_RAY_REFLECT = (1 << 1),
   PATH_RAY_TRANSMIT = (1 << 2),
@@ -276,6 +277,7 @@ enum PathRayFlag {
   PATH_RAY_SINGULAR = (1 << 5),
   PATH_RAY_TRANSPARENT = (1 << 6),
 
+  /* Shadow ray visibility. */
   PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7),
   PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8),
   PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER | PATH_RAY_SHADOW_OPAQUE_CATCHER),
@@ -287,8 +289,11 @@ enum PathRayFlag {
                                  PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
   PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE | PATH_RAY_SHADOW_TRANSPARENT),
 
-  PATH_RAY_CURVE = (1 << 11),          /* visibility flag to define curve segments */
-  PATH_RAY_VOLUME_SCATTER = (1 << 12), /* volume scattering */
+  /* Unused, free to reuse. */
+  PATH_RAY_UNUSED = (1 << 11),
+
+  /* Ray visibility for volume scattering. */
+  PATH_RAY_VOLUME_SCATTER = (1 << 12),
 
   /* Special flag to tag unaligned BVH nodes. */
   PATH_RAY_NODE_UNALIGNED = (1 << 13),
@@ -314,7 +319,7 @@ enum PathRayFlag {
   /* Ray is to be terminated, but continue with transparent bounces and
    * emission as long as we encounter them. This is required to make the
    * MIS between direct and indirect light rays match, as shadow rays go
-   * through transparent surfaces to reach emisison too. */
+   * through transparent surfaces to reach emission too. */
   PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21),
   /* Ray is to be terminated. */
   PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_IMMEDIATE | PATH_RAY_TERMINATE_AFTER_TRANSPARENT),
@@ -365,6 +370,10 @@ typedef enum PassType {
 #endif
   PASS_RENDER_TIME,
   PASS_CRYPTOMATTE,
+  PASS_AOV_COLOR,
+  PASS_AOV_VALUE,
+  PASS_ADAPTIVE_AUX_BUFFER,
+  PASS_SAMPLE_COUNT,
   PASS_CATEGORY_MAIN_END = 31,
 
   PASS_MIST = 32,
@@ -382,13 +391,14 @@ typedef enum PassType {
   PASS_TRANSMISSION_DIRECT,
   PASS_TRANSMISSION_INDIRECT,
   PASS_TRANSMISSION_COLOR,
-  PASS_SUBSURFACE_DIRECT,
-  PASS_SUBSURFACE_INDIRECT,
-  PASS_SUBSURFACE_COLOR,
-  PASS_VOLUME_DIRECT,
+  PASS_VOLUME_DIRECT = 50,
   PASS_VOLUME_INDIRECT,
   /* No Scatter color since it's tricky to define what it would even mean. */
   PASS_CATEGORY_LIGHT_END = 63,
+
+  PASS_BAKE_PRIMITIVE,
+  PASS_BAKE_DIFFERENTIAL,
+  PASS_CATEGORY_BAKE_END = 95
 } PassType;
 
 #define PASS_ANY (~0)
@@ -435,23 +445,20 @@ typedef enum eBakePassFilter {
   BAKE_FILTER_DIFFUSE = (1 << 3),
   BAKE_FILTER_GLOSSY = (1 << 4),
   BAKE_FILTER_TRANSMISSION = (1 << 5),
-  BAKE_FILTER_SUBSURFACE = (1 << 6),
-  BAKE_FILTER_EMISSION = (1 << 7),
-  BAKE_FILTER_AO = (1 << 8),
+  BAKE_FILTER_EMISSION = (1 << 6),
+  BAKE_FILTER_AO = (1 << 7),
 } eBakePassFilter;
 
 typedef enum BakePassFilterCombos {
   BAKE_FILTER_COMBINED = (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE |
-                          BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_SUBSURFACE |
-                          BAKE_FILTER_EMISSION | BAKE_FILTER_AO),
+                          BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_EMISSION |
+                          BAKE_FILTER_AO),
   BAKE_FILTER_DIFFUSE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_DIFFUSE),
   BAKE_FILTER_GLOSSY_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_GLOSSY),
   BAKE_FILTER_TRANSMISSION_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_TRANSMISSION),
-  BAKE_FILTER_SUBSURFACE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_SUBSURFACE),
   BAKE_FILTER_DIFFUSE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE),
   BAKE_FILTER_GLOSSY_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_GLOSSY),
   BAKE_FILTER_TRANSMISSION_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_TRANSMISSION),
-  BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
 } BakePassFilterCombos;
 
 typedef enum DenoiseFlag {
@@ -461,9 +468,7 @@ typedef enum DenoiseFlag {
   DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
   DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
   DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
-  DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
-  DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
-  DENOISING_CLEAN_ALL_PASSES = (1 << 8) - 1,
+  DENOISING_CLEAN_ALL_PASSES = (1 << 6) - 1,
 } DenoiseFlag;
 
 #ifdef __KERNEL_DEBUG__
@@ -483,8 +488,7 @@ typedef ccl_addr_space struct PathRadianceState {
   float3 diffuse;
   float3 glossy;
   float3 transmission;
-  float3 subsurface;
-  float3 scatter;
+  float3 volume;
 
   float3 direct;
 #endif
@@ -507,19 +511,16 @@ typedef ccl_addr_space struct PathRadiance {
   float3 color_diffuse;
   float3 color_glossy;
   float3 color_transmission;
-  float3 color_subsurface;
 
   float3 direct_diffuse;
   float3 direct_glossy;
   float3 direct_transmission;
-  float3 direct_subsurface;
-  float3 direct_scatter;
+  float3 direct_volume;
 
   float3 indirect_diffuse;
   float3 indirect_glossy;
   float3 indirect_transmission;
-  float3 indirect_subsurface;
-  float3 indirect_scatter;
+  float3 indirect_volume;
 
   float4 shadow;
   float mist;
@@ -573,8 +574,7 @@ typedef struct BsdfEval {
   float3 glossy;
   float3 transmission;
   float3 transparent;
-  float3 subsurface;
-  float3 scatter;
+  float3 volume;
 #endif
 #ifdef __SHADOW_TRICKS__
   float3 sum_no_mis;
@@ -660,9 +660,8 @@ typedef struct Ray {
  * is fixed.
  */
 #ifndef __KERNEL_OPENCL_AMD__
-  float3 P; /* origin */
-  float3 D; /* direction */
-
+  float3 P;   /* origin */
+  float3 D;   /* direction */
   float t;    /* length of the ray */
   float time; /* time (for motion blur) */
 #else
@@ -702,32 +701,42 @@ typedef enum PrimitiveType {
   PRIMITIVE_NONE = 0,
   PRIMITIVE_TRIANGLE = (1 << 0),
   PRIMITIVE_MOTION_TRIANGLE = (1 << 1),
-  PRIMITIVE_CURVE = (1 << 2),
-  PRIMITIVE_MOTION_CURVE = (1 << 3),
+  PRIMITIVE_CURVE_THICK = (1 << 2),
+  PRIMITIVE_MOTION_CURVE_THICK = (1 << 3),
+  PRIMITIVE_CURVE_RIBBON = (1 << 4),
+  PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5),
   /* Lamp primitive is not included below on purpose,
    * since it is no real traceable primitive.
    */
-  PRIMITIVE_LAMP = (1 << 4),
+  PRIMITIVE_LAMP = (1 << 6),
 
   PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE),
-  PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE | PRIMITIVE_MOTION_CURVE),
-  PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE),
+  PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE_THICK | PRIMITIVE_MOTION_CURVE_THICK |
+                         PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON),
+  PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK |
+                          PRIMITIVE_MOTION_CURVE_RIBBON),
   PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE),
 
   /* Total number of different traceable primitives.
    * NOTE: This is an actual value, not a bitflag.
    */
-  PRIMITIVE_NUM_TOTAL = 4,
+  PRIMITIVE_NUM_TOTAL = 6,
 } PrimitiveType;
 
 #define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
 #define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
 
+typedef enum CurveShapeType {
+  CURVE_RIBBON = 0,
+  CURVE_THICK = 1,
+
+  CURVE_NUM_SHAPE_TYPES,
+} CurveShapeType;
+
 /* Attributes */
 
 typedef enum AttributePrimitive {
-  ATTR_PRIM_TRIANGLE = 0,
-  ATTR_PRIM_CURVE,
+  ATTR_PRIM_GEOMETRY = 0,
   ATTR_PRIM_SUBD,
 
   ATTR_PRIM_TYPES
@@ -755,6 +764,7 @@ typedef enum AttributeStandard {
   ATTR_STD_UV,
   ATTR_STD_UV_TANGENT,
   ATTR_STD_UV_TANGENT_SIGN,
+  ATTR_STD_VERTEX_COLOR,
   ATTR_STD_GENERATED,
   ATTR_STD_GENERATED_TRANSFORM,
   ATTR_STD_POSITION_UNDEFORMED,
@@ -773,6 +783,7 @@ typedef enum AttributeStandard {
   ATTR_STD_VOLUME_TEMPERATURE,
   ATTR_STD_VOLUME_VELOCITY,
   ATTR_STD_POINTINESS,
+  ATTR_STD_RANDOM_PER_ISLAND,
   ATTR_STD_NUM,
 
   ATTR_STD_NOT_FOUND = ~0
@@ -814,8 +825,9 @@ typedef struct AttributeDescriptor {
  * ShaderClosure has a fixed size, and any extra space must be allocated
  * with closure_alloc_extra().
  *
- * We pad the struct to 80 bytes and ensure it is aligned to 16 bytes, which
- * we assume to be the maximum required alignment for any struct. */
+ * We pad the struct to align to 16 bytes. All shader closures are assumed
+ * to fit in this struct size. CPU sizes are a bit larger because float3 is
+ * padded to be 16 bytes, while it's only 12 bytes on the GPU. */
 
 #define SHADER_CLOSURE_BASE \
   float3 weight; \
@@ -827,7 +839,10 @@ typedef ccl_addr_space struct ccl_align(16) ShaderClosure
 {
   SHADER_CLOSURE_BASE;
 
-  float data[10]; /* pad to 80 bytes */
+#ifdef __KERNEL_CPU__
+  float pad[2];
+#endif
+  float data[10];
 }
 ShaderClosure;
 
@@ -890,13 +905,13 @@ enum ShaderDataFlag {
   SD_HAS_DISPLACEMENT = (1 << 26),
   /* Has constant emission (value stored in __shaders) */
   SD_HAS_CONSTANT_EMISSION = (1 << 27),
-  /* Needs to access attributes */
-  SD_NEED_ATTRIBUTES = (1 << 28),
+  /* Needs to access attributes for volume rendering */
+  SD_NEED_VOLUME_ATTRIBUTES = (1 << 28),
 
   SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME |
                      SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR |
                      SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT |
-                     SD_HAS_CONSTANT_EMISSION | SD_NEED_ATTRIBUTES)
+                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES)
 };
 
 /* Object flags. */
@@ -926,7 +941,8 @@ enum ShaderDataObjectFlag {
                      SD_OBJECT_HAS_VOLUME_ATTRIBUTES)
 };
 
-typedef ccl_addr_space struct ShaderData {
+typedef ccl_addr_space struct ccl_align(16) ShaderData
+{
   /* position */
   float3 P;
   /* smooth normal for shading */
@@ -1015,11 +1031,16 @@ typedef ccl_addr_space struct ShaderData {
 
   /* At the end so we can adjust size in ShaderDataTinyStorage. */
   struct ShaderClosure closure[MAX_CLOSURE];
-} ShaderData;
+}
+ShaderData;
 
-typedef ccl_addr_space struct ShaderDataTinyStorage {
+/* ShaderDataTinyStorage needs the same alignment as ShaderData, or else
+ * the pointer cast in AS_SHADER_DATA invokes undefined behavior. */
+typedef ccl_addr_space struct ccl_align(16) ShaderDataTinyStorage
+{
   char pad[sizeof(ShaderData) - sizeof(ShaderClosure) * MAX_CLOSURE];
-} ShaderDataTinyStorage;
+}
+ShaderDataTinyStorage;
 #define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData *)shader_data_tiny_storage)
 
 /* Path State */
@@ -1051,6 +1072,7 @@ typedef struct PathState {
 
 #ifdef __DENOISING_FEATURES__
   float denoising_feature_weight;
+  float3 denoising_feature_throughput;
 #endif /* __DENOISING_FEATURES__ */
 
   /* multiple importance sampling */
@@ -1069,6 +1091,15 @@ typedef struct PathState {
 #endif
 } PathState;
 
+#ifdef __VOLUME__
+typedef struct VolumeState {
+#  ifdef __SPLIT_KERNEL__
+#  else
+  PathState ps;
+#  endif
+} VolumeState;
+#endif
+
 /* Struct to gather multiple nearby intersections. */
 typedef struct LocalIntersection {
   Ray ray;
@@ -1159,7 +1190,7 @@ typedef struct KernelCamera {
   ProjectionTransform worldtondc;
   Transform worldtocamera;
 
-  /* Stores changes in the projeciton matrix. Use for camera zoom motion
+  /* Stores changes in the projection matrix. Use for camera zoom motion
    * blur and motion pass output for perspective camera. */
   ProjectionTransform perspective_pre;
   ProjectionTransform perspective_post;
@@ -1181,6 +1212,7 @@ static_assert_align(KernelCamera, 16);
 typedef struct KernelFilm {
   float exposure;
   int pass_flag;
+
   int light_pass_flag;
   int pass_stride;
   int use_light_pass;
@@ -1198,18 +1230,15 @@ typedef struct KernelFilm {
   int pass_diffuse_color;
   int pass_glossy_color;
   int pass_transmission_color;
-  int pass_subsurface_color;
 
   int pass_diffuse_indirect;
   int pass_glossy_indirect;
   int pass_transmission_indirect;
-  int pass_subsurface_indirect;
   int pass_volume_indirect;
 
   int pass_diffuse_direct;
   int pass_glossy_direct;
   int pass_transmission_direct;
-  int pass_subsurface_direct;
   int pass_volume_direct;
 
   int pass_emission;
@@ -1224,6 +1253,9 @@ typedef struct KernelFilm {
   int cryptomatte_depth;
   int pass_cryptomatte;
 
+  int pass_adaptive_aux_buffer;
+  int pass_sample_count;
+
   int pass_mist;
   float mist_start;
   float mist_inv_depth;
@@ -1233,6 +1265,12 @@ typedef struct KernelFilm {
   int pass_denoising_clean;
   int denoising_flags;
 
+  int pass_aov_color;
+  int pass_aov_value;
+  int pass_aov_color_num;
+  int pass_aov_value_num;
+  int pad1, pad2, pad3;
+
   /* XYZ to rendering color space transform. float4 instead of float3 to
    * ensure consistent padding/alignment across devices. */
   float4 xyz_to_r;
@@ -1240,12 +1278,25 @@ typedef struct KernelFilm {
   float4 xyz_to_b;
   float4 rgb_to_y;
 
+  int pass_bake_primitive;
+  int pass_bake_differential;
+  int pad;
+
 #ifdef __KERNEL_DEBUG__
   int pass_bvh_traversed_nodes;
   int pass_bvh_traversed_instances;
   int pass_bvh_intersections;
   int pass_ray_bounces;
 #endif
+
+  /* viewport rendering options */
+  int display_pass_stride;
+  int display_pass_components;
+  int display_divide_pass_stride;
+  int use_display_exposure;
+  int use_display_pass_alpha;
+
+  int pad4, pad5, pad6;
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
 
@@ -1253,6 +1304,7 @@ typedef struct KernelBackground {
   /* only shader index */
   int surface_shader;
   int volume_shader;
+  float volume_step_size;
   int transparent;
   float transparent_roughness_squared_threshold;
 
@@ -1260,7 +1312,24 @@ typedef struct KernelBackground {
   float ao_factor;
   float ao_distance;
   float ao_bounces_factor;
-  float ao_pad;
+
+  /* portal sampling */
+  float portal_weight;
+  int num_portals;
+  int portal_offset;
+
+  /* sun sampling */
+  float sun_weight;
+  /* xyz store direction, w the angle. float4 instead of float3 is used
+   * to ensure consistent padding/alignment across devices. */
+  float4 sun;
+
+  /* map sampling */
+  float map_weight;
+  int map_res_x;
+  int map_res_y;
+
+  int use_mis;
 } KernelBackground;
 static_assert_align(KernelBackground, 16);
 
@@ -1279,18 +1348,12 @@ typedef struct KernelIntegrator {
   float pdf_triangles;
   float pdf_lights;
   float pdf_inv_totarea;
-  int pdf_background_res_x;
-  int pdf_background_res_y;
   float light_inv_rr_threshold;
   int distant_lights_offset;
   int background_light_index;
 
-  /* light portals */
-  float portal_pdf;
-  int num_portals;
-  int portal_offset;
-
   /* bounces */
+  int min_bounce;
   int max_bounce;
 
   int max_diffuse_bounce;
@@ -1301,6 +1364,7 @@ typedef struct KernelIntegrator {
   int ao_bounces;
 
   /* transparent */
+  int transparent_min_bounce;
   int transparent_max_bounce;
   int transparent_shadows;
 
@@ -1334,11 +1398,15 @@ typedef struct KernelIntegrator {
   /* sampler */
   int sampling_pattern;
   int aa_samples;
+  int adaptive_min_samples;
+  int adaptive_step;
+  int adaptive_stop_per_sample;
+  float adaptive_threshold;
 
   /* volume render */
   int use_volumes;
   int volume_max_steps;
-  float volume_step_size;
+  float volume_step_rate;
   int volume_samples;
 
   int start_sample;
@@ -1353,11 +1421,12 @@ typedef enum KernelBVHLayout {
   BVH_LAYOUT_NONE = 0,
 
   BVH_LAYOUT_BVH2 = (1 << 0),
-  BVH_LAYOUT_BVH4 = (1 << 1),
-  BVH_LAYOUT_BVH8 = (1 << 2),
-  BVH_LAYOUT_EMBREE = (1 << 3),
-  BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8,
-  BVH_LAYOUT_ALL = (unsigned int)(-1),
+  BVH_LAYOUT_EMBREE = (1 << 1),
+  BVH_LAYOUT_OPTIX = (1 << 2),
+
+  /* Default BVH layout to use for CPU. */
+  BVH_LAYOUT_AUTO = BVH_LAYOUT_EMBREE,
+  BVH_LAYOUT_ALL = (unsigned int)(~0u),
 } KernelBVHLayout;
 
 typedef struct KernelBVH {
@@ -1365,56 +1434,48 @@ typedef struct KernelBVH {
   int root;
   int have_motion;
   int have_curves;
-  int have_instancing;
   int bvh_layout;
   int use_bvh_steps;
+  int curve_subdivisions;
 
-  /* Embree */
-#ifdef __EMBREE__
+  /* Custom BVH */
+#ifdef __KERNEL_OPTIX__
+  OptixTraversableHandle scene;
+#else
+#  ifdef __EMBREE__
   RTCScene scene;
-#  ifndef __KERNEL_64_BIT__
-  int pad1;
+#    ifndef __KERNEL_64_BIT__
+  int pad2;
+#    endif
+#  else
+  int scene, pad2;
 #  endif
-#else
-  int pad1, pad2;
 #endif
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
-typedef enum CurveFlag {
-  /* runtime flags */
-  CURVE_KN_BACKFACING = 1,           /* backside of cylinder? */
-  CURVE_KN_ENCLOSEFILTER = 2,        /* don't consider strands surrounding start point? */
-  CURVE_KN_INTERPOLATE = 4,          /* render as a curve? */
-  CURVE_KN_ACCURATE = 8,             /* use accurate intersections test? */
-  CURVE_KN_INTERSECTCORRECTION = 16, /* correct for width after determing closest midpoint? */
-  CURVE_KN_TRUETANGENTGNORMAL = 32,  /* use tangent normal for geometry? */
-  CURVE_KN_RIBBONS = 64,             /* use flat curve ribbons */
-} CurveFlag;
-
-typedef struct KernelCurves {
-  int curveflags;
-  int subdivisions;
-
-  float minimum_width;
-  float maximum_width;
-} KernelCurves;
-static_assert_align(KernelCurves, 16);
-
 typedef struct KernelTables {
   int beckmann_offset;
   int pad1, pad2, pad3;
 } KernelTables;
 static_assert_align(KernelTables, 16);
 
+typedef struct KernelBake {
+  int object_index;
+  int tri_offset;
+  int type;
+  int pass_filter;
+} KernelBake;
+static_assert_align(KernelBake, 16);
+
 typedef struct KernelData {
   KernelCamera cam;
   KernelFilm film;
   KernelBackground background;
   KernelIntegrator integrator;
   KernelBVH bvh;
-  KernelCurves curve;
   KernelTables tables;
+  KernelBake bake;
 } KernelData;
 static_assert_align(KernelData, 16);
 
@@ -1427,6 +1488,7 @@ typedef struct KernelObject {
   float surface_area;
   float pass_id;
   float random_number;
+  float color[3];
   int particle_index;
 
   float dupli_generated[3];
@@ -1439,11 +1501,12 @@ typedef struct KernelObject {
   uint patch_map_offset;
   uint attribute_map_offset;
   uint motion_offset;
-  uint pad1;
 
   float cryptomatte_object;
   float cryptomatte_asset;
-  float pad2, pad3;
+
+  float shadow_terminator_offset;
+  float pad1, pad2, pad3;
 } KernelObject;
 static_assert_align(KernelObject, 16);
 
@@ -1481,6 +1544,8 @@ typedef struct KernelLight {
   int samples;
   float max_bounces;
   float random;
+  float strength[3];
+  float pad1;
   Transform tfm;
   Transform itfm;
   union {
@@ -1542,7 +1607,7 @@ static_assert_align(KernelShader, 16);
  * Queue 1 - Active rays
  * Queue 2 - Background queue
  * Queue 3 - Shadow ray cast kernel - AO
- * Queeu 4 - Shadow ray cast kernel - direct lighting
+ * Queue 4 - Shadow ray cast kernel - direct lighting
  */
 
 /* Queue names */
@@ -1654,12 +1719,16 @@ typedef struct WorkTile {
   uint start_sample;
   uint num_samples;
 
-  uint offset;
+  int offset;
   uint stride;
 
   ccl_global float *buffer;
 } WorkTile;
 
+/* Precoumputed sample table sizes for PMJ02 sampler. */
+#define NUM_PMJ_SAMPLES 64 * 64
+#define NUM_PMJ_PATTERNS 48
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 044fca7fff6..b2468b196da 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -48,7 +48,8 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
   shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
 
   if (sd->flag & SD_EXTINCTION) {
-    *extinction = sd->closure_transparent_extinction;
+    const float density = object_volume_density(kg, sd->object);
+    *extinction = sd->closure_transparent_extinction * density;
     return true;
   }
   else {
@@ -84,6 +85,11 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
     }
   }
 
+  const float density = object_volume_density(kg, sd->object);
+  coeff->sigma_s *= density;
+  coeff->sigma_t *= density;
+  coeff->emission *= density;
+
   return true;
 }
 
@@ -101,29 +107,39 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel)
 
 #ifdef __VOLUME__
 
-ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
+ccl_device float volume_stack_step_size(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
 {
+  float step_size = FLT_MAX;
+
   for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
     int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
 
+    bool heterogeneous = false;
+
     if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
-      return true;
+      heterogeneous = true;
     }
-    else if (shader_flag & SD_NEED_ATTRIBUTES) {
+    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
       /* We want to render world or objects without any volume grids
-       * as homogenous, but can only verify this at runtime since other
-       * heterogenous volume objects may be using the same shader. */
+       * as homogeneous, but can only verify this at run-time since other
+       * heterogeneous volume objects may be using the same shader. */
       int object = stack[i].object;
       if (object != OBJECT_NONE) {
         int object_flag = kernel_tex_fetch(__object_flag, object);
         if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
-          return true;
+          heterogeneous = true;
         }
       }
     }
+
+    if (heterogeneous) {
+      float object_step_size = object_volume_step_size(kg, stack[i].object);
+      object_step_size *= kernel_data.integrator.volume_step_rate;
+      step_size = fminf(object_step_size, step_size);
+    }
   }
 
-  return false;
+  return step_size;
 }
 
 ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
@@ -158,12 +174,13 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 
 ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg,
                                                ccl_addr_space PathState *state,
+                                               const float object_step_size,
                                                float t,
                                                float *step_size,
                                                float *step_offset)
 {
   const int max_steps = kernel_data.integrator.volume_max_steps;
-  float step = min(kernel_data.integrator.volume_step_size, t);
+  float step = min(object_step_size, t);
 
   /* compute exact steps in advance for malloc */
   if (t > max_steps * step) {
@@ -187,7 +204,7 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
                                                  ShaderData *sd,
                                                  float3 *throughput)
 {
-  float3 sigma_t;
+  float3 sigma_t = make_float3(0.0f, 0.0f, 0.0f);
 
   if (volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t))
     *throughput *= volume_color_transmittance(sigma_t, ray->t);
@@ -199,7 +216,8 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
                                                    ccl_addr_space PathState *state,
                                                    Ray *ray,
                                                    ShaderData *sd,
-                                                   float3 *throughput)
+                                                   float3 *throughput,
+                                                   const float object_step_size)
 {
   float3 tp = *throughput;
   const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -207,7 +225,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
   /* prepare for stepping */
   int max_steps = kernel_data.integrator.volume_max_steps;
   float step_offset, step_size;
-  kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
+  kernel_volume_step_init(kg, state, object_step_size, ray->t, &step_size, &step_offset);
 
   /* compute extinction at the start */
   float t = 0.0f;
@@ -225,7 +243,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
     }
 
     float3 new_P = ray->P + ray->D * (t + step_offset);
-    float3 sigma_t;
+    float3 sigma_t = make_float3(0.0f, 0.0f, 0.0f);
 
     /* compute attenuation over segment */
     if (volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
@@ -264,8 +282,9 @@ ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
 {
   shader_setup_from_volume(kg, shadow_sd, ray);
 
-  if (volume_stack_is_heterogeneous(kg, state->volume_stack))
-    kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput);
+  float step_size = volume_stack_step_size(kg, state->volume_stack);
+  if (step_size != FLT_MAX)
+    kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput, step_size);
   else
     kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
 }
@@ -428,7 +447,7 @@ kernel_volume_integrate_homogeneous(KernelGlobals *kg,
                                     ccl_addr_space float3 *throughput,
                                     bool probalistic_scatter)
 {
-  VolumeShaderCoefficients coeff;
+  VolumeShaderCoefficients coeff ccl_optional_struct_init;
 
   if (!volume_shader_sample(kg, sd, state, ray->P, &coeff))
     return VOLUME_PATH_MISSED;
@@ -504,7 +523,7 @@ kernel_volume_integrate_homogeneous(KernelGlobals *kg,
     float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t);
     float3 emission = kernel_volume_emission_integrate(
         &coeff, closure_flag, transmittance, ray->t);
-    path_radiance_accum_emission(L, state, *throughput, emission);
+    path_radiance_accum_emission(kg, L, state, *throughput, emission);
   }
 
   /* modify throughput */
@@ -533,7 +552,8 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
                                                Ray *ray,
                                                ShaderData *sd,
                                                PathRadiance *L,
-                                               ccl_addr_space float3 *throughput)
+                                               ccl_addr_space float3 *throughput,
+                                               const float object_step_size)
 {
   float3 tp = *throughput;
   const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -541,7 +561,7 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
   /* prepare for stepping */
   int max_steps = kernel_data.integrator.volume_max_steps;
   float step_offset, step_size;
-  kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
+  kernel_volume_step_init(kg, state, object_step_size, ray->t, &step_size, &step_offset);
 
   /* compute coefficients at the start */
   float t = 0.0f;
@@ -559,13 +579,13 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
     float dt = new_t - t;
 
     /* use random position inside this segment to sample shader,
-    * for last shorter step we remap it to fit within the segment. */
+     * for last shorter step we remap it to fit within the segment. */
     if (new_t == ray->t) {
       step_offset *= (new_t - t) / step_size;
     }
 
     float3 new_P = ray->P + ray->D * (t + step_offset);
-    VolumeShaderCoefficients coeff;
+    VolumeShaderCoefficients coeff ccl_optional_struct_init;
 
     /* compute segment */
     if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
@@ -621,6 +641,7 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
         new_tp = tp * transmittance;
       }
       else {
+        transmittance = make_float3(0.0f, 0.0f, 0.0f);
         new_tp = tp;
       }
 
@@ -628,7 +649,7 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
       if (L && (closure_flag & SD_EMISSION)) {
         float3 emission = kernel_volume_emission_integrate(
             &coeff, closure_flag, transmittance, dt);
-        path_radiance_accum_emission(L, state, tp, emission);
+        path_radiance_accum_emission(kg, L, state, tp, emission);
       }
 
       /* modify throughput */
@@ -671,19 +692,20 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
  * ray, with the assumption that there are no surfaces blocking light
  * between the endpoints. distance sampling is used to decide if we will
  * scatter or not. */
-ccl_device_noinline VolumeIntegrateResult
+ccl_device_noinline_cpu VolumeIntegrateResult
 kernel_volume_integrate(KernelGlobals *kg,
                         ccl_addr_space PathState *state,
                         ShaderData *sd,
                         Ray *ray,
                         PathRadiance *L,
                         ccl_addr_space float3 *throughput,
-                        bool heterogeneous)
+                        float step_size)
 {
   shader_setup_from_volume(kg, sd, ray);
 
-  if (heterogeneous)
-    return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput);
+  if (step_size != FLT_MAX)
+    return kernel_volume_integrate_heterogeneous_distance(
+        kg, state, ray, sd, L, throughput, step_size);
   else
     return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
 }
@@ -734,7 +756,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
                                                Ray *ray,
                                                ShaderData *sd,
                                                VolumeSegment *segment,
-                                               bool heterogeneous)
+                                               const float object_step_size)
 {
   const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
@@ -742,9 +764,9 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
   int max_steps;
   float step_size, step_offset;
 
-  if (heterogeneous) {
+  if (object_step_size != FLT_MAX) {
     max_steps = kernel_data.integrator.volume_max_steps;
-    kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
+    kernel_volume_step_init(kg, state, object_step_size, ray->t, &step_size, &step_offset);
 
 #      ifdef __KERNEL_CPU__
     /* NOTE: For the branched path tracing it's possible to have direct
@@ -794,13 +816,13 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
     float dt = new_t - t;
 
     /* use random position inside this segment to sample shader,
-    * for last shorter step we remap it to fit within the segment. */
+     * for last shorter step we remap it to fit within the segment. */
     if (new_t == ray->t) {
       step_offset *= (new_t - t) / step_size;
     }
 
     float3 new_P = ray->P + ray->D * (t + step_offset);
-    VolumeShaderCoefficients coeff;
+    VolumeShaderCoefficients coeff ccl_optional_struct_init;
 
     /* compute segment */
     if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
@@ -1277,7 +1299,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
    */
   if (stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
     stack[0].shader = kernel_data.background.volume_shader;
-    stack[0].object = PRIM_NONE;
+    stack[0].object = OBJECT_NONE;
     stack[1].shader = SHADER_NONE;
   }
   else {
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 799561a7466..d1602744f1d 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -23,17 +23,41 @@ CCL_NAMESPACE_BEGIN
  * Utility functions for work stealing
  */
 
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
+{
+#ifdef __KERNEL_CUDA__
+  /* Keeping threads for the same pixel together improves performance on CUDA. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#else  /* __KERNEL_CUDA__ */
+  uint tile_pixels = tile->w * tile->h;
+  uint sample_offset = global_work_index / tile_pixels;
+  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+#endif /* __KERNEL_CUDA__ */
+  uint y_offset = pixel_offset / tile->w;
+  uint x_offset = pixel_offset - y_offset * tile->w;
+
+  *x = tile->x + x_offset;
+  *y = tile->y + y_offset;
+  *sample = tile->start_sample + sample_offset;
+}
+
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
 #ifdef __SPLIT_KERNEL__
 /* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
+ccl_device bool get_next_work_item(KernelGlobals *kg,
+                                   ccl_global uint *work_pools,
+                                   uint total_work_size,
+                                   uint ray_index,
+                                   ccl_private uint *global_work_index)
 {
   /* With a small amount of work there may be more threads than work due to
    * rounding up of global size, stop such threads immediately. */
@@ -56,31 +80,37 @@ ccl_device bool get_next_work(KernelGlobals *kg,
   /* Test if all work for this pool is done. */
   return (*global_work_index < total_work_size);
 }
-#endif
 
-/* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
-                                      uint global_work_index,
-                                      ccl_private uint *x,
-                                      ccl_private uint *y,
-                                      ccl_private uint *sample)
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              ccl_global uint *work_pools,
+                              uint total_work_size,
+                              uint ray_index,
+                              ccl_private uint *global_work_index)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
-  uint y_offset = pixel_offset / tile->w;
-  uint x_offset = pixel_offset - y_offset * tile->w;
-
-  *x = tile->x + x_offset;
-  *y = tile->y + y_offset;
-  *sample = tile->start_sample + sample_offset;
+  bool got_work = false;
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    do {
+      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+      if (got_work) {
+        ccl_global WorkTile *tile = &kernel_split_params.tile;
+        uint x, y, sample;
+        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
+        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
+        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                       kernel_data.film.pass_adaptive_aux_buffer);
+        if ((*aux).w == 0.0f) {
+          break;
+        }
+      }
+    } while (got_work);
+  }
+  else {
+    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+  }
+  return got_work;
 }
+#endif
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_write_passes.h b/intern/cycles/kernel/kernel_write_passes.h
new file mode 100644
index 00000000000..410218d91d4
--- /dev/null
+++ b/intern/cycles/kernel/kernel_write_passes.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#  define __ATOMIC_PASS_WRITE__
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
+{
+  ccl_global float *buf = buffer;
+#ifdef __ATOMIC_PASS_WRITE__
+  atomic_add_and_fetch_float(buf, value);
+#else
+  *buf += value;
+#endif
+}
+
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
+{
+#ifdef __ATOMIC_PASS_WRITE__
+  ccl_global float *buf_x = buffer + 0;
+  ccl_global float *buf_y = buffer + 1;
+  ccl_global float *buf_z = buffer + 2;
+
+  atomic_add_and_fetch_float(buf_x, value.x);
+  atomic_add_and_fetch_float(buf_y, value.y);
+  atomic_add_and_fetch_float(buf_z, value.z);
+#else
+  ccl_global float3 *buf = (ccl_global float3 *)buffer;
+  *buf += value;
+#endif
+}
+
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
+{
+#ifdef __ATOMIC_PASS_WRITE__
+  ccl_global float *buf_x = buffer + 0;
+  ccl_global float *buf_y = buffer + 1;
+  ccl_global float *buf_z = buffer + 2;
+  ccl_global float *buf_w = buffer + 3;
+
+  atomic_add_and_fetch_float(buf_x, value.x);
+  atomic_add_and_fetch_float(buf_y, value.y);
+  atomic_add_and_fetch_float(buf_z, value.z);
+  atomic_add_and_fetch_float(buf_w, value.w);
+#else
+  ccl_global float4 *buf = (ccl_global float4 *)buffer;
+  *buf += value;
+#endif
+}
+
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
+{
+  kernel_write_pass_float(buffer, value);
+
+  /* The online one-pass variance update that's used for the megakernel can't easily be implemented
+   * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+  kernel_write_pass_float(buffer + 1, value * value);
+}
+
+#  ifdef __ATOMIC_PASS_WRITE__
+#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+#  else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
+{
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
+}
+#  endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
+{
+  kernel_write_pass_float3_unaligned(buffer, value);
+  kernel_write_pass_float3_unaligned(buffer + 3, value * value);
+}
+#endif /* __DENOISING_FEATURES__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index f2146302a27..8040bfb7b33 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -64,15 +64,17 @@ CCL_NAMESPACE_BEGIN
 
 /* Memory Copy */
 
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size)
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
 {
-  if (strcmp(name, "__data") == 0)
-    memcpy(&kg->__data, host, size);
-  else
+  if (strcmp(name, "__data") == 0) {
+    kg->__data = *(KernelData *)host;
+  }
+  else {
     assert(0);
+  }
 }
 
-void kernel_tex_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
 {
   if (0) {
   }
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index f5d981fb71a..ea3103f12c3 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -46,6 +46,9 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        int offset,
                                        int sample);
 
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
 /* Split kernels */
 
 void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg,
@@ -89,5 +92,9 @@ DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
 DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index 4289e2bbb85..f87501db258 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -19,6 +19,10 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Make template functions private so symbols don't conflict between kernels with different
+ * instruction sets. */
+namespace {
+
 template<typename T> struct TextureInterpolator {
 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
   { \
@@ -470,7 +474,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
-  switch (kernel_tex_type(id)) {
+  switch (info.data_type) {
     case IMAGE_DATA_TYPE_HALF:
       return TextureInterpolator<half>::interp(info, x, y);
     case IMAGE_DATA_TYPE_BYTE:
@@ -494,28 +498,34 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(
-    KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+                                             int id,
+                                             float3 P,
+                                             InterpolationType interp)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
-  switch (kernel_tex_type(id)) {
+  if (info.use_transform_3d) {
+    P = transform_point(&info.transform_3d, P);
+  }
+
+  switch (info.data_type) {
     case IMAGE_DATA_TYPE_HALF:
-      return TextureInterpolator<half>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_BYTE:
-      return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_USHORT:
-      return TextureInterpolator<uint16_t>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_FLOAT:
-      return TextureInterpolator<float>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_HALF4:
-      return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_BYTE4:
-      return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_USHORT4:
-      return TextureInterpolator<ushort4>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_FLOAT4:
-      return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp);
     default:
       assert(0);
       return make_float4(
@@ -523,6 +533,8 @@ ccl_device float4 kernel_tex_image_interp_3d(
   }
 }
 
+} /* Namespace. */
+
 CCL_NAMESPACE_END
 
 #endif  // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 9ca3f46b5b6..5aa3fb14318 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -20,6 +20,7 @@
  * simply includes this file without worry of copying actual implementation over.
  */
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 
 #ifndef KERNEL_STUB
@@ -58,6 +59,10 @@
 #    include "kernel/split/kernel_next_iteration_setup.h"
 #    include "kernel/split/kernel_indirect_subsurface.h"
 #    include "kernel/split/kernel_buffer_update.h"
+#    include "kernel/split/kernel_adaptive_stopping.h"
+#    include "kernel/split/kernel_adaptive_filter_x.h"
+#    include "kernel/split/kernel_adaptive_filter_y.h"
+#    include "kernel/split/kernel_adaptive_adjust_samples.h"
 #  endif /* __SPLIT_KERNEL__ */
 #else
 #  define STUB_ASSERT(arch, name) \
@@ -67,6 +72,7 @@
 #    include "kernel/split/kernel_data_init.h"
 #  endif /* __SPLIT_KERNEL__ */
 #endif   /* KERNEL_STUB */
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -126,6 +132,18 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 #  endif /* KERNEL_STUB */
 }
 
+/* Bake */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#  ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, bake);
+#  else
+  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+#  endif /* KERNEL_STUB */
+}
+
 /* Shader Evaluate */
 
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
@@ -140,12 +158,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 #  ifdef KERNEL_STUB
   STUB_ASSERT(KERNEL_ARCH, shader);
 #  else
-  if (type >= SHADER_EVAL_BAKE) {
-#    ifdef __BAKING__
-    kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, filter, i, offset, sample);
-#    endif
-  }
-  else if (type == SHADER_EVAL_DISPLACE) {
+  if (type == SHADER_EVAL_DISPLACE) {
     kernel_displace_evaluate(kg, input, output, i);
   }
   else {
@@ -204,6 +217,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 #endif   /* __SPLIT_KERNEL__ */
 
 #undef KERNEL_STUB
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 5b552b01413..6c9642d1f03 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -28,6 +28,74 @@
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_copy_input(float *buffer,
+                              CCL_FILTER_TILE_INFO,
+                              int4 prefilter_rect,
+                              int buffer_pass_stride)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
+		int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
+		int itile = ytile * 3 + xtile;
+		float *const in = ((float *)ccl_get_tile_buffer(itile)) +
+			(tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
+		buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
+		for (int i = 0; i < buffer_pass_stride; ++i)
+			buffer[i] = in[i];
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < sw && y < sh) {
+		if (num_inputs > 0) {
+			float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
+			float *out = rgb + (x + y * sw) * 3;
+			out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f);
+			out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f);
+			out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f);
+		}
+		if (num_inputs > 1) {
+			float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
+			float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
+			out[0] = in[0] / num_samples;
+			out[1] = in[1] / num_samples;
+			out[2] = in[2] / num_samples;
+		}
+		if (num_inputs > 2) {
+			float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
+			float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
+			out[0] = in[0] / num_samples;
+			out[1] = in[1] / num_samples;
+			out[2] = in[2] / num_samples;
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < sw && y < sh) {
+		float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
+		float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
+		out[0] = in[0] * num_samples;
+		out[1] = in[1] * num_samples;
+		out[2] = in[2] * num_samples;
+	}
+}
+
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_filter_divide_shadow(int sample,
                                  CCL_FILTER_TILE_INFO,
                                  float *unfilteredA,
@@ -97,14 +165,14 @@ kernel_cuda_filter_write_feature(int sample,
 	int x = blockDim.x*blockIdx.x + threadIdx.x;
 	int y = blockDim.y*blockIdx.y + threadIdx.y;
 	if(x < filter_area.z && y < filter_area.w) {
-	        kernel_filter_write_feature(sample,
-	                                    x + filter_area.x,
-	                                    y + filter_area.y,
-	                                    buffer_params,
-	                                    from,
-	                                    buffer,
-	                                    out_offset,
-	                                    prefilter_rect);
+		kernel_filter_write_feature(sample,
+	                                x + filter_area.x,
+	                                y + filter_area.y,
+	                                buffer_params,
+	                                from,
+	                                buffer,
+	                                out_offset,
+	                                prefilter_rect);
 	}
 }
 
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index af311027f78..d4f41132a11 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -33,6 +33,7 @@
 #include "kernel/kernel_path_branched.h"
 #include "kernel/kernel_bake.h"
 #include "kernel/kernel_work_stealing.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 /* kernels */
 extern "C" __global__ void
@@ -83,6 +84,75 @@ kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
+{
+	int work_index = ccl_global_id(0);
+	bool thread_is_active = work_index < total_work_size;
+	KernelGlobals kg;
+	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
+		uint x = tile->x + work_index % tile->w;
+		uint y = tile->y + work_index / tile->w;
+		int index = tile->offset + x + y * tile->stride;
+		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+		kernel_do_adaptive_stopping(&kg, buffer, sample);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
+{
+	KernelGlobals kg;
+	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+		if(ccl_global_id(0) < tile->h) {
+			int y = tile->y + ccl_global_id(0);
+			kernel_do_adaptive_filter_x(&kg, y, tile);
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
+{
+	KernelGlobals kg;
+	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+		if(ccl_global_id(0) < tile->w) {
+			int x = tile->x + ccl_global_id(0);
+			kernel_do_adaptive_filter_y(&kg, x, tile);
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
+{
+	if(kernel_data.film.pass_adaptive_aux_buffer) {
+		int work_index = ccl_global_id(0);
+		bool thread_is_active = work_index < total_work_size;
+		KernelGlobals kg;
+		if(thread_is_active) {
+			uint x = tile->x + work_index % tile->w;
+			uint y = tile->y + work_index / tile->w;
+			int index = tile->offset + x + y * tile->stride;
+			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+				float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]);
+				if(sample_multiplier != 1.0f) {
+					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
+				}
+			}
+			else {
+				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
+			}
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
@@ -144,13 +214,16 @@ kernel_cuda_background(uint4 *input,
 #ifdef __BAKING__
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int sw, int offset, int sample)
+kernel_cuda_bake(WorkTile *tile, uint total_work_size)
 {
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int work_index = ccl_global_id(0);
+
+	if(work_index < total_work_size) {
+		uint x, y, sample;
+		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-	if(x < sx + sw) {
 		KernelGlobals kg;
-		kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+		kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
 }
 #endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
index d9f349837a8..3ec00762e72 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -61,7 +61,8 @@
 
 /* tunable parameters */
 #  define CUDA_THREADS_BLOCK_WIDTH 16
-/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of registers */
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
 #  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
 #    define CUDA_KERNEL_MAX_REGISTERS 64
 #  else
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
index 7c68f08ea10..1d425d132a1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -124,7 +124,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   CUtexObject tex = (CUtexObject)info.data;
 
   /* float4, byte4, ushort4 and half4 */
-  const int texture_type = kernel_tex_type(id);
+  const int texture_type = info.data_type;
   if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
       texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
     if (info.interpolation == INTERPOLATION_CUBIC) {
@@ -149,14 +149,25 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(
-    KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+                                             int id,
+                                             float3 P,
+                                             InterpolationType interp)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  if (info.use_transform_3d) {
+    P = transform_point(&info.transform_3d, P);
+  }
+
+  const float x = P.x;
+  const float y = P.y;
+  const float z = P.z;
+
   CUtexObject tex = (CUtexObject)info.data;
   uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
 
-  const int texture_type = kernel_tex_type(id);
+  const int texture_type = info.data_type;
   if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
       texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
     if (interpolation == INTERPOLATION_CUBIC) {
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 43b3d0aa0e6..95ad7599cf1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -43,6 +43,10 @@
 #include "kernel/split/kernel_next_iteration_setup.h"
 #include "kernel/split/kernel_indirect_subsurface.h"
 #include "kernel/split/kernel_buffer_update.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
 
 #include "kernel/kernel_film.h"
 
@@ -121,6 +125,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
new file mode 100644
index 00000000000..ebdb99d4730
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
+
+#define KERNEL_NAME adaptive_adjust_samples
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
new file mode 100644
index 00000000000..76d82d4184e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+
+#define KERNEL_NAME adaptive_filter_x
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
new file mode 100644
index 00000000000..1e6d15ba0f2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+
+#define KERNEL_NAME adaptive_filter_y
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
new file mode 100644
index 00000000000..51de0059667
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+
+#define KERNEL_NAME adaptive_stopping
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index b6390679331..9ab374d1fba 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -47,7 +47,7 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg,
                                                 int id,
                                                 int offset)
 {
-  const int texture_type = kernel_tex_type(id);
+  const int texture_type = info->data_type;
 
   /* Float4 */
   if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
@@ -77,7 +77,7 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg,
     return make_float4(f, f, f, 1.0f);
   }
   /* Byte */
-#ifdef cl_khr_fp16
+#ifdef __KERNEL_CL_KHR_FP16__
   /* half and half4 are optional in OpenCL */
   else if (texture_type == IMAGE_DATA_TYPE_HALF) {
     float f = tex_fetch(half, info, offset);
@@ -202,11 +202,19 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4
-kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, int interp)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float3 P, int interp)
 {
   const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
 
+  if (info->use_transform_3d) {
+    Transform tfm = info->transform_3d;
+    P = transform_point(&tfm, P);
+  }
+
+  const float x = P.x;
+  const float y = P.y;
+  const float z = P.z;
+
   if (info->extension == EXTENSION_CLIP) {
     if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
index 6041f13b52b..c3b7b09460a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
@@ -28,3 +28,7 @@
 #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
 #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
 #include "kernel/kernels/opencl/kernel_buffer_update.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
new file mode 100644
index 00000000000..3b166e59dfd
--- /dev/null
+++ b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
@@ -0,0 +1,329 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+#include "kernel/kernel_compat_optix.h"
+#include "util/util_atomic.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "../cuda/kernel_cuda_image.h"  // Texture lookup uses normal CUDA intrinsics
+
+#include "kernel/kernel_path.h"
+#include "kernel/kernel_bake.h"
+// clang-format on
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
+{
+  return (T *)(((uint64_t)optixGetPayload_1() << 32) | optixGetPayload_0());
+}
+template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
+{
+  return (T *)(((uint64_t)optixGetPayload_3() << 32) | optixGetPayload_2());
+}
+
+template<bool always = false> ccl_device_forceinline uint get_object_id()
+{
+#ifdef __OBJECT_MOTION__
+  // Always get the the instance ID from the TLAS
+  // There might be a motion transform node between TLAS and BLAS which does not have one
+  uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+#else
+  uint object = optixGetInstanceId();
+#endif
+  // Choose between always returning object ID or only for instances
+  if (always)
+    // Can just remove the high bit since instance always contains object ID
+    return object & 0x7FFFFF;
+  // Set to OBJECT_NONE if this is not an instanced object
+  else if (object & 0x800000)
+    object = OBJECT_NONE;
+  return object;
+}
+
+extern "C" __global__ void __raygen__kernel_optix_path_trace()
+{
+  KernelGlobals kg;  // Allocate stack storage for common data
+
+  const uint3 launch_index = optixGetLaunchIndex();
+  // Keep threads for same pixel together to improve occupancy of warps
+  uint pixel_offset = launch_index.x / __params.tile.num_samples;
+  uint sample_offset = launch_index.x % __params.tile.num_samples;
+
+  kernel_path_trace(&kg,
+                    __params.tile.buffer,
+                    __params.tile.start_sample + sample_offset,
+                    __params.tile.x + pixel_offset,
+                    __params.tile.y + launch_index.y,
+                    __params.tile.offset,
+                    __params.tile.stride);
+}
+
+#ifdef __BAKING__
+extern "C" __global__ void __raygen__kernel_optix_bake()
+{
+  KernelGlobals kg;
+  const ShaderParams &p = __params.shader;
+  kernel_bake_evaluate(&kg,
+                       p.input,
+                       p.output,
+                       (ShaderEvalType)p.type,
+                       p.filter,
+                       p.sx + optixGetLaunchIndex().x,
+                       p.offset,
+                       p.sample);
+}
+#endif
+
+extern "C" __global__ void __raygen__kernel_optix_displace()
+{
+  KernelGlobals kg;
+  const ShaderParams &p = __params.shader;
+  kernel_displace_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_background()
+{
+  KernelGlobals kg;
+  const ShaderParams &p = __params.shader;
+  kernel_background_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+}
+
+extern "C" __global__ void __miss__kernel_optix_miss()
+{
+  // 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
+  optixSetPayload_5(PRIMITIVE_NONE);
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_local_hit()
+{
+#ifdef __BVH_LOCAL__
+  const uint object = get_object_id<true>();
+  if (object != optixGetPayload_4() /* local_object */) {
+    // Only intersect with matching object
+    return optixIgnoreIntersection();
+  }
+
+  int hit = 0;
+  uint *const lcg_state = get_payload_ptr_0<uint>();
+  LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
+
+  if (lcg_state) {
+    const uint max_hits = optixGetPayload_5();
+    for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+      if (optixGetRayTmax() == local_isect->hits[i].t) {
+        return optixIgnoreIntersection();
+      }
+    }
+
+    hit = local_isect->num_hits++;
+
+    if (local_isect->num_hits > max_hits) {
+      hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
+      if (hit >= max_hits) {
+        return optixIgnoreIntersection();
+      }
+    }
+  }
+  else {
+    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
+      // Record closest intersection only
+      // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
+      return optixIgnoreIntersection();
+    }
+
+    local_isect->num_hits = 1;
+  }
+
+  Intersection *isect = &local_isect->hits[hit];
+  isect->t = optixGetRayTmax();
+  isect->prim = optixGetPrimitiveIndex();
+  isect->object = get_object_id();
+  isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+
+  const float2 barycentrics = optixGetTriangleBarycentrics();
+  isect->u = 1.0f - barycentrics.y - barycentrics.x;
+  isect->v = barycentrics.x;
+
+  // Record geometric normal
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
+  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
+  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
+  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+
+  // Continue tracing (without this the trace call would return after the first hit)
+  optixIgnoreIntersection();
+#endif
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
+{
+#ifdef __SHADOW_RECORD_ALL__
+  const uint prim = optixGetPrimitiveIndex();
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#  endif
+
+  // Offset into array with num_hits
+  Intersection *const isect = get_payload_ptr_0<Intersection>() + optixGetPayload_2();
+  isect->t = optixGetRayTmax();
+  isect->prim = prim;
+  isect->object = get_object_id();
+  isect->type = kernel_tex_fetch(__prim_type, prim);
+
+  if (optixIsTriangleHit()) {
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    isect->u = 1.0f - barycentrics.y - barycentrics.x;
+    isect->v = barycentrics.x;
+  }
+#  ifdef __HAIR__
+  else {
+    const float u = __uint_as_float(optixGetAttribute_0());
+    isect->u = u;
+    isect->v = __uint_as_float(optixGetAttribute_1());
+
+    // Filter out curve endcaps
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
+#  endif
+
+#  ifdef __TRANSPARENT_SHADOWS__
+  // Detect if this surface has a shader with transparent shadows
+  if (!shader_transparent_shadow(NULL, isect) || optixGetPayload_2() >= optixGetPayload_3()) {
+#  endif
+    // This is an opaque hit or the hit limit has been reached, abort traversal
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+#  ifdef __TRANSPARENT_SHADOWS__
+  }
+
+  optixSetPayload_2(optixGetPayload_2() + 1);  // num_hits++
+
+  // Continue tracing
+  optixIgnoreIntersection();
+#  endif
+#endif
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
+  uint visibility = optixGetPayload_4();
+#ifdef __VISIBILITY_FLAG__
+  const uint prim = optixGetPrimitiveIndex();
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#endif
+
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    // Filter out curve endcaps
+    const float u = __uint_as_float(optixGetAttribute_0());
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
+#endif
+
+  // Shadow ray early termination
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    return optixTerminateRay();
+  }
+}
+
+extern "C" __global__ void __closesthit__kernel_optix_hit()
+{
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));  // Intersection distance
+  optixSetPayload_3(optixGetPrimitiveIndex());
+  optixSetPayload_4(get_object_id());
+  // Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index
+  optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
+
+  if (optixIsTriangleHit()) {
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
+    optixSetPayload_2(__float_as_uint(barycentrics.x));
+  }
+  else {
+    optixSetPayload_1(optixGetAttribute_0());  // Same as 'optixGetCurveParameter()'
+    optixSetPayload_2(optixGetAttribute_1());
+  }
+}
+
+#ifdef __HAIR__
+ccl_device_inline void optix_intersection_curve(const uint prim, const uint type)
+{
+  const uint object = get_object_id<true>();
+  const uint visibility = optixGetPayload_4();
+
+  float3 P = optixGetObjectRayOrigin();
+  float3 dir = optixGetObjectRayDirection();
+
+  // The direction is not normalized by default, but the curve intersection routine expects that
+  float len;
+  dir = normalize_len(dir, &len);
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+  // Transform maximum distance into object space
+  if (isect.t != FLT_MAX)
+    isect.t *= len;
+
+  if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
+    optixReportIntersection(isect.t / len,
+                            type & PRIMITIVE_ALL,
+                            __float_as_int(isect.u),   // Attribute_0
+                            __float_as_int(isect.v));  // Attribute_1
+  }
+}
+
+extern "C" __global__ void __intersection__curve_ribbon()
+{
+  const uint prim = optixGetPrimitiveIndex();
+  const uint type = kernel_tex_fetch(__prim_type, prim);
+
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+    optix_intersection_curve(prim, type);
+  }
+}
+
+extern "C" __global__ void __intersection__curve_all()
+{
+  const uint prim = optixGetPrimitiveIndex();
+  const uint type = kernel_tex_fetch(__prim_type, prim);
+  optix_intersection_curve(prim, type);
+}
+#endif
+
+#ifdef __KERNEL_DEBUG__
+extern "C" __global__ void __exception__kernel_optix_exception()
+{
+  printf("Unhandled exception occured: code %d!\n", optixGetExceptionCode());
+}
+#endif
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index 28d9ca854db..d7ab778181e 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -27,10 +27,24 @@ set(HEADER_SRC
 
 set(LIB
   cycles_render
+
+  ${OSL_LIBRARIES}
+  ${OPENIMAGEIO_LIBRARIES}
+  ${LLVM_LIBRARY}
 )
 
+# OSL and LLVM are built without RTTI
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
 
+if(APPLE)
+  # Disable allocation warning on macOS prior to 10.14: the OSLRenderServices
+  # contains member which is 64 bytes aligned (cache inside of OIIO's
+  # unordered_map_concurrent). This is not something what the SDK supportsm, but
+  # since we take care of allocations ourselves is is OK to ignore the
+  # diagnostic message.
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-allocation")
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index b395227845d..3f9de5ab33d 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -36,9 +36,11 @@
 
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index c5edc7c9be3..76a2e41abfa 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -37,10 +37,12 @@
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_types.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/bsdf_diffuse_ramp.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index 4b7e59ff932..b78dc8a3a67 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -37,9 +37,11 @@
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/bsdf_phong_ramp.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index c29ddb13e2e..d656723bac2 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -36,10 +36,12 @@
 
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/kernel_types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index dd52c33071c..c5ca8616fbd 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -35,6 +35,7 @@
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_types.h"
 #include "kernel/kernel_montecarlo.h"
 
@@ -43,6 +44,7 @@
 #include "kernel/closure/bsdf_diffuse.h"
 #include "kernel/closure/bsdf_principled_diffuse.h"
 #include "kernel/closure/bssrdf.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index aa7e2727577..7ee467a46dd 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -39,6 +39,7 @@
 #include "util/util_math.h"
 #include "util/util_param.h"
 
+// clang-format off
 #include "kernel/kernel_types.h"
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/split/kernel_split_data_types.h"
@@ -63,6 +64,7 @@
 #include "kernel/closure/bsdf_principled_diffuse.h"
 #include "kernel/closure/bsdf_principled_sheen.h"
 #include "kernel/closure/volume.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -98,14 +100,14 @@ CLOSURE_FLOAT3_PARAM(DiffuseClosure, params.N),
     BSDF_CLOSURE_CLASS_END(AshikhminVelvet, ashikhmin_velvet)
 
         BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley,
-                                 ashikhmin_shirley_aniso,
+                                 ashikhmin_shirley,
                                  MicrofacetBsdf,
                                  LABEL_GLOSSY | LABEL_REFLECT)
             CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, params.N),
     CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, params.T),
     CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, params.alpha_x),
     CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, params.alpha_y),
-    BSDF_CLOSURE_CLASS_END(AshikhminShirley, ashikhmin_shirley_aniso)
+    BSDF_CLOSURE_CLASS_END(AshikhminShirley, ashikhmin_shirley)
 
         BSDF_CLOSURE_CLASS_BEGIN(DiffuseToon, diffuse_toon, ToonBsdf, LABEL_DIFFUSE)
             CLOSURE_FLOAT3_PARAM(DiffuseToonClosure, params.N),
@@ -119,42 +121,42 @@ CLOSURE_FLOAT3_PARAM(DiffuseClosure, params.N),
     CLOSURE_FLOAT_PARAM(GlossyToonClosure, params.smooth),
     BSDF_CLOSURE_CLASS_END(GlossyToon, glossy_toon)
 
+        BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXIsotropic,
+                                 microfacet_ggx_isotropic,
+                                 MicrofacetBsdf,
+                                 LABEL_GLOSSY | LABEL_REFLECT)
+            CLOSURE_FLOAT3_PARAM(MicrofacetGGXIsotropicClosure, params.N),
+    CLOSURE_FLOAT_PARAM(MicrofacetGGXIsotropicClosure, params.alpha_x),
+    BSDF_CLOSURE_CLASS_END(MicrofacetGGXIsotropic, microfacet_ggx_isotropic)
+
         BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX,
                                  microfacet_ggx,
                                  MicrofacetBsdf,
                                  LABEL_GLOSSY | LABEL_REFLECT)
             CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, params.N),
+    CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, params.T),
     CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, params.alpha_x),
+    CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, params.alpha_y),
     BSDF_CLOSURE_CLASS_END(MicrofacetGGX, microfacet_ggx)
 
-        BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso,
-                                 microfacet_ggx_aniso,
+        BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannIsotropic,
+                                 microfacet_beckmann_isotropic,
                                  MicrofacetBsdf,
                                  LABEL_GLOSSY | LABEL_REFLECT)
-            CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, params.N),
-    CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, params.T),
-    CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, params.alpha_x),
-    CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, params.alpha_y),
-    BSDF_CLOSURE_CLASS_END(MicrofacetGGXAniso, microfacet_ggx_aniso)
+            CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannIsotropicClosure, params.N),
+    CLOSURE_FLOAT_PARAM(MicrofacetBeckmannIsotropicClosure, params.alpha_x),
+    BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannIsotropic, microfacet_beckmann_isotropic)
 
         BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann,
                                  microfacet_beckmann,
                                  MicrofacetBsdf,
                                  LABEL_GLOSSY | LABEL_REFLECT)
             CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, params.N),
+    CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, params.T),
     CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, params.alpha_x),
+    CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, params.alpha_y),
     BSDF_CLOSURE_CLASS_END(MicrofacetBeckmann, microfacet_beckmann)
 
-        BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso,
-                                 microfacet_beckmann_aniso,
-                                 MicrofacetBsdf,
-                                 LABEL_GLOSSY | LABEL_REFLECT)
-            CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, params.N),
-    CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, params.T),
-    CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, params.alpha_x),
-    CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, params.alpha_y),
-    BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannAniso, microfacet_beckmann_aniso)
-
         BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction,
                                  microfacet_ggx_refraction,
                                  MicrofacetBsdf,
@@ -197,15 +199,32 @@ CLOSURE_FLOAT3_PARAM(DiffuseClosure, params.N),
     CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness),
     BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse)
 
-        BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen,
-                                 principled_sheen,
-                                 PrincipledSheenBsdf,
-                                 LABEL_DIFFUSE)
-            CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N),
-    BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen)
+        class PrincipledSheenClosure : public CBSDFClosure {
+ public:
+  PrincipledSheenBsdf params;
 
-    /* PRINCIPLED HAIR BSDF */
-    class PrincipledHairClosure : public CBSDFClosure {
+  void setup(ShaderData *sd, int path_flag, float3 weight)
+  {
+    if (!skip(sd, path_flag, LABEL_DIFFUSE)) {
+      PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf *)bsdf_alloc_osl(
+          sd, sizeof(PrincipledSheenBsdf), weight, &params);
+      sd->flag |= (bsdf) ? bsdf_principled_sheen_setup(sd, bsdf) : 0;
+    }
+  }
+};
+
+static ClosureParam *bsdf_principled_sheen_params()
+{
+  static ClosureParam params[] = {CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N),
+                                  CLOSURE_STRING_KEYPARAM(PrincipledSheenClosure, label, "label"),
+                                  CLOSURE_FINISH_PARAM(PrincipledSheenClosure)};
+  return params;
+}
+
+CCLOSURE_PREPARE_STATIC(closure_bsdf_principled_sheen_prepare, PrincipledSheenClosure)
+
+/* PRINCIPLED HAIR BSDF */
+class PrincipledHairClosure : public CBSDFClosure {
  public:
   PrincipledHairBSDF params;
 
@@ -343,13 +362,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
                    id++,
                    closure_bsdf_transparent_params(),
                    closure_bsdf_transparent_prepare);
+
   register_closure(
-      ss, "microfacet_ggx", id++, bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare);
+      ss, "microfacet", id++, closure_bsdf_microfacet_params(), closure_bsdf_microfacet_prepare);
   register_closure(ss,
-                   "microfacet_ggx_aniso",
+                   "microfacet_ggx",
                    id++,
-                   bsdf_microfacet_ggx_aniso_params(),
-                   bsdf_microfacet_ggx_aniso_prepare);
+                   bsdf_microfacet_ggx_isotropic_params(),
+                   bsdf_microfacet_ggx_isotropic_prepare);
+  register_closure(
+      ss, "microfacet_ggx_aniso", id++, bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare);
   register_closure(ss,
                    "microfacet_ggx_refraction",
                    id++,
@@ -398,13 +420,13 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
   register_closure(ss,
                    "microfacet_beckmann",
                    id++,
-                   bsdf_microfacet_beckmann_params(),
-                   bsdf_microfacet_beckmann_prepare);
+                   bsdf_microfacet_beckmann_isotropic_params(),
+                   bsdf_microfacet_beckmann_isotropic_prepare);
   register_closure(ss,
                    "microfacet_beckmann_aniso",
                    id++,
-                   bsdf_microfacet_beckmann_aniso_params(),
-                   bsdf_microfacet_beckmann_aniso_prepare);
+                   bsdf_microfacet_beckmann_params(),
+                   bsdf_microfacet_beckmann_prepare);
   register_closure(ss,
                    "microfacet_beckmann_refraction",
                    id++,
@@ -413,8 +435,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
   register_closure(ss,
                    "ashikhmin_shirley",
                    id++,
-                   bsdf_ashikhmin_shirley_aniso_params(),
-                   bsdf_ashikhmin_shirley_aniso_prepare);
+                   bsdf_ashikhmin_shirley_params(),
+                   bsdf_ashikhmin_shirley_prepare);
   register_closure(
       ss, "ashikhmin_velvet", id++, bsdf_ashikhmin_velvet_params(), bsdf_ashikhmin_velvet_prepare);
   register_closure(
@@ -425,8 +447,11 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
                    id++,
                    bsdf_principled_diffuse_params(),
                    bsdf_principled_diffuse_prepare);
-  register_closure(
-      ss, "principled_sheen", id++, bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare);
+  register_closure(ss,
+                   "principled_sheen",
+                   id++,
+                   bsdf_principled_sheen_params(),
+                   closure_bsdf_principled_sheen_prepare);
   register_closure(ss,
                    "principled_clearcoat",
                    id++,
@@ -486,6 +511,82 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
   return false;
 }
 
+/* Standard Microfacet Closure */
+
+class MicrofacetClosure : public CBSDFClosure {
+ public:
+  MicrofacetBsdf params;
+  ustring distribution;
+  int refract;
+
+  void setup(ShaderData *sd, int path_flag, float3 weight)
+  {
+    static ustring u_ggx("ggx");
+    static ustring u_default("default");
+
+    const int label = (refract) ? LABEL_TRANSMIT : LABEL_REFLECT;
+    if (skip(sd, path_flag, LABEL_GLOSSY | label)) {
+      return;
+    }
+
+    MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl(
+        sd, sizeof(MicrofacetBsdf), weight, &params);
+
+    if (!bsdf) {
+      return;
+    }
+
+    /* GGX */
+    if (distribution == u_ggx || distribution == u_default) {
+      if (!refract) {
+        if (params.alpha_x == params.alpha_y) {
+          /* Isotropic */
+          sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+        }
+        else {
+          /* Anisotropic */
+          sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
+        }
+      }
+      else {
+        sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+      }
+    }
+    /* Beckmann */
+    else {
+      if (!refract) {
+        if (params.alpha_x == params.alpha_y) {
+          /* Isotropic */
+          sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+        }
+        else {
+          /* Anisotropic */
+          sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
+        }
+      }
+      else {
+        sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+      }
+    }
+  }
+};
+
+ClosureParam *closure_bsdf_microfacet_params()
+{
+  static ClosureParam params[] = {CLOSURE_STRING_PARAM(MicrofacetClosure, distribution),
+                                  CLOSURE_FLOAT3_PARAM(MicrofacetClosure, params.N),
+                                  CLOSURE_FLOAT3_PARAM(MicrofacetClosure, params.T),
+                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.alpha_x),
+                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.alpha_y),
+                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.ior),
+                                  CLOSURE_INT_PARAM(MicrofacetClosure, refract),
+                                  CLOSURE_STRING_KEYPARAM(MicrofacetClosure, label, "label"),
+                                  CLOSURE_FINISH_PARAM(MicrofacetClosure)};
+
+  return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_prepare, MicrofacetClosure)
+
 /* GGX closures with Fresnel */
 
 class MicrofacetFresnelClosure : public CBSDFClosure {
@@ -497,8 +598,8 @@ class MicrofacetFresnelClosure : public CBSDFClosure {
   MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
   {
     /* Technically, the MultiGGX Glass closure may also transmit. However,
-    * since this is set statically and only used for caustic flags, this
-    * is probably as good as it gets. */
+     * since this is set statically and only used for caustic flags, this
+     * is probably as good as it gets. */
     if (skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
       return NULL;
     }
@@ -560,7 +661,7 @@ class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure {
       return;
     }
 
-    sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
+    sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
   }
 };
 
@@ -654,7 +755,7 @@ class MicrofacetMultiGGXAnisoClosure : public MicrofacetMultiClosure {
     }
 
     bsdf->ior = 0.0f;
-    sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
+    sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
   }
 };
 
@@ -715,8 +816,8 @@ class MicrofacetMultiFresnelClosure : public CBSDFClosure {
   MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
   {
     /* Technically, the MultiGGX closure may also transmit. However,
-    * since this is set statically and only used for caustic flags, this
-    * is probably as good as it gets. */
+     * since this is set statically and only used for caustic flags, this
+     * is probably as good as it gets. */
     if (skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
       return NULL;
     }
@@ -779,7 +880,7 @@ class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosu
       return;
     }
 
-    sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
+    sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd);
   }
 };
 
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index d3db6b71f5c..e4058e3a746 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -33,12 +33,12 @@
 #ifndef __OSL_CLOSURES_H__
 #define __OSL_CLOSURES_H__
 
-#include "util/util_types.h"
 #include "kernel/kernel_types.h"
+#include "util/util_types.h"
 
+#include <OSL/genclosure.h>
 #include <OSL/oslclosure.h>
 #include <OSL/oslexec.h>
-#include <OSL/genclosure.h>
 
 CCL_NAMESPACE_BEGIN
 
@@ -51,6 +51,7 @@ OSL::ClosureParam *closure_bsdf_transparent_params();
 OSL::ClosureParam *closure_bssrdf_params();
 OSL::ClosureParam *closure_absorption_params();
 OSL::ClosureParam *closure_henyey_greenstein_params();
+OSL::ClosureParam *closure_bsdf_microfacet_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params();
@@ -70,6 +71,7 @@ void closure_bsdf_transparent_prepare(OSL::RendererServices *, int id, void *dat
 void closure_bssrdf_prepare(OSL::RendererServices *, int id, void *data);
 void closure_absorption_prepare(OSL::RendererServices *, int id, void *data);
 void closure_henyey_greenstein_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data);
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 641c9967586..c06c9abd4c1 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -21,9 +21,13 @@
 
 #  include <OSL/oslexec.h>
 
+#  include <OpenImageIO/refcnt.h>
+#  include <OpenImageIO/unordered_map_concurrent.h>
+
 #  include "util/util_map.h"
 #  include "util/util_param.h"
 #  include "util/util_thread.h"
+#  include "util/util_unique_ptr.h"
 #  include "util/util_vector.h"
 
 #  ifndef WIN32
@@ -33,6 +37,13 @@ using std::isfinite;
 CCL_NAMESPACE_BEGIN
 
 class OSLRenderServices;
+class ColorSpaceProcessor;
+
+/* OSL Globals
+ *
+ * Data needed by OSL render services, that is global to a rendering session.
+ * This includes all OSL shaders, name to attribute mapping and texture handles.
+ * */
 
 struct OSLGlobals {
   OSLGlobals()
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index eb9f672fd8a..5292b5f8055 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -25,6 +25,7 @@
 
 #include <string.h>
 
+#include "render/colorspace.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
@@ -38,11 +39,13 @@
 #include "util/util_logging.h"
 #include "util/util_string.h"
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/split/kernel_split_data_types.h"
 #include "kernel/kernel_globals.h"
 #include "kernel/kernel_color.h"
 #include "kernel/kernel_random.h"
+#include "kernel/kernel_write_passes.h"
 #include "kernel/kernel_projection.h"
 #include "kernel/kernel_differential.h"
 #include "kernel/kernel_montecarlo.h"
@@ -54,10 +57,7 @@
 #include "kernel/kernel_projection.h"
 #include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
-
-#ifdef WITH_PTEX
-#  include <Ptexture.h>
-#endif
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -84,6 +84,7 @@ ustring OSLRenderServices::u_screen("screen");
 ustring OSLRenderServices::u_raster("raster");
 ustring OSLRenderServices::u_ndc("NDC");
 ustring OSLRenderServices::u_object_location("object:location");
+ustring OSLRenderServices::u_object_color("object:color");
 ustring OSLRenderServices::u_object_index("object:index");
 ustring OSLRenderServices::u_geom_dupli_generated("geom:dupli_generated");
 ustring OSLRenderServices::u_geom_dupli_uv("geom:dupli_uv");
@@ -124,34 +125,17 @@ ustring OSLRenderServices::u_I("I");
 ustring OSLRenderServices::u_u("u");
 ustring OSLRenderServices::u_v("v");
 ustring OSLRenderServices::u_empty;
-ustring OSLRenderServices::u_at_bevel("@bevel");
-ustring OSLRenderServices::u_at_ao("@ao");
 
-OSLRenderServices::OSLRenderServices()
+OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system)
+    : texture_system(texture_system)
 {
-  kernel_globals = NULL;
-  osl_ts = NULL;
-
-#ifdef WITH_PTEX
-  size_t maxmem = 16384 * 1024;
-  ptex_cache = PtexCache::create(0, maxmem);
-#endif
 }
 
 OSLRenderServices::~OSLRenderServices()
 {
-  if (osl_ts) {
-    VLOG(2) << "OSL texture system stats:\n" << osl_ts->getstats();
+  if (texture_system) {
+    VLOG(2) << "OSL texture system stats:\n" << texture_system->getstats();
   }
-#ifdef WITH_PTEX
-  ptex_cache->release();
-#endif
-}
-
-void OSLRenderServices::thread_init(KernelGlobals *kernel_globals_, OSL::TextureSystem *osl_ts_)
-{
-  kernel_globals = kernel_globals_;
-  osl_ts = osl_ts_;
 }
 
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
@@ -233,7 +217,8 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    ustring from,
                                    float time)
 {
-  KernelGlobals *kg = kernel_globals;
+  ShaderData *sd = (ShaderData *)(sg->renderstate);
+  KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -264,7 +249,8 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            ustring to,
                                            float time)
 {
-  KernelGlobals *kg = kernel_globals;
+  ShaderData *sd = (ShaderData *)(sg->renderstate);
+  KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -354,7 +340,8 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
 
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
 {
-  KernelGlobals *kg = kernel_globals;
+  ShaderData *sd = (ShaderData *)(sg->renderstate);
+  KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -380,7 +367,8 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            OSL::Matrix44 &result,
                                            ustring to)
 {
-  KernelGlobals *kg = kernel_globals;
+  ShaderData *sd = (ShaderData *)(sg->renderstate);
+  KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -498,6 +486,65 @@ static bool set_attribute_float3(float3 f, TypeDesc type, bool derivatives, void
   return set_attribute_float3(fv, type, derivatives, val);
 }
 
+/* Attributes with the TypeRGBA type descriptor should be retrieved and stored
+ * in a float array of size 4 (e.g. node_vertex_color.osl), this array have
+ * a type descriptor TypeFloatArray4. If the storage is not a TypeFloatArray4,
+ * we either store the first three components in a vector, store the average of
+ * the components in a float, or fail the retrieval and do nothing. We allow
+ * this for the correct operation of the Attribute node.
+ */
+
+static bool set_attribute_float4(float4 f[3], TypeDesc type, bool derivatives, void *val)
+{
+  float *fval = (float *)val;
+  if (type == TypeFloatArray4) {
+    fval[0] = f[0].x;
+    fval[1] = f[0].y;
+    fval[2] = f[0].z;
+    fval[3] = f[0].w;
+
+    if (derivatives) {
+      fval[4] = f[1].x;
+      fval[5] = f[1].y;
+      fval[6] = f[1].z;
+      fval[7] = f[1].w;
+
+      fval[8] = f[2].x;
+      fval[9] = f[2].y;
+      fval[10] = f[2].z;
+      fval[11] = f[2].w;
+    }
+    return true;
+  }
+  else if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+           type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor) {
+    fval[0] = f[0].x;
+    fval[1] = f[0].y;
+    fval[2] = f[0].z;
+
+    if (derivatives) {
+      fval[3] = f[1].x;
+      fval[4] = f[1].y;
+      fval[5] = f[1].z;
+
+      fval[6] = f[2].x;
+      fval[7] = f[2].y;
+      fval[8] = f[2].z;
+    }
+    return true;
+  }
+  else if (type == TypeDesc::TypeFloat) {
+    fval[0] = average(float4_to_float3(f[0]));
+
+    if (derivatives) {
+      fval[1] = average(float4_to_float3(f[1]));
+      fval[2] = average(float4_to_float3(f[2]));
+    }
+    return true;
+  }
+  return false;
+}
+
 static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, void *val)
 {
   if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
@@ -633,7 +680,7 @@ static bool get_primitive_attribute(KernelGlobals *kg,
     return set_attribute_float3(fval, type, derivatives, val);
   }
   else if (attr.type == TypeFloat2) {
-    float2 fval[2];
+    float2 fval[3];
     fval[0] = primitive_attribute_float2(
         kg, sd, attr.desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
     return set_attribute_float2(fval, type, derivatives, val);
@@ -644,6 +691,12 @@ static bool get_primitive_attribute(KernelGlobals *kg,
         kg, sd, attr.desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
     return set_attribute_float(fval, type, derivatives, val);
   }
+  else if (attr.type == TypeRGBA) {
+    float4 fval[3];
+    fval[0] = primitive_attribute_float4(
+        kg, sd, attr.desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
+    return set_attribute_float4(fval, type, derivatives, val);
+  }
   else {
     return false;
   }
@@ -684,6 +737,10 @@ bool OSLRenderServices::get_object_standard_attribute(
     float3 f = object_location(kg, sd);
     return set_attribute_float3(f, type, derivatives, val);
   }
+  else if (name == u_object_color) {
+    float3 f = object_color(kg, sd->object);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
   else if (name == u_object_index) {
     float f = object_pass_id(kg, sd->object);
     return set_attribute_float(f, type, derivatives, val);
@@ -713,7 +770,7 @@ bool OSLRenderServices::get_object_standard_attribute(
   }
   else if (name == u_particle_random) {
     int particle_id = object_particle_id(kg, sd->object);
-    float f = hash_int_01(particle_index(kg, particle_id));
+    float f = hash_uint2_to_float(particle_index(kg, particle_id), 0);
     return set_attribute_float(f, type, derivatives, val);
   }
 
@@ -733,7 +790,7 @@ bool OSLRenderServices::get_object_standard_attribute(
     return set_attribute_float3(f, type, derivatives, val);
   }
 #if 0 /* unsupported */
-  else if(name == u_particle_rotation) {
+  else if (name == u_particle_rotation) {
     int particle_id = object_particle_id(kg, sd->object);
     float4 f = particle_rotation(kg, particle_id);
     return set_attribute_float4(f, type, derivatives, val);
@@ -954,21 +1011,52 @@ bool OSLRenderServices::get_userdata(
   return false; /* disabled by lockgeom */
 }
 
+#if OSL_LIBRARY_VERSION_CODE >= 11100
+TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring filename,
+                                                                    OSL::ShadingContext *)
+#else
+
 TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring filename)
+#endif
 {
-  if (filename.length() && filename[0] == '@') {
-    /* Dummy, we don't use texture handles for builtin textures but need
-     * to tell the OSL runtime optimizer that this is a valid texture. */
+  OSLTextureHandleMap::iterator it = textures.find(filename);
+
+  /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
+  if (it != textures.end()) {
+    if (it->second->type != OSLTextureHandle::OIIO) {
+      return (TextureSystem::TextureHandle *)it->second.get();
+    }
+  }
+
+  /* Get handle from OpenImageIO. */
+  OSL::TextureSystem *ts = texture_system;
+  TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
+  if (handle == NULL) {
     return NULL;
   }
-  else {
-    return texturesys()->get_texture_handle(filename);
+
+  /* Insert new OSLTextureHandle if needed. */
+  if (it == textures.end()) {
+    textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
+    it = textures.find(filename);
   }
+
+  /* Assign OIIO texture handle and return. */
+  it->second->oiio_handle = handle;
+  return (TextureSystem::TextureHandle *)it->second.get();
 }
 
 bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
 {
-  return texturesys()->good(texture_handle);
+  OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
+
+  if (handle->oiio_handle) {
+    OSL::TextureSystem *ts = texture_system;
+    return ts->good(handle->oiio_handle);
+  }
+  else {
+    return true;
+  }
 }
 
 bool OSLRenderServices::texture(ustring filename,
@@ -988,69 +1076,28 @@ bool OSLRenderServices::texture(ustring filename,
                                 float *dresultdt,
                                 ustring *errormessage)
 {
-  OSL::TextureSystem *ts = osl_ts;
+  OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
+  OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
-
-  if (texture_thread_info == NULL) {
-    OSLThreadData *tdata = kg->osl_tdata;
-    texture_thread_info = tdata->oiio_thread_info;
-  }
-
-#ifdef WITH_PTEX
-  /* todo: this is just a quick hack, only works with particular files and options */
-  if (string_endswith(filename.string(), ".ptx")) {
-    float2 uv;
-    int faceid;
-
-    if (!primitive_ptex(kg, sd, &uv, &faceid))
-      return false;
-
-    float u = uv.x;
-    float v = uv.y;
-    float dudx = 0.0f;
-    float dvdx = 0.0f;
-    float dudy = 0.0f;
-    float dvdy = 0.0f;
-
-    Ptex::String error;
-    PtexPtr<PtexTexture> r(ptex_cache->get(filename.c_str(), error));
-
-    if (!r) {
-      //std::cerr << error.c_str() << std::endl;
-      return false;
-    }
-
-    bool mipmaplerp = false;
-    float sharpness = 1.0f;
-    PtexFilter::Options opts(PtexFilter::f_bicubic, mipmaplerp, sharpness);
-    PtexPtr<PtexFilter> f(PtexFilter::getFilter(r, opts));
-
-    f->eval(result, options.firstchannel, nchannels, faceid, u, v, dudx, dvdx, dudy, dvdy);
-
-    for (int c = r->numChannels(); c < nchannels; c++)
-      result[c] = result[0];
-
-    return true;
-  }
-#endif
+  KernelGlobals *kernel_globals = sd->osl_globals;
   bool status = false;
 
-  if (filename.length() && filename[0] == '@') {
-    if (filename == u_at_bevel) {
+  switch (texture_type) {
+    case OSLTextureHandle::BEVEL: {
       /* Bevel shader hack. */
       if (nchannels >= 3) {
         PathState *state = sd->osl_path_state;
         int num_samples = (int)s;
         float radius = t;
-        float3 N = svm_bevel(kg, sd, state, radius, num_samples);
+        float3 N = svm_bevel(kernel_globals, sd, state, radius, num_samples);
         result[0] = N.x;
         result[1] = N.y;
         result[2] = N.z;
         status = true;
       }
+      break;
     }
-    else if (filename == u_at_ao) {
+    case OSLTextureHandle::AO: {
       /* AO shader hack. */
       PathState *state = sd->osl_path_state;
       int num_samples = (int)s;
@@ -1066,19 +1113,13 @@ bool OSLRenderServices::texture(ustring filename,
       if ((int)options.tblur) {
         flags |= NODE_AO_GLOBAL_RADIUS;
       }
-      result[0] = svm_ao(kg, sd, N, state, radius, num_samples, flags);
-      status = true;
-    }
-    else if (filename[1] == 'l') {
-      /* IES light. */
-      int slot = atoi(filename.c_str() + 2);
-      result[0] = kernel_ies_interp(kg, slot, s, t);
+      result[0] = svm_ao(kernel_globals, sd, N, state, radius, num_samples, flags);
       status = true;
+      break;
     }
-    else {
+    case OSLTextureHandle::SVM: {
       /* Packed texture. */
-      int slot = atoi(filename.c_str() + 2);
-      float4 rgba = kernel_tex_image_interp(kg, slot, s, 1.0f - t);
+      float4 rgba = kernel_tex_image_interp(kernel_globals, handle->svm_slot, s, 1.0f - t);
 
       result[0] = rgba[0];
       if (nchannels > 1)
@@ -1088,37 +1129,62 @@ bool OSLRenderServices::texture(ustring filename,
       if (nchannels > 3)
         result[3] = rgba[3];
       status = true;
+      break;
     }
-  }
-  else {
-    if (texture_handle != NULL) {
-      status = ts->texture(texture_handle,
-                           texture_thread_info,
-                           options,
-                           s,
-                           t,
-                           dsdx,
-                           dtdx,
-                           dsdy,
-                           dtdy,
-                           nchannels,
-                           result,
-                           dresultds,
-                           dresultdt);
+    case OSLTextureHandle::IES: {
+      /* IES light. */
+      result[0] = kernel_ies_interp(kernel_globals, handle->svm_slot, s, t);
+      status = true;
+      break;
     }
-    else {
-      status = ts->texture(filename,
-                           options,
-                           s,
-                           t,
-                           dsdx,
-                           dtdx,
-                           dsdy,
-                           dtdy,
-                           nchannels,
-                           result,
-                           dresultds,
-                           dresultdt);
+    case OSLTextureHandle::OIIO: {
+      /* OpenImageIO texture cache. */
+      OSL::TextureSystem *ts = texture_system;
+
+      if (handle && handle->oiio_handle) {
+        if (texture_thread_info == NULL) {
+          OSLThreadData *tdata = kernel_globals->osl_tdata;
+          texture_thread_info = tdata->oiio_thread_info;
+        }
+
+        status = ts->texture(handle->oiio_handle,
+                             texture_thread_info,
+                             options,
+                             s,
+                             t,
+                             dsdx,
+                             dtdx,
+                             dsdy,
+                             dtdy,
+                             nchannels,
+                             result,
+                             dresultds,
+                             dresultdt);
+      }
+      else {
+        status = ts->texture(filename,
+                             options,
+                             s,
+                             t,
+                             dsdx,
+                             dtdx,
+                             dsdy,
+                             dtdy,
+                             nchannels,
+                             result,
+                             dresultds,
+                             dresultdt);
+      }
+
+      if (!status) {
+        /* This might be slow, but prevents error messages leak and
+         * other nasty stuff happening. */
+        ts->geterror();
+      }
+      else if (handle && handle->processor) {
+        ColorSpaceManager::to_scene_linear(handle->processor, result, nchannels);
+      }
+      break;
     }
   }
 
@@ -1131,11 +1197,6 @@ bool OSLRenderServices::texture(ustring filename,
       if (nchannels == 4)
         result[3] = 1.0f;
     }
-    /* This might be slow, but prevents error messages leak and
-     * other nasty stuff happening.
-     */
-    string err = ts->geterror();
-    (void)err;
   }
 
   return status;
@@ -1157,56 +1218,83 @@ bool OSLRenderServices::texture3d(ustring filename,
                                   float *dresultdr,
                                   ustring *errormessage)
 {
-  OSL::TextureSystem *ts = osl_ts;
-  ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
+  OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
+  bool status = false;
 
-  if (texture_thread_info == NULL) {
-    OSLThreadData *tdata = kg->osl_tdata;
-    texture_thread_info = tdata->oiio_thread_info;
-  }
+  switch (texture_type) {
+    case OSLTextureHandle::SVM: {
+      /* Packed texture. */
+      ShaderData *sd = (ShaderData *)(sg->renderstate);
+      KernelGlobals *kernel_globals = sd->osl_globals;
+      int slot = handle->svm_slot;
+      float3 P_float3 = make_float3(P.x, P.y, P.z);
+      float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE);
 
-  bool status;
-  if (filename.length() && filename[0] == '@') {
-    int slot = atoi(filename.c_str() + 1);
-    float4 rgba = kernel_tex_image_interp_3d(kg, slot, P.x, P.y, P.z, INTERPOLATION_NONE);
+      result[0] = rgba[0];
+      if (nchannels > 1)
+        result[1] = rgba[1];
+      if (nchannels > 2)
+        result[2] = rgba[2];
+      if (nchannels > 3)
+        result[3] = rgba[3];
+      status = true;
+      break;
+    }
+    case OSLTextureHandle::OIIO: {
+      /* OpenImageIO texture cache. */
+      OSL::TextureSystem *ts = texture_system;
+
+      if (handle && handle->oiio_handle) {
+        if (texture_thread_info == NULL) {
+          ShaderData *sd = (ShaderData *)(sg->renderstate);
+          KernelGlobals *kernel_globals = sd->osl_globals;
+          OSLThreadData *tdata = kernel_globals->osl_tdata;
+          texture_thread_info = tdata->oiio_thread_info;
+        }
 
-    result[0] = rgba[0];
-    if (nchannels > 1)
-      result[1] = rgba[1];
-    if (nchannels > 2)
-      result[2] = rgba[2];
-    if (nchannels > 3)
-      result[3] = rgba[3];
-    status = true;
-  }
-  else {
-    if (texture_handle != NULL) {
-      status = ts->texture3d(texture_handle,
-                             texture_thread_info,
-                             options,
-                             P,
-                             dPdx,
-                             dPdy,
-                             dPdz,
-                             nchannels,
-                             result,
-                             dresultds,
-                             dresultdt,
-                             dresultdr);
+        status = ts->texture3d(handle->oiio_handle,
+                               texture_thread_info,
+                               options,
+                               P,
+                               dPdx,
+                               dPdy,
+                               dPdz,
+                               nchannels,
+                               result,
+                               dresultds,
+                               dresultdt,
+                               dresultdr);
+      }
+      else {
+        status = ts->texture3d(filename,
+                               options,
+                               P,
+                               dPdx,
+                               dPdy,
+                               dPdz,
+                               nchannels,
+                               result,
+                               dresultds,
+                               dresultdt,
+                               dresultdr);
+      }
+
+      if (!status) {
+        /* This might be slow, but prevents error messages leak and
+         * other nasty stuff happening. */
+        ts->geterror();
+      }
+      else if (handle && handle->processor) {
+        ColorSpaceManager::to_scene_linear(handle->processor, result, nchannels);
+      }
+      break;
     }
-    else {
-      status = ts->texture3d(filename,
-                             options,
-                             P,
-                             dPdx,
-                             dPdy,
-                             dPdz,
-                             nchannels,
-                             result,
-                             dresultds,
-                             dresultdt,
-                             dresultdr);
+    case OSLTextureHandle::IES:
+    case OSLTextureHandle::AO:
+    case OSLTextureHandle::BEVEL: {
+      status = false;
+      break;
     }
   }
 
@@ -1219,18 +1307,13 @@ bool OSLRenderServices::texture3d(ustring filename,
       if (nchannels == 4)
         result[3] = 1.0f;
     }
-    /* This might be slow, but prevents error messages leak and
-     * other nasty stuff happening.
-     */
-    string err = ts->geterror();
-    (void)err;
   }
 
   return status;
 }
 
 bool OSLRenderServices::environment(ustring filename,
-                                    TextureHandle *th,
+                                    TextureHandle *texture_handle,
                                     TexturePerthread *thread_info,
                                     TextureOpt &options,
                                     OSL::ShaderGlobals *sg,
@@ -1243,21 +1326,33 @@ bool OSLRenderServices::environment(ustring filename,
                                     float *dresultdt,
                                     ustring *errormessage)
 {
-  OSL::TextureSystem *ts = osl_ts;
+  OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
+  OSL::TextureSystem *ts = texture_system;
+  bool status = false;
 
-  if (thread_info == NULL) {
-    ShaderData *sd = (ShaderData *)(sg->renderstate);
-    KernelGlobals *kg = sd->osl_globals;
-    OSLThreadData *tdata = kg->osl_tdata;
-    thread_info = tdata->oiio_thread_info;
-  }
+  if (handle && handle->oiio_handle) {
+    if (thread_info == NULL) {
+      ShaderData *sd = (ShaderData *)(sg->renderstate);
+      KernelGlobals *kernel_globals = sd->osl_globals;
+      OSLThreadData *tdata = kernel_globals->osl_tdata;
+      thread_info = tdata->oiio_thread_info;
+    }
 
-  if (th == NULL) {
-    th = ts->get_texture_handle(filename, thread_info);
+    status = ts->environment(handle->oiio_handle,
+                             thread_info,
+                             options,
+                             R,
+                             dRdx,
+                             dRdy,
+                             nchannels,
+                             result,
+                             dresultds,
+                             dresultdt);
+  }
+  else {
+    status = ts->environment(
+        filename, options, R, dRdx, dRdy, nchannels, result, dresultds, dresultdt);
   }
-
-  bool status = ts->environment(
-      th, thread_info, options, R, dRdx, dRdy, nchannels, result, dresultds, dresultdt);
 
   if (!status) {
     if (nchannels == 3 || nchannels == 4) {
@@ -1269,26 +1364,43 @@ bool OSLRenderServices::environment(ustring filename,
         result[3] = 1.0f;
     }
   }
+  else if (handle && handle->processor) {
+    ColorSpaceManager::to_scene_linear(handle->processor, result, nchannels);
+  }
 
   return status;
 }
 
+#if OSL_LIBRARY_VERSION_CODE >= 11100
+bool OSLRenderServices::get_texture_info(ustring filename,
+                                         TextureHandle *texture_handle,
+                                         TexturePerthread *,
+                                         OSL::ShadingContext *,
+                                         int subimage,
+                                         ustring dataname,
+                                         TypeDesc datatype,
+                                         void *data,
+                                         ustring *)
+#else
 bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg,
                                          ustring filename,
-                                         TextureHandle *th,
+                                         TextureHandle *texture_handle,
                                          int subimage,
                                          ustring dataname,
                                          TypeDesc datatype,
                                          void *data)
+#endif
 {
-  OSL::TextureSystem *ts = osl_ts;
-  if (filename.length() && filename[0] == '@') {
-    /* Special builtin textures. */
+  OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
+
+  /* No texture info for other texture types. */
+  if (handle && handle->type != OSLTextureHandle::OIIO) {
     return false;
   }
-  else {
-    return ts->get_texture_info(filename, subimage, dataname, datatype, data);
-  }
+
+  /* Get texture info from OpenImageIO. */
+  OSL::TextureSystem *ts = texture_system;
+  return ts->get_texture_info(filename, subimage, dataname, datatype, data);
 }
 
 int OSLRenderServices::pointcloud_search(OSL::ShaderGlobals *sg,
@@ -1371,9 +1483,16 @@ bool OSLRenderServices::trace(TraceOpt &options,
   tracedata->init = true;
   tracedata->sd.osl_globals = sd->osl_globals;
 
+  KernelGlobals *kg = sd->osl_globals;
+
+  /* Can't raytrace from shaders like displacement, before BVH exists. */
+  if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
+    return false;
+  }
+
   /* Raytrace, leaving out shadow opaque to avoid early exit. */
   uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE;
-  return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f);
+  return scene_intersect(kg, &ray, visibility, &tracedata->isect);
 }
 
 bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 2fad5833fc9..894d6e471ba 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -25,8 +25,8 @@
  * attributes.
  */
 
-#include <OSL/oslexec.h>
 #include <OSL/oslclosure.h>
+#include <OSL/oslexec.h>
 
 #ifdef WITH_PTEX
 class PtexCache;
@@ -40,13 +40,46 @@ class Shader;
 struct ShaderData;
 struct float3;
 struct KernelGlobals;
+
+/* OSL Texture Handle
+ *
+ * OSL texture lookups are string based. If those strings are known at compile
+ * time, the OSL compiler can cache a texture handle to use instead of a string.
+ *
+ * By default it uses TextureSystem::TextureHandle. But since we want to support
+ * different kinds of textures and color space conversions, this is our own handle
+ * with additional data.
+ *
+ * These are stored in a concurrent hash map, because OSL can compile multiple
+ * shaders in parallel. */
+
+struct OSLTextureHandle : public OIIO::RefCnt {
+  enum Type { OIIO, SVM, IES, BEVEL, AO };
+
+  OSLTextureHandle(Type type = OIIO, int svm_slot = -1)
+      : type(type), svm_slot(svm_slot), oiio_handle(NULL), processor(NULL)
+  {
+  }
+
+  Type type;
+  int svm_slot;
+  OSL::TextureSystem::TextureHandle *oiio_handle;
+  ColorSpaceProcessor *processor;
+};
+
+typedef OIIO::intrusive_ptr<OSLTextureHandle> OSLTextureHandleRef;
+typedef OIIO::unordered_map_concurrent<ustring, OSLTextureHandleRef, ustringHash>
+    OSLTextureHandleMap;
+
+/* OSL Render Services
+ *
+ * Interface for OSL to access attributes, textures and other scene data. */
+
 class OSLRenderServices : public OSL::RendererServices {
  public:
-  OSLRenderServices();
+  OSLRenderServices(OSL::TextureSystem *texture_system);
   ~OSLRenderServices();
 
-  void thread_init(KernelGlobals *kernel_globals, OSL::TextureSystem *ts);
-
   bool get_matrix(OSL::ShaderGlobals *sg,
                   OSL::Matrix44 &result,
                   OSL::TransformationPtr xform,
@@ -140,7 +173,12 @@ class OSLRenderServices : public OSL::RendererServices {
                   void *val,
                   bool derivatives) override;
 
+#if OSL_LIBRARY_VERSION_CODE >= 11100
+  TextureSystem::TextureHandle *get_texture_handle(ustring filename,
+                                                   OSL::ShadingContext *context) override;
+#else
   TextureSystem::TextureHandle *get_texture_handle(ustring filename) override;
+#endif
 
   bool good(TextureSystem::TextureHandle *texture_handle) override;
 
@@ -191,6 +229,17 @@ class OSLRenderServices : public OSL::RendererServices {
                    float *dresultdt,
                    ustring *errormessage) override;
 
+#if OSL_LIBRARY_VERSION_CODE >= 11100
+  bool get_texture_info(ustring filename,
+                        TextureHandle *texture_handle,
+                        TexturePerthread *texture_thread_info,
+                        OSL::ShadingContext *shading_context,
+                        int subimage,
+                        ustring dataname,
+                        TypeDesc datatype,
+                        void *data,
+                        ustring *errormessage) override;
+#else
   bool get_texture_info(OSL::ShaderGlobals *sg,
                         ustring filename,
                         TextureHandle *texture_handle,
@@ -198,6 +247,7 @@ class OSLRenderServices : public OSL::RendererServices {
                         ustring dataname,
                         TypeDesc datatype,
                         void *data) override;
+#endif
 
   static bool get_background_attribute(
       KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
@@ -212,6 +262,7 @@ class OSLRenderServices : public OSL::RendererServices {
   static ustring u_raster;
   static ustring u_ndc;
   static ustring u_object_location;
+  static ustring u_object_color;
   static ustring u_object_index;
   static ustring u_geom_dupli_generated;
   static ustring u_geom_dupli_uv;
@@ -255,12 +306,12 @@ class OSLRenderServices : public OSL::RendererServices {
   static ustring u_at_bevel;
   static ustring u_at_ao;
 
- private:
-  KernelGlobals *kernel_globals;
-  OSL::TextureSystem *osl_ts;
-#ifdef WITH_PTEX
-  PtexCache *ptex_cache;
-#endif
+  /* Texture system and texture handle map are part of the services instead of
+   * globals to be shared between different render sessions. This saves memory,
+   * and is required because texture handles are cached as part of the shared
+   * shading system. */
+  OSL::TextureSystem *texture_system;
+  OSLTextureHandleMap textures;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 3d9c579c9ff..2318813949e 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -16,6 +16,7 @@
 
 #include <OSL/oslexec.h>
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_types.h"
@@ -28,6 +29,7 @@
 #include "kernel/osl/osl_globals.h"
 #include "kernel/osl/osl_services.h"
 #include "kernel/osl/osl_shader.h"
+// clang-format on
 
 #include "util/util_foreach.h"
 
@@ -49,7 +51,6 @@ void OSLShader::thread_init(KernelGlobals *kg,
 
   /* per thread kernel data init*/
   kg->osl = osl_globals;
-  kg->osl->services->thread_init(kernel_globals, osl_globals->ts);
 
   OSL::ShadingSystem *ss = kg->osl->ss;
   OSLThreadData *tdata = new OSLThreadData();
@@ -383,10 +384,6 @@ int OSLShader::find_attribute(KernelGlobals *kg,
 {
   /* for OSL, a hash map is used to lookup the attribute by name. */
   int object = sd->object * ATTR_PRIM_TYPES;
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE)
-    object += ATTR_PRIM_CURVE;
-#endif
 
   OSLGlobals::AttributeMap &attr_map = kg->osl->attribute_map[object];
   ustring stdname(std::string("geom:") +
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index b42b9b2fe64..9dcedc9ba19 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -13,6 +13,7 @@ set(SRC_OSL
   node_bump.osl
   node_camera.osl
   node_checker_texture.osl
+  node_clamp.osl
   node_combine_rgb.osl
   node_combine_hsv.osl
   node_combine_xyz.osl
@@ -46,6 +47,7 @@ set(SRC_OSL
   node_light_falloff.osl
   node_light_path.osl
   node_magic_texture.osl
+  node_map_range.osl
   node_mapping.osl
   node_math.osl
   node_mix.osl
@@ -76,13 +78,16 @@ set(SRC_OSL
   node_value.osl
   node_vector_curves.osl
   node_vector_math.osl
+  node_vector_rotate.osl
   node_vector_transform.osl
   node_velvet_bsdf.osl
+  node_vertex_color.osl
   node_voronoi_texture.osl
   node_voxel_texture.osl
   node_wavelength.osl
   node_blackbody.osl
   node_wave_texture.osl
+  node_white_noise_texture.osl
   node_wireframe.osl
   node_hair_bsdf.osl
   node_principled_hair_bsdf.osl
@@ -91,13 +96,19 @@ set(SRC_OSL
   node_rgb_to_bw.osl
 )
 
+# The headers that OSL ships differs per release so we can not
+# hardcode this.
+file(GLOB SRC_OSL_HEADER_DIST ${OSL_SHADER_DIR}/*.h)
+
 set(SRC_OSL_HEADERS
   node_color.h
   node_fresnel.h
+  node_hash.h
+  node_math.h
+  node_noise.h
   node_ramp_util.h
-  node_texture.h
-  stdosl.h
-  oslutil.h
+  stdcycles.h
+  ${SRC_OSL_HEADER_DIST}
 )
 
 set(SRC_OSO
@@ -112,7 +123,7 @@ foreach(_file ${SRC_OSL})
   string(REPLACE ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} _OSO_FILE ${_OSO_FILE})
   add_custom_command(
     OUTPUT ${_OSO_FILE}
-    COMMAND ${OSL_COMPILER} -q -O2  -I"${CMAKE_CURRENT_SOURCE_DIR}" -o ${_OSO_FILE} ${_OSL_FILE}
+    COMMAND ${OSL_COMPILER} -q -O2  -I"${CMAKE_CURRENT_SOURCE_DIR}" -I"${OSL_SHADER_DIR}" -o ${_OSO_FILE} ${_OSL_FILE}
     DEPENDS ${_OSL_FILE} ${SRC_OSL_HEADERS} ${OSL_COMPILER})
   list(APPEND SRC_OSO
     ${_OSO_FILE}
diff --git a/intern/cycles/kernel/shaders/node_absorption_volume.osl b/intern/cycles/kernel/shaders/node_absorption_volume.osl
index e99bd254666..37ccc4c969f 100644
--- a/intern/cycles/kernel/shaders/node_absorption_volume.osl
+++ b/intern/cycles/kernel/shaders/node_absorption_volume.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_absorption_volume(color Color = color(0.8, 0.8, 0.8),
                               float Density = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_add_closure.osl b/intern/cycles/kernel/shaders/node_add_closure.osl
index 077e2735e61..27ecc9ef0c2 100644
--- a/intern/cycles/kernel/shaders/node_add_closure.osl
+++ b/intern/cycles/kernel/shaders/node_add_closure.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_add_closure(closure color Closure1 = 0,
                         closure color Closure2 = 0,
diff --git a/intern/cycles/kernel/shaders/node_ambient_occlusion.osl b/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
index 7bf28719e78..22d245d0698 100644
--- a/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
+++ b/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_ambient_occlusion(color ColorIn = color(1.0, 1.0, 1.0),
                               int samples = 16,
diff --git a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
index 165c09eb8e0..739cd375ab2 100644
--- a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_anisotropic_bsdf(color Color = 0.0,
                              string distribution = "GGX",
diff --git a/intern/cycles/kernel/shaders/node_attribute.osl b/intern/cycles/kernel/shaders/node_attribute.osl
index 336543cc130..abec8ebfbf0 100644
--- a/intern/cycles/kernel/shaders/node_attribute.osl
+++ b/intern/cycles/kernel/shaders/node_attribute.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_attribute(string bump_offset = "center",
                       string name = "",
diff --git a/intern/cycles/kernel/shaders/node_background.osl b/intern/cycles/kernel/shaders/node_background.osl
index 6ded0d2c65c..3f45db751b3 100644
--- a/intern/cycles/kernel/shaders/node_background.osl
+++ b/intern/cycles/kernel/shaders/node_background.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_background(color Color = 0.8,
                        float Strength = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_bevel.osl b/intern/cycles/kernel/shaders/node_bevel.osl
index 189c20c52e7..e87ddab716d 100644
--- a/intern/cycles/kernel/shaders/node_bevel.osl
+++ b/intern/cycles/kernel/shaders/node_bevel.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_bevel(int samples = 4,
                   float Radius = 0.05,
diff --git a/intern/cycles/kernel/shaders/node_blackbody.osl b/intern/cycles/kernel/shaders/node_blackbody.osl
index 8a24bf1e28b..741efae755d 100644
--- a/intern/cycles/kernel/shaders/node_blackbody.osl
+++ b/intern/cycles/kernel/shaders/node_blackbody.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_blackbody(float Temperature = 1200.0, output color Color = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl
index 0abc3574c48..075a324c730 100644
--- a/intern/cycles/kernel/shaders/node_brick_texture.osl
+++ b/intern/cycles/kernel/shaders/node_brick_texture.osl
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
+#include "stdcycles.h"
 
 /* Brick */
 
diff --git a/intern/cycles/kernel/shaders/node_brightness.osl b/intern/cycles/kernel/shaders/node_brightness.osl
index 2defbc4b1db..019edfb79a3 100644
--- a/intern/cycles/kernel/shaders/node_brightness.osl
+++ b/intern/cycles/kernel/shaders/node_brightness.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_brightness(color ColorIn = 0.8,
                        float Bright = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_bump.osl b/intern/cycles/kernel/shaders/node_bump.osl
index 3697bb37fd9..811182f40b5 100644
--- a/intern/cycles/kernel/shaders/node_bump.osl
+++ b/intern/cycles/kernel/shaders/node_bump.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 /* "Bump Mapping Unparameterized Surfaces on the GPU"
  * Morten S. Mikkelsen, 2010 */
diff --git a/intern/cycles/kernel/shaders/node_camera.osl b/intern/cycles/kernel/shaders/node_camera.osl
index 833e9e775fe..45ca50c6e1e 100644
--- a/intern/cycles/kernel/shaders/node_camera.osl
+++ b/intern/cycles/kernel/shaders/node_camera.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_camera(output vector ViewVector = vector(0.0, 0.0, 0.0),
                    output float ViewZDepth = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_checker_texture.osl b/intern/cycles/kernel/shaders/node_checker_texture.osl
index e068f7952ed..d6a30dbdb40 100644
--- a/intern/cycles/kernel/shaders/node_checker_texture.osl
+++ b/intern/cycles/kernel/shaders/node_checker_texture.osl
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
+#include "stdcycles.h"
 
 /* Checker */
 
diff --git a/intern/cycles/kernel/shaders/node_clamp.osl b/intern/cycles/kernel/shaders/node_clamp.osl
new file mode 100644
index 00000000000..ce9392a0d98
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_clamp.osl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdcycles.h"
+
+shader node_clamp(string type = "minmax",
+                  float Value = 1.0,
+                  float Min = 0.0,
+                  float Max = 1.0,
+                  output float Result = 0.0)
+{
+  Result = (type == "range" && (Min > Max)) ? clamp(Value, Max, Min) : clamp(Value, Min, Max);
+}
diff --git a/intern/cycles/kernel/shaders/node_combine_hsv.osl b/intern/cycles/kernel/shaders/node_combine_hsv.osl
index 1658cf3d774..05e502b5bc1 100644
--- a/intern/cycles/kernel/shaders/node_combine_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_combine_hsv.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_combine_hsv(float H = 0.0, float S = 0.0, float V = 0.0, output color Color = 0.8)
 {
diff --git a/intern/cycles/kernel/shaders/node_combine_rgb.osl b/intern/cycles/kernel/shaders/node_combine_rgb.osl
index aaa95e9c5af..036f371eb5c 100644
--- a/intern/cycles/kernel/shaders/node_combine_rgb.osl
+++ b/intern/cycles/kernel/shaders/node_combine_rgb.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_combine_rgb(float R = 0.0, float G = 0.0, float B = 0.0, output color Image = 0.8)
 {
diff --git a/intern/cycles/kernel/shaders/node_combine_xyz.osl b/intern/cycles/kernel/shaders/node_combine_xyz.osl
index 4ab49168704..4ebd86b605c 100644
--- a/intern/cycles/kernel/shaders/node_combine_xyz.osl
+++ b/intern/cycles/kernel/shaders/node_combine_xyz.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_combine_xyz(float X = 0.0, float Y = 0.0, float Z = 0.0, output vector Vector = 0.8)
 {
diff --git a/intern/cycles/kernel/shaders/node_convert_from_color.osl b/intern/cycles/kernel/shaders/node_convert_from_color.osl
index 7ea9a1e4fb3..c3f0e118844 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_color.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_color.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_color(color value_color = 0.0,
                                output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_float.osl b/intern/cycles/kernel/shaders/node_convert_from_float.osl
index 13b5dea0838..61a15a1c2b0 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_float.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_float.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_float(float value_float = 0.0,
                                output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_int.osl b/intern/cycles/kernel/shaders/node_convert_from_int.osl
index a59e025d822..2e6a99b2765 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_int.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_int.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_int(int value_int = 0,
                              output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_normal.osl b/intern/cycles/kernel/shaders/node_convert_from_normal.osl
index 7bdd94d1941..64201d63190 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_normal.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_normal.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_normal(normal value_normal = normal(0.0, 0.0, 0.0),
                                 output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_point.osl b/intern/cycles/kernel/shaders/node_convert_from_point.osl
index 79c1719e7a7..11d64f76d6f 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_point.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_point.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_point(point value_point = point(0.0, 0.0, 0.0),
                                output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_string.osl b/intern/cycles/kernel/shaders/node_convert_from_string.osl
index 48d894a6b3e..b496c4e6d05 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_string.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_string.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_string(string value_string = "",
                                 output color value_color = color(0.0, 0.0, 0.0),
diff --git a/intern/cycles/kernel/shaders/node_convert_from_vector.osl b/intern/cycles/kernel/shaders/node_convert_from_vector.osl
index 92ab2313bcb..820faabd32b 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_vector.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_vector.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_vector(vector value_vector = vector(0.0, 0.0, 0.0),
                                 output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl b/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
index bd5554b838a..f5886f534eb 100644
--- a/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_diffuse_bsdf(color Color = 0.8,
                          float Roughness = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_displacement.osl b/intern/cycles/kernel/shaders/node_displacement.osl
index a1f3b7b7737..44a4828d511 100644
--- a/intern/cycles/kernel/shaders/node_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_displacement.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_displacement(string space = "object",
                          float Height = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_emission.osl b/intern/cycles/kernel/shaders/node_emission.osl
index 57973f57ac6..f289a9711d9 100644
--- a/intern/cycles/kernel/shaders/node_emission.osl
+++ b/intern/cycles/kernel/shaders/node_emission.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_emission(color Color = 0.8, float Strength = 1.0, output closure color Emission = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_environment_texture.osl b/intern/cycles/kernel/shaders/node_environment_texture.osl
index eb32dad392f..d04743eb368 100644
--- a/intern/cycles/kernel/shaders/node_environment_texture.osl
+++ b/intern/cycles/kernel/shaders/node_environment_texture.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 vector environment_texture_direction_to_equirectangular(vector dir)
 {
@@ -47,9 +47,10 @@ shader node_environment_texture(
     string filename = "",
     string projection = "equirectangular",
     string interpolation = "linear",
-    string color_space = "sRGB",
+    int compress_as_srgb = 0,
+    int ignore_alpha = 0,
+    int unassociate_alpha = 0,
     int is_float = 1,
-    int use_alpha = 1,
     output color Color = 0.0,
     output float Alpha = 1.0)
 {
@@ -69,13 +70,16 @@ shader node_environment_texture(
   Color = (color)texture(
       filename, p[0], 1.0 - p[1], "wrap", "periodic", "interp", interpolation, "alpha", Alpha);
 
-  if (use_alpha) {
+  if (ignore_alpha) {
+    Alpha = 1.0;
+  }
+  else if (unassociate_alpha) {
     Color = color_unpremultiply(Color, Alpha);
 
     if (!is_float)
       Color = min(Color, 1.0);
   }
 
-  if (color_space == "sRGB")
+  if (compress_as_srgb)
     Color = color_srgb_to_scene_linear(Color);
 }
diff --git a/intern/cycles/kernel/shaders/node_fresnel.osl b/intern/cycles/kernel/shaders/node_fresnel.osl
index 89250db40f3..cff084c344d 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.osl
+++ b/intern/cycles/kernel/shaders/node_fresnel.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_fresnel(float IOR = 1.45, normal Normal = N, output float Fac = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_gamma.osl b/intern/cycles/kernel/shaders/node_gamma.osl
index 9b9c17dc8af..0816df64fe8 100644
--- a/intern/cycles/kernel/shaders/node_gamma.osl
+++ b/intern/cycles/kernel/shaders/node_gamma.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_gamma(color ColorIn = 0.8, float Gamma = 1.0, output color ColorOut = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_geometry.osl b/intern/cycles/kernel/shaders/node_geometry.osl
index b5c1c6611c1..55cda71db1b 100644
--- a/intern/cycles/kernel/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/shaders/node_geometry.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_geometry(normal NormalIn = N,
                      string bump_offset = "center",
@@ -26,7 +26,8 @@ shader node_geometry(normal NormalIn = N,
                      output vector Incoming = vector(0.0, 0.0, 0.0),
                      output point Parametric = point(0.0, 0.0, 0.0),
                      output float Backfacing = 0.0,
-                     output float Pointiness = 0.0)
+                     output float Pointiness = 0.0,
+                     output float RandomPerIsland = 0.0)
 {
   Position = P;
   Normal = NormalIn;
@@ -65,4 +66,6 @@ shader node_geometry(normal NormalIn = N,
   else if (bump_offset == "dy") {
     Pointiness += Dy(Pointiness);
   }
+
+  getattribute("geom:random_per_island", RandomPerIsland);
 }
diff --git a/intern/cycles/kernel/shaders/node_glass_bsdf.osl b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
index c0b8a002536..0042d573f8d 100644
--- a/intern/cycles/kernel/shaders/node_glass_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_glass_bsdf(color Color = 0.8,
                        string distribution = "sharp",
diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
index 2d40ee8d3f6..c73604d3650 100644
--- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_glossy_bsdf(color Color = 0.8,
                         string distribution = "GGX",
diff --git a/intern/cycles/kernel/shaders/node_gradient_texture.osl b/intern/cycles/kernel/shaders/node_gradient_texture.osl
index 52bf466673d..e9acebc0572 100644
--- a/intern/cycles/kernel/shaders/node_gradient_texture.osl
+++ b/intern/cycles/kernel/shaders/node_gradient_texture.osl
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
+#include "stdcycles.h"
 
 /* Gradient */
 
@@ -57,7 +56,7 @@ float gradient(point p, string type)
       result = r;
   }
 
-  return result;
+  return clamp(result, 0.0, 1.0);
 }
 
 shader node_gradient_texture(
diff --git a/intern/cycles/kernel/shaders/node_hair_bsdf.osl b/intern/cycles/kernel/shaders/node_hair_bsdf.osl
index bc912087666..3e0ac7af2e0 100644
--- a/intern/cycles/kernel/shaders/node_hair_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_hair_bsdf.osl
@@ -16,7 +16,7 @@
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_hair_bsdf(color Color = 0.8,
                       string component = "reflection",
diff --git a/intern/cycles/kernel/shaders/node_hair_info.osl b/intern/cycles/kernel/shaders/node_hair_info.osl
index 991a27c4103..ee08ea57e68 100644
--- a/intern/cycles/kernel/shaders/node_hair_info.osl
+++ b/intern/cycles/kernel/shaders/node_hair_info.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_hair_info(output float IsStrand = 0.0,
                       output float Intercept = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_hash.h b/intern/cycles/kernel/shaders/node_hash.h
new file mode 100644
index 00000000000..b42e42ff910
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_hash.h
@@ -0,0 +1,81 @@
+#include "stdcycles.h"
+#include "vector2.h"
+#include "vector4.h"
+
+#define vector3 point
+
+/* **** Hash a float or vector[234] into a float [0, 1] **** */
+
+float hash_float_to_float(float k)
+{
+  return hashnoise(k);
+}
+
+float hash_vector2_to_float(vector2 k)
+{
+  return hashnoise(k.x, k.y);
+}
+
+float hash_vector3_to_float(vector3 k)
+{
+  return hashnoise(k);
+}
+
+float hash_vector4_to_float(vector4 k)
+{
+  return hashnoise(vector3(k.x, k.y, k.z), k.w);
+}
+
+/* **** Hash a vector[234] into a vector[234] [0, 1] **** */
+
+vector2 hash_vector2_to_vector2(vector2 k)
+{
+  return vector2(hash_vector2_to_float(k), hash_vector3_to_float(vector3(k.x, k.y, 1.0)));
+}
+
+vector3 hash_vector3_to_vector3(vector3 k)
+{
+  return vector3(hash_vector3_to_float(k),
+                 hash_vector4_to_float(vector4(k[0], k[1], k[2], 1.0)),
+                 hash_vector4_to_float(vector4(k[0], k[1], k[2], 2.0)));
+}
+
+vector4 hash_vector4_to_vector4(vector4 k)
+{
+  return vector4(hash_vector4_to_float(k),
+                 hash_vector4_to_float(vector4(k.w, k.x, k.y, k.z)),
+                 hash_vector4_to_float(vector4(k.z, k.w, k.x, k.y)),
+                 hash_vector4_to_float(vector4(k.y, k.z, k.w, k.x)));
+}
+
+/* **** Hash a float or a vec[234] into a color [0, 1] **** */
+
+color hash_float_to_color(float k)
+{
+  return color(hash_float_to_float(k),
+               hash_vector2_to_float(vector2(k, 1.0)),
+               hash_vector2_to_float(vector2(k, 2.0)));
+}
+
+color hash_vector2_to_color(vector2 k)
+{
+  return color(hash_vector2_to_float(k),
+               hash_vector3_to_float(vector3(k.x, k.y, 1.0)),
+               hash_vector3_to_float(vector3(k.x, k.y, 2.0)));
+}
+
+color hash_vector3_to_color(vector3 k)
+{
+  return color(hash_vector3_to_float(k),
+               hash_vector4_to_float(vector4(k[0], k[1], k[2], 1.0)),
+               hash_vector4_to_float(vector4(k[0], k[1], k[2], 2.0)));
+}
+
+color hash_vector4_to_color(vector4 k)
+{
+  return color(hash_vector4_to_float(k),
+               hash_vector4_to_float(vector4(k.z, k.x, k.w, k.y)),
+               hash_vector4_to_float(vector4(k.w, k.z, k.y, k.x)));
+}
+
+#undef vector3
diff --git a/intern/cycles/kernel/shaders/node_holdout.osl b/intern/cycles/kernel/shaders/node_holdout.osl
index b51bc0543a5..92e41c92f72 100644
--- a/intern/cycles/kernel/shaders/node_holdout.osl
+++ b/intern/cycles/kernel/shaders/node_holdout.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_holdout(output closure color Holdout = holdout())
 {
diff --git a/intern/cycles/kernel/shaders/node_hsv.osl b/intern/cycles/kernel/shaders/node_hsv.osl
index 30c56a20a92..4417057b10f 100644
--- a/intern/cycles/kernel/shaders/node_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_hsv.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 shader node_hsv(float Hue = 0.5,
                 float Saturation = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_ies_light.osl b/intern/cycles/kernel/shaders/node_ies_light.osl
index ea8c44e09de..76348b4d758 100644
--- a/intern/cycles/kernel/shaders/node_ies_light.osl
+++ b/intern/cycles/kernel/shaders/node_ies_light.osl
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
+#include "stdcycles.h"
 
 /* IES Light */
 
 shader node_ies_light(int use_mapping = 0,
                       matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-                      int slot = 0,
+                      string filename = "",
                       float Strength = 1.0,
                       point Vector = I,
                       output float Fac = 0.0)
@@ -32,10 +31,10 @@ shader node_ies_light(int use_mapping = 0,
     p = transform(mapping, p);
   }
 
-  p = normalize(p);
+  p = normalize((vector)p);
 
   float v_angle = acos(-p[2]);
   float h_angle = atan2(p[0], p[1]) + M_PI;
 
-  Fac = Strength * texture(format("@l%d", slot), h_angle, v_angle);
+  Fac = Strength * texture(filename, h_angle, v_angle);
 }
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index df5eda39985..22d34a1082c 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 point texco_remap_square(point co)
 {
@@ -56,26 +56,40 @@ point map_to_sphere(vector dir)
 }
 
 color image_texture_lookup(string filename,
-                           string color_space,
                            float u,
                            float v,
                            output float Alpha,
-                           int use_alpha,
+                           int compress_as_srgb,
+                           int ignore_alpha,
+                           int unassociate_alpha,
                            int is_float,
+                           int is_tiled,
                            string interpolation,
                            string extension)
 {
+  /* Flip the y coordinate, but preserve UDIM tiles. */
+  float flip_v;
+  if (is_tiled) {
+    float v_i = (int)v;
+    flip_v = v_i + (1.0 - (v - v_i));
+  }
+  else {
+    flip_v = 1.0 - v;
+  }
   color rgb = (color)texture(
-      filename, u, 1.0 - v, "wrap", extension, "interp", interpolation, "alpha", Alpha);
+      filename, u, flip_v, "wrap", extension, "interp", interpolation, "alpha", Alpha);
 
-  if (use_alpha) {
+  if (ignore_alpha) {
+    Alpha = 1.0;
+  }
+  else if (unassociate_alpha) {
     rgb = color_unpremultiply(rgb, Alpha);
 
     if (!is_float)
       rgb = min(rgb, 1.0);
   }
 
-  if (color_space == "sRGB") {
+  if (compress_as_srgb) {
     rgb = color_srgb_to_scene_linear(rgb);
   }
 
@@ -86,13 +100,15 @@ shader node_image_texture(int use_mapping = 0,
                           matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                           point Vector = P,
                           string filename = "",
-                          string color_space = "sRGB",
                           string projection = "flat",
                           string interpolation = "smartcubic",
                           string extension = "periodic",
                           float projection_blend = 0.0,
+                          int compress_as_srgb = 0,
+                          int ignore_alpha = 0,
+                          int unassociate_alpha = 0,
+                          int is_tiled = 0,
                           int is_float = 1,
-                          int use_alpha = 1,
                           output color Color = 0.0,
                           output float Alpha = 1.0)
 {
@@ -102,8 +118,17 @@ shader node_image_texture(int use_mapping = 0,
     p = transform(mapping, p);
 
   if (projection == "flat") {
-    Color = image_texture_lookup(
-        filename, color_space, p[0], p[1], Alpha, use_alpha, is_float, interpolation, extension);
+    Color = image_texture_lookup(filename,
+                                 p[0],
+                                 p[1],
+                                 Alpha,
+                                 compress_as_srgb,
+                                 ignore_alpha,
+                                 unassociate_alpha,
+                                 is_float,
+                                 is_tiled,
+                                 interpolation,
+                                 extension);
   }
   else if (projection == "box") {
     /* object space normal */
@@ -173,36 +198,42 @@ shader node_image_texture(int use_mapping = 0,
 
     if (weight[0] > 0.0) {
       Color += weight[0] * image_texture_lookup(filename,
-                                                color_space,
                                                 p[1],
                                                 p[2],
                                                 tmp_alpha,
-                                                use_alpha,
+                                                compress_as_srgb,
+                                                ignore_alpha,
+                                                unassociate_alpha,
                                                 is_float,
+                                                0,
                                                 interpolation,
                                                 extension);
       Alpha += weight[0] * tmp_alpha;
     }
     if (weight[1] > 0.0) {
       Color += weight[1] * image_texture_lookup(filename,
-                                                color_space,
                                                 p[0],
                                                 p[2],
                                                 tmp_alpha,
-                                                use_alpha,
+                                                compress_as_srgb,
+                                                ignore_alpha,
+                                                unassociate_alpha,
                                                 is_float,
+                                                0,
                                                 interpolation,
                                                 extension);
       Alpha += weight[1] * tmp_alpha;
     }
     if (weight[2] > 0.0) {
       Color += weight[2] * image_texture_lookup(filename,
-                                                color_space,
                                                 p[1],
                                                 p[0],
                                                 tmp_alpha,
-                                                use_alpha,
+                                                compress_as_srgb,
+                                                ignore_alpha,
+                                                unassociate_alpha,
                                                 is_float,
+                                                0,
                                                 interpolation,
                                                 extension);
       Alpha += weight[2] * tmp_alpha;
@@ -211,24 +242,28 @@ shader node_image_texture(int use_mapping = 0,
   else if (projection == "sphere") {
     point projected = map_to_sphere(texco_remap_square(p));
     Color = image_texture_lookup(filename,
-                                 color_space,
                                  projected[0],
                                  projected[1],
                                  Alpha,
-                                 use_alpha,
+                                 compress_as_srgb,
+                                 ignore_alpha,
+                                 unassociate_alpha,
                                  is_float,
+                                 0,
                                  interpolation,
                                  extension);
   }
   else if (projection == "tube") {
     point projected = map_to_tube(texco_remap_square(p));
     Color = image_texture_lookup(filename,
-                                 color_space,
                                  projected[0],
                                  projected[1],
                                  Alpha,
-                                 use_alpha,
+                                 compress_as_srgb,
+                                 ignore_alpha,
+                                 unassociate_alpha,
                                  is_float,
+                                 0,
                                  interpolation,
                                  extension);
   }
diff --git a/intern/cycles/kernel/shaders/node_invert.osl b/intern/cycles/kernel/shaders/node_invert.osl
index c7d41e4e129..23c16935ca1 100644
--- a/intern/cycles/kernel/shaders/node_invert.osl
+++ b/intern/cycles/kernel/shaders/node_invert.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_invert(float Fac = 1.0, color ColorIn = 0.8, output color ColorOut = 0.8)
 {
diff --git a/intern/cycles/kernel/shaders/node_layer_weight.osl b/intern/cycles/kernel/shaders/node_layer_weight.osl
index 7c46f28b41b..1662be2cad1 100644
--- a/intern/cycles/kernel/shaders/node_layer_weight.osl
+++ b/intern/cycles/kernel/shaders/node_layer_weight.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_layer_weight(float Blend = 0.5,
                          normal Normal = N,
diff --git a/intern/cycles/kernel/shaders/node_light_falloff.osl b/intern/cycles/kernel/shaders/node_light_falloff.osl
index d0d7dd9c5aa..3f3c9444a5a 100644
--- a/intern/cycles/kernel/shaders/node_light_falloff.osl
+++ b/intern/cycles/kernel/shaders/node_light_falloff.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_light_falloff(float Strength = 0.0,
                           float Smooth = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_light_path.osl b/intern/cycles/kernel/shaders/node_light_path.osl
index c4a3624a67f..4ff06915771 100644
--- a/intern/cycles/kernel/shaders/node_light_path.osl
+++ b/intern/cycles/kernel/shaders/node_light_path.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_light_path(output float IsCameraRay = 0.0,
                        output float IsShadowRay = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_magic_texture.osl b/intern/cycles/kernel/shaders/node_magic_texture.osl
index aa700e575ef..476c6895f05 100644
--- a/intern/cycles/kernel/shaders/node_magic_texture.osl
+++ b/intern/cycles/kernel/shaders/node_magic_texture.osl
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
+#include "stdcycles.h"
 
 /* Magic */
 
diff --git a/intern/cycles/kernel/shaders/node_map_range.osl b/intern/cycles/kernel/shaders/node_map_range.osl
new file mode 100644
index 00000000000..1c49027e6dd
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_map_range.osl
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdcycles.h"
+
+float safe_divide(float a, float b)
+{
+  return (b != 0.0) ? a / b : 0.0;
+}
+
+float smootherstep(float edge0, float edge1, float x)
+{
+  float t = clamp(safe_divide((x - edge0), (edge1 - edge0)), 0.0, 1.0);
+  return t * t * t * (t * (t * 6.0 - 15.0) + 10.0);
+}
+
+shader node_map_range(string type = "linear",
+                      float Value = 1.0,
+                      float FromMin = 0.0,
+                      float FromMax = 1.0,
+                      float ToMin = 0.0,
+                      float ToMax = 1.0,
+                      float Steps = 4.0,
+                      output float Result = 0.0)
+{
+  if (FromMax != FromMin) {
+    float Factor = Value;
+    if (type == "stepped") {
+      Factor = (Value - FromMin) / (FromMax - FromMin);
+      Factor = (Steps > 0) ? floor(Factor * (Steps + 1.0)) / Steps : 0.0;
+    }
+    else if (type == "smoothstep") {
+      Factor = (FromMin > FromMax) ? 1.0 - smoothstep(FromMax, FromMin, Value) :
+                                     smoothstep(FromMin, FromMax, Value);
+    }
+    else if (type == "smootherstep") {
+      Factor = (FromMin > FromMax) ? 1.0 - smootherstep(FromMax, FromMin, Value) :
+                                     smootherstep(FromMin, FromMax, Value);
+    }
+    else {
+      Factor = (Value - FromMin) / (FromMax - FromMin);
+    }
+    Result = ToMin + Factor * (ToMax - ToMin);
+  }
+}
diff --git a/intern/cycles/kernel/shaders/node_mapping.osl b/intern/cycles/kernel/shaders/node_mapping.osl
index f5cc2d1c5dd..8d204999630 100644
--- a/intern/cycles/kernel/shaders/node_mapping.osl
+++ b/intern/cycles/kernel/shaders/node_mapping.osl
@@ -14,19 +14,60 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
-shader node_mapping(matrix Matrix = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-                    point mapping_min = point(0.0, 0.0, 0.0),
-                    point mapping_max = point(0.0, 0.0, 0.0),
-                    int use_minmax = 0,
-                    point VectorIn = point(0.0, 0.0, 0.0),
-                    output point VectorOut = point(0.0, 0.0, 0.0))
+point safe_divide(point a, point b)
+{
+  return point((b[0] != 0.0) ? a[0] / b[0] : 0.0,
+               (b[1] != 0.0) ? a[1] / b[1] : 0.0,
+               (b[2] != 0.0) ? a[2] / b[2] : 0.0);
+}
+
+matrix euler_to_mat(point euler)
 {
-  point p = transform(Matrix, VectorIn);
+  float cx = cos(euler[0]);
+  float cy = cos(euler[1]);
+  float cz = cos(euler[2]);
+  float sx = sin(euler[0]);
+  float sy = sin(euler[1]);
+  float sz = sin(euler[2]);
+
+  matrix mat = matrix(1.0);
+  mat[0][0] = cy * cz;
+  mat[0][1] = cy * sz;
+  mat[0][2] = -sy;
 
-  if (use_minmax)
-    p = min(max(mapping_min, p), mapping_max);
+  mat[1][0] = sy * sx * cz - cx * sz;
+  mat[1][1] = sy * sx * sz + cx * cz;
+  mat[1][2] = cy * sx;
 
-  VectorOut = p;
+  mat[2][0] = sy * cx * cz + sx * sz;
+  mat[2][1] = sy * cx * sz - sx * cz;
+  mat[2][2] = cy * cx;
+  return mat;
+}
+
+shader node_mapping(string type = "point",
+                    point VectorIn = point(0.0, 0.0, 0.0),
+                    point Location = point(0.0, 0.0, 0.0),
+                    point Rotation = point(0.0, 0.0, 0.0),
+                    point Scale = point(1.0, 1.0, 1.0),
+                    output point VectorOut = point(0.0, 0.0, 0.0))
+{
+  if (type == "point") {
+    VectorOut = transform(euler_to_mat(Rotation), (VectorIn * Scale)) + Location;
+  }
+  else if (type == "texture") {
+    VectorOut = safe_divide(transform(transpose(euler_to_mat(Rotation)), (VectorIn - Location)),
+                            Scale);
+  }
+  else if (type == "vector") {
+    VectorOut = transform(euler_to_mat(Rotation), (VectorIn * Scale));
+  }
+  else if (type == "normal") {
+    VectorOut = normalize((vector)transform(euler_to_mat(Rotation), safe_divide(VectorIn, Scale)));
+  }
+  else {
+    warning("%s", "Unknown Mapping vector type!");
+  }
 }
diff --git a/intern/cycles/kernel/shaders/node_math.h b/intern/cycles/kernel/shaders/node_math.h
new file mode 100644
index 00000000000..4b1a6c5bc16
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_math.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+float safe_divide(float a, float b)
+{
+  return (b != 0.0) ? a / b : 0.0;
+}
+
+vector safe_divide(vector a, vector b)
+{
+  return vector((b[0] != 0.0) ? a[0] / b[0] : 0.0,
+                (b[1] != 0.0) ? a[1] / b[1] : 0.0,
+                (b[2] != 0.0) ? a[2] / b[2] : 0.0);
+}
+
+float safe_modulo(float a, float b)
+{
+  return (b != 0.0) ? fmod(a, b) : 0.0;
+}
+
+float fract(float a)
+{
+  return a - floor(a);
+}
+
+/* See: https://www.iquilezles.org/www/articles/smin/smin.htm. */
+float smoothmin(float a, float b, float c)
+{
+  if (c != 0.0) {
+    float h = max(c - abs(a - b), 0.0) / c;
+    return min(a, b) - h * h * h * c * (1.0 / 6.0);
+  }
+  else {
+    return min(a, b);
+  }
+}
+
+float pingpong(float a, float b)
+{
+  return (b != 0.0) ? abs(fract((a - b) / (b * 2.0)) * b * 2.0 - b) : 0.0;
+}
+
+float safe_sqrt(float a)
+{
+  return (a > 0.0) ? sqrt(a) : 0.0;
+}
+
+float safe_log(float a, float b)
+{
+  return (a > 0.0 && b > 0.0) ? log(a) / log(b) : 0.0;
+}
+
+vector project(vector v, vector v_proj)
+{
+  float lenSquared = dot(v_proj, v_proj);
+  return (lenSquared != 0.0) ? (dot(v, v_proj) / lenSquared) * v_proj : vector(0.0);
+}
+
+vector snap(vector a, vector b)
+{
+  return floor(safe_divide(a, b)) * b;
+}
+
+/* Adapted from godotengine math_funcs.h. */
+float wrap(float value, float max, float min)
+{
+  float range = max - min;
+  return (range != 0.0) ? value - (range * floor((value - min) / range)) : min;
+}
+
+point wrap(point value, point max, point min)
+{
+  return point(wrap(value[0], max[0], min[0]),
+               wrap(value[1], max[1], min[1]),
+               wrap(value[2], max[2], min[2]));
+}
+
+matrix euler_to_mat(point euler)
+{
+  float cx = cos(euler[0]);
+  float cy = cos(euler[1]);
+  float cz = cos(euler[2]);
+  float sx = sin(euler[0]);
+  float sy = sin(euler[1]);
+  float sz = sin(euler[2]);
+  matrix mat = matrix(1.0);
+  mat[0][0] = cy * cz;
+  mat[0][1] = cy * sz;
+  mat[0][2] = -sy;
+  mat[1][0] = sy * sx * cz - cx * sz;
+  mat[1][1] = sy * sx * sz + cx * cz;
+  mat[1][2] = cy * sx;
+  +mat[2][0] = sy * cx * cz + sx * sz;
+  mat[2][1] = sy * cx * sz - sx * cz;
+  mat[2][2] = cy * cx;
+  return mat;
+}
diff --git a/intern/cycles/kernel/shaders/node_math.osl b/intern/cycles/kernel/shaders/node_math.osl
index 8830339e05f..dbaa7ccb60e 100644
--- a/intern/cycles/kernel/shaders/node_math.osl
+++ b/intern/cycles/kernel/shaders/node_math.osl
@@ -14,60 +14,16 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-
-float safe_divide(float a, float b)
-{
-  float result;
-
-  if (b == 0.0)
-    result = 0.0;
-  else
-    result = a / b;
-
-  return result;
-}
-
-float safe_modulo(float a, float b)
-{
-  float result;
-
-  if (b == 0.0)
-    result = 0.0;
-  else
-    result = fmod(a, b);
-
-  return result;
-}
-
-float safe_sqrt(float a)
-{
-  float result;
-
-  if (a > 0.0)
-    result = sqrt(a);
-  else
-    result = 0.0;
-
-  return result;
-}
-
-float safe_log(float a, float b)
-{
-  if (a < 0.0 || b < 0.0)
-    return 0.0;
-
-  return log(a) / log(b);
-}
+#include "node_math.h"
+#include "stdcycles.h"
 
+/* OSL asin, acos, and pow functions are safe by default. */
 shader node_math(string type = "add",
-                 int use_clamp = 0,
-                 float Value1 = 0.0,
-                 float Value2 = 0.0,
+                 float Value1 = 0.5,
+                 float Value2 = 0.5,
+                 float Value3 = 0.5,
                  output float Value = 0.0)
 {
-  /* OSL asin, acos, pow check for values that could give rise to nan */
-
   if (type == "add")
     Value = Value1 + Value2;
   else if (type == "subtract")
@@ -76,47 +32,78 @@ shader node_math(string type = "add",
     Value = Value1 * Value2;
   else if (type == "divide")
     Value = safe_divide(Value1, Value2);
-  else if (type == "sine")
-    Value = sin(Value1);
-  else if (type == "cosine")
-    Value = cos(Value1);
-  else if (type == "tangent")
-    Value = tan(Value1);
-  else if (type == "arcsine")
-    Value = asin(Value1);
-  else if (type == "arccosine")
-    Value = acos(Value1);
-  else if (type == "arctangent")
-    Value = atan(Value1);
   else if (type == "power")
     Value = pow(Value1, Value2);
   else if (type == "logarithm")
     Value = safe_log(Value1, Value2);
+  else if (type == "sqrt")
+    Value = safe_sqrt(Value1);
+  else if (type == "inversesqrt")
+    Value = inversesqrt(Value1);
+  else if (type == "absolute")
+    Value = fabs(Value1);
+  else if (type == "radians")
+    Value = radians(Value1);
+  else if (type == "degrees")
+    Value = degrees(Value1);
   else if (type == "minimum")
     Value = min(Value1, Value2);
   else if (type == "maximum")
     Value = max(Value1, Value2);
-  else if (type == "round")
-    Value = floor(Value1 + 0.5);
   else if (type == "less_than")
     Value = Value1 < Value2;
   else if (type == "greater_than")
     Value = Value1 > Value2;
-  else if (type == "modulo")
-    Value = safe_modulo(Value1, Value2);
-  else if (type == "absolute")
-    Value = fabs(Value1);
-  else if (type == "arctan2")
-    Value = atan2(Value1, Value2);
+  else if (type == "round")
+    Value = floor(Value1 + 0.5);
   else if (type == "floor")
     Value = floor(Value1);
   else if (type == "ceil")
     Value = ceil(Value1);
-  else if (type == "fract")
+  else if (type == "fraction")
     Value = Value1 - floor(Value1);
-  else if (type == "sqrt")
-    Value = safe_sqrt(Value1);
-
-  if (use_clamp)
-    Value = clamp(Value, 0.0, 1.0);
+  else if (type == "modulo")
+    Value = safe_modulo(Value1, Value2);
+  else if (type == "trunc")
+    Value = trunc(Value1);
+  else if (type == "snap")
+    Value = floor(safe_divide(Value1, Value2)) * Value2;
+  else if (type == "wrap")
+    Value = wrap(Value1, Value2, Value3);
+  else if (type == "pingpong")
+    Value = pingpong(Value1, Value2);
+  else if (type == "sine")
+    Value = sin(Value1);
+  else if (type == "cosine")
+    Value = cos(Value1);
+  else if (type == "tangent")
+    Value = tan(Value1);
+  else if (type == "sinh")
+    Value = sinh(Value1);
+  else if (type == "cosh")
+    Value = cosh(Value1);
+  else if (type == "tanh")
+    Value = tanh(Value1);
+  else if (type == "arcsine")
+    Value = asin(Value1);
+  else if (type == "arccosine")
+    Value = acos(Value1);
+  else if (type == "arctangent")
+    Value = atan(Value1);
+  else if (type == "arctan2")
+    Value = atan2(Value1, Value2);
+  else if (type == "sign")
+    Value = sign(Value1);
+  else if (type == "exponent")
+    Value = exp(Value1);
+  else if (type == "compare")
+    Value = ((Value1 == Value2) || (abs(Value1 - Value2) <= max(Value3, 1e-5))) ? 1.0 : 0.0;
+  else if (type == "multiply_add")
+    Value = Value1 * Value2 + Value3;
+  else if (type == "smoothmin")
+    Value = smoothmin(Value1, Value2, Value3);
+  else if (type == "smoothmax")
+    Value = -(smoothmin(-Value1, -Value2, Value3));
+  else
+    warning("%s", "Unknown math operator!");
 }
diff --git a/intern/cycles/kernel/shaders/node_mix.osl b/intern/cycles/kernel/shaders/node_mix.osl
index 8caea6803ed..a13b4bb7b96 100644
--- a/intern/cycles/kernel/shaders/node_mix.osl
+++ b/intern/cycles/kernel/shaders/node_mix.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 color node_mix_blend(float t, color col1, color col2)
 {
@@ -91,12 +91,12 @@ color node_mix_diff(float t, color col1, color col2)
 
 color node_mix_dark(float t, color col1, color col2)
 {
-  return min(col1, col2) * t + col1 * (1.0 - t);
+  return mix(col1, min(col1, col2), t);
 }
 
 color node_mix_light(float t, color col1, color col2)
 {
-  return max(col1, col2 * t);
+  return mix(col1, max(col1, col2), t);
 }
 
 color node_mix_dodge(float t, color col1, color col2)
diff --git a/intern/cycles/kernel/shaders/node_mix_closure.osl b/intern/cycles/kernel/shaders/node_mix_closure.osl
index 517c59c8786..94fc2171c44 100644
--- a/intern/cycles/kernel/shaders/node_mix_closure.osl
+++ b/intern/cycles/kernel/shaders/node_mix_closure.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_mix_closure(float Fac = 0.5,
                         closure color Closure1 = 0,
diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
index a7877c43d46..d03b84c1ab4 100644
--- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
@@ -14,10 +14,344 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
+#include "node_noise.h"
+#include "stdcycles.h"
+#include "vector2.h"
+#include "vector4.h"
 
-/* Musgrave fBm
+#define vector3 point
+
+/* 1D Musgrave fBm
+ *
+ * H: fractal increment parameter
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ *
+ * from "Texturing and Modelling: A procedural approach"
+ */
+
+float noise_musgrave_fBm_1d(float co, float H, float lacunarity, float octaves)
+{
+  float p = co;
+  float value = 0.0;
+  float pwr = 1.0;
+  float pwHL = pow(lacunarity, -H);
+
+  for (int i = 0; i < (int)octaves; i++) {
+    value += safe_snoise(p) * pwr;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value += rmd * safe_snoise(p) * pwr;
+  }
+
+  return value;
+}
+
+/* 1D Musgrave Multifractal
+ *
+ * H: highest fractal dimension
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ */
+
+float noise_musgrave_multi_fractal_1d(float co, float H, float lacunarity, float octaves)
+{
+  float p = co;
+  float value = 1.0;
+  float pwr = 1.0;
+  float pwHL = pow(lacunarity, -H);
+
+  for (int i = 0; i < (int)octaves; i++) {
+    value *= (pwr * safe_snoise(p) + 1.0);
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value *= (rmd * pwr * safe_snoise(p) + 1.0); /* correct? */
+  }
+
+  return value;
+}
+
+/* 1D Musgrave Heterogeneous Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_hetero_terrain_1d(
+    float co, float H, float lacunarity, float octaves, float offset)
+{
+  float p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  /* first unscaled octave of function; later octaves are scaled */
+  float value = offset + safe_snoise(p);
+  p *= lacunarity;
+
+  for (int i = 1; i < (int)octaves; i++) {
+    float increment = (safe_snoise(p) + offset) * pwr * value;
+    value += increment;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    float increment = (safe_snoise(p) + offset) * pwr * value;
+    value += rmd * increment;
+  }
+
+  return value;
+}
+
+/* 1D Hybrid Additive/Multiplicative Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_hybrid_multi_fractal_1d(
+    float co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  float p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  float value = safe_snoise(p) + offset;
+  float weight = gain * value;
+  p *= lacunarity;
+
+  for (int i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
+    if (weight > 1.0) {
+      weight = 1.0;
+    }
+
+    float signal = (safe_snoise(p) + offset) * pwr;
+    pwr *= pwHL;
+    value += weight * signal;
+    weight *= gain * signal;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value += rmd * ((safe_snoise(p) + offset) * pwr);
+  }
+
+  return value;
+}
+
+/* 1D Ridged Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_ridged_multi_fractal_1d(
+    float co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  float p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  float signal = offset - fabs(safe_snoise(p));
+  signal *= signal;
+  float value = signal;
+  float weight = 1.0;
+
+  for (int i = 1; i < (int)octaves; i++) {
+    p *= lacunarity;
+    weight = clamp(signal * gain, 0.0, 1.0);
+    signal = offset - fabs(safe_snoise(p));
+    signal *= signal;
+    signal *= weight;
+    value += signal * pwr;
+    pwr *= pwHL;
+  }
+
+  return value;
+}
+
+/* 2D Musgrave fBm
+ *
+ * H: fractal increment parameter
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ *
+ * from "Texturing and Modelling: A procedural approach"
+ */
+
+float noise_musgrave_fBm_2d(vector2 co, float H, float lacunarity, float octaves)
+{
+  vector2 p = co;
+  float value = 0.0;
+  float pwr = 1.0;
+  float pwHL = pow(lacunarity, -H);
+
+  for (int i = 0; i < (int)octaves; i++) {
+    value += safe_snoise(p) * pwr;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value += rmd * safe_snoise(p) * pwr;
+  }
+
+  return value;
+}
+
+/* 2D Musgrave Multifractal
+ *
+ * H: highest fractal dimension
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ */
+
+float noise_musgrave_multi_fractal_2d(vector2 co, float H, float lacunarity, float octaves)
+{
+  vector2 p = co;
+  float value = 1.0;
+  float pwr = 1.0;
+  float pwHL = pow(lacunarity, -H);
+
+  for (int i = 0; i < (int)octaves; i++) {
+    value *= (pwr * safe_snoise(p) + 1.0);
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value *= (rmd * pwr * safe_snoise(p) + 1.0); /* correct? */
+  }
+
+  return value;
+}
+
+/* 2D Musgrave Heterogeneous Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_hetero_terrain_2d(
+    vector2 co, float H, float lacunarity, float octaves, float offset)
+{
+  vector2 p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  /* first unscaled octave of function; later octaves are scaled */
+  float value = offset + safe_snoise(p);
+  p *= lacunarity;
+
+  for (int i = 1; i < (int)octaves; i++) {
+    float increment = (safe_snoise(p) + offset) * pwr * value;
+    value += increment;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    float increment = (safe_snoise(p) + offset) * pwr * value;
+    value += rmd * increment;
+  }
+
+  return value;
+}
+
+/* 2D Hybrid Additive/Multiplicative Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_hybrid_multi_fractal_2d(
+    vector2 co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  vector2 p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  float value = safe_snoise(p) + offset;
+  float weight = gain * value;
+  p *= lacunarity;
+
+  for (int i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
+    if (weight > 1.0) {
+      weight = 1.0;
+    }
+
+    float signal = (safe_snoise(p) + offset) * pwr;
+    pwr *= pwHL;
+    value += weight * signal;
+    weight *= gain * signal;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value += rmd * ((safe_snoise(p) + offset) * pwr);
+  }
+
+  return value;
+}
+
+/* 2D Ridged Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_ridged_multi_fractal_2d(
+    vector2 co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  vector2 p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  float signal = offset - fabs(safe_snoise(p));
+  signal *= signal;
+  float value = signal;
+  float weight = 1.0;
+
+  for (int i = 1; i < (int)octaves; i++) {
+    p *= lacunarity;
+    weight = clamp(signal * gain, 0.0, 1.0);
+    signal = offset - fabs(safe_snoise(p));
+    signal *= signal;
+    signal *= weight;
+    value += signal * pwr;
+    pwr *= pwHL;
+  }
+
+  return value;
+}
+
+/* 3D Musgrave fBm
  *
  * H: fractal increment parameter
  * lacunarity: gap between successive frequencies
@@ -26,58 +360,56 @@
  * from "Texturing and Modelling: A procedural approach"
  */
 
-float noise_musgrave_fBm(point ip, float H, float lacunarity, float octaves)
+float noise_musgrave_fBm_3d(vector3 co, float H, float lacunarity, float octaves)
 {
-  float rmd;
+  vector3 p = co;
   float value = 0.0;
   float pwr = 1.0;
   float pwHL = pow(lacunarity, -H);
-  int i;
-  point p = ip;
 
-  for (i = 0; i < (int)octaves; i++) {
-    value += safe_noise(p, "signed") * pwr;
+  for (int i = 0; i < (int)octaves; i++) {
+    value += safe_snoise(p) * pwr;
     pwr *= pwHL;
     p *= lacunarity;
   }
 
-  rmd = octaves - floor(octaves);
-  if (rmd != 0.0)
-    value += rmd * safe_noise(p, "signed") * pwr;
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value += rmd * safe_snoise(p) * pwr;
+  }
 
   return value;
 }
 
-/* Musgrave Multifractal
+/* 3D Musgrave Multifractal
  *
  * H: highest fractal dimension
  * lacunarity: gap between successive frequencies
  * octaves: number of frequencies in the fBm
  */
 
-float noise_musgrave_multi_fractal(point ip, float H, float lacunarity, float octaves)
+float noise_musgrave_multi_fractal_3d(vector3 co, float H, float lacunarity, float octaves)
 {
-  float rmd;
+  vector3 p = co;
   float value = 1.0;
   float pwr = 1.0;
   float pwHL = pow(lacunarity, -H);
-  int i;
-  point p = ip;
 
-  for (i = 0; i < (int)octaves; i++) {
-    value *= (pwr * safe_noise(p, "signed") + 1.0);
+  for (int i = 0; i < (int)octaves; i++) {
+    value *= (pwr * safe_snoise(p) + 1.0);
     pwr *= pwHL;
     p *= lacunarity;
   }
 
-  rmd = octaves - floor(octaves);
-  if (rmd != 0.0)
-    value *= (rmd * pwr * safe_noise(p, "signed") + 1.0); /* correct? */
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value *= (rmd * pwr * safe_snoise(p) + 1.0); /* correct? */
+  }
 
   return value;
 }
 
-/* Musgrave Heterogeneous Terrain
+/* 3D Musgrave Heterogeneous Terrain
  *
  * H: fractal dimension of the roughest area
  * lacunarity: gap between successive frequencies
@@ -85,36 +417,34 @@ float noise_musgrave_multi_fractal(point ip, float H, float lacunarity, float oc
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_hetero_terrain(
-    point ip, float H, float lacunarity, float octaves, float offset)
+float noise_musgrave_hetero_terrain_3d(
+    vector3 co, float H, float lacunarity, float octaves, float offset)
 {
-  float value, increment, rmd;
+  vector3 p = co;
   float pwHL = pow(lacunarity, -H);
   float pwr = pwHL;
-  int i;
-  point p = ip;
 
   /* first unscaled octave of function; later octaves are scaled */
-  value = offset + safe_noise(p, "signed");
+  float value = offset + safe_snoise(p);
   p *= lacunarity;
 
-  for (i = 1; i < (int)octaves; i++) {
-    increment = (safe_noise(p, "signed") + offset) * pwr * value;
+  for (int i = 1; i < (int)octaves; i++) {
+    float increment = (safe_snoise(p) + offset) * pwr * value;
     value += increment;
     pwr *= pwHL;
     p *= lacunarity;
   }
 
-  rmd = octaves - floor(octaves);
+  float rmd = octaves - floor(octaves);
   if (rmd != 0.0) {
-    increment = (safe_noise(p, "signed") + offset) * pwr * value;
+    float increment = (safe_snoise(p) + offset) * pwr * value;
     value += rmd * increment;
   }
 
   return value;
 }
 
-/* Hybrid Additive/Multiplicative Multifractal Terrain
+/* 3D Hybrid Additive/Multiplicative Multifractal Terrain
  *
  * H: fractal dimension of the roughest area
  * lacunarity: gap between successive frequencies
@@ -122,38 +452,38 @@ float noise_musgrave_hetero_terrain(
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_hybrid_multi_fractal(
-    point ip, float H, float lacunarity, float octaves, float offset, float gain)
+float noise_musgrave_hybrid_multi_fractal_3d(
+    vector3 co, float H, float lacunarity, float octaves, float offset, float gain)
 {
-  float result, signal, weight, rmd;
+  vector3 p = co;
   float pwHL = pow(lacunarity, -H);
   float pwr = pwHL;
-  int i;
-  point p = ip;
 
-  result = safe_noise(p, "signed") + offset;
-  weight = gain * result;
+  float value = safe_snoise(p) + offset;
+  float weight = gain * value;
   p *= lacunarity;
 
-  for (i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
-    if (weight > 1.0)
+  for (int i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
+    if (weight > 1.0) {
       weight = 1.0;
+    }
 
-    signal = (safe_noise(p, "signed") + offset) * pwr;
+    float signal = (safe_snoise(p) + offset) * pwr;
     pwr *= pwHL;
-    result += weight * signal;
+    value += weight * signal;
     weight *= gain * signal;
     p *= lacunarity;
   }
 
-  rmd = octaves - floor(octaves);
-  if (rmd != 0.0)
-    result += rmd * ((safe_noise(p, "signed") + offset) * pwr);
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value += rmd * ((safe_snoise(p) + offset) * pwr);
+  }
 
-  return result;
+  return value;
 }
 
-/* Ridged Multifractal Terrain
+/* 3D Ridged Multifractal Terrain
  *
  * H: fractal dimension of the roughest area
  * lacunarity: gap between successive frequencies
@@ -161,73 +491,313 @@ float noise_musgrave_hybrid_multi_fractal(
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_ridged_multi_fractal(
-    point ip, float H, float lacunarity, float octaves, float offset, float gain)
+float noise_musgrave_ridged_multi_fractal_3d(
+    vector3 co, float H, float lacunarity, float octaves, float offset, float gain)
 {
-  float result, signal, weight;
+  vector3 p = co;
   float pwHL = pow(lacunarity, -H);
   float pwr = pwHL;
-  int i;
-  point p = ip;
 
-  signal = offset - fabs(safe_noise(p, "signed"));
+  float signal = offset - fabs(safe_snoise(p));
   signal *= signal;
-  result = signal;
-  weight = 1.0;
+  float value = signal;
+  float weight = 1.0;
 
-  for (i = 1; i < (int)octaves; i++) {
+  for (int i = 1; i < (int)octaves; i++) {
     p *= lacunarity;
     weight = clamp(signal * gain, 0.0, 1.0);
-    signal = offset - fabs(safe_noise(p, "signed"));
+    signal = offset - fabs(safe_snoise(p));
     signal *= signal;
     signal *= weight;
-    result += signal * pwr;
+    value += signal * pwr;
     pwr *= pwHL;
   }
 
-  return result;
+  return value;
 }
 
-/* Shader */
+/* 4D Musgrave fBm
+ *
+ * H: fractal increment parameter
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ *
+ * from "Texturing and Modelling: A procedural approach"
+ */
+
+float noise_musgrave_fBm_4d(vector4 co, float H, float lacunarity, float octaves)
+{
+  vector4 p = co;
+  float value = 0.0;
+  float pwr = 1.0;
+  float pwHL = pow(lacunarity, -H);
+
+  for (int i = 0; i < (int)octaves; i++) {
+    value += safe_snoise(p) * pwr;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value += rmd * safe_snoise(p) * pwr;
+  }
+
+  return value;
+}
+
+/* 4D Musgrave Multifractal
+ *
+ * H: highest fractal dimension
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ */
+
+float noise_musgrave_multi_fractal_4d(vector4 co, float H, float lacunarity, float octaves)
+{
+  vector4 p = co;
+  float value = 1.0;
+  float pwr = 1.0;
+  float pwHL = pow(lacunarity, -H);
+
+  for (int i = 0; i < (int)octaves; i++) {
+    value *= (pwr * safe_snoise(p) + 1.0);
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value *= (rmd * pwr * safe_snoise(p) + 1.0); /* correct? */
+  }
+
+  return value;
+}
+
+/* 4D Musgrave Heterogeneous Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_hetero_terrain_4d(
+    vector4 co, float H, float lacunarity, float octaves, float offset)
+{
+  vector4 p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  /* first unscaled octave of function; later octaves are scaled */
+  float value = offset + safe_snoise(p);
+  p *= lacunarity;
+
+  for (int i = 1; i < (int)octaves; i++) {
+    float increment = (safe_snoise(p) + offset) * pwr * value;
+    value += increment;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    float increment = (safe_snoise(p) + offset) * pwr * value;
+    value += rmd * increment;
+  }
+
+  return value;
+}
+
+/* 4D Hybrid Additive/Multiplicative Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_hybrid_multi_fractal_4d(
+    vector4 co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  vector4 p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  float value = safe_snoise(p) + offset;
+  float weight = gain * value;
+  p *= lacunarity;
+
+  for (int i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
+    if (weight > 1.0) {
+      weight = 1.0;
+    }
+
+    float signal = (safe_snoise(p) + offset) * pwr;
+    pwr *= pwHL;
+    value += weight * signal;
+    weight *= gain * signal;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    value += rmd * ((safe_snoise(p) + offset) * pwr);
+  }
+
+  return value;
+}
+
+/* 4D Ridged Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+float noise_musgrave_ridged_multi_fractal_4d(
+    vector4 co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  vector4 p = co;
+  float pwHL = pow(lacunarity, -H);
+  float pwr = pwHL;
+
+  float signal = offset - fabs(safe_snoise(p));
+  signal *= signal;
+  float value = signal;
+  float weight = 1.0;
+
+  for (int i = 1; i < (int)octaves; i++) {
+    p *= lacunarity;
+    weight = clamp(signal * gain, 0.0, 1.0);
+    signal = offset - fabs(safe_snoise(p));
+    signal *= signal;
+    signal *= weight;
+    value += signal * pwr;
+    pwr *= pwHL;
+  }
+
+  return value;
+}
 
 shader node_musgrave_texture(
     int use_mapping = 0,
     matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
     string type = "fBM",
+    string dimensions = "3D",
+    point Vector = P,
+    float W = 0.0,
     float Dimension = 2.0,
-    float Lacunarity = 1.0,
+    float Scale = 5.0,
     float Detail = 2.0,
+    float Lacunarity = 2.0,
     float Offset = 0.0,
     float Gain = 1.0,
-    float Scale = 5.0,
-    point Vector = P,
-    output float Fac = 0.0,
-    output color Color = 0.0)
+    output float Fac = 0.0)
 {
   float dimension = max(Dimension, 1e-5);
   float octaves = clamp(Detail, 0.0, 16.0);
   float lacunarity = max(Lacunarity, 1e-5);
-  float intensity = 1.0;
 
-  point p = Vector;
+  vector3 s = Vector;
 
   if (use_mapping)
-    p = transform(mapping, p);
-
-  p = p * Scale;
-
-  if (type == "multifractal")
-    Fac = intensity * noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves);
-  else if (type == "fBM")
-    Fac = intensity * noise_musgrave_fBm(p, dimension, lacunarity, octaves);
-  else if (type == "hybrid_multifractal")
-    Fac = intensity *
-          noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain);
-  else if (type == "ridged_multifractal")
-    Fac = intensity *
-          noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain);
-  else if (type == "hetero_terrain")
-    Fac = intensity * noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, Offset);
-
-  Color = color(Fac, Fac, Fac);
+    s = transform(mapping, s);
+
+  if (dimensions == "1D") {
+    float p = W * Scale;
+    if (type == "multifractal") {
+      Fac = noise_musgrave_multi_fractal_1d(p, dimension, lacunarity, octaves);
+    }
+    else if (type == "fBM") {
+      Fac = noise_musgrave_fBm_1d(p, dimension, lacunarity, octaves);
+    }
+    else if (type == "hybrid_multifractal") {
+      Fac = noise_musgrave_hybrid_multi_fractal_1d(
+          p, dimension, lacunarity, octaves, Offset, Gain);
+    }
+    else if (type == "ridged_multifractal") {
+      Fac = noise_musgrave_ridged_multi_fractal_1d(
+          p, dimension, lacunarity, octaves, Offset, Gain);
+    }
+    else if (type == "hetero_terrain") {
+      Fac = noise_musgrave_hetero_terrain_1d(p, dimension, lacunarity, octaves, Offset);
+    }
+    else {
+      Fac = 0.0;
+    }
+  }
+  else if (dimensions == "2D") {
+    vector2 p = vector2(s[0], s[1]) * Scale;
+    if (type == "multifractal") {
+      Fac = noise_musgrave_multi_fractal_2d(p, dimension, lacunarity, octaves);
+    }
+    else if (type == "fBM") {
+      Fac = noise_musgrave_fBm_2d(p, dimension, lacunarity, octaves);
+    }
+    else if (type == "hybrid_multifractal") {
+      Fac = noise_musgrave_hybrid_multi_fractal_2d(
+          p, dimension, lacunarity, octaves, Offset, Gain);
+    }
+    else if (type == "ridged_multifractal") {
+      Fac = noise_musgrave_ridged_multi_fractal_2d(
+          p, dimension, lacunarity, octaves, Offset, Gain);
+    }
+    else if (type == "hetero_terrain") {
+      Fac = noise_musgrave_hetero_terrain_2d(p, dimension, lacunarity, octaves, Offset);
+    }
+    else {
+      Fac = 0.0;
+    }
+  }
+  else if (dimensions == "3D") {
+    vector3 p = s * Scale;
+    if (type == "multifractal") {
+      Fac = noise_musgrave_multi_fractal_3d(p, dimension, lacunarity, octaves);
+    }
+    else if (type == "fBM") {
+      Fac = noise_musgrave_fBm_3d(p, dimension, lacunarity, octaves);
+    }
+    else if (type == "hybrid_multifractal") {
+      Fac = noise_musgrave_hybrid_multi_fractal_3d(
+          p, dimension, lacunarity, octaves, Offset, Gain);
+    }
+    else if (type == "ridged_multifractal") {
+      Fac = noise_musgrave_ridged_multi_fractal_3d(
+          p, dimension, lacunarity, octaves, Offset, Gain);
+    }
+    else if (type == "hetero_terrain") {
+      Fac = noise_musgrave_hetero_terrain_3d(p, dimension, lacunarity, octaves, Offset);
+    }
+    else {
+      Fac = 0.0;
+    }
+  }
+  else if (dimensions == "4D") {
+    vector4 p = vector4(s[0], s[1], s[2], W) * Scale;
+    if (type == "multifractal") {
+      Fac = noise_musgrave_multi_fractal_4d(p, dimension, lacunarity, octaves);
+    }
+    else if (type == "fBM") {
+      Fac = noise_musgrave_fBm_4d(p, dimension, lacunarity, octaves);
+    }
+    else if (type == "hybrid_multifractal") {
+      Fac = noise_musgrave_hybrid_multi_fractal_4d(
+          p, dimension, lacunarity, octaves, Offset, Gain);
+    }
+    else if (type == "ridged_multifractal") {
+      Fac = noise_musgrave_ridged_multi_fractal_4d(
+          p, dimension, lacunarity, octaves, Offset, Gain);
+    }
+    else if (type == "hetero_terrain") {
+      Fac = noise_musgrave_hetero_terrain_4d(p, dimension, lacunarity, octaves, Offset);
+    }
+    else {
+      Fac = 0.0;
+    }
+  }
+  else {
+    Fac = 0.0;
+  }
 }
diff --git a/intern/cycles/kernel/shaders/node_noise.h b/intern/cycles/kernel/shaders/node_noise.h
new file mode 100644
index 00000000000..ab4cd7792cc
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_noise.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vector2.h"
+#include "vector4.h"
+
+#define vector3 point
+
+float safe_noise(float p)
+{
+  float f = noise("noise", p);
+  if (isinf(f))
+    return 0.5;
+  return f;
+}
+
+float safe_noise(vector2 p)
+{
+  float f = noise("noise", p.x, p.y);
+  if (isinf(f))
+    return 0.5;
+  return f;
+}
+
+float safe_noise(vector3 p)
+{
+  float f = noise("noise", p);
+  if (isinf(f))
+    return 0.5;
+  return f;
+}
+
+float safe_noise(vector4 p)
+{
+  float f = noise("noise", vector3(p.x, p.y, p.z), p.w);
+  if (isinf(f))
+    return 0.5;
+  return f;
+}
+
+float safe_snoise(float p)
+{
+  float f = noise("snoise", p);
+  if (isinf(f))
+    return 0.0;
+  return f;
+}
+
+float safe_snoise(vector2 p)
+{
+  float f = noise("snoise", p.x, p.y);
+  if (isinf(f))
+    return 0.0;
+  return f;
+}
+
+float safe_snoise(vector3 p)
+{
+  float f = noise("snoise", p);
+  if (isinf(f))
+    return 0.0;
+  return f;
+}
+
+float safe_snoise(vector4 p)
+{
+  float f = noise("snoise", vector3(p.x, p.y, p.z), p.w);
+  if (isinf(f))
+    return 0.0;
+  return f;
+}
+
+/* The fractal_noise functions are all exactly the same except for the input type. */
+float fractal_noise(float p, float details, float roughness)
+{
+  float fscale = 1.0;
+  float amp = 1.0;
+  float maxamp = 0.0;
+  float sum = 0.0;
+  float octaves = clamp(details, 0.0, 16.0);
+  int n = (int)octaves;
+  for (int i = 0; i <= n; i++) {
+    float t = safe_noise(fscale * p);
+    sum += t * amp;
+    maxamp += amp;
+    amp *= clamp(roughness, 0.0, 1.0);
+    fscale *= 2.0;
+  }
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    float t = safe_noise(fscale * p);
+    float sum2 = sum + t * amp;
+    sum /= maxamp;
+    sum2 /= maxamp + amp;
+    return (1.0 - rmd) * sum + rmd * sum2;
+  }
+  else {
+    return sum / maxamp;
+  }
+}
+
+/* The fractal_noise functions are all exactly the same except for the input type. */
+float fractal_noise(vector2 p, float details, float roughness)
+{
+  float fscale = 1.0;
+  float amp = 1.0;
+  float maxamp = 0.0;
+  float sum = 0.0;
+  float octaves = clamp(details, 0.0, 16.0);
+  int n = (int)octaves;
+  for (int i = 0; i <= n; i++) {
+    float t = safe_noise(fscale * p);
+    sum += t * amp;
+    maxamp += amp;
+    amp *= clamp(roughness, 0.0, 1.0);
+    fscale *= 2.0;
+  }
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    float t = safe_noise(fscale * p);
+    float sum2 = sum + t * amp;
+    sum /= maxamp;
+    sum2 /= maxamp + amp;
+    return (1.0 - rmd) * sum + rmd * sum2;
+  }
+  else {
+    return sum / maxamp;
+  }
+}
+
+/* The fractal_noise functions are all exactly the same except for the input type. */
+float fractal_noise(vector3 p, float details, float roughness)
+{
+  float fscale = 1.0;
+  float amp = 1.0;
+  float maxamp = 0.0;
+  float sum = 0.0;
+  float octaves = clamp(details, 0.0, 16.0);
+  int n = (int)octaves;
+  for (int i = 0; i <= n; i++) {
+    float t = safe_noise(fscale * p);
+    sum += t * amp;
+    maxamp += amp;
+    amp *= clamp(roughness, 0.0, 1.0);
+    fscale *= 2.0;
+  }
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    float t = safe_noise(fscale * p);
+    float sum2 = sum + t * amp;
+    sum /= maxamp;
+    sum2 /= maxamp + amp;
+    return (1.0 - rmd) * sum + rmd * sum2;
+  }
+  else {
+    return sum / maxamp;
+  }
+}
+
+/* The fractal_noise functions are all exactly the same except for the input type. */
+float fractal_noise(vector4 p, float details, float roughness)
+{
+  float fscale = 1.0;
+  float amp = 1.0;
+  float maxamp = 0.0;
+  float sum = 0.0;
+  float octaves = clamp(details, 0.0, 16.0);
+  int n = (int)octaves;
+  for (int i = 0; i <= n; i++) {
+    float t = safe_noise(fscale * p);
+    sum += t * amp;
+    maxamp += amp;
+    amp *= clamp(roughness, 0.0, 1.0);
+    fscale *= 2.0;
+  }
+  float rmd = octaves - floor(octaves);
+  if (rmd != 0.0) {
+    float t = safe_noise(fscale * p);
+    float sum2 = sum + t * amp;
+    sum /= maxamp;
+    sum2 /= maxamp + amp;
+    return (1.0 - rmd) * sum + rmd * sum2;
+  }
+  else {
+    return sum / maxamp;
+  }
+}
+
+#undef vector3
diff --git a/intern/cycles/kernel/shaders/node_noise_texture.osl b/intern/cycles/kernel/shaders/node_noise_texture.osl
index 2cbd571e206..61c0216910b 100644
--- a/intern/cycles/kernel/shaders/node_noise_texture.osl
+++ b/intern/cycles/kernel/shaders/node_noise_texture.osl
@@ -14,47 +14,139 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
+#include "node_noise.h"
+#include "stdcycles.h"
+#include "vector2.h"
+#include "vector4.h"
 
-/* Noise */
+#define vector3 point
 
-float noise(point ip, float distortion, float detail, output color Color)
+/* The following offset functions generate random offsets to be added to texture
+ * coordinates to act as a seed since the noise functions don't have seed values.
+ * A seed value is needed for generating distortion textures and color outputs.
+ * The offset's components are in the range [100, 200], not too high to cause
+ * bad precision and not to small to be noticeable. We use float seed because
+ * OSL only support float hashes.
+ */
+
+float random_float_offset(float seed)
+{
+  return 100.0 + noise("hash", seed) * 100.0;
+}
+
+vector2 random_vector2_offset(float seed)
+{
+  return vector2(100.0 + noise("hash", seed, 0.0) * 100.0,
+                 100.0 + noise("hash", seed, 1.0) * 100.0);
+}
+
+vector3 random_vector3_offset(float seed)
 {
-  point r;
-  point p = ip;
-  int hard = 0;
+  return vector3(100.0 + noise("hash", seed, 0.0) * 100.0,
+                 100.0 + noise("hash", seed, 1.0) * 100.0,
+                 100.0 + noise("hash", seed, 2.0) * 100.0);
+}
+
+vector4 random_vector4_offset(float seed)
+{
+  return vector4(100.0 + noise("hash", seed, 0.0) * 100.0,
+                 100.0 + noise("hash", seed, 1.0) * 100.0,
+                 100.0 + noise("hash", seed, 2.0) * 100.0,
+                 100.0 + noise("hash", seed, 3.0) * 100.0);
+}
+
+float noise_texture(float co, float detail, float roughness, float distortion, output color Color)
+{
+  float p = co;
+  if (distortion != 0.0) {
+    p += safe_snoise(p + random_float_offset(0.0)) * distortion;
+  }
 
+  float value = fractal_noise(p, detail, roughness);
+  Color = color(value,
+                fractal_noise(p + random_float_offset(1.0), detail, roughness),
+                fractal_noise(p + random_float_offset(2.0), detail, roughness));
+  return value;
+}
+
+float noise_texture(
+    vector2 co, float detail, float roughness, float distortion, output color Color)
+{
+  vector2 p = co;
   if (distortion != 0.0) {
-    r[0] = safe_noise(p + point(13.5), "unsigned") * distortion;
-    r[1] = safe_noise(p, "unsigned") * distortion;
-    r[2] = safe_noise(p - point(13.5), "unsigned") * distortion;
+    p += vector2(safe_snoise(p + random_vector2_offset(0.0)) * distortion,
+                 safe_snoise(p + random_vector2_offset(1.0)) * distortion);
+  }
 
-    p += r;
+  float value = fractal_noise(p, detail, roughness);
+  Color = color(value,
+                fractal_noise(p + random_vector2_offset(2.0), detail, roughness),
+                fractal_noise(p + random_vector2_offset(3.0), detail, roughness));
+  return value;
+}
+
+float noise_texture(
+    vector3 co, float detail, float roughness, float distortion, output color Color)
+{
+  vector3 p = co;
+  if (distortion != 0.0) {
+    p += vector3(safe_snoise(p + random_vector3_offset(0.0)) * distortion,
+                 safe_snoise(p + random_vector3_offset(1.0)) * distortion,
+                 safe_snoise(p + random_vector3_offset(2.0)) * distortion);
   }
 
-  float fac = noise_turbulence(p, detail, hard);
+  float value = fractal_noise(p, detail, roughness);
+  Color = color(value,
+                fractal_noise(p + random_vector3_offset(3.0), detail, roughness),
+                fractal_noise(p + random_vector3_offset(4.0), detail, roughness));
+  return value;
+}
 
-  Color = color(fac,
-                noise_turbulence(point(p[1], p[0], p[2]), detail, hard),
-                noise_turbulence(point(p[1], p[2], p[0]), detail, hard));
+float noise_texture(
+    vector4 co, float detail, float roughness, float distortion, output color Color)
+{
+  vector4 p = co;
+  if (distortion != 0.0) {
+    p += vector4(safe_snoise(p + random_vector4_offset(0.0)) * distortion,
+                 safe_snoise(p + random_vector4_offset(1.0)) * distortion,
+                 safe_snoise(p + random_vector4_offset(2.0)) * distortion,
+                 safe_snoise(p + random_vector4_offset(3.0)) * distortion);
+  }
 
-  return fac;
+  float value = fractal_noise(p, detail, roughness);
+  Color = color(value,
+                fractal_noise(p + random_vector4_offset(4.0), detail, roughness),
+                fractal_noise(p + random_vector4_offset(5.0), detail, roughness));
+  return value;
 }
 
 shader node_noise_texture(int use_mapping = 0,
                           matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-                          float Distortion = 0.0,
+                          string dimensions = "3D",
+                          vector3 Vector = vector3(0, 0, 0),
+                          float W = 0.0,
                           float Scale = 5.0,
                           float Detail = 2.0,
-                          point Vector = P,
+                          float Roughness = 0.5,
+                          float Distortion = 0.0,
                           output float Fac = 0.0,
                           output color Color = 0.0)
 {
-  point p = Vector;
-
+  vector3 p = Vector;
   if (use_mapping)
     p = transform(mapping, p);
 
-  Fac = noise(p * Scale, Distortion, Detail, Color);
+  p *= Scale;
+  float w = W * Scale;
+
+  if (dimensions == "1D")
+    Fac = noise_texture(w, Detail, Roughness, Distortion, Color);
+  else if (dimensions == "2D")
+    Fac = noise_texture(vector2(p[0], p[1]), Detail, Roughness, Distortion, Color);
+  else if (dimensions == "3D")
+    Fac = noise_texture(p, Detail, Roughness, Distortion, Color);
+  else if (dimensions == "4D")
+    Fac = noise_texture(vector4(p[0], p[1], p[2], w), Detail, Roughness, Distortion, Color);
+  else
+    error("Unknown dimension!");
 }
diff --git a/intern/cycles/kernel/shaders/node_normal.osl b/intern/cycles/kernel/shaders/node_normal.osl
index 1d20c3e7cac..a0a88445427 100644
--- a/intern/cycles/kernel/shaders/node_normal.osl
+++ b/intern/cycles/kernel/shaders/node_normal.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_normal(normal direction = normal(0.0, 0.0, 0.0),
                    normal NormalIn = normal(0.0, 0.0, 0.0),
diff --git a/intern/cycles/kernel/shaders/node_normal_map.osl b/intern/cycles/kernel/shaders/node_normal_map.osl
index 90b593d00bc..912960f13ab 100644
--- a/intern/cycles/kernel/shaders/node_normal_map.osl
+++ b/intern/cycles/kernel/shaders/node_normal_map.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_normal_map(normal NormalIn = N,
                        float Strength = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_object_info.osl b/intern/cycles/kernel/shaders/node_object_info.osl
index 0904a30a53f..44513d9a1ba 100644
--- a/intern/cycles/kernel/shaders/node_object_info.osl
+++ b/intern/cycles/kernel/shaders/node_object_info.osl
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_object_info(output point Location = point(0.0, 0.0, 0.0),
+                        output color Color = color(1.0, 1.0, 1.0),
                         output float ObjectIndex = 0.0,
                         output float MaterialIndex = 0.0,
                         output float Random = 0.0)
 {
   getattribute("object:location", Location);
+  getattribute("object:color", Color);
   getattribute("object:index", ObjectIndex);
   getattribute("material:index", MaterialIndex);
   getattribute("object:random", Random);
diff --git a/intern/cycles/kernel/shaders/node_output_displacement.osl b/intern/cycles/kernel/shaders/node_output_displacement.osl
index fa7f603980b..bd60fc2b7e1 100644
--- a/intern/cycles/kernel/shaders/node_output_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_output_displacement.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 displacement node_output_displacement(vector Displacement = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_output_surface.osl b/intern/cycles/kernel/shaders/node_output_surface.osl
index 013666145da..cd746f79c4a 100644
--- a/intern/cycles/kernel/shaders/node_output_surface.osl
+++ b/intern/cycles/kernel/shaders/node_output_surface.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 surface node_output_surface(closure color Surface = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_output_volume.osl b/intern/cycles/kernel/shaders/node_output_volume.osl
index dd479e751b3..4cc14cd6699 100644
--- a/intern/cycles/kernel/shaders/node_output_volume.osl
+++ b/intern/cycles/kernel/shaders/node_output_volume.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 volume node_output_volume(closure color Volume = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_particle_info.osl b/intern/cycles/kernel/shaders/node_particle_info.osl
index e286c33a1ff..2dcdf3d0f3c 100644
--- a/intern/cycles/kernel/shaders/node_particle_info.osl
+++ b/intern/cycles/kernel/shaders/node_particle_info.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_particle_info(output float Index = 0.0,
                           output float Random = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 657ced9b6e6..1711811ac65 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_principled_bsdf(string distribution = "Multiscatter GGX",
                             string subsurface_method = "burley",
diff --git a/intern/cycles/kernel/shaders/node_principled_hair_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_hair_bsdf.osl
index bf986438fca..4cf17e0e703 100644
--- a/intern/cycles/kernel/shaders/node_principled_hair_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_hair_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 color log3(color a)
 {
diff --git a/intern/cycles/kernel/shaders/node_principled_volume.osl b/intern/cycles/kernel/shaders/node_principled_volume.osl
index 39cf6837eb2..0cb4cdebdaa 100644
--- a/intern/cycles/kernel/shaders/node_principled_volume.osl
+++ b/intern/cycles/kernel/shaders/node_principled_volume.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_principled_volume(color Color = color(0.5, 0.5, 0.5),
                               float Density = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
index 941d99dd44d..9e9b31d9a87 100644
--- a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_refraction_bsdf(color Color = 0.8,
                             string distribution = "sharp",
diff --git a/intern/cycles/kernel/shaders/node_rgb_curves.osl b/intern/cycles/kernel/shaders/node_rgb_curves.osl
index e34eb027cc3..8850040d580 100644
--- a/intern/cycles/kernel/shaders/node_rgb_curves.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_curves.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_ramp_util.h"
+#include "stdcycles.h"
 
 shader node_rgb_curves(color ramp[] = {0.0},
                        float min_x = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_rgb_ramp.osl b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
index c9f9746a4fb..2131edb2688 100644
--- a/intern/cycles/kernel/shaders/node_rgb_ramp.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_ramp_util.h"
+#include "stdcycles.h"
 
 shader node_rgb_ramp(color ramp_color[] = {0.0},
                      float ramp_alpha[] = {0.0},
diff --git a/intern/cycles/kernel/shaders/node_rgb_to_bw.osl b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl
index 837d6caf5fc..f0a094d5b57 100644
--- a/intern/cycles/kernel/shaders/node_rgb_to_bw.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_rgb_to_bw(color Color = 0.0, output float Val = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_scatter_volume.osl b/intern/cycles/kernel/shaders/node_scatter_volume.osl
index fce5716f372..36ad952dee6 100644
--- a/intern/cycles/kernel/shaders/node_scatter_volume.osl
+++ b/intern/cycles/kernel/shaders/node_scatter_volume.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_scatter_volume(color Color = color(0.8, 0.8, 0.8),
                            float Density = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_separate_hsv.osl b/intern/cycles/kernel/shaders/node_separate_hsv.osl
index c77ed1f3755..2f902b72dbc 100644
--- a/intern/cycles/kernel/shaders/node_separate_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_separate_hsv.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 shader node_separate_hsv(color Color = 0.8,
                          output float H = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_separate_rgb.osl b/intern/cycles/kernel/shaders/node_separate_rgb.osl
index ee64add27e2..62e4aedb879 100644
--- a/intern/cycles/kernel/shaders/node_separate_rgb.osl
+++ b/intern/cycles/kernel/shaders/node_separate_rgb.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_separate_rgb(color Image = 0.8,
                          output float R = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_separate_xyz.osl b/intern/cycles/kernel/shaders/node_separate_xyz.osl
index 8a563f5e920..acaf3942b6f 100644
--- a/intern/cycles/kernel/shaders/node_separate_xyz.osl
+++ b/intern/cycles/kernel/shaders/node_separate_xyz.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_separate_xyz(vector Vector = 0.8,
                          output float X = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_set_normal.osl b/intern/cycles/kernel/shaders/node_set_normal.osl
index 9541b829ef7..26a97e2b5d1 100644
--- a/intern/cycles/kernel/shaders/node_set_normal.osl
+++ b/intern/cycles/kernel/shaders/node_set_normal.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 surface node_set_normal(normal Direction = N, output normal Normal = N)
 {
diff --git a/intern/cycles/kernel/shaders/node_sky_texture.osl b/intern/cycles/kernel/shaders/node_sky_texture.osl
index 9b29e5489c2..a12e7a9dc17 100644
--- a/intern/cycles/kernel/shaders/node_sky_texture.osl
+++ b/intern/cycles/kernel/shaders/node_sky_texture.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 float sky_angle_between(float thetav, float phiv, float theta, float phi)
 {
@@ -44,13 +44,13 @@ float sky_perez_function(float lam[9], float theta, float gamma)
          (1.0 + lam[2] * exp(lam[3] * gamma) + lam[4] * cgamma * cgamma);
 }
 
-color sky_radiance_old(normal dir,
-                       float sunphi,
-                       float suntheta,
-                       color radiance,
-                       float config_x[9],
-                       float config_y[9],
-                       float config_z[9])
+color sky_radiance_preetham(normal dir,
+                            float sunphi,
+                            float suntheta,
+                            color radiance,
+                            float config_x[9],
+                            float config_y[9],
+                            float config_z[9])
 {
   /* convert vector to spherical coordinates */
   vector spherical = sky_spherical_coordinates(dir);
@@ -88,13 +88,13 @@ float sky_radiance_internal(float config[9], float theta, float gamma)
          (config[2] + config[3] * expM + config[5] * rayM + config[6] * mieM + config[7] * zenith);
 }
 
-color sky_radiance_new(normal dir,
-                       float sunphi,
-                       float suntheta,
-                       color radiance,
-                       float config_x[9],
-                       float config_y[9],
-                       float config_z[9])
+color sky_radiance_hosek(normal dir,
+                         float sunphi,
+                         float suntheta,
+                         color radiance,
+                         float config_x[9],
+                         float config_y[9],
+                         float config_z[9])
 {
   /* convert vector to spherical coordinates */
   vector spherical = sky_spherical_coordinates(dir);
@@ -116,25 +116,122 @@ color sky_radiance_new(normal dir,
   return xyz_to_rgb(x, y, z) * (M_2PI / 683);
 }
 
-shader node_sky_texture(int use_mapping = 0,
-                        matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-                        vector Vector = P,
-                        string type = "hosek_wilkie",
-                        float theta = 0.0,
-                        float phi = 0.0,
-                        color radiance = color(0.0, 0.0, 0.0),
-                        float config_x[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-                        float config_y[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-                        float config_z[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-                        output color Color = color(0.0, 0.0, 0.0))
+/* Nishita improved */
+vector geographical_to_direction(float lat, float lon)
+{
+  return vector(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
+}
+
+float precise_angle(vector a, vector b)
+{
+  return 2.0 * atan2(length(a - b), length(a + b));
+}
+
+color sky_radiance_nishita(vector dir, float nishita_data[10], string filename)
+{
+  /* definitions */
+  float sun_elevation = nishita_data[6];
+  float sun_rotation = nishita_data[7];
+  float angular_diameter = nishita_data[8];
+  float sun_intensity = nishita_data[9];
+  int sun_disc = angular_diameter > 0;
+  float alpha = 1.0;
+  color xyz;
+  /* convert dir to spherical coordinates */
+  vector direction = sky_spherical_coordinates(dir);
+
+  /* render above the horizon */
+  if (dir[2] >= 0.0) {
+    /* definitions */
+    vector sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2);
+    float sun_dir_angle = precise_angle(dir, sun_dir);
+    float half_angular = angular_diameter / 2.0;
+    float dir_elevation = M_PI_2 - direction[0];
+
+    /* if ray inside sun disc render it, otherwise render sky */
+    if (sun_dir_angle < half_angular && sun_disc == 1) {
+      /* get 2 pixels data */
+      color pixel_bottom = color(nishita_data[0], nishita_data[1], nishita_data[2]);
+      color pixel_top = color(nishita_data[3], nishita_data[4], nishita_data[5]);
+      float y;
+
+      /* sun interpolation */
+      if (sun_elevation - half_angular > 0.0) {
+        if ((sun_elevation + half_angular) > 0.0) {
+          y = ((dir_elevation - sun_elevation) / angular_diameter) + 0.5;
+          xyz = mix(pixel_bottom, pixel_top, y) * sun_intensity;
+        }
+      }
+      else {
+        if (sun_elevation + half_angular > 0.0) {
+          y = dir_elevation / (sun_elevation + half_angular);
+          xyz = mix(pixel_bottom, pixel_top, y) * sun_intensity;
+        }
+      }
+      /* limb darkening, coefficient is 0.6f */
+      float angle_fraction = sun_dir_angle / half_angular;
+      float limb_darkening = (1.0 - 0.6 * (1.0 - sqrt(1.0 - angle_fraction * angle_fraction)));
+      xyz *= limb_darkening;
+    }
+    /* sky */
+    else {
+      /* sky interpolation */
+      float x = (direction[1] + M_PI + sun_rotation) / M_2PI;
+      /* more pixels toward horizon compensation */
+      float y = 1.0 - sqrt(dir_elevation / M_PI_2);
+      if (x > 1.0) {
+        x = x - 1.0;
+      }
+      xyz = (color)texture(filename, x, y, "wrap", "clamp", "interp", "linear", "alpha", alpha);
+    }
+  }
+  /* ground */
+  else {
+    if (dir[2] < -0.4) {
+      xyz = color(0, 0, 0);
+    }
+    else {
+      /* black ground fade */
+      float mul = pow(1.0 + dir[2] * 2.5, 3.0);
+      /* interpolation */
+      float x = (direction[1] + M_PI + sun_rotation) / M_2PI;
+      float y = 1.5;
+      if (x > 1.0) {
+        x = x - 1.0;
+      }
+      xyz = (color)texture(
+                filename, x, y, "wrap", "periodic", "interp", "linear", "alpha", alpha) *
+            mul;
+    }
+  }
+  /* convert to RGB */
+  return xyz_to_rgb(xyz[0], xyz[1], xyz[2]);
+}
+
+shader node_sky_texture(
+    int use_mapping = 0,
+    matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+    vector Vector = P,
+    string type = "hosek_wilkie",
+    float theta = 0.0,
+    float phi = 0.0,
+    string filename = "",
+    color radiance = color(0.0, 0.0, 0.0),
+    float config_x[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+    float config_y[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+    float config_z[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+    float nishita_data[10] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+    output color Color = color(0.0, 0.0, 0.0))
 {
   vector p = Vector;
 
   if (use_mapping)
     p = transform(mapping, p);
 
+  if (type == "nishita_improved")
+    Color = sky_radiance_nishita(p, nishita_data, filename);
   if (type == "hosek_wilkie")
-    Color = sky_radiance_new(p, phi, theta, radiance, config_x, config_y, config_z);
-  else
-    Color = sky_radiance_old(p, phi, theta, radiance, config_x, config_y, config_z);
+    Color = sky_radiance_hosek(p, phi, theta, radiance, config_x, config_y, config_z);
+  if (type == "preetham")
+    Color = sky_radiance_preetham(p, phi, theta, radiance, config_x, config_y, config_z);
 }
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index e12199d8c3d..b1e854150ab 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_subsurface_scattering(color Color = 0.8,
                                   float Scale = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_tangent.osl b/intern/cycles/kernel/shaders/node_tangent.osl
index 44eb9973f3d..83f19a4610b 100644
--- a/intern/cycles/kernel/shaders/node_tangent.osl
+++ b/intern/cycles/kernel/shaders/node_tangent.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_tangent(normal NormalIn = N,
                     string attr_name = "geom:tangent",
diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h
deleted file mode 100644
index e1f3b900ee5..00000000000
--- a/intern/cycles/kernel/shaders/node_texture.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Voronoi / Worley like */
-
-color cellnoise_color(point p)
-{
-  float r = cellnoise(p);
-  float g = cellnoise(point(p[1], p[0], p[2]));
-  float b = cellnoise(point(p[1], p[2], p[0]));
-
-  return color(r, g, b);
-}
-
-void voronoi(point p, float e, float da[4], point pa[4])
-{
-  /* returns distances in da and point coords in pa */
-  int xx, yy, zz, xi, yi, zi;
-
-  xi = (int)floor(p[0]);
-  yi = (int)floor(p[1]);
-  zi = (int)floor(p[2]);
-
-  da[0] = 1e10;
-  da[1] = 1e10;
-  da[2] = 1e10;
-  da[3] = 1e10;
-
-  for (xx = xi - 1; xx <= xi + 1; xx++) {
-    for (yy = yi - 1; yy <= yi + 1; yy++) {
-      for (zz = zi - 1; zz <= zi + 1; zz++) {
-        point ip = point(xx, yy, zz);
-        point vp = (point)cellnoise_color(ip);
-        point pd = p - (vp + ip);
-        float d = dot(pd, pd);
-
-        vp += point(xx, yy, zz);
-
-        if (d < da[0]) {
-          da[3] = da[2];
-          da[2] = da[1];
-          da[1] = da[0];
-          da[0] = d;
-
-          pa[3] = pa[2];
-          pa[2] = pa[1];
-          pa[1] = pa[0];
-          pa[0] = vp;
-        }
-        else if (d < da[1]) {
-          da[3] = da[2];
-          da[2] = da[1];
-          da[1] = d;
-
-          pa[3] = pa[2];
-          pa[2] = pa[1];
-          pa[1] = vp;
-        }
-        else if (d < da[2]) {
-          da[3] = da[2];
-          da[2] = d;
-
-          pa[3] = pa[2];
-          pa[2] = vp;
-        }
-        else if (d < da[3]) {
-          da[3] = d;
-          pa[3] = vp;
-        }
-      }
-    }
-  }
-}
-
-/* Noise Bases */
-
-float safe_noise(point p, string type)
-{
-  float f = 0.0;
-
-  /* Perlin noise in range -1..1 */
-  if (type == "signed")
-    f = noise("perlin", p);
-
-  /* Perlin noise in range 0..1 */
-  else
-    f = noise(p);
-
-  /* can happen for big coordinates, things even out to 0.5 then anyway */
-  if (!isfinite(f))
-    return 0.5;
-
-  return f;
-}
-
-/* Turbulence */
-
-float noise_turbulence(point p, float details, int hard)
-{
-  float fscale = 1.0;
-  float amp = 1.0;
-  float sum = 0.0;
-  int i, n;
-
-  float octaves = clamp(details, 0.0, 16.0);
-  n = (int)octaves;
-
-  for (i = 0; i <= n; i++) {
-    float t = safe_noise(fscale * p, "unsigned");
-
-    if (hard)
-      t = fabs(2.0 * t - 1.0);
-
-    sum += t * amp;
-    amp *= 0.5;
-    fscale *= 2.0;
-  }
-
-  float rmd = octaves - floor(octaves);
-
-  if (rmd != 0.0) {
-    float t = safe_noise(fscale * p, "unsigned");
-
-    if (hard)
-      t = fabs(2.0 * t - 1.0);
-
-    float sum2 = sum + t * amp;
-
-    sum *= ((float)(1 << n) / (float)((1 << (n + 1)) - 1));
-    sum2 *= ((float)(1 << (n + 1)) / (float)((1 << (n + 2)) - 1));
-
-    return (1.0 - rmd) * sum + rmd * sum2;
-  }
-  else {
-    sum *= ((float)(1 << n) / (float)((1 << (n + 1)) - 1));
-    return sum;
-  }
-}
-
-/* Utility */
-
-float nonzero(float f, float eps)
-{
-  float r;
-
-  if (abs(f) < eps)
-    r = sign(f) * eps;
-  else
-    r = f;
-
-  return r;
-}
diff --git a/intern/cycles/kernel/shaders/node_texture_coordinate.osl b/intern/cycles/kernel/shaders/node_texture_coordinate.osl
index 13861653d04..ac05e984af2 100644
--- a/intern/cycles/kernel/shaders/node_texture_coordinate.osl
+++ b/intern/cycles/kernel/shaders/node_texture_coordinate.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_texture_coordinate(
     normal NormalIn = N,
diff --git a/intern/cycles/kernel/shaders/node_toon_bsdf.osl b/intern/cycles/kernel/shaders/node_toon_bsdf.osl
index ed3a0b25c60..4a44730c70c 100644
--- a/intern/cycles/kernel/shaders/node_toon_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_toon_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_toon_bsdf(color Color = 0.8,
                       string component = "diffuse",
diff --git a/intern/cycles/kernel/shaders/node_translucent_bsdf.osl b/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
index 7ce1ab08c59..23a562bf34d 100644
--- a/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_translucent_bsdf(color Color = 0.8, normal Normal = N, output closure color BSDF = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_transparent_bsdf.osl b/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
index a735513ba89..eb737a05c41 100644
--- a/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_transparent_bsdf(color Color = 0.8, normal Normal = N, output closure color BSDF = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_uv_map.osl b/intern/cycles/kernel/shaders/node_uv_map.osl
index 6f2887be63c..88d8c5ba394 100644
--- a/intern/cycles/kernel/shaders/node_uv_map.osl
+++ b/intern/cycles/kernel/shaders/node_uv_map.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_uv_map(int from_dupli = 0,
                    string attribute = "",
diff --git a/intern/cycles/kernel/shaders/node_value.osl b/intern/cycles/kernel/shaders/node_value.osl
index 398e2c0e392..13197b9a27a 100644
--- a/intern/cycles/kernel/shaders/node_value.osl
+++ b/intern/cycles/kernel/shaders/node_value.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_value(float value_value = 0.0,
                   vector vector_value = vector(0.0, 0.0, 0.0),
diff --git a/intern/cycles/kernel/shaders/node_vector_curves.osl b/intern/cycles/kernel/shaders/node_vector_curves.osl
index e8c8036b550..9d3a2b82b0a 100644
--- a/intern/cycles/kernel/shaders/node_vector_curves.osl
+++ b/intern/cycles/kernel/shaders/node_vector_curves.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_ramp_util.h"
+#include "stdcycles.h"
 
 shader node_vector_curves(color ramp[] = {0.0},
                           float min_x = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_vector_displacement.osl b/intern/cycles/kernel/shaders/node_vector_displacement.osl
index e9bd336347f..7cd9c2a37f2 100644
--- a/intern/cycles/kernel/shaders/node_vector_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_vector_displacement.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_vector_displacement(color Vector = color(0.0, 0.0, 0.0),
                                 float Midlevel = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_vector_math.osl b/intern/cycles/kernel/shaders/node_vector_math.osl
index 10bb0c7283c..218851598b4 100644
--- a/intern/cycles/kernel/shaders/node_vector_math.osl
+++ b/intern/cycles/kernel/shaders/node_vector_math.osl
@@ -14,36 +14,90 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "node_math.h"
+#include "stdcycles.h"
 
 shader node_vector_math(string type = "add",
                         vector Vector1 = vector(0.0, 0.0, 0.0),
                         vector Vector2 = vector(0.0, 0.0, 0.0),
+                        vector Vector3 = vector(0.0, 0.0, 0.0),
+                        float Scale = 1.0,
                         output float Value = 0.0,
                         output vector Vector = vector(0.0, 0.0, 0.0))
 {
   if (type == "add") {
     Vector = Vector1 + Vector2;
-    Value = (abs(Vector[0]) + abs(Vector[1]) + abs(Vector[2])) / 3.0;
   }
   else if (type == "subtract") {
     Vector = Vector1 - Vector2;
-    Value = (abs(Vector[0]) + abs(Vector[1]) + abs(Vector[2])) / 3.0;
   }
-  else if (type == "average") {
-    Value = length(Vector1 + Vector2);
-    Vector = normalize(Vector1 + Vector2);
+  else if (type == "multiply") {
+    Vector = Vector1 * Vector2;
+  }
+  else if (type == "divide") {
+    Vector = safe_divide(Vector1, Vector2);
+  }
+  else if (type == "cross_product") {
+    Vector = cross(Vector1, Vector2);
+  }
+  else if (type == "project") {
+    Vector = project(Vector1, Vector2);
+  }
+  else if (type == "reflect") {
+    Vector = reflect(Vector1, normalize(Vector2));
   }
   else if (type == "dot_product") {
     Value = dot(Vector1, Vector2);
   }
-  else if (type == "cross_product") {
-    vector c = cross(Vector1, Vector2);
-    Value = length(c);
-    Vector = normalize(c);
+  else if (type == "distance") {
+    Value = distance(Vector1, Vector2);
   }
-  else if (type == "normalize") {
+  else if (type == "length") {
     Value = length(Vector1);
+  }
+  else if (type == "scale") {
+    Vector = Vector1 * Scale;
+  }
+  else if (type == "normalize") {
     Vector = normalize(Vector1);
   }
+  else if (type == "snap") {
+    Vector = snap(Vector1, Vector2);
+  }
+  else if (type == "floor") {
+    Vector = floor(Vector1);
+  }
+  else if (type == "ceil") {
+    Vector = ceil(Vector1);
+  }
+  else if (type == "modulo") {
+    Vector = fmod(Vector1, Vector2);
+  }
+  else if (type == "wrap") {
+    Vector = wrap(Vector1, Vector2, Vector3);
+  }
+  else if (type == "fraction") {
+    Vector = Vector1 - floor(Vector1);
+  }
+  else if (type == "absolute") {
+    Vector = abs(Vector1);
+  }
+  else if (type == "minimum") {
+    Vector = min(Vector1, Vector2);
+  }
+  else if (type == "maximum") {
+    Vector = max(Vector1, Vector2);
+  }
+  else if (type == "sine") {
+    Vector = sin(Vector1);
+  }
+  else if (type == "cosine") {
+    Vector = cos(Vector1);
+  }
+  else if (type == "tangent") {
+    Vector = tan(Vector1);
+  }
+  else {
+    warning("%s", "Unknown vector math operator!");
+  }
 }
diff --git a/intern/cycles/kernel/shaders/node_vector_rotate.osl b/intern/cycles/kernel/shaders/node_vector_rotate.osl
new file mode 100644
index 00000000000..2efe3470ae2
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_vector_rotate.osl
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "node_math.h"
+#include "stdcycles.h"
+
+shader node_vector_rotate(int invert = 0,
+                          string type = "axis",
+                          vector VectorIn = vector(0.0, 0.0, 0.0),
+                          point Center = point(0.0, 0.0, 0.0),
+                          point Rotation = point(0.0, 0.0, 0.0),
+                          vector Axis = vector(0.0, 0.0, 1.0),
+                          float Angle = 0.0,
+                          output vector VectorOut = vector(0.0, 0.0, 0.0))
+{
+  if (type == "euler_xyz") {
+    matrix rmat = (invert) ? transpose(euler_to_mat(Rotation)) : euler_to_mat(Rotation);
+    VectorOut = transform(rmat, VectorIn - Center) + Center;
+  }
+  else {
+    float a = (invert) ? -Angle : Angle;
+    if (type == "x_axis") {
+      VectorOut = rotate(VectorIn - Center, a, point(0.0), vector(1.0, 0.0, 0.0)) + Center;
+    }
+    else if (type == "y_axis") {
+      VectorOut = rotate(VectorIn - Center, a, point(0.0), vector(0.0, 1.0, 0.0)) + Center;
+    }
+    else if (type == "z_axis") {
+      VectorOut = rotate(VectorIn - Center, a, point(0.0), vector(0.0, 0.0, 1.0)) + Center;
+    }
+    else {  // axis
+      VectorOut = (length(Axis) != 0.0) ? rotate(VectorIn - Center, a, point(0.0), Axis) + Center :
+                                          VectorIn;
+    }
+  }
+}
diff --git a/intern/cycles/kernel/shaders/node_vector_transform.osl b/intern/cycles/kernel/shaders/node_vector_transform.osl
index 22939577be0..1db799cfc9e 100644
--- a/intern/cycles/kernel/shaders/node_vector_transform.osl
+++ b/intern/cycles/kernel/shaders/node_vector_transform.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_vector_transform(string type = "vector",
                              string convert_from = "world",
diff --git a/intern/cycles/kernel/shaders/node_velvet_bsdf.osl b/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
index 9290b845325..299acef35ee 100644
--- a/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_velvet_bsdf(color Color = 0.8,
                         float Sigma = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_vertex_color.osl b/intern/cycles/kernel/shaders/node_vertex_color.osl
new file mode 100644
index 00000000000..ffaf7a2f720
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_vertex_color.osl
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdcycles.h"
+
+shader node_vertex_color(string bump_offset = "center",
+                         string layer_name = "",
+                         output color Color = 0.0,
+                         output float Alpha = 0.0)
+{
+  float vertex_color[4];
+  string vertex_color_layer;
+
+  if (layer_name == "") {
+    vertex_color_layer = "geom:vertex_color";
+  }
+  else {
+    vertex_color_layer = layer_name;
+  }
+
+  if (getattribute(vertex_color_layer, vertex_color)) {
+    Color = color(vertex_color[0], vertex_color[1], vertex_color[2]);
+    Alpha = vertex_color[3];
+
+    if (bump_offset == "dx") {
+      Color += Dx(Color);
+      Alpha += Dx(Alpha);
+    }
+    else if (bump_offset == "dy") {
+      Color += Dy(Color);
+      Alpha += Dy(Alpha);
+    }
+  }
+  else {
+    warning("%s", "Invalid attribute.");
+  }
+}
diff --git a/intern/cycles/kernel/shaders/node_voronoi_texture.osl b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
index 34c86d5b98d..04d61c32f8a 100644
--- a/intern/cycles/kernel/shaders/node_voronoi_texture.osl
+++ b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
@@ -14,151 +14,1014 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
-
-void voronoi_m(point p, string metric, float e, float da[4], point pa[4])
-{
-  /* Compute the distance to and the position of the four closest neighbors to p.
-   *
-   * The neighbors are randomly placed, 1 each in a 3x3x3 grid (Worley pattern).
-   * The distances and points are returned in ascending order, i.e. da[0] and pa[0] will
-   * contain the distance to the closest point and its coordinates respectively.
-   */
-  int xx, yy, zz, xi, yi, zi;
-
-  xi = (int)floor(p[0]);
-  yi = (int)floor(p[1]);
-  zi = (int)floor(p[2]);
-
-  da[0] = 1e10;
-  da[1] = 1e10;
-  da[2] = 1e10;
-  da[3] = 1e10;
-
-  for (xx = xi - 1; xx <= xi + 1; xx++) {
-    for (yy = yi - 1; yy <= yi + 1; yy++) {
-      for (zz = zi - 1; zz <= zi + 1; zz++) {
-        point ip = point(xx, yy, zz);
-        point vp = (point)cellnoise_color(ip);
-        point pd = p - (vp + ip);
-
-        float d = 0.0;
-        if (metric == "distance") {
-          d = dot(pd, pd);
+#include "node_hash.h"
+#include "stdcycles.h"
+#include "vector2.h"
+#include "vector4.h"
+
+#define vector3 point
+
+/* **** Distance Functions **** */
+
+float distance(float a, float b)
+{
+  return abs(a - b);
+}
+
+float distance(vector2 a, vector2 b)
+{
+  return length(a - b);
+}
+
+float distance(vector4 a, vector4 b)
+{
+  return length(a - b);
+}
+
+/* **** Safe Division **** */
+
+vector2 safe_divide(vector2 a, float b)
+{
+  return vector2((b != 0.0) ? a.x / b : 0.0, (b != 0.0) ? a.y / b : 0.0);
+}
+
+vector4 safe_divide(vector4 a, float b)
+{
+  return vector4((b != 0.0) ? a.x / b : 0.0,
+                 (b != 0.0) ? a.y / b : 0.0,
+                 (b != 0.0) ? a.z / b : 0.0,
+                 (b != 0.0) ? a.w / b : 0.0);
+}
+
+/*
+ * Smooth Voronoi:
+ *
+ * - https://wiki.blender.org/wiki/User:OmarSquircleArt/GSoC2019/Documentation/Smooth_Voronoi
+ *
+ * Distance To Edge:
+ *
+ * - https://www.shadertoy.com/view/llG3zy
+ *
+ */
+
+/* **** 1D Voronoi **** */
+
+float voronoi_distance(float a, float b, string metric, float exponent)
+{
+  return abs(a - b);
+}
+
+void voronoi_f1_1d(float w,
+                   float exponent,
+                   float randomness,
+                   string metric,
+                   output float outDistance,
+                   output color outColor,
+                   output float outW)
+{
+  float cellPosition = floor(w);
+  float localPosition = w - cellPosition;
+
+  float minDistance = 8.0;
+  float targetOffset, targetPosition;
+  for (int i = -1; i <= 1; i++) {
+    float cellOffset = float(i);
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+    if (distanceToPoint < minDistance) {
+      targetOffset = cellOffset;
+      minDistance = distanceToPoint;
+      targetPosition = pointPosition;
+    }
+  }
+  outDistance = minDistance;
+  outColor = hash_float_to_color(cellPosition + targetOffset);
+  outW = targetPosition + cellPosition;
+}
+
+void voronoi_smooth_f1_1d(float w,
+                          float smoothness,
+                          float exponent,
+                          float randomness,
+                          string metric,
+                          output float outDistance,
+                          output color outColor,
+                          output float outW)
+{
+  float cellPosition = floor(w);
+  float localPosition = w - cellPosition;
+
+  float smoothDistance = 8.0;
+  float smoothPosition = 0.0;
+  color smoothColor = color(0.0);
+  for (int i = -2; i <= 2; i++) {
+    float cellOffset = float(i);
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+    float h = smoothstep(0.0, 1.0, 0.5 + 0.5 * (smoothDistance - distanceToPoint) / smoothness);
+    float correctionFactor = smoothness * h * (1.0 - h);
+    smoothDistance = mix(smoothDistance, distanceToPoint, h) - correctionFactor;
+    correctionFactor /= 1.0 + 3.0 * smoothness;
+    color cellColor = hash_float_to_color(cellPosition + cellOffset);
+    smoothColor = mix(smoothColor, cellColor, h) - correctionFactor;
+    smoothPosition = mix(smoothPosition, pointPosition, h) - correctionFactor;
+  }
+  outDistance = smoothDistance;
+  outColor = smoothColor;
+  outW = cellPosition + smoothPosition;
+}
+
+void voronoi_f2_1d(float w,
+                   float exponent,
+                   float randomness,
+                   string metric,
+                   output float outDistance,
+                   output color outColor,
+                   output float outW)
+{
+  float cellPosition = floor(w);
+  float localPosition = w - cellPosition;
+
+  float distanceF1 = 8.0;
+  float distanceF2 = 8.0;
+  float offsetF1 = 0.0;
+  float positionF1 = 0.0;
+  float offsetF2, positionF2;
+  for (int i = -1; i <= 1; i++) {
+    float cellOffset = float(i);
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+    if (distanceToPoint < distanceF1) {
+      distanceF2 = distanceF1;
+      distanceF1 = distanceToPoint;
+      offsetF2 = offsetF1;
+      offsetF1 = cellOffset;
+      positionF2 = positionF1;
+      positionF1 = pointPosition;
+    }
+    else if (distanceToPoint < distanceF2) {
+      distanceF2 = distanceToPoint;
+      offsetF2 = cellOffset;
+      positionF2 = pointPosition;
+    }
+  }
+  outDistance = distanceF2;
+  outColor = hash_float_to_color(cellPosition + offsetF2);
+  outW = positionF2 + cellPosition;
+}
+
+void voronoi_distance_to_edge_1d(float w, float randomness, output float outDistance)
+{
+  float cellPosition = floor(w);
+  float localPosition = w - cellPosition;
+
+  float minDistance = 8.0;
+  for (int i = -1; i <= 1; i++) {
+    float cellOffset = float(i);
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = distance(pointPosition, localPosition);
+    minDistance = min(distanceToPoint, minDistance);
+  }
+  outDistance = minDistance;
+}
+
+void voronoi_n_sphere_radius_1d(float w, float randomness, output float outRadius)
+{
+  float cellPosition = floor(w);
+  float localPosition = w - cellPosition;
+
+  float closestPoint;
+  float closestPointOffset;
+  float minDistance = 8.0;
+  for (int i = -1; i <= 1; i++) {
+    float cellOffset = float(i);
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = distance(pointPosition, localPosition);
+    if (distanceToPoint < minDistance) {
+      minDistance = distanceToPoint;
+      closestPoint = pointPosition;
+      closestPointOffset = cellOffset;
+    }
+  }
+
+  minDistance = 8.0;
+  float closestPointToClosestPoint;
+  for (int i = -1; i <= 1; i++) {
+    if (i == 0) {
+      continue;
+    }
+    float cellOffset = float(i) + closestPointOffset;
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = distance(closestPoint, pointPosition);
+    if (distanceToPoint < minDistance) {
+      minDistance = distanceToPoint;
+      closestPointToClosestPoint = pointPosition;
+    }
+  }
+  outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0;
+}
+
+/* **** 2D Voronoi **** */
+
+float voronoi_distance(vector2 a, vector2 b, string metric, float exponent)
+{
+  if (metric == "euclidean") {
+    return distance(a, b);
+  }
+  else if (metric == "manhattan") {
+    return abs(a.x - b.x) + abs(a.y - b.y);
+  }
+  else if (metric == "chebychev") {
+    return max(abs(a.x - b.x), abs(a.y - b.y));
+  }
+  else if (metric == "minkowski") {
+    return pow(pow(abs(a.x - b.x), exponent) + pow(abs(a.y - b.y), exponent), 1.0 / exponent);
+  }
+  else {
+    return 0.0;
+  }
+}
+
+void voronoi_f1_2d(vector2 coord,
+                   float exponent,
+                   float randomness,
+                   string metric,
+                   output float outDistance,
+                   output color outColor,
+                   output vector2 outPosition)
+{
+  vector2 cellPosition = floor(coord);
+  vector2 localPosition = coord - cellPosition;
+
+  float minDistance = 8.0;
+  vector2 targetOffset, targetPosition;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      vector2 cellOffset = vector2(i, j);
+      vector2 pointPosition = cellOffset +
+                              hash_vector2_to_vector2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+      if (distanceToPoint < minDistance) {
+        targetOffset = cellOffset;
+        minDistance = distanceToPoint;
+        targetPosition = pointPosition;
+      }
+    }
+  }
+  outDistance = minDistance;
+  outColor = hash_vector2_to_color(cellPosition + targetOffset);
+  outPosition = targetPosition + cellPosition;
+}
+
+void voronoi_smooth_f1_2d(vector2 coord,
+                          float smoothness,
+                          float exponent,
+                          float randomness,
+                          string metric,
+                          output float outDistance,
+                          output color outColor,
+                          output vector2 outPosition)
+{
+  vector2 cellPosition = floor(coord);
+  vector2 localPosition = coord - cellPosition;
+
+  float smoothDistance = 8.0;
+  color smoothColor = color(0.0);
+  vector2 smoothPosition = vector2(0.0, 0.0);
+  for (int j = -2; j <= 2; j++) {
+    for (int i = -2; i <= 2; i++) {
+      vector2 cellOffset = vector2(i, j);
+      vector2 pointPosition = cellOffset +
+                              hash_vector2_to_vector2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+      float h = smoothstep(0.0, 1.0, 0.5 + 0.5 * (smoothDistance - distanceToPoint) / smoothness);
+      float correctionFactor = smoothness * h * (1.0 - h);
+      smoothDistance = mix(smoothDistance, distanceToPoint, h) - correctionFactor;
+      correctionFactor /= 1.0 + 3.0 * smoothness;
+      color cellColor = hash_vector2_to_color(cellPosition + cellOffset);
+      smoothColor = mix(smoothColor, cellColor, h) - correctionFactor;
+      smoothPosition = mix(smoothPosition, pointPosition, h) - correctionFactor;
+    }
+  }
+  outDistance = smoothDistance;
+  outColor = smoothColor;
+  outPosition = cellPosition + smoothPosition;
+}
+
+void voronoi_f2_2d(vector2 coord,
+                   float exponent,
+                   float randomness,
+                   string metric,
+                   output float outDistance,
+                   output color outColor,
+                   output vector2 outPosition)
+{
+  vector2 cellPosition = floor(coord);
+  vector2 localPosition = coord - cellPosition;
+
+  float distanceF1 = 8.0;
+  float distanceF2 = 8.0;
+  vector2 offsetF1 = vector2(0.0, 0.0);
+  vector2 positionF1 = vector2(0.0, 0.0);
+  vector2 offsetF2, positionF2;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      vector2 cellOffset = vector2(i, j);
+      vector2 pointPosition = cellOffset +
+                              hash_vector2_to_vector2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+      if (distanceToPoint < distanceF1) {
+        distanceF2 = distanceF1;
+        distanceF1 = distanceToPoint;
+        offsetF2 = offsetF1;
+        offsetF1 = cellOffset;
+        positionF2 = positionF1;
+        positionF1 = pointPosition;
+      }
+      else if (distanceToPoint < distanceF2) {
+        distanceF2 = distanceToPoint;
+        offsetF2 = cellOffset;
+        positionF2 = pointPosition;
+      }
+    }
+  }
+  outDistance = distanceF2;
+  outColor = hash_vector2_to_color(cellPosition + offsetF2);
+  outPosition = positionF2 + cellPosition;
+}
+
+void voronoi_distance_to_edge_2d(vector2 coord, float randomness, output float outDistance)
+{
+  vector2 cellPosition = floor(coord);
+  vector2 localPosition = coord - cellPosition;
+
+  vector2 vectorToClosest;
+  float minDistance = 8.0;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      vector2 cellOffset = vector2(i, j);
+      vector2 vectorToPoint = cellOffset +
+                              hash_vector2_to_vector2(cellPosition + cellOffset) * randomness -
+                              localPosition;
+      float distanceToPoint = dot(vectorToPoint, vectorToPoint);
+      if (distanceToPoint < minDistance) {
+        minDistance = distanceToPoint;
+        vectorToClosest = vectorToPoint;
+      }
+    }
+  }
+
+  minDistance = 8.0;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      vector2 cellOffset = vector2(i, j);
+      vector2 vectorToPoint = cellOffset +
+                              hash_vector2_to_vector2(cellPosition + cellOffset) * randomness -
+                              localPosition;
+      vector2 perpendicularToEdge = vectorToPoint - vectorToClosest;
+      if (dot(perpendicularToEdge, perpendicularToEdge) > 0.0001) {
+        float distanceToEdge = dot((vectorToClosest + vectorToPoint) / 2.0,
+                                   normalize(perpendicularToEdge));
+        minDistance = min(minDistance, distanceToEdge);
+      }
+    }
+  }
+  outDistance = minDistance;
+}
+
+void voronoi_n_sphere_radius_2d(vector2 coord, float randomness, output float outRadius)
+{
+  vector2 cellPosition = floor(coord);
+  vector2 localPosition = coord - cellPosition;
+
+  vector2 closestPoint;
+  vector2 closestPointOffset;
+  float minDistance = 8.0;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      vector2 cellOffset = vector2(i, j);
+      vector2 pointPosition = cellOffset +
+                              hash_vector2_to_vector2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = distance(pointPosition, localPosition);
+      if (distanceToPoint < minDistance) {
+        minDistance = distanceToPoint;
+        closestPoint = pointPosition;
+        closestPointOffset = cellOffset;
+      }
+    }
+  }
+
+  minDistance = 8.0;
+  vector2 closestPointToClosestPoint;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      if (i == 0 && j == 0) {
+        continue;
+      }
+      vector2 cellOffset = vector2(i, j) + closestPointOffset;
+      vector2 pointPosition = cellOffset +
+                              hash_vector2_to_vector2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = distance(closestPoint, pointPosition);
+      if (distanceToPoint < minDistance) {
+        minDistance = distanceToPoint;
+        closestPointToClosestPoint = pointPosition;
+      }
+    }
+  }
+  outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0;
+}
+
+/* **** 3D Voronoi **** */
+
+float voronoi_distance(vector3 a, vector3 b, string metric, float exponent)
+{
+  if (metric == "euclidean") {
+    return distance(a, b);
+  }
+  else if (metric == "manhattan") {
+    return abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2]);
+  }
+  else if (metric == "chebychev") {
+    return max(abs(a[0] - b[0]), max(abs(a[1] - b[1]), abs(a[2] - b[2])));
+  }
+  else if (metric == "minkowski") {
+    return pow(pow(abs(a[0] - b[0]), exponent) + pow(abs(a[1] - b[1]), exponent) +
+                   pow(abs(a[2] - b[2]), exponent),
+               1.0 / exponent);
+  }
+  else {
+    return 0.0;
+  }
+}
+
+void voronoi_f1_3d(vector3 coord,
+                   float exponent,
+                   float randomness,
+                   string metric,
+                   output float outDistance,
+                   output color outColor,
+                   output vector3 outPosition)
+{
+  vector3 cellPosition = floor(coord);
+  vector3 localPosition = coord - cellPosition;
+
+  float minDistance = 8.0;
+  vector3 targetOffset, targetPosition;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        vector3 cellOffset = vector3(i, j, k);
+        vector3 pointPosition = cellOffset +
+                                hash_vector3_to_vector3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+        if (distanceToPoint < minDistance) {
+          targetOffset = cellOffset;
+          minDistance = distanceToPoint;
+          targetPosition = pointPosition;
         }
-        else if (metric == "manhattan") {
-          d = fabs(pd[0]) + fabs(pd[1]) + fabs(pd[2]);
+      }
+    }
+  }
+  outDistance = minDistance;
+  outColor = hash_vector3_to_color(cellPosition + targetOffset);
+  outPosition = targetPosition + cellPosition;
+}
+
+void voronoi_smooth_f1_3d(vector3 coord,
+                          float smoothness,
+                          float exponent,
+                          float randomness,
+                          string metric,
+                          output float outDistance,
+                          output color outColor,
+                          output vector3 outPosition)
+{
+  vector3 cellPosition = floor(coord);
+  vector3 localPosition = coord - cellPosition;
+
+  float smoothDistance = 8.0;
+  color smoothColor = color(0.0);
+  vector3 smoothPosition = vector3(0.0);
+  for (int k = -2; k <= 2; k++) {
+    for (int j = -2; j <= 2; j++) {
+      for (int i = -2; i <= 2; i++) {
+        vector3 cellOffset = vector3(i, j, k);
+        vector3 pointPosition = cellOffset +
+                                hash_vector3_to_vector3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+        float h = smoothstep(
+            0.0, 1.0, 0.5 + 0.5 * (smoothDistance - distanceToPoint) / smoothness);
+        float correctionFactor = smoothness * h * (1.0 - h);
+        smoothDistance = mix(smoothDistance, distanceToPoint, h) - correctionFactor;
+        correctionFactor /= 1.0 + 3.0 * smoothness;
+        color cellColor = hash_vector3_to_color(cellPosition + cellOffset);
+        smoothColor = mix(smoothColor, cellColor, h) - correctionFactor;
+        smoothPosition = mix(smoothPosition, pointPosition, h) - correctionFactor;
+      }
+    }
+  }
+  outDistance = smoothDistance;
+  outColor = smoothColor;
+  outPosition = cellPosition + smoothPosition;
+}
+
+void voronoi_f2_3d(vector3 coord,
+                   float exponent,
+                   float randomness,
+                   string metric,
+                   output float outDistance,
+                   output color outColor,
+                   output vector3 outPosition)
+{
+  vector3 cellPosition = floor(coord);
+  vector3 localPosition = coord - cellPosition;
+
+  float distanceF1 = 8.0;
+  float distanceF2 = 8.0;
+  vector3 offsetF1 = vector3(0.0);
+  vector3 positionF1 = vector3(0.0);
+  vector3 offsetF2, positionF2;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        vector3 cellOffset = vector3(i, j, k);
+        vector3 pointPosition = cellOffset +
+                                hash_vector3_to_vector3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+        if (distanceToPoint < distanceF1) {
+          distanceF2 = distanceF1;
+          distanceF1 = distanceToPoint;
+          offsetF2 = offsetF1;
+          offsetF1 = cellOffset;
+          positionF2 = positionF1;
+          positionF1 = pointPosition;
         }
-        else if (metric == "chebychev") {
-          d = max(fabs(pd[0]), max(fabs(pd[1]), fabs(pd[2])));
+        else if (distanceToPoint < distanceF2) {
+          distanceF2 = distanceToPoint;
+          offsetF2 = cellOffset;
+          positionF2 = pointPosition;
         }
-        else if (metric == "minkowski") {
-          d = pow(pow(fabs(pd[0]), e) + pow(fabs(pd[1]), e) + pow(fabs(pd[2]), e), 1.0 / e);
+      }
+    }
+  }
+  outDistance = distanceF2;
+  outColor = hash_vector3_to_color(cellPosition + offsetF2);
+  outPosition = positionF2 + cellPosition;
+}
+
+void voronoi_distance_to_edge_3d(vector3 coord, float randomness, output float outDistance)
+{
+  vector3 cellPosition = floor(coord);
+  vector3 localPosition = coord - cellPosition;
+
+  vector3 vectorToClosest;
+  float minDistance = 8.0;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        vector3 cellOffset = vector3(i, j, k);
+        vector3 vectorToPoint = cellOffset +
+                                hash_vector3_to_vector3(cellPosition + cellOffset) * randomness -
+                                localPosition;
+        float distanceToPoint = dot(vectorToPoint, vectorToPoint);
+        if (distanceToPoint < minDistance) {
+          minDistance = distanceToPoint;
+          vectorToClosest = vectorToPoint;
         }
+      }
+    }
+  }
 
-        vp += point(xx, yy, zz);
+  minDistance = 8.0;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        vector3 cellOffset = vector3(i, j, k);
+        vector3 vectorToPoint = cellOffset +
+                                hash_vector3_to_vector3(cellPosition + cellOffset) * randomness -
+                                localPosition;
+        vector3 perpendicularToEdge = vectorToPoint - vectorToClosest;
+        if (dot(perpendicularToEdge, perpendicularToEdge) > 0.0001) {
+          float distanceToEdge = dot((vectorToClosest + vectorToPoint) / 2.0,
+                                     normalize((vector)perpendicularToEdge));
+          minDistance = min(minDistance, distanceToEdge);
+        }
+      }
+    }
+  }
+  outDistance = minDistance;
+}
 
-        if (d < da[0]) {
-          da[3] = da[2];
-          da[2] = da[1];
-          da[1] = da[0];
-          da[0] = d;
+void voronoi_n_sphere_radius_3d(vector3 coord, float randomness, output float outRadius)
+{
+  vector3 cellPosition = floor(coord);
+  vector3 localPosition = coord - cellPosition;
 
-          pa[3] = pa[2];
-          pa[2] = pa[1];
-          pa[1] = pa[0];
-          pa[0] = vp;
+  vector3 closestPoint;
+  vector3 closestPointOffset;
+  float minDistance = 8.0;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        vector3 cellOffset = vector3(i, j, k);
+        vector3 pointPosition = cellOffset +
+                                hash_vector3_to_vector3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = distance(pointPosition, localPosition);
+        if (distanceToPoint < minDistance) {
+          minDistance = distanceToPoint;
+          closestPoint = pointPosition;
+          closestPointOffset = cellOffset;
         }
-        else if (d < da[1]) {
-          da[3] = da[2];
-          da[2] = da[1];
-          da[1] = d;
-
-          pa[3] = pa[2];
-          pa[2] = pa[1];
-          pa[1] = vp;
+      }
+    }
+  }
+
+  minDistance = 8.0;
+  vector3 closestPointToClosestPoint;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        if (i == 0 && j == 0 && k == 0) {
+          continue;
         }
-        else if (d < da[2]) {
-          da[3] = da[2];
-          da[2] = d;
+        vector3 cellOffset = vector3(i, j, k) + closestPointOffset;
+        vector3 pointPosition = cellOffset +
+                                hash_vector3_to_vector3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = distance(closestPoint, pointPosition);
+        if (distanceToPoint < minDistance) {
+          minDistance = distanceToPoint;
+          closestPointToClosestPoint = pointPosition;
+        }
+      }
+    }
+  }
+  outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0;
+}
 
-          pa[3] = pa[2];
-          pa[2] = vp;
+/* **** 4D Voronoi **** */
+
+float voronoi_distance(vector4 a, vector4 b, string metric, float exponent)
+{
+  if (metric == "euclidean") {
+    return distance(a, b);
+  }
+  else if (metric == "manhattan") {
+    return abs(a.x - b.x) + abs(a.y - b.y) + abs(a.z - b.z) + abs(a.w - b.w);
+  }
+  else if (metric == "chebychev") {
+    return max(abs(a.x - b.x), max(abs(a.y - b.y), max(abs(a.z - b.z), abs(a.w - b.w))));
+  }
+  else if (metric == "minkowski") {
+    return pow(pow(abs(a.x - b.x), exponent) + pow(abs(a.y - b.y), exponent) +
+                   pow(abs(a.z - b.z), exponent) + pow(abs(a.w - b.w), exponent),
+               1.0 / exponent);
+  }
+  else {
+    return 0.0;
+  }
+}
+
+void voronoi_f1_4d(vector4 coord,
+                   float exponent,
+                   float randomness,
+                   string metric,
+                   output float outDistance,
+                   output color outColor,
+                   output vector4 outPosition)
+{
+  vector4 cellPosition = floor(coord);
+  vector4 localPosition = coord - cellPosition;
+
+  float minDistance = 8.0;
+  vector4 targetOffset, targetPosition;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      for (int j = -1; j <= 1; j++) {
+        for (int i = -1; i <= 1; i++) {
+          vector4 cellOffset = vector4(i, j, k, u);
+          vector4 pointPosition = cellOffset +
+                                  hash_vector4_to_vector4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+          if (distanceToPoint < minDistance) {
+            targetOffset = cellOffset;
+            minDistance = distanceToPoint;
+            targetPosition = pointPosition;
+          }
         }
-        else if (d < da[3]) {
-          da[3] = d;
-          pa[3] = vp;
+      }
+    }
+  }
+  outDistance = minDistance;
+  outColor = hash_vector4_to_color(cellPosition + targetOffset);
+  outPosition = targetPosition + cellPosition;
+}
+
+void voronoi_smooth_f1_4d(vector4 coord,
+                          float smoothness,
+                          float exponent,
+                          float randomness,
+                          string metric,
+                          output float outDistance,
+                          output color outColor,
+                          output vector4 outPosition)
+{
+  vector4 cellPosition = floor(coord);
+  vector4 localPosition = coord - cellPosition;
+
+  float smoothDistance = 8.0;
+  color smoothColor = color(0.0);
+  vector4 smoothPosition = vector4(0.0, 0.0, 0.0, 0.0);
+  for (int u = -2; u <= 2; u++) {
+    for (int k = -2; k <= 2; k++) {
+      for (int j = -2; j <= 2; j++) {
+        for (int i = -2; i <= 2; i++) {
+          vector4 cellOffset = vector4(i, j, k, u);
+          vector4 pointPosition = cellOffset +
+                                  hash_vector4_to_vector4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+          float h = smoothstep(
+              0.0, 1.0, 0.5 + 0.5 * (smoothDistance - distanceToPoint) / smoothness);
+          float correctionFactor = smoothness * h * (1.0 - h);
+          smoothDistance = mix(smoothDistance, distanceToPoint, h) - correctionFactor;
+          correctionFactor /= 1.0 + 3.0 * smoothness;
+          color cellColor = hash_vector4_to_color(cellPosition + cellOffset);
+          smoothColor = mix(smoothColor, cellColor, h) - correctionFactor;
+          smoothPosition = mix(smoothPosition, pointPosition, h) - correctionFactor;
+        }
+      }
+    }
+  }
+  outDistance = smoothDistance;
+  outColor = smoothColor;
+  outPosition = cellPosition + smoothPosition;
+}
+
+void voronoi_f2_4d(vector4 coord,
+                   float exponent,
+                   float randomness,
+                   string metric,
+                   output float outDistance,
+                   output color outColor,
+                   output vector4 outPosition)
+{
+  vector4 cellPosition = floor(coord);
+  vector4 localPosition = coord - cellPosition;
+
+  float distanceF1 = 8.0;
+  float distanceF2 = 8.0;
+  vector4 offsetF1 = vector4(0.0, 0.0, 0.0, 0.0);
+  vector4 positionF1 = vector4(0.0, 0.0, 0.0, 0.0);
+  vector4 offsetF2, positionF2;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      for (int j = -1; j <= 1; j++) {
+        for (int i = -1; i <= 1; i++) {
+          vector4 cellOffset = vector4(i, j, k, u);
+          vector4 pointPosition = cellOffset +
+                                  hash_vector4_to_vector4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = voronoi_distance(pointPosition, localPosition, metric, exponent);
+          if (distanceToPoint < distanceF1) {
+            distanceF2 = distanceF1;
+            distanceF1 = distanceToPoint;
+            offsetF2 = offsetF1;
+            offsetF1 = cellOffset;
+            positionF2 = positionF1;
+            positionF1 = pointPosition;
+          }
+          else if (distanceToPoint < distanceF2) {
+            distanceF2 = distanceToPoint;
+            offsetF2 = cellOffset;
+            positionF2 = pointPosition;
+          }
+        }
+      }
+    }
+  }
+  outDistance = distanceF2;
+  outColor = hash_vector4_to_color(cellPosition + offsetF2);
+  outPosition = positionF2 + cellPosition;
+}
+
+void voronoi_distance_to_edge_4d(vector4 coord, float randomness, output float outDistance)
+{
+  vector4 cellPosition = floor(coord);
+  vector4 localPosition = coord - cellPosition;
+
+  vector4 vectorToClosest;
+  float minDistance = 8.0;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      for (int j = -1; j <= 1; j++) {
+        for (int i = -1; i <= 1; i++) {
+          vector4 cellOffset = vector4(i, j, k, u);
+          vector4 vectorToPoint = cellOffset +
+                                  hash_vector4_to_vector4(cellPosition + cellOffset) * randomness -
+                                  localPosition;
+          float distanceToPoint = dot(vectorToPoint, vectorToPoint);
+          if (distanceToPoint < minDistance) {
+            minDistance = distanceToPoint;
+            vectorToClosest = vectorToPoint;
+          }
         }
       }
     }
   }
+
+  minDistance = 8.0;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      for (int j = -1; j <= 1; j++) {
+        for (int i = -1; i <= 1; i++) {
+          vector4 cellOffset = vector4(i, j, k, u);
+          vector4 vectorToPoint = cellOffset +
+                                  hash_vector4_to_vector4(cellPosition + cellOffset) * randomness -
+                                  localPosition;
+          vector4 perpendicularToEdge = vectorToPoint - vectorToClosest;
+          if (dot(perpendicularToEdge, perpendicularToEdge) > 0.0001) {
+            float distanceToEdge = dot((vectorToClosest + vectorToPoint) / 2.0,
+                                       normalize(perpendicularToEdge));
+            minDistance = min(minDistance, distanceToEdge);
+          }
+        }
+      }
+    }
+  }
+  outDistance = minDistance;
 }
 
-/* Voronoi */
+void voronoi_n_sphere_radius_4d(vector4 coord, float randomness, output float outRadius)
+{
+  vector4 cellPosition = floor(coord);
+  vector4 localPosition = coord - cellPosition;
+
+  vector4 closestPoint;
+  vector4 closestPointOffset;
+  float minDistance = 8.0;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      for (int j = -1; j <= 1; j++) {
+        for (int i = -1; i <= 1; i++) {
+          vector4 cellOffset = vector4(i, j, k, u);
+          vector4 pointPosition = cellOffset +
+                                  hash_vector4_to_vector4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = distance(pointPosition, localPosition);
+          if (distanceToPoint < minDistance) {
+            minDistance = distanceToPoint;
+            closestPoint = pointPosition;
+            closestPointOffset = cellOffset;
+          }
+        }
+      }
+    }
+  }
+
+  minDistance = 8.0;
+  vector4 closestPointToClosestPoint;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      for (int j = -1; j <= 1; j++) {
+        for (int i = -1; i <= 1; i++) {
+          if (i == 0 && j == 0 && k == 0 && u == 0) {
+            continue;
+          }
+          vector4 cellOffset = vector4(i, j, k, u) + closestPointOffset;
+          vector4 pointPosition = cellOffset +
+                                  hash_vector4_to_vector4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = distance(closestPoint, pointPosition);
+          if (distanceToPoint < minDistance) {
+            minDistance = distanceToPoint;
+            closestPointToClosestPoint = pointPosition;
+          }
+        }
+      }
+    }
+  }
+  outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0;
+}
 
 shader node_voronoi_texture(
     int use_mapping = 0,
     matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
-    string coloring = "intensity",
-    string metric = "distance",
-    string feature = "F1",
-    float Exponent = 1.0,
+    string dimensions = "3D",
+    string feature = "f1",
+    string metric = "euclidean",
+    vector3 Vector = P,
+    float WIn = 0.0,
     float Scale = 5.0,
-    point Vector = P,
-    output float Fac = 0.0,
-    output color Color = 0.0)
+    float Smoothness = 5.0,
+    float Exponent = 1.0,
+    float Randomness = 1.0,
+    output float Distance = 0.0,
+    output color Color = 0.0,
+    output vector3 Position = P,
+    output float WOut = 0.0,
+    output float Radius = 0.0)
 {
-  point p = Vector;
+  float randomness = clamp(Randomness, 0.0, 1.0);
+  float smoothness = clamp(Smoothness / 2.0, 0.0, 0.5);
 
+  vector3 coord = Vector;
   if (use_mapping)
-    p = transform(mapping, p);
-
-  /* compute distance and point coordinate of 4 nearest neighbours */
-  float da[4];
-  point pa[4];
+    coord = transform(mapping, coord);
 
-  /* compute distance and point coordinate of 4 nearest neighbours */
-  voronoi_m(p * Scale, metric, Exponent, da, pa);
+  float w = WIn * Scale;
+  coord *= Scale;
 
-  if (coloring == "intensity") {
-    /* Intensity output */
-    if (feature == "F1") {
-      Fac = fabs(da[0]);
+  if (dimensions == "1D") {
+    if (feature == "f1") {
+      voronoi_f1_1d(w, Exponent, randomness, metric, Distance, Color, WOut);
     }
-    else if (feature == "F2") {
-      Fac = fabs(da[1]);
+    else if (feature == "smooth_f1") {
+      voronoi_smooth_f1_1d(w, smoothness, Exponent, randomness, metric, Distance, Color, WOut);
     }
-    else if (feature == "F3") {
-      Fac = fabs(da[2]);
+    else if (feature == "f2") {
+      voronoi_f2_1d(w, Exponent, randomness, metric, Distance, Color, WOut);
     }
-    else if (feature == "F4") {
-      Fac = fabs(da[3]);
+    else if (feature == "distance_to_edge") {
+      voronoi_distance_to_edge_1d(w, randomness, Distance);
     }
-    else if (feature == "F2F1") {
-      Fac = fabs(da[1] - da[0]);
+    else if (feature == "n_sphere_radius") {
+      voronoi_n_sphere_radius_1d(w, randomness, Radius);
     }
-    Color = color(Fac);
+    else {
+      error("Unknown feature!");
+    }
+    WOut = (Scale != 0.0) ? WOut / Scale : 0.0;
   }
-  else {
-    /* Color output */
-    if (feature == "F1") {
-      Color = pa[0];
+  else if (dimensions == "2D") {
+    vector2 coord2D = vector2(coord[0], coord[1]);
+    vector2 outPosition2D;
+    if (feature == "f1") {
+      voronoi_f1_2d(coord2D, Exponent, randomness, metric, Distance, Color, outPosition2D);
     }
-    else if (feature == "F2") {
-      Color = pa[1];
+    else if (feature == "smooth_f1") {
+      voronoi_smooth_f1_2d(
+          coord2D, smoothness, Exponent, randomness, metric, Distance, Color, outPosition2D);
     }
-    else if (feature == "F3") {
-      Color = pa[2];
+    else if (feature == "f2") {
+      voronoi_f2_2d(coord2D, Exponent, randomness, metric, Distance, Color, outPosition2D);
     }
-    else if (feature == "F4") {
-      Color = pa[3];
+    else if (feature == "distance_to_edge") {
+      voronoi_distance_to_edge_2d(coord2D, randomness, Distance);
     }
-    else if (feature == "F2F1") {
-      Color = fabs(pa[1] - pa[0]);
+    else if (feature == "n_sphere_radius") {
+      voronoi_n_sphere_radius_2d(coord2D, randomness, Radius);
     }
-
-    Color = cellnoise_color(Color);
-    Fac = (Color[0] + Color[1] + Color[2]) * (1.0 / 3.0);
+    else {
+      error("Unknown feature!");
+    }
+    outPosition2D = safe_divide(outPosition2D, Scale);
+    Position = vector3(outPosition2D.x, outPosition2D.y, 0.0);
+  }
+  else if (dimensions == "3D") {
+    if (feature == "f1") {
+      voronoi_f1_3d(coord, Exponent, randomness, metric, Distance, Color, Position);
+    }
+    else if (feature == "smooth_f1") {
+      voronoi_smooth_f1_3d(
+          coord, smoothness, Exponent, randomness, metric, Distance, Color, Position);
+    }
+    else if (feature == "f2") {
+      voronoi_f2_3d(coord, Exponent, randomness, metric, Distance, Color, Position);
+    }
+    else if (feature == "distance_to_edge") {
+      voronoi_distance_to_edge_3d(coord, randomness, Distance);
+    }
+    else if (feature == "n_sphere_radius") {
+      voronoi_n_sphere_radius_3d(coord, randomness, Radius);
+    }
+    else {
+      error("Unknown feature!");
+    }
+    Position = (Scale != 0.0) ? Position / Scale : vector3(0.0);
+  }
+  else if (dimensions == "4D") {
+    vector4 coord4D = vector4(coord[0], coord[1], coord[2], w);
+    vector4 outPosition4D;
+    if (feature == "f1") {
+      voronoi_f1_4d(coord4D, Exponent, randomness, metric, Distance, Color, outPosition4D);
+    }
+    else if (feature == "smooth_f1") {
+      voronoi_smooth_f1_4d(
+          coord4D, smoothness, Exponent, randomness, metric, Distance, Color, outPosition4D);
+    }
+    else if (feature == "f2") {
+      voronoi_f2_4d(coord4D, Exponent, randomness, metric, Distance, Color, outPosition4D);
+    }
+    else if (feature == "distance_to_edge") {
+      voronoi_distance_to_edge_4d(coord4D, randomness, Distance);
+    }
+    else if (feature == "n_sphere_radius") {
+      voronoi_n_sphere_radius_4d(coord4D, randomness, Radius);
+    }
+    else {
+      error("Unknown feature!");
+    }
+    outPosition4D = safe_divide(outPosition4D, Scale);
+    Position = vector3(outPosition4D.x, outPosition4D.y, outPosition4D.z);
+    WOut = outPosition4D.w;
+  }
+  else {
+    error("Unknown dimension!");
   }
 }
diff --git a/intern/cycles/kernel/shaders/node_voxel_texture.osl b/intern/cycles/kernel/shaders/node_voxel_texture.osl
index 0e4484561d8..14489298367 100644
--- a/intern/cycles/kernel/shaders/node_voxel_texture.osl
+++ b/intern/cycles/kernel/shaders/node_voxel_texture.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_voxel_texture(string filename = "",
                           string interpolation = "linear",
diff --git a/intern/cycles/kernel/shaders/node_wave_texture.osl b/intern/cycles/kernel/shaders/node_wave_texture.osl
index dfc2dbfb800..874bfb8d3af 100644
--- a/intern/cycles/kernel/shaders/node_wave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_wave_texture.osl
@@ -14,45 +14,88 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-#include "node_texture.h"
+#include "node_noise.h"
+#include "stdcycles.h"
 
 /* Wave */
 
-float wave(point p, string type, string profile, float detail, float distortion, float dscale)
+float wave(point p_input,
+           string type,
+           string bands_direction,
+           string rings_direction,
+           string profile,
+           float distortion,
+           float detail,
+           float dscale,
+           float droughness,
+           float phase)
 {
+  /* Prevent precision issues on unit coordinates. */
+  point p = (p_input + 0.000001) * 0.999999;
+
   float n = 0.0;
 
   if (type == "bands") {
-    n = (p[0] + p[1] + p[2]) * 10.0;
+    if (bands_direction == "x") {
+      n = p[0] * 20.0;
+    }
+    else if (bands_direction == "y") {
+      n = p[1] * 20.0;
+    }
+    else if (bands_direction == "z") {
+      n = p[2] * 20.0;
+    }
+    else { /* diagonal */
+      n = (p[0] + p[1] + p[2]) * 10.0;
+    }
   }
   else if (type == "rings") {
-    n = length(p) * 20.0;
+    point rp = p;
+    if (rings_direction == "x") {
+      rp *= point(0.0, 1.0, 1.0);
+    }
+    else if (rings_direction == "y") {
+      rp *= point(1.0, 0.0, 1.0);
+    }
+    else if (rings_direction == "z") {
+      rp *= point(1.0, 1.0, 0.0);
+    }
+    /* else: "spherical" */
+
+    n = length(rp) * 20.0;
   }
 
+  n += phase;
+
   if (distortion != 0.0) {
-    n = n + (distortion * noise_turbulence(p * dscale, detail, 0));
+    n = n + (distortion * (fractal_noise(p * dscale, detail, droughness) * 2.0 - 1.0));
   }
 
   if (profile == "sine") {
-    return 0.5 + 0.5 * sin(n);
+    return 0.5 + 0.5 * sin(n - M_PI_2);
+  }
+  else if (profile == "saw") {
+    n /= M_2PI;
+    return n - floor(n);
   }
-  else {
-    /* Saw profile */
+  else { /* profile tri */
     n /= M_2PI;
-    n -= (int)n;
-    return (n < 0.0) ? n + 1.0 : n;
+    return abs(n - floor(n + 0.5)) * 2.0;
   }
 }
 
 shader node_wave_texture(int use_mapping = 0,
                          matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                          string type = "bands",
+                         string bands_direction = "x",
+                         string rings_direction = "x",
                          string profile = "sine",
                          float Scale = 5.0,
                          float Distortion = 0.0,
                          float Detail = 2.0,
                          float DetailScale = 1.0,
+                         float DetailRoughness = 0.5,
+                         float PhaseOffset = 0.0,
                          point Vector = P,
                          output float Fac = 0.0,
                          output color Color = 0.0)
@@ -62,6 +105,15 @@ shader node_wave_texture(int use_mapping = 0,
   if (use_mapping)
     p = transform(mapping, p);
 
-  Fac = wave(p * Scale, type, profile, Detail, Distortion, DetailScale);
+  Fac = wave(p * Scale,
+             type,
+             bands_direction,
+             rings_direction,
+             profile,
+             Distortion,
+             Detail,
+             DetailScale,
+             DetailRoughness,
+             PhaseOffset);
   Color = Fac;
 }
diff --git a/intern/cycles/kernel/shaders/node_wavelength.osl b/intern/cycles/kernel/shaders/node_wavelength.osl
index c8c6eecb171..f484c4b4788 100644
--- a/intern/cycles/kernel/shaders/node_wavelength.osl
+++ b/intern/cycles/kernel/shaders/node_wavelength.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_wavelength(float Wavelength = 500.0, output color Color = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_white_noise_texture.osl b/intern/cycles/kernel/shaders/node_white_noise_texture.osl
new file mode 100644
index 00000000000..94735a019d5
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_white_noise_texture.osl
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "node_hash.h"
+#include "stdcycles.h"
+#include "vector2.h"
+#include "vector4.h"
+
+#define vector3 point
+
+shader node_white_noise_texture(string dimensions = "3D",
+                                point Vector = point(0.0, 0.0, 0.0),
+                                float W = 0.0,
+                                output float Value = 0.0,
+                                output color Color = 0.0)
+{
+  if (dimensions == "1D") {
+    Value = noise("hash", W);
+    Color = hash_float_to_color(W);
+  }
+  else if (dimensions == "2D") {
+    Value = noise("hash", Vector[0], Vector[1]);
+    Color = hash_vector2_to_color(vector2(Vector[0], Vector[1]));
+  }
+  else if (dimensions == "3D") {
+    Value = noise("hash", Vector);
+    Color = hash_vector3_to_color(vector3(Vector[0], Vector[1], Vector[2]));
+  }
+  else if (dimensions == "4D") {
+    Value = noise("hash", Vector, W);
+    Color = hash_vector4_to_color(vector4(Vector[0], Vector[1], Vector[2], W));
+  }
+  else {
+    warning("%s", "Unknown dimension!");
+  }
+}
diff --git a/intern/cycles/kernel/shaders/node_wireframe.osl b/intern/cycles/kernel/shaders/node_wireframe.osl
index ea4bd3a4c87..673a451c928 100644
--- a/intern/cycles/kernel/shaders/node_wireframe.osl
+++ b/intern/cycles/kernel/shaders/node_wireframe.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "oslutil.h"
+#include "stdcycles.h"
 
 shader node_wireframe(string bump_offset = "center",
                       int use_pixel_size = 0,
diff --git a/intern/cycles/kernel/shaders/oslutil.h b/intern/cycles/kernel/shaders/oslutil.h
deleted file mode 100644
index d48bfa4a665..00000000000
--- a/intern/cycles/kernel/shaders/oslutil.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef CCL_OSLUTIL_H
-#define CCL_OSLUTIL_H
-
-// Return wireframe opacity factor [0, 1] given a geometry type in
-// ("triangles", "polygons" or "patches"), and a line_width in raster
-// or world space depending on the last (raster) boolean argument.
-//
-float wireframe(string edge_type, float line_width, int raster)
-{
-  // ray differentials are so big in diffuse context that this function would always return "wire"
-  if (raytype("path:diffuse"))
-    return 0.0;
-
-  int np = 0;
-  point p[64];
-  float pixelWidth = 1;
-
-  if (edge_type == "triangles") {
-    np = 3;
-    if (!getattribute("geom:trianglevertices", p))
-      return 0.0;
-  }
-  else if (edge_type == "polygons" || edge_type == "patches") {
-    getattribute("geom:numpolyvertices", np);
-    if (np < 3 || !getattribute("geom:polyvertices", p))
-      return 0.0;
-  }
-
-  if (raster) {
-    // Project the derivatives of P to the viewing plane defined
-    // by I so we have a measure of how big is a pixel at this point
-    float pixelWidthX = length(Dx(P) - dot(Dx(P), I) * I);
-    float pixelWidthY = length(Dy(P) - dot(Dy(P), I) * I);
-    // Take the average of both axis' length
-    pixelWidth = (pixelWidthX + pixelWidthY) / 2;
-  }
-
-  // Use half the width as the neighbor face will render the
-  // other half. And take the square for fast comparison
-  pixelWidth *= 0.5 * line_width;
-  pixelWidth *= pixelWidth;
-  for (int i = 0; i < np; i++) {
-    int i2 = i ? i - 1 : np - 1;
-    vector dir = P - p[i];
-    vector edge = p[i] - p[i2];
-    vector crs = cross(edge, dir);
-    // At this point dot(crs, crs) / dot(edge, edge) is
-    // the square of area / length(edge) == square of the
-    // distance to the edge.
-    if (dot(crs, crs) < (dot(edge, edge) * pixelWidth))
-      return 1;
-  }
-  return 0;
-}
-
-float wireframe(string edge_type, float line_width)
-{
-  return wireframe(edge_type, line_width, 1);
-}
-float wireframe(string edge_type)
-{
-  return wireframe(edge_type, 1.0, 1);
-}
-float wireframe()
-{
-  return wireframe("polygons", 1.0, 1);
-}
-
-#endif /* CCL_OSLUTIL_H */
diff --git a/intern/cycles/kernel/shaders/stdcycles.h b/intern/cycles/kernel/shaders/stdcycles.h
new file mode 100644
index 00000000000..dd604da68ce
--- /dev/null
+++ b/intern/cycles/kernel/shaders/stdcycles.h
@@ -0,0 +1,150 @@
+/////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.  All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+// * Neither the name of Sony Pictures Imageworks nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef CCL_STDCYCLESOSL_H
+#define CCL_STDCYCLESOSL_H
+
+#include "stdosl.h"
+
+// Declaration of built-in functions and closures, stdosl.h does not make
+// these available so we have to redefine them.
+#define BUILTIN [[int builtin = 1]]
+#define BUILTIN_DERIV [[ int builtin = 1, int deriv = 1 ]]
+
+closure color diffuse_ramp(normal N, color colors[8]) BUILTIN;
+closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN;
+closure color diffuse_toon(normal N, float size, float smooth) BUILTIN;
+closure color glossy_toon(normal N, float size, float smooth) BUILTIN;
+closure color microfacet_ggx(normal N, float ag) BUILTIN;
+closure color microfacet_ggx_aniso(normal N, vector T, float ax, float ay) BUILTIN;
+closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
+closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
+closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
+closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
+closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_ggx_aniso_fresnel(
+    normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color
+microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_aniso_fresnel(
+    normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color
+microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_beckmann(normal N, float ab) BUILTIN;
+closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
+closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
+closure color ashikhmin_shirley(normal N, vector T, float ax, float ay) BUILTIN;
+closure color ashikhmin_velvet(normal N, float sigma) BUILTIN;
+closure color ambient_occlusion() BUILTIN;
+closure color principled_diffuse(normal N, float roughness) BUILTIN;
+closure color principled_sheen(normal N) BUILTIN;
+closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
+
+// BSSRDF
+closure color bssrdf(string method, normal N, vector radius, color albedo) BUILTIN;
+
+// Hair
+closure color
+hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
+closure color
+hair_transmission(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
+closure color principled_hair(normal N,
+                              color sigma,
+                              float roughnessu,
+                              float roughnessv,
+                              float coat,
+                              float alpha,
+                              float eta) BUILTIN;
+
+// Volume
+closure color henyey_greenstein(float g) BUILTIN;
+closure color absorption() BUILTIN;
+
+normal ensure_valid_reflection(normal Ng, vector I, normal N)
+{
+  /* The implementation here mirrors the one in kernel_montecarlo.h,
+   * check there for an explanation of the algorithm. */
+
+  float sqr(float x)
+  {
+    return x * x;
+  }
+
+  vector R = 2 * dot(N, I) * N - I;
+
+  float threshold = min(0.9 * dot(Ng, I), 0.01);
+  if (dot(Ng, R) >= threshold) {
+    return N;
+  }
+
+  float NdotNg = dot(N, Ng);
+  vector X = normalize(N - NdotNg * Ng);
+
+  float Ix = dot(I, X), Iz = dot(I, Ng);
+  float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
+  float a = Ix2 + Iz2;
+
+  float b = sqrt(Ix2 * (a - sqr(threshold)));
+  float c = Iz * threshold + a;
+
+  float fac = 0.5 / a;
+  float N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c);
+  int valid1 = (N1_z2 > 1e-5) && (N1_z2 <= (1.0 + 1e-5));
+  int valid2 = (N2_z2 > 1e-5) && (N2_z2 <= (1.0 + 1e-5));
+
+  float N_new_x, N_new_z;
+  if (valid1 && valid2) {
+    float N1_x = sqrt(1.0 - N1_z2), N1_z = sqrt(N1_z2);
+    float N2_x = sqrt(1.0 - N2_z2), N2_z = sqrt(N2_z2);
+
+    float R1 = 2 * (N1_x * Ix + N1_z * Iz) * N1_z - Iz;
+    float R2 = 2 * (N2_x * Ix + N2_z * Iz) * N2_z - Iz;
+
+    valid1 = (R1 >= 1e-5);
+    valid2 = (R2 >= 1e-5);
+    if (valid1 && valid2) {
+      N_new_x = (R1 < R2) ? N1_x : N2_x;
+      N_new_z = (R1 < R2) ? N1_z : N2_z;
+    }
+    else {
+      N_new_x = (R1 > R2) ? N1_x : N2_x;
+      N_new_z = (R1 > R2) ? N1_z : N2_z;
+    }
+  }
+  else if (valid1 || valid2) {
+    float Nz2 = valid1 ? N1_z2 : N2_z2;
+    N_new_x = sqrt(1.0 - Nz2);
+    N_new_z = sqrt(Nz2);
+  }
+  else {
+    return Ng;
+  }
+
+  return N_new_x * X + N_new_z * Ng;
+}
+
+#endif /* CCL_STDOSL_H */
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
deleted file mode 100644
index 9b9720ffff9..00000000000
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ /dev/null
@@ -1,853 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.  All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-// * Neither the name of Sony Pictures Imageworks nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////
-
-#ifndef CCL_STDOSL_H
-#define CCL_STDOSL_H
-
-#ifndef M_PI
-#  define M_PI 3.1415926535897932       /* pi */
-#  define M_PI_2 1.5707963267948966     /* pi/2 */
-#  define M_PI_4 0.7853981633974483     /* pi/4 */
-#  define M_2_PI 0.6366197723675813     /* 2/pi */
-#  define M_2PI 6.2831853071795865      /* 2*pi */
-#  define M_4PI 12.566370614359173      /* 4*pi */
-#  define M_2_SQRTPI 1.1283791670955126 /* 2/sqrt(pi) */
-#  define M_E 2.7182818284590452        /* e (Euler's number) */
-#  define M_LN2 0.6931471805599453      /* ln(2) */
-#  define M_LN10 2.3025850929940457     /* ln(10) */
-#  define M_LOG2E 1.4426950408889634    /* log_2(e) */
-#  define M_LOG10E 0.4342944819032518   /* log_10(e) */
-#  define M_SQRT2 1.4142135623730950    /* sqrt(2) */
-#  define M_SQRT1_2 0.7071067811865475  /* 1/sqrt(2) */
-#endif
-
-// Declaration of built-in functions and closures
-#define BUILTIN [[int builtin = 1]]
-#define BUILTIN_DERIV [[ int builtin = 1, int deriv = 1 ]]
-
-#define PERCOMP1(name) \
-  normal name(normal x) BUILTIN; \
-  vector name(vector x) BUILTIN; \
-  point name(point x) BUILTIN; \
-  color name(color x) BUILTIN; \
-  float name(float x) BUILTIN;
-
-#define PERCOMP2(name) \
-  normal name(normal x, normal y) BUILTIN; \
-  vector name(vector x, vector y) BUILTIN; \
-  point name(point x, point y) BUILTIN; \
-  color name(color x, color y) BUILTIN; \
-  float name(float x, float y) BUILTIN;
-
-#define PERCOMP2F(name) \
-  normal name(normal x, float y) BUILTIN; \
-  vector name(vector x, float y) BUILTIN; \
-  point name(point x, float y) BUILTIN; \
-  color name(color x, float y) BUILTIN; \
-  float name(float x, float y) BUILTIN;
-
-// Basic math
-normal degrees(normal x)
-{
-  return x * (180.0 / M_PI);
-}
-vector degrees(vector x)
-{
-  return x * (180.0 / M_PI);
-}
-point degrees(point x)
-{
-  return x * (180.0 / M_PI);
-}
-color degrees(color x)
-{
-  return x * (180.0 / M_PI);
-}
-float degrees(float x)
-{
-  return x * (180.0 / M_PI);
-}
-normal radians(normal x)
-{
-  return x * (M_PI / 180.0);
-}
-vector radians(vector x)
-{
-  return x * (M_PI / 180.0);
-}
-point radians(point x)
-{
-  return x * (M_PI / 180.0);
-}
-color radians(color x)
-{
-  return x * (M_PI / 180.0);
-}
-float radians(float x)
-{
-  return x * (M_PI / 180.0);
-}
-PERCOMP1(cos)
-PERCOMP1(sin)
-PERCOMP1(tan)
-PERCOMP1(acos)
-PERCOMP1(asin)
-PERCOMP1(atan)
-PERCOMP2(atan2)
-PERCOMP1(cosh)
-PERCOMP1(sinh)
-PERCOMP1(tanh)
-PERCOMP2F(pow)
-PERCOMP1(exp)
-PERCOMP1(exp2)
-PERCOMP1(expm1)
-PERCOMP1(log)
-point log(point a, float b)
-{
-  return log(a) / log(b);
-}
-vector log(vector a, float b)
-{
-  return log(a) / log(b);
-}
-color log(color a, float b)
-{
-  return log(a) / log(b);
-}
-float log(float a, float b)
-{
-  return log(a) / log(b);
-}
-PERCOMP1(log2)
-PERCOMP1(log10)
-PERCOMP1(logb)
-PERCOMP1(sqrt)
-PERCOMP1(inversesqrt)
-float hypot(float a, float b)
-{
-  return sqrt(a * a + b * b);
-}
-float hypot(float a, float b, float c)
-{
-  return sqrt(a * a + b * b + c * c);
-}
-PERCOMP1(abs)
-int abs(int x) BUILTIN;
-PERCOMP1(fabs)
-int fabs(int x) BUILTIN;
-PERCOMP1(sign)
-PERCOMP1(floor)
-PERCOMP1(ceil)
-PERCOMP1(round)
-PERCOMP1(trunc)
-PERCOMP2(fmod)
-PERCOMP2F(fmod)
-int mod(int a, int b)
-{
-  return a - b * (int)floor(a / b);
-}
-point mod(point a, point b)
-{
-  return a - b * floor(a / b);
-}
-vector mod(vector a, vector b)
-{
-  return a - b * floor(a / b);
-}
-normal mod(normal a, normal b)
-{
-  return a - b * floor(a / b);
-}
-color mod(color a, color b)
-{
-  return a - b * floor(a / b);
-}
-point mod(point a, float b)
-{
-  return a - b * floor(a / b);
-}
-vector mod(vector a, float b)
-{
-  return a - b * floor(a / b);
-}
-normal mod(normal a, float b)
-{
-  return a - b * floor(a / b);
-}
-color mod(color a, float b)
-{
-  return a - b * floor(a / b);
-}
-float mod(float a, float b)
-{
-  return a - b * floor(a / b);
-}
-PERCOMP2(min)
-int min(int a, int b) BUILTIN;
-PERCOMP2(max)
-int max(int a, int b) BUILTIN;
-normal clamp(normal x, normal minval, normal maxval)
-{
-  return max(min(x, maxval), minval);
-}
-vector clamp(vector x, vector minval, vector maxval)
-{
-  return max(min(x, maxval), minval);
-}
-point clamp(point x, point minval, point maxval)
-{
-  return max(min(x, maxval), minval);
-}
-color clamp(color x, color minval, color maxval)
-{
-  return max(min(x, maxval), minval);
-}
-float clamp(float x, float minval, float maxval)
-{
-  return max(min(x, maxval), minval);
-}
-int clamp(int x, int minval, int maxval)
-{
-  return max(min(x, maxval), minval);
-}
-#if 0
-normal mix (normal x, normal y, normal a) { return x*(1-a) + y*a; }
-normal mix (normal x, normal y, float  a) { return x*(1-a) + y*a; }
-vector mix (vector x, vector y, vector a) { return x*(1-a) + y*a; }
-vector mix (vector x, vector y, float  a) { return x*(1-a) + y*a; }
-point  mix (point  x, point  y, point  a) { return x*(1-a) + y*a; }
-point  mix (point  x, point  y, float  a) { return x*(1-a) + y*a; }
-color  mix (color  x, color  y, color  a) { return x*(1-a) + y*a; }
-color  mix (color  x, color  y, float  a) { return x*(1-a) + y*a; }
-float  mix (float  x, float  y, float  a) { return x*(1-a) + y*a; }
-#else
-normal mix(normal x, normal y, normal a) BUILTIN;
-normal mix(normal x, normal y, float a) BUILTIN;
-vector mix(vector x, vector y, vector a) BUILTIN;
-vector mix(vector x, vector y, float a) BUILTIN;
-point mix(point x, point y, point a) BUILTIN;
-point mix(point x, point y, float a) BUILTIN;
-color mix(color x, color y, color a) BUILTIN;
-color mix(color x, color y, float a) BUILTIN;
-float mix(float x, float y, float a) BUILTIN;
-#endif
-int isnan(float x) BUILTIN;
-int isinf(float x) BUILTIN;
-int isfinite(float x) BUILTIN;
-float erf(float x) BUILTIN;
-float erfc(float x) BUILTIN;
-
-// Vector functions
-
-vector cross(vector a, vector b) BUILTIN;
-float dot(vector a, vector b) BUILTIN;
-float length(vector v) BUILTIN;
-float distance(point a, point b) BUILTIN;
-float distance(point a, point b, point q)
-{
-  vector d = b - a;
-  float dd = dot(d, d);
-  if (dd == 0.0)
-    return distance(q, a);
-  float t = dot(q - a, d) / dd;
-  return distance(q, a + clamp(t, 0.0, 1.0) * d);
-}
-normal normalize(normal v) BUILTIN;
-vector normalize(vector v) BUILTIN;
-vector faceforward(vector N, vector I, vector Nref) BUILTIN;
-vector faceforward(vector N, vector I) BUILTIN;
-vector reflect(vector I, vector N)
-{
-  return I - 2 * dot(N, I) * N;
-}
-vector refract(vector I, vector N, float eta)
-{
-  float IdotN = dot(I, N);
-  float k = 1 - eta * eta * (1 - IdotN * IdotN);
-  return (k < 0) ? vector(0, 0, 0) : (eta * I - N * (eta * IdotN + sqrt(k)));
-}
-void fresnel(vector I,
-             normal N,
-             float eta,
-             output float Kr,
-             output float Kt,
-             output vector R,
-             output vector T)
-{
-  float sqr(float x)
-  {
-    return x * x;
-  }
-  float c = dot(I, N);
-  if (c < 0)
-    c = -c;
-  R = reflect(I, N);
-  float g = 1.0 / sqr(eta) - 1.0 + c * c;
-  if (g >= 0.0) {
-    g = sqrt(g);
-    float beta = g - c;
-    float F = (c * (g + c) - 1.0) / (c * beta + 1.0);
-    F = 0.5 * (1.0 + sqr(F));
-    F *= sqr(beta / (g + c));
-    Kr = F;
-    Kt = (1.0 - Kr) * eta * eta;
-    // OPT: the following recomputes some of the above values, but it
-    // gives us the same result as if the shader-writer called refract()
-    T = refract(I, N, eta);
-  }
-  else {
-    // total internal reflection
-    Kr = 1.0;
-    Kt = 0.0;
-    T = vector(0, 0, 0);
-  }
-}
-
-void fresnel(vector I, normal N, float eta, output float Kr, output float Kt)
-{
-  vector R, T;
-  fresnel(I, N, eta, Kr, Kt, R, T);
-}
-
-normal transform(matrix Mto, normal p) BUILTIN;
-vector transform(matrix Mto, vector p) BUILTIN;
-point transform(matrix Mto, point p) BUILTIN;
-normal transform(string from, string to, normal p) BUILTIN;
-vector transform(string from, string to, vector p) BUILTIN;
-point transform(string from, string to, point p) BUILTIN;
-normal transform(string to, normal p)
-{
-  return transform("common", to, p);
-}
-vector transform(string to, vector p)
-{
-  return transform("common", to, p);
-}
-point transform(string to, point p)
-{
-  return transform("common", to, p);
-}
-
-float transformu(string tounits, float x) BUILTIN;
-float transformu(string fromunits, string tounits, float x) BUILTIN;
-
-point rotate(point p, float angle, point a, point b)
-{
-  vector axis = normalize(b - a);
-  float cosang, sinang;
-  /* Older OSX has major issues with sincos() function,
-     * it's likely a big in OSL or LLVM. For until we've
-     * updated to new versions of this libraries we'll
-     * use a workaround to prevent possible crashes on all
-     * the platforms.
-     *
-     * Shouldn't be that bad because it's mainly used for
-     * anisotropic shader where angle is usually constant.
-     */
-#if 0
-    sincos (angle, sinang, cosang);
-#else
-  sinang = sin(angle);
-  cosang = cos(angle);
-#endif
-  float cosang1 = 1.0 - cosang;
-  float x = axis[0], y = axis[1], z = axis[2];
-  matrix M = matrix(x * x + (1.0 - x * x) * cosang,
-                    x * y * cosang1 + z * sinang,
-                    x * z * cosang1 - y * sinang,
-                    0.0,
-                    x * y * cosang1 - z * sinang,
-                    y * y + (1.0 - y * y) * cosang,
-                    y * z * cosang1 + x * sinang,
-                    0.0,
-                    x * z * cosang1 + y * sinang,
-                    y * z * cosang1 - x * sinang,
-                    z * z + (1.0 - z * z) * cosang,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    1.0);
-  return transform(M, p - a) + a;
-}
-
-normal ensure_valid_reflection(normal Ng, vector I, normal N)
-{
-  /* The implementation here mirrors the one in kernel_montecarlo.h,
-     * check there for an explanation of the algorithm. */
-
-  float sqr(float x)
-  {
-    return x * x;
-  }
-
-  vector R = 2 * dot(N, I) * N - I;
-
-  float threshold = min(0.9 * dot(Ng, I), 0.01);
-  if (dot(Ng, R) >= threshold) {
-    return N;
-  }
-
-  float NdotNg = dot(N, Ng);
-  vector X = normalize(N - NdotNg * Ng);
-
-  float Ix = dot(I, X), Iz = dot(I, Ng);
-  float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
-  float a = Ix2 + Iz2;
-
-  float b = sqrt(Ix2 * (a - sqr(threshold)));
-  float c = Iz * threshold + a;
-
-  float fac = 0.5 / a;
-  float N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c);
-  int valid1 = (N1_z2 > 1e-5) && (N1_z2 <= (1.0 + 1e-5));
-  int valid2 = (N2_z2 > 1e-5) && (N2_z2 <= (1.0 + 1e-5));
-
-  float N_new_x, N_new_z;
-  if (valid1 && valid2) {
-    float N1_x = sqrt(1.0 - N1_z2), N1_z = sqrt(N1_z2);
-    float N2_x = sqrt(1.0 - N2_z2), N2_z = sqrt(N2_z2);
-
-    float R1 = 2 * (N1_x * Ix + N1_z * Iz) * N1_z - Iz;
-    float R2 = 2 * (N2_x * Ix + N2_z * Iz) * N2_z - Iz;
-
-    valid1 = (R1 >= 1e-5);
-    valid2 = (R2 >= 1e-5);
-    if (valid1 && valid2) {
-      N_new_x = (R1 < R2) ? N1_x : N2_x;
-      N_new_z = (R1 < R2) ? N1_z : N2_z;
-    }
-    else {
-      N_new_x = (R1 > R2) ? N1_x : N2_x;
-      N_new_z = (R1 > R2) ? N1_z : N2_z;
-    }
-  }
-  else if (valid1 || valid2) {
-    float Nz2 = valid1 ? N1_z2 : N2_z2;
-    N_new_x = sqrt(1.0 - Nz2);
-    N_new_z = sqrt(Nz2);
-  }
-  else {
-    return Ng;
-  }
-
-  return N_new_x * X + N_new_z * Ng;
-}
-
-// Color functions
-
-float luminance(color c) BUILTIN;
-color blackbody(float temperatureK) BUILTIN;
-color wavelength_color(float wavelength_nm) BUILTIN;
-
-color transformc(string to, color x)
-{
-  color rgb_to_hsv(color rgb)
-  {  // See Foley & van Dam
-    float r = rgb[0], g = rgb[1], b = rgb[2];
-    float mincomp = min(r, min(g, b));
-    float maxcomp = max(r, max(g, b));
-    float delta = maxcomp - mincomp;  // chroma
-    float h, s, v;
-    v = maxcomp;
-    if (maxcomp > 0)
-      s = delta / maxcomp;
-    else
-      s = 0;
-    if (s <= 0)
-      h = 0;
-    else {
-      if (r >= maxcomp)
-        h = (g - b) / delta;
-      else if (g >= maxcomp)
-        h = 2 + (b - r) / delta;
-      else
-        h = 4 + (r - g) / delta;
-      h /= 6;
-      if (h < 0)
-        h += 1;
-    }
-    return color(h, s, v);
-  }
-
-  color rgb_to_hsl(color rgb)
-  {  // See Foley & van Dam
-    // First convert rgb to hsv, then to hsl
-    float minval = min(rgb[0], min(rgb[1], rgb[2]));
-    color hsv = rgb_to_hsv(rgb);
-    float maxval = hsv[2];  // v == maxval
-    float h = hsv[0], s, l = (minval + maxval) / 2;
-    if (minval == maxval)
-      s = 0;  // special 'achromatic' case, hue is 0
-    else if (l <= 0.5)
-      s = (maxval - minval) / (maxval + minval);
-    else
-      s = (maxval - minval) / (2 - maxval - minval);
-    return color(h, s, l);
-  }
-
-  color r;
-  if (to == "rgb" || to == "RGB")
-    r = x;
-  else if (to == "hsv")
-    r = rgb_to_hsv(x);
-  else if (to == "hsl")
-    r = rgb_to_hsl(x);
-  else if (to == "YIQ")
-    r = color(dot(vector(0.299, 0.587, 0.114), (vector)x),
-              dot(vector(0.596, -0.275, -0.321), (vector)x),
-              dot(vector(0.212, -0.523, 0.311), (vector)x));
-  else if (to == "XYZ")
-    r = color(dot(vector(0.412453, 0.357580, 0.180423), (vector)x),
-              dot(vector(0.212671, 0.715160, 0.072169), (vector)x),
-              dot(vector(0.019334, 0.119193, 0.950227), (vector)x));
-  else {
-    error("Unknown color space \"%s\"", to);
-    r = x;
-  }
-  return r;
-}
-
-color transformc(string from, string to, color x)
-{
-  color hsv_to_rgb(color c)
-  {  // Reference: Foley & van Dam
-    float h = c[0], s = c[1], v = c[2];
-    color r;
-    if (s < 0.0001) {
-      r = v;
-    }
-    else {
-      h = 6 * (h - floor(h));  // expand to [0..6)
-      int hi = (int)h;
-      float f = h - hi;
-      float p = v * (1 - s);
-      float q = v * (1 - s * f);
-      float t = v * (1 - s * (1 - f));
-      if (hi == 0)
-        r = color(v, t, p);
-      else if (hi == 1)
-        r = color(q, v, p);
-      else if (hi == 2)
-        r = color(p, v, t);
-      else if (hi == 3)
-        r = color(p, q, v);
-      else if (hi == 4)
-        r = color(t, p, v);
-      else
-        r = color(v, p, q);
-    }
-    return r;
-  }
-
-  color hsl_to_rgb(color c)
-  {
-    float h = c[0], s = c[1], l = c[2];
-    // Easiest to convert hsl -> hsv, then hsv -> RGB (per Foley & van Dam)
-    float v = (l <= 0.5) ? (l * (1 + s)) : (l * (1 - s) + s);
-    color r;
-    if (v <= 0) {
-      r = 0;
-    }
-    else {
-      float min = 2 * l - v;
-      s = (v - min) / v;
-      r = hsv_to_rgb(color(h, s, v));
-    }
-    return r;
-  }
-
-  color r;
-  if (from == "rgb" || from == "RGB")
-    r = x;
-  else if (from == "hsv")
-    r = hsv_to_rgb(x);
-  else if (from == "hsl")
-    r = hsl_to_rgb(x);
-  else if (from == "YIQ")
-    r = color(dot(vector(1, 0.9557, 0.6199), (vector)x),
-              dot(vector(1, -0.2716, -0.6469), (vector)x),
-              dot(vector(1, -1.1082, 1.7051), (vector)x));
-  else if (from == "XYZ")
-    r = color(dot(vector(3.240479, -1.537150, -0.498535), (vector)x),
-              dot(vector(-0.969256, 1.875991, 0.041556), (vector)x),
-              dot(vector(0.055648, -0.204043, 1.057311), (vector)x));
-  else {
-    error("Unknown color space \"%s\"", to);
-    r = x;
-  }
-  return transformc(to, r);
-}
-
-// Matrix functions
-
-float determinant(matrix m) BUILTIN;
-matrix transpose(matrix m) BUILTIN;
-
-// Pattern generation
-
-color step(color edge, color x) BUILTIN;
-point step(point edge, point x) BUILTIN;
-vector step(vector edge, vector x) BUILTIN;
-normal step(normal edge, normal x) BUILTIN;
-float step(float edge, float x) BUILTIN;
-float smoothstep(float edge0, float edge1, float x) BUILTIN;
-
-float linearstep(float edge0, float edge1, float x)
-{
-  float result;
-  if (edge0 != edge1) {
-    float xclamped = clamp(x, edge0, edge1);
-    result = (xclamped - edge0) / (edge1 - edge0);
-  }
-  else {  // special case: edges coincide
-    result = step(edge0, x);
-  }
-  return result;
-}
-
-float smooth_linearstep(float edge0, float edge1, float x_, float eps_)
-{
-  float result;
-  if (edge0 != edge1) {
-    float rampup(float x, float r)
-    {
-      return 0.5 / r * x * x;
-    }
-    float width_inv = 1.0 / (edge1 - edge0);
-    float eps = eps_ * width_inv;
-    float x = (x_ - edge0) * width_inv;
-    if (x <= -eps)
-      result = 0;
-    else if (x >= eps && x <= 1.0 - eps)
-      result = x;
-    else if (x >= 1.0 + eps)
-      result = 1;
-    else if (x < eps)
-      result = rampup(x + eps, 2.0 * eps);
-    else /* if (x < 1.0+eps) */
-      result = 1.0 - rampup(1.0 + eps - x, 2.0 * eps);
-  }
-  else {
-    result = step(edge0, x_);
-  }
-  return result;
-}
-
-float aastep(float edge, float s, float dedge, float ds)
-{
-  // Box filtered AA step
-  float width = fabs(dedge) + fabs(ds);
-  float halfwidth = 0.5 * width;
-  float e1 = edge - halfwidth;
-  return (s <= e1) ? 0.0 : ((s >= (edge + halfwidth)) ? 1.0 : (s - e1) / width);
-}
-float aastep(float edge, float s, float ds)
-{
-  return aastep(edge, s, filterwidth(edge), ds);
-}
-float aastep(float edge, float s)
-{
-  return aastep(edge, s, filterwidth(edge), filterwidth(s));
-}
-
-// Derivatives and area operators
-
-// Displacement functions
-
-// String functions
-int strlen(string s) BUILTIN;
-int hash(string s) BUILTIN;
-int getchar(string s, int index) BUILTIN;
-int startswith(string s, string prefix) BUILTIN;
-int endswith(string s, string suffix) BUILTIN;
-string substr(string s, int start, int len) BUILTIN;
-string substr(string s, int start)
-{
-  return substr(s, start, strlen(s));
-}
-float stof(string str) BUILTIN;
-int stoi(string str) BUILTIN;
-
-// Define concat in terms of shorter concat
-string concat(string a, string b, string c)
-{
-  return concat(concat(a, b), c);
-}
-string concat(string a, string b, string c, string d)
-{
-  return concat(concat(a, b, c), d);
-}
-string concat(string a, string b, string c, string d, string e)
-{
-  return concat(concat(a, b, c, d), e);
-}
-string concat(string a, string b, string c, string d, string e, string f)
-{
-  return concat(concat(a, b, c, d, e), f);
-}
-
-// Texture
-
-// Closures
-
-closure color diffuse(normal N) BUILTIN;
-closure color oren_nayar(normal N, float sigma) BUILTIN;
-closure color diffuse_ramp(normal N, color colors[8]) BUILTIN;
-closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN;
-closure color diffuse_toon(normal N, float size, float smooth) BUILTIN;
-closure color glossy_toon(normal N, float size, float smooth) BUILTIN;
-closure color translucent(normal N) BUILTIN;
-closure color reflection(normal N) BUILTIN;
-closure color refraction(normal N, float eta) BUILTIN;
-closure color transparent() BUILTIN;
-closure color microfacet_ggx(normal N, float ag) BUILTIN;
-closure color microfacet_ggx_aniso(normal N, vector T, float ax, float ay) BUILTIN;
-closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
-closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
-closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
-closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
-closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
-closure color microfacet_ggx_aniso_fresnel(
-    normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
-closure color
-microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
-closure color microfacet_multi_ggx_aniso_fresnel(
-    normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
-closure color
-microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
-closure color microfacet_beckmann(normal N, float ab) BUILTIN;
-closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
-closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
-closure color ashikhmin_shirley(normal N, vector T, float ax, float ay) BUILTIN;
-closure color ashikhmin_velvet(normal N, float sigma) BUILTIN;
-closure color emission() BUILTIN;
-closure color background() BUILTIN;
-closure color holdout() BUILTIN;
-closure color ambient_occlusion() BUILTIN;
-closure color principled_diffuse(normal N, float roughness) BUILTIN;
-closure color principled_sheen(normal N) BUILTIN;
-closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
-
-// BSSRDF
-closure color bssrdf(string method, normal N, vector radius, color albedo) BUILTIN;
-
-// Hair
-closure color
-hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
-closure color
-hair_transmission(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
-closure color principled_hair(normal N,
-                              color sigma,
-                              float roughnessu,
-                              float roughnessv,
-                              float coat,
-                              float alpha,
-                              float eta) BUILTIN;
-
-// Volume
-closure color henyey_greenstein(float g) BUILTIN;
-closure color absorption() BUILTIN;
-
-// OSL 1.5 Microfacet functions
-closure color microfacet(
-    string distribution, normal N, vector U, float xalpha, float yalpha, float eta, int refract)
-{
-  /* GGX */
-  if (distribution == "ggx" || distribution == "default") {
-    if (!refract) {
-      if (xalpha == yalpha) {
-        /* Isotropic */
-        return microfacet_ggx(N, xalpha);
-      }
-      else {
-        /* Anisotropic */
-        return microfacet_ggx_aniso(N, U, xalpha, yalpha);
-      }
-    }
-    else {
-      return microfacet_ggx_refraction(N, xalpha, eta);
-    }
-  }
-  /* Beckmann */
-  else {
-    if (!refract) {
-      if (xalpha == yalpha) {
-        /* Isotropic */
-        return microfacet_beckmann(N, xalpha);
-      }
-      else {
-        /* Anisotropic */
-        return microfacet_beckmann_aniso(N, U, xalpha, yalpha);
-      }
-    }
-    else {
-      return microfacet_beckmann_refraction(N, xalpha, eta);
-    }
-  }
-}
-
-closure color microfacet(string distribution, normal N, float alpha, float eta, int refract)
-{
-  return microfacet(distribution, N, vector(0), alpha, alpha, eta, refract);
-}
-
-// Renderer state
-int backfacing() BUILTIN;
-int raytype(string typename) BUILTIN;
-// the individual 'isFOOray' functions are deprecated
-int iscameraray()
-{
-  return raytype("camera");
-}
-int isdiffuseray()
-{
-  return raytype("diffuse");
-}
-int isglossyray()
-{
-  return raytype("glossy");
-}
-int isshadowray()
-{
-  return raytype("shadow");
-}
-int getmatrix(string fromspace, string tospace, output matrix M) BUILTIN;
-int getmatrix(string fromspace, output matrix M)
-{
-  return getmatrix(fromspace, "common", M);
-}
-
-// Miscellaneous
-
-#undef BUILTIN
-#undef BUILTIN_DERIV
-#undef PERCOMP1
-#undef PERCOMP2
-#undef PERCOMP2F
-
-#endif /* CCL_STDOSL_H */
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
new file mode 100644
index 00000000000..60ebf415970
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
+    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+    int buffer_offset = (kernel_split_params.tile.offset + x +
+                         y * kernel_split_params.tile.stride) *
+                        kernel_data.film.pass_stride;
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+    int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
+    if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+      buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+      float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f,
+                                             buffer[kernel_data.film.pass_sample_count]);
+      if (sample_multiplier != 1.0f) {
+        kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+      }
+    }
+    else {
+      kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
new file mode 100644
index 00000000000..93f41f7ced4
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.h &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int y = kernel_split_params.tile.y + pixel_index;
+    kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
new file mode 100644
index 00000000000..eca53d079ec
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int x = kernel_split_params.tile.x + pixel_index;
+    kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
new file mode 100644
index 00000000000..c8eb1ebd705
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+    int buffer_offset = (kernel_split_params.tile.offset + x +
+                         y * kernel_split_params.tile.stride) *
+                        kernel_data.film.pass_stride;
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+    kernel_do_adaptive_stopping(kg,
+                                buffer,
+                                kernel_split_params.tile.start_sample +
+                                    kernel_split_params.tile.num_samples - 1);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index e08d87ab618..bfcd21baac4 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -106,7 +106,7 @@ ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals
   PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
   PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
 
-  path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+  path_radiance_init(kg, inactive_L);
   path_radiance_copy_indirect(inactive_L, L);
 
   ray_state[inactive_ray] = RAY_REGENERATED;
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index e77743350dc..dba1768f03f 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -132,10 +132,10 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 
         if (ray->t != 0.0f) {
           /* Initialize throughput, path radiance, Ray, PathState;
-         * These rays proceed with path-iteration.
-         */
+           * These rays proceed with path-iteration.
+           */
           *throughput = make_float3(1.0f, 1.0f, 1.0f);
-          path_radiance_init(L, kernel_data.film.use_light_pass);
+          path_radiance_init(kg, L);
           path_state_init(kg,
                           AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
                           state,
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 52930843f56..2f83a10316d 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -46,10 +46,10 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
     int sh,
     int offset,
     int stride,
-    ccl_global int *Queue_index, /* Tracks the number of elements in queues */
-    int queuesize,               /* size (capacity) of the queue */
-    ccl_global char *
-        use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+    ccl_global int *Queue_index,      /* Tracks the number of elements in queues */
+    int queuesize,                    /* size (capacity) of the queue */
+    ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues
+                                         to fetch ray index */
     ccl_global unsigned int *work_pools, /* Work pool for each work group */
     unsigned int num_samples,
     ccl_global float *buffer)
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index dd3ffe3f52c..d0c91d43eed 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -88,7 +88,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
       LightSample ls;
       if (light_sample(
               kg, light_u, light_v, sd->time, sd->P_pick, sd->N_pick, state->bounce, &ls, false)) {
-
         Ray light_ray;
         light_ray.time = sd->time;
 
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index bdadfb45d74..99a7973019e 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -44,7 +44,7 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
                      branched_state->isect.t :
                      FLT_MAX;
 
-  bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+  float step_size = volume_stack_step_size(kg, branched_state->path_state.volume_stack);
 
   for (int j = branched_state->next_sample; j < num_samples; j++) {
     ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
@@ -61,7 +61,7 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
 
     /* integrate along volume segment with distance sampling */
     VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, ps, sd, &volume_ray, L, tp, heterogeneous);
+        kg, ps, sd, &volume_ray, L, tp, step_size);
 
     kernel_update_light_picking(sd, ps);
 
@@ -166,12 +166,12 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
       if (!kernel_data.integrator.branched ||
           IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
 #  endif /* __BRANCHED_PATH__ */
-        bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+        float step_size = volume_stack_step_size(kg, state->volume_stack);
 
         {
           /* integrate along volume segment with distance sampling */
           VolumeIntegrateResult result = kernel_volume_integrate(
-              kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+              kg, state, sd, &volume_ray, L, throughput, step_size);
 
 #  ifdef __VOLUME_SCATTER__
           if (result == VOLUME_PATH_SCATTERED) {
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 6eb9e1815c4..630625fa2ac 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -123,9 +123,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
     if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
       /* Path termination. this is a strange place to put the termination, it's
-     * mainly due to the mixed in MIS that we use. gives too many unneeded
-     * shader evaluations, only need emission if we are going to terminate.
-     */
+       * mainly due to the mixed in MIS that we use. gives too many unneeded
+       * shader evaluations, only need emission if we are going to terminate.
+       */
       float probability = path_state_continuation_probability(kg, state, throughput);
 
       if (probability == 0.0f) {
@@ -141,10 +141,12 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         }
       }
 
+#ifdef __DENOISING_FEATURES__
       if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
         PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
         kernel_update_denoising_features(kg, sd, state, L);
       }
+#endif
     }
 
 #ifdef __AO__
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index b1c65f61e2c..6d500650cc0 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -58,8 +58,10 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
     ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
     float3 throughput = kernel_split_state.throughput[ray_index];
     ShaderData *sd = kernel_split_sd(sd, ray_index);
+    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
 
-    kernel_path_background(kg, state, ray, throughput, sd, L);
+    kernel_path_background(kg, state, ray, throughput, sd, buffer, L);
     kernel_split_path_end(kg, ray_index);
   }
 }
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 781ce869374..320f6a414bf 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -109,9 +109,9 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 
   if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
     /* If we are here, then it means that scene-intersect kernel
-    * has already been executed atleast once. From the next time,
-    * scene-intersect kernel may operate on queues to fetch ray index
-    */
+     * has already been executed at least once. From the next time,
+     * scene-intersect kernel may operate on queues to fetch ray index
+     */
     *kernel_split_params.use_queues_flag = 1;
 
     /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
index 3faa3208341..82b0f583d8d 100644
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -59,8 +59,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg)
      * These rays proceed with path-iteration.
      */
     kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-    path_radiance_init(&kernel_split_state.path_radiance[ray_index],
-                       kernel_data.film.use_light_pass);
+    path_radiance_init(kg, &kernel_split_state.path_radiance[ray_index]);
     path_state_init(kg,
                     AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
                     &kernel_split_state.path_state[ray_index],
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 8e39c9797e5..c760a2b2049 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -50,8 +50,10 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
   ccl_global char *ray_state = kernel_split_state.ray_state;
   if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
     ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
 
-    shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, state->flag);
+    shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, buffer, state->flag);
 #ifdef __BRANCHED_PATH__
     if (kernel_data.integrator.branched) {
       shader_merge_closures(kernel_split_sd(sd, ray_index));
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 82990ce9fae..5e46d300bca 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -87,7 +87,7 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 
     if (!shadow_blocked(kg, sd, emission_sd, state, &ray, &shadow)) {
       /* accumulate */
-      path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
     }
     else {
       path_radiance_accum_total_light(L, state, throughput, &L_light);
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 384bc952460..5114f2b03e5 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,6 +17,7 @@
 #ifndef __KERNEL_SPLIT_H__
 #define __KERNEL_SPLIT_H__
 
+// clang-format off
 #include "kernel/kernel_math.h"
 #include "kernel/kernel_types.h"
 
@@ -52,6 +53,7 @@
 #ifdef __BRANCHED_PATH__
 #  include "kernel/split/kernel_branched.h"
 #endif
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 433b1221a37..decc537b39b 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -18,6 +18,7 @@
 #define __KERNEL_SPLIT_DATA_H__
 
 #include "kernel/split/kernel_split_data_types.h"
+
 #include "kernel/kernel_globals.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 6ff3f5bdb55..ac4a450ca2b 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -19,7 +19,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
+/* parameters used by the split kernels, we use a single struct to avoid passing these to each
+ * kernel */
 
 typedef struct SplitParams {
   WorkTile tile;
@@ -112,7 +113,8 @@ typedef ccl_global struct SplitBranchedState {
   SPLIT_DATA_BRANCHED_ENTRIES \
   SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
 
-/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+/* Entries to be copied to inactive rays when sharing branched samples
+ * (TODO: which are actually needed?) */
 #define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
   SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
   SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
@@ -134,8 +136,9 @@ typedef struct SplitData {
   SPLIT_DATA_ENTRIES
 #undef SPLIT_DATA_ENTRY
 
-  /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
-   * the host easily) but is still used the same as the other data so we have it here in this struct as well
+  /* this is actually in a separate buffer from the rest of the split state data (so it can be read
+   * back from the host easily) but is still used the same as the other data so we have it here in
+   * this struct as well
    */
   ccl_global char *ray_state;
 } SplitData;
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 4a386afa5de..abeb8fa7457 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -132,16 +132,25 @@ ccl_device_inline float4 fetch_node_float(KernelGlobals *kg, int offset)
                      __uint_as_float(node.w));
 }
 
-ccl_device_inline void decode_node_uchar4(uint i, uint *x, uint *y, uint *z, uint *w)
+ccl_device_forceinline void svm_unpack_node_uchar2(uint i, uint *x, uint *y)
 {
-  if (x)
-    *x = (i & 0xFF);
-  if (y)
-    *y = ((i >> 8) & 0xFF);
-  if (z)
-    *z = ((i >> 16) & 0xFF);
-  if (w)
-    *w = ((i >> 24) & 0xFF);
+  *x = (i & 0xFF);
+  *y = ((i >> 8) & 0xFF);
+}
+
+ccl_device_forceinline void svm_unpack_node_uchar3(uint i, uint *x, uint *y, uint *z)
+{
+  *x = (i & 0xFF);
+  *y = ((i >> 8) & 0xFF);
+  *z = ((i >> 16) & 0xFF);
+}
+
+ccl_device_forceinline void svm_unpack_node_uchar4(uint i, uint *x, uint *y, uint *z, uint *w)
+{
+  *x = (i & 0xFF);
+  *y = ((i >> 8) & 0xFF);
+  *z = ((i >> 16) & 0xFF);
+  *w = ((i >> 24) & 0xFF);
 }
 
 CCL_NAMESPACE_END
@@ -149,49 +158,56 @@ CCL_NAMESPACE_END
 /* Nodes */
 
 #include "kernel/svm/svm_noise.h"
-#include "svm_texture.h"
+#include "svm_fractal_noise.h"
 
 #include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_mapping_util.h"
 #include "kernel/svm/svm_math_util.h"
 
+#include "kernel/svm/svm_aov.h"
 #include "kernel/svm/svm_attribute.h"
-#include "kernel/svm/svm_gradient.h"
 #include "kernel/svm/svm_blackbody.h"
+#include "kernel/svm/svm_brick.h"
+#include "kernel/svm/svm_brightness.h"
+#include "kernel/svm/svm_bump.h"
+#include "kernel/svm/svm_camera.h"
+#include "kernel/svm/svm_checker.h"
+#include "kernel/svm/svm_clamp.h"
 #include "kernel/svm/svm_closure.h"
-#include "kernel/svm/svm_noisetex.h"
 #include "kernel/svm/svm_convert.h"
 #include "kernel/svm/svm_displace.h"
 #include "kernel/svm/svm_fresnel.h"
-#include "kernel/svm/svm_wireframe.h"
-#include "kernel/svm/svm_wavelength.h"
-#include "kernel/svm/svm_camera.h"
+#include "kernel/svm/svm_gamma.h"
 #include "kernel/svm/svm_geometry.h"
+#include "kernel/svm/svm_gradient.h"
 #include "kernel/svm/svm_hsv.h"
 #include "kernel/svm/svm_ies.h"
 #include "kernel/svm/svm_image.h"
-#include "kernel/svm/svm_gamma.h"
-#include "kernel/svm/svm_brightness.h"
 #include "kernel/svm/svm_invert.h"
 #include "kernel/svm/svm_light_path.h"
 #include "kernel/svm/svm_magic.h"
+#include "kernel/svm/svm_map_range.h"
 #include "kernel/svm/svm_mapping.h"
-#include "kernel/svm/svm_normal.h"
-#include "kernel/svm/svm_wave.h"
 #include "kernel/svm/svm_math.h"
 #include "kernel/svm/svm_mix.h"
+#include "kernel/svm/svm_musgrave.h"
+#include "kernel/svm/svm_noisetex.h"
+#include "kernel/svm/svm_normal.h"
 #include "kernel/svm/svm_ramp.h"
 #include "kernel/svm/svm_sepcomb_hsv.h"
 #include "kernel/svm/svm_sepcomb_vector.h"
-#include "kernel/svm/svm_musgrave.h"
 #include "kernel/svm/svm_sky.h"
 #include "kernel/svm/svm_tex_coord.h"
 #include "kernel/svm/svm_value.h"
-#include "kernel/svm/svm_voronoi.h"
-#include "kernel/svm/svm_checker.h"
-#include "kernel/svm/svm_brick.h"
+#include "kernel/svm/svm_vector_rotate.h"
 #include "kernel/svm/svm_vector_transform.h"
+#include "kernel/svm/svm_vertex_color.h"
+#include "kernel/svm/svm_voronoi.h"
 #include "kernel/svm/svm_voxel.h"
-#include "kernel/svm/svm_bump.h"
+#include "kernel/svm/svm_wave.h"
+#include "kernel/svm/svm_wavelength.h"
+#include "kernel/svm/svm_white_noise.h"
+#include "kernel/svm/svm_wireframe.h"
 
 #ifdef __SHADER_RAYTRACE__
 #  include "kernel/svm/svm_ao.h"
@@ -200,13 +216,11 @@ CCL_NAMESPACE_END
 
 CCL_NAMESPACE_BEGIN
 
-#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
-#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
-
 /* Main Interpreter Loop */
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ccl_addr_space PathState *state,
+                                        ccl_global float *buffer,
                                         ShaderType type,
                                         int path_flag)
 {
@@ -217,6 +231,8 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
     uint4 node = read_node(kg, &offset);
 
     switch (node.x) {
+      case NODE_END:
+        return;
 #if NODES_GROUP(NODE_GROUP_LEVEL_0)
       case NODE_SHADER_JUMP: {
         if (type == SHADER_TYPE_SURFACE)
@@ -276,6 +292,9 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_ATTR:
         svm_node_attr(kg, sd, stack, node);
         break;
+      case NODE_VERTEX_COLOR:
+        svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
+        break;
 #  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_GEOMETRY_BUMP_DX:
         svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
@@ -293,19 +312,16 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
         svm_node_vector_displacement(kg, sd, stack, node, &offset);
         break;
 #  endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */
-#  ifdef __TEXTURES__
       case NODE_TEX_IMAGE:
-        svm_node_tex_image(kg, sd, stack, node);
+        svm_node_tex_image(kg, sd, stack, node, &offset);
         break;
       case NODE_TEX_IMAGE_BOX:
         svm_node_tex_image_box(kg, sd, stack, node);
         break;
       case NODE_TEX_NOISE:
-        svm_node_tex_noise(kg, sd, stack, node, &offset);
+        svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
-#  endif /* __TEXTURES__ */
-#  ifdef __EXTRA_NODES__
-#    if NODES_FEATURE(NODE_FEATURE_BUMP)
+#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_SET_BUMP:
         svm_node_set_bump(kg, sd, stack, node);
         break;
@@ -315,6 +331,12 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_ATTR_BUMP_DY:
         svm_node_attr_bump_dy(kg, sd, stack, node);
         break;
+      case NODE_VERTEX_COLOR_BUMP_DX:
+        svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        break;
+      case NODE_VERTEX_COLOR_BUMP_DY:
+        svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        break;
       case NODE_TEX_COORD_BUMP_DX:
         svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
         break;
@@ -324,20 +346,19 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_CLOSURE_SET_NORMAL:
         svm_node_set_normal(kg, sd, stack, node.y, node.z);
         break;
-#      if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
+#    if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
       case NODE_ENTER_BUMP_EVAL:
         svm_node_enter_bump_eval(kg, sd, stack, node.y);
         break;
       case NODE_LEAVE_BUMP_EVAL:
         svm_node_leave_bump_eval(kg, sd, stack, node.y);
         break;
-#      endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
-#    endif   /* NODES_FEATURE(NODE_FEATURE_BUMP) */
+#    endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
+#  endif   /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_HSV:
         svm_node_hsv(kg, sd, stack, node, &offset);
         break;
-#  endif /* __EXTRA_NODES__ */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
+#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
 
 #if NODES_GROUP(NODE_GROUP_LEVEL_1)
       case NODE_CLOSURE_HOLDOUT:
@@ -357,7 +378,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
         svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset);
         break;
 #  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
-#  ifdef __EXTRA_NODES__
       case NODE_MATH:
         svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
@@ -382,19 +402,19 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_PARTICLE_INFO:
         svm_node_particle_info(kg, sd, stack, node.y, node.z);
         break;
-#    ifdef __HAIR__
-#      if NODES_FEATURE(NODE_FEATURE_HAIR)
+#  if defined(__HAIR__) && NODES_FEATURE(NODE_FEATURE_HAIR)
       case NODE_HAIR_INFO:
         svm_node_hair_info(kg, sd, stack, node.y, node.z);
         break;
-#      endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
-#    endif   /* __HAIR__ */
-#  endif     /* __EXTRA_NODES__ */
-#endif       /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+#  endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
+#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
 
 #if NODES_GROUP(NODE_GROUP_LEVEL_2)
+      case NODE_TEXTURE_MAPPING:
+        svm_node_texture_mapping(kg, sd, stack, node.y, node.z, &offset);
+        break;
       case NODE_MAPPING:
-        svm_node_mapping(kg, sd, stack, node.y, node.z, &offset);
+        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
       case NODE_MIN_MAX:
         svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
@@ -402,7 +422,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_CAMERA:
         svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
         break;
-#  ifdef __TEXTURES__
       case NODE_TEX_ENVIRONMENT:
         svm_node_tex_environment(kg, sd, stack, node);
         break;
@@ -413,10 +432,10 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
         svm_node_tex_gradient(sd, stack, node);
         break;
       case NODE_TEX_VORONOI:
-        svm_node_tex_voronoi(kg, sd, stack, node, &offset);
+        svm_node_tex_voronoi(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
       case NODE_TEX_MUSGRAVE:
-        svm_node_tex_musgrave(kg, sd, stack, node, &offset);
+        svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
       case NODE_TEX_WAVE:
         svm_node_tex_wave(kg, sd, stack, node, &offset);
@@ -430,8 +449,9 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_TEX_BRICK:
         svm_node_tex_brick(kg, sd, stack, node, &offset);
         break;
-#  endif /* __TEXTURES__ */
-#  ifdef __EXTRA_NODES__
+      case NODE_TEX_WHITE_NOISE:
+        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        break;
       case NODE_NORMAL:
         svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
@@ -441,8 +461,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_IES:
         svm_node_ies(kg, sd, stack, node, &offset);
         break;
-#  endif /* __EXTRA_NODES__ */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
+#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
 #if NODES_GROUP(NODE_GROUP_LEVEL_3)
       case NODE_RGB_CURVES:
@@ -455,7 +474,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_NORMAL_MAP:
         svm_node_normal_map(kg, sd, stack, node);
         break;
-#  ifdef __EXTRA_NODES__
       case NODE_INVERT:
         svm_node_invert(sd, stack, node.y, node.z, node.w);
         break;
@@ -474,6 +492,9 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_COMBINE_HSV:
         svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
+      case NODE_VECTOR_ROTATE:
+        svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
+        break;
       case NODE_VECTOR_TRANSFORM:
         svm_node_vector_transform(kg, sd, stack, node);
         break;
@@ -486,12 +507,12 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_BLACKBODY:
         svm_node_blackbody(kg, sd, stack, node.y, node.z);
         break;
-#  endif /* __EXTRA_NODES__ */
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
-      case NODE_TEX_VOXEL:
-        svm_node_tex_voxel(kg, sd, stack, node, &offset);
+      case NODE_MAP_RANGE:
+        svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, &offset);
+        break;
+      case NODE_CLAMP:
+        svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
 #  ifdef __SHADER_RAYTRACE__
       case NODE_BEVEL:
         svm_node_bevel(kg, sd, state, stack, node);
@@ -501,8 +522,25 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
         break;
 #  endif /* __SHADER_RAYTRACE__ */
 #endif   /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
-      case NODE_END:
-        return;
+
+#if NODES_GROUP(NODE_GROUP_LEVEL_4)
+#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
+      case NODE_TEX_VOXEL:
+        svm_node_tex_voxel(kg, sd, stack, node, &offset);
+        break;
+#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
+      case NODE_AOV_START:
+        if (!svm_node_aov_check(state, buffer)) {
+          return;
+        }
+        break;
+      case NODE_AOV_COLOR:
+        svm_node_aov_color(kg, sd, stack, node, buffer);
+        break;
+      case NODE_AOV_VALUE:
+        svm_node_aov_value(kg, sd, stack, node, buffer);
+        break;
+#endif /* NODES_GROUP(NODE_GROUP_LEVEL_4) */
       default:
         kernel_assert(!"Unknown node type was passed to the SVM machine");
         return;
@@ -510,9 +548,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
   }
 }
 
-#undef NODES_GROUP
-#undef NODES_FEATURE
-
 CCL_NAMESPACE_END
 
 #endif /* __SVM_H__ */
diff --git a/intern/cycles/kernel/svm/svm_ao.h b/intern/cycles/kernel/svm/svm_ao.h
index 06076175c40..4cb986b897a 100644
--- a/intern/cycles/kernel/svm/svm_ao.h
+++ b/intern/cycles/kernel/svm/svm_ao.h
@@ -1,21 +1,23 @@
 /*
-* Copyright 2011-2018 Blender Foundation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 CCL_NAMESPACE_BEGIN
 
+#ifdef __SHADER_RAYTRACE__
+
 ccl_device_noinline float svm_ao(KernelGlobals *kg,
                                  ShaderData *sd,
                                  float3 N,
@@ -64,13 +66,13 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
     ray.dD = differential3_zero();
 
     if (flags & NODE_AO_ONLY_LOCAL) {
-      if (!scene_intersect_local(kg, ray, NULL, sd->object, NULL, 0)) {
+      if (!scene_intersect_local(kg, &ray, NULL, sd->object, NULL, 0)) {
         unoccluded++;
       }
     }
     else {
       Intersection isect;
-      if (!scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f)) {
+      if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
         unoccluded++;
       }
     }
@@ -83,10 +85,10 @@ ccl_device void svm_node_ao(
     KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
 {
   uint flags, dist_offset, normal_offset, out_ao_offset;
-  decode_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset);
+  svm_unpack_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset);
 
   uint color_offset, out_color_offset, samples;
-  decode_node_uchar4(node.z, &color_offset, &out_color_offset, &samples, NULL);
+  svm_unpack_node_uchar3(node.z, &color_offset, &out_color_offset, &samples);
 
   float dist = stack_load_float_default(stack, dist_offset, node.w);
   float3 normal = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
@@ -102,4 +104,6 @@ ccl_device void svm_node_ao(
   }
 }
 
+#endif /* __SHADER_RAYTRACE__ */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_aov.h b/intern/cycles/kernel/svm/svm_aov.h
new file mode 100644
index 00000000000..899e466d099
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_aov.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline bool svm_node_aov_check(ccl_addr_space PathState *state,
+                                          ccl_global float *buffer)
+{
+  int path_flag = state->flag;
+
+  bool is_primary = (path_flag & PATH_RAY_CAMERA) && (!(path_flag & PATH_RAY_SINGLE_PASS_DONE));
+
+  return ((buffer != NULL) && is_primary);
+}
+
+ccl_device void svm_node_aov_color(
+    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+{
+  float3 val = stack_load_float3(stack, node.y);
+
+  if (buffer) {
+    kernel_write_pass_float4(buffer + kernel_data.film.pass_aov_color + 4 * node.z,
+                             make_float4(val.x, val.y, val.z, 1.0f));
+  }
+}
+
+ccl_device void svm_node_aov_value(
+    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+{
+  float val = stack_load_float(stack, node.y);
+
+  if (buffer) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_aov_value + node.z, val);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index a67cfe91a30..fc7a3ba3f5a 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -46,8 +46,8 @@ ccl_device AttributeDescriptor svm_node_attr_init(
 
 ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
-  NodeAttributeType type;
-  uint out_offset;
+  NodeAttributeType type = NODE_ATTR_FLOAT;
+  uint out_offset = 0;
   AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
   /* fetch and store attribute */
@@ -69,6 +69,15 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
       stack_store_float3(stack, out_offset, make_float3(f.x, f.y, 0.0f));
     }
   }
+  else if (desc.type == NODE_ATTR_RGBA) {
+    float4 f = primitive_attribute_float4(kg, sd, desc, NULL, NULL);
+    if (type == NODE_ATTR_FLOAT) {
+      stack_store_float(stack, out_offset, average(float4_to_float3(f)));
+    }
+    else {
+      stack_store_float3(stack, out_offset, float4_to_float3(f));
+    }
+  }
   else {
     float3 f = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
     if (type == NODE_ATTR_FLOAT) {
@@ -80,16 +89,10 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
-  NodeAttributeType type;
-  uint out_offset;
+  NodeAttributeType type = NODE_ATTR_FLOAT;
+  uint out_offset = 0;
   AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
   /* fetch and store attribute */
@@ -113,6 +116,16 @@ ccl_device_noinline
       stack_store_float3(stack, out_offset, make_float3(f.x + dx.x, f.y + dx.y, 0.0f));
     }
   }
+  else if (desc.type == NODE_ATTR_RGBA) {
+    float4 dx;
+    float4 f = primitive_attribute_float4(kg, sd, desc, &dx, NULL);
+    if (type == NODE_ATTR_FLOAT) {
+      stack_store_float(stack, out_offset, average(float4_to_float3(f + dx)));
+    }
+    else {
+      stack_store_float3(stack, out_offset, float4_to_float3(f + dx));
+    }
+  }
   else {
     float3 dx;
     float3 f = primitive_surface_attribute_float3(kg, sd, desc, &dx, NULL);
@@ -125,16 +138,10 @@ ccl_device_noinline
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
-  NodeAttributeType type;
-  uint out_offset;
+  NodeAttributeType type = NODE_ATTR_FLOAT;
+  uint out_offset = 0;
   AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
   /* fetch and store attribute */
@@ -158,6 +165,16 @@ ccl_device_noinline
       stack_store_float3(stack, out_offset, make_float3(f.x + dy.x, f.y + dy.y, 0.0f));
     }
   }
+  else if (desc.type == NODE_ATTR_RGBA) {
+    float4 dy;
+    float4 f = primitive_attribute_float4(kg, sd, desc, NULL, &dy);
+    if (type == NODE_ATTR_FLOAT) {
+      stack_store_float(stack, out_offset, average(float4_to_float3(f + dy)));
+    }
+    else {
+      stack_store_float3(stack, out_offset, float4_to_float3(f + dy));
+    }
+  }
   else {
     float3 dy;
     float3 f = primitive_surface_attribute_float3(kg, sd, desc, NULL, &dy);
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index fcf28e96e98..bf5957ec9e4 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -16,6 +16,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+#ifdef __SHADER_RAYTRACE__
+
 /* Bevel shader averaging normals from nearby surfaces.
  *
  * Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013
@@ -110,7 +112,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
 
     /* Intersect with the same object. if multiple intersections are found it
      * will use at most LOCAL_MAX_HITS hits, a random subset of all hits. */
-    scene_intersect_local(kg, *ray, &isect, sd->object, &lcg_state, LOCAL_MAX_HITS);
+    scene_intersect_local(kg, ray, &isect, sd->object, &lcg_state, LOCAL_MAX_HITS);
 
     int num_eval_hits = min(isect.num_hits, LOCAL_MAX_HITS);
 
@@ -120,14 +122,14 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       if (sd->type & PRIMITIVE_TRIANGLE) {
         hit_P = triangle_refine_local(kg, sd, &isect.hits[hit], ray);
       }
-#ifdef __OBJECT_MOTION__
+#  ifdef __OBJECT_MOTION__
       else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
         float3 verts[3];
         motion_triangle_vertices(
             kg, sd->object, kernel_tex_fetch(__prim_index, isect.hits[hit].prim), sd->time, verts);
         hit_P = motion_triangle_refine_local(kg, sd, &isect.hits[hit], ray, verts);
       }
-#endif /* __OBJECT_MOTION__ */
+#  endif /* __OBJECT_MOTION__ */
 
       /* Get geometric normal. */
       float3 hit_Ng = isect.Ng[hit];
@@ -151,11 +153,11 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
         if (sd->type & PRIMITIVE_TRIANGLE) {
           N = triangle_smooth_normal(kg, N, prim, u, v);
         }
-#ifdef __OBJECT_MOTION__
+#  ifdef __OBJECT_MOTION__
         else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
           N = motion_triangle_smooth_normal(kg, N, sd->object, prim, u, v, sd->time);
         }
-#endif /* __OBJECT_MOTION__ */
+#  endif /* __OBJECT_MOTION__ */
       }
 
       /* Transform normals to world space. */
@@ -200,7 +202,7 @@ ccl_device void svm_node_bevel(
     KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
 {
   uint num_samples, radius_offset, normal_offset, out_offset;
-  decode_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset);
+  svm_unpack_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset);
 
   float radius = stack_load_float(stack, radius_offset);
   float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples);
@@ -214,4 +216,6 @@ ccl_device void svm_node_bevel(
   stack_store_float3(stack, out_offset, bevel_N);
 }
 
+#endif /* __SHADER_RAYTRACE__ */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index b5cbfcc72df..6984afa30a5 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Brick */
 
-ccl_device_noinline float brick_noise(uint n) /* fast integer noise */
+ccl_device_inline float brick_noise(uint n) /* fast integer noise */
 {
   uint nn;
   n = (n + 1013) & 0x7fffffff;
@@ -27,16 +27,16 @@ ccl_device_noinline float brick_noise(uint n) /* fast integer noise */
   return 0.5f * ((float)nn / 1073741824.0f);
 }
 
-ccl_device_noinline float2 svm_brick(float3 p,
-                                     float mortar_size,
-                                     float mortar_smooth,
-                                     float bias,
-                                     float brick_width,
-                                     float row_height,
-                                     float offset_amount,
-                                     int offset_frequency,
-                                     float squash_amount,
-                                     int squash_frequency)
+ccl_device_noinline_cpu float2 svm_brick(float3 p,
+                                         float mortar_size,
+                                         float mortar_smooth,
+                                         float bias,
+                                         float brick_width,
+                                         float row_height,
+                                         float offset_amount,
+                                         int offset_frequency,
+                                         float squash_amount,
+                                         int squash_frequency)
 {
   int bricknum, rownum;
   float offset = 0.0f;
@@ -87,13 +87,13 @@ ccl_device void svm_node_tex_brick(
   /* RNA properties */
   uint offset_frequency, squash_frequency;
 
-  decode_node_uchar4(node.y, &co_offset, &color1_offset, &color2_offset, &mortar_offset);
-  decode_node_uchar4(
+  svm_unpack_node_uchar4(node.y, &co_offset, &color1_offset, &color2_offset, &mortar_offset);
+  svm_unpack_node_uchar4(
       node.z, &scale_offset, &mortar_size_offset, &bias_offset, &brick_width_offset);
-  decode_node_uchar4(
+  svm_unpack_node_uchar4(
       node.w, &row_height_offset, &color_offset, &fac_offset, &mortar_smooth_offset);
 
-  decode_node_uchar4(node2.x, &offset_frequency, &squash_frequency, NULL, NULL);
+  svm_unpack_node_uchar2(node2.x, &offset_frequency, &squash_frequency);
 
   float3 co = stack_load_float3(stack, co_offset);
 
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index dcd75a2fe8f..9554b5946fb 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -22,7 +22,7 @@ ccl_device void svm_node_brightness(
   uint bright_offset, contrast_offset;
   float3 color = stack_load_float3(stack, in_color);
 
-  decode_node_uchar4(node, &bright_offset, &contrast_offset, NULL, NULL);
+  svm_unpack_node_uchar2(node, &bright_offset, &contrast_offset);
   float brightness = stack_load_float(stack, bright_offset);
   float contrast = stack_load_float(stack, contrast_offset);
 
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index 63b4d1e149b..d54cb73df91 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Checker */
 
-ccl_device_noinline float svm_checker(float3 p)
+ccl_device float svm_checker(float3 p)
 {
   /* avoid precision issues on unit coordinates */
   p.x = (p.x + 0.000001f) * 0.999999f;
@@ -37,8 +37,8 @@ ccl_device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *s
   uint co_offset, color1_offset, color2_offset, scale_offset;
   uint color_offset, fac_offset;
 
-  decode_node_uchar4(node.y, &co_offset, &color1_offset, &color2_offset, &scale_offset);
-  decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
+  svm_unpack_node_uchar4(node.y, &co_offset, &color1_offset, &color2_offset, &scale_offset);
+  svm_unpack_node_uchar2(node.z, &color_offset, &fac_offset);
 
   float3 co = stack_load_float3(stack, co_offset);
   float3 color1 = stack_load_float3(stack, color1_offset);
diff --git a/intern/cycles/kernel/svm/svm_clamp.h b/intern/cycles/kernel/svm/svm_clamp.h
new file mode 100644
index 00000000000..a85fd82754e
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_clamp.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Clamp Node */
+
+ccl_device void svm_node_clamp(KernelGlobals *kg,
+                               ShaderData *sd,
+                               float *stack,
+                               uint value_stack_offset,
+                               uint parameters_stack_offsets,
+                               uint result_stack_offset,
+                               int *offset)
+{
+  uint min_stack_offset, max_stack_offset, type;
+  svm_unpack_node_uchar3(parameters_stack_offsets, &min_stack_offset, &max_stack_offset, &type);
+
+  uint4 defaults = read_node(kg, offset);
+
+  float value = stack_load_float(stack, value_stack_offset);
+  float min = stack_load_float_default(stack, min_stack_offset, defaults.x);
+  float max = stack_load_float_default(stack, max_stack_offset, defaults.y);
+
+  if (type == NODE_CLAMP_RANGE && (min > max)) {
+    stack_store_float(stack, result_stack_offset, clamp(value, max, min));
+  }
+  else {
+    stack_store_float(stack, result_stack_offset, clamp(value, min, max));
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 270fe4c8615..1ae94f1d766 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -16,23 +16,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Hair Melanin */
-
-ccl_device_inline float3 sigma_from_concentration(float eumelanin, float pheomelanin)
-{
-  return eumelanin * make_float3(0.506f, 0.841f, 1.653f) +
-         pheomelanin * make_float3(0.343f, 0.733f, 1.924f);
-}
-
-ccl_device_inline float3 sigma_from_reflectance(float3 color, float azimuthal_roughness)
-{
-  float x = azimuthal_roughness;
-  float roughness_fac = (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x +
-                        5.969f;
-  float3 sigma = log3(color) / roughness_fac;
-  return sigma * sigma;
-}
-
 /* Closure Nodes */
 
 ccl_device void svm_node_glass_setup(
@@ -85,7 +68,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
   uint type, param1_offset, param2_offset;
 
   uint mix_weight_offset;
-  decode_node_uchar4(node.y, &type, &param1_offset, &param2_offset, &mix_weight_offset);
+  svm_unpack_node_uchar4(node.y, &type, &param1_offset, &param2_offset, &mix_weight_offset);
   float mix_weight = (stack_valid(mix_weight_offset) ? stack_load_float(stack, mix_weight_offset) :
                                                        1.0f);
 
@@ -122,21 +105,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       uint4 data_node2 = read_node(kg, offset);
 
       float3 T = stack_load_float3(stack, data_node.y);
-      decode_node_uchar4(data_node.z,
-                         &specular_offset,
-                         &roughness_offset,
-                         &specular_tint_offset,
-                         &anisotropic_offset);
-      decode_node_uchar4(data_node.w,
-                         &sheen_offset,
-                         &sheen_tint_offset,
-                         &clearcoat_offset,
-                         &clearcoat_roughness_offset);
-      decode_node_uchar4(data_node2.x,
-                         &eta_offset,
-                         &transmission_offset,
-                         &anisotropic_rotation_offset,
-                         &transmission_roughness_offset);
+      svm_unpack_node_uchar4(data_node.z,
+                             &specular_offset,
+                             &roughness_offset,
+                             &specular_tint_offset,
+                             &anisotropic_offset);
+      svm_unpack_node_uchar4(data_node.w,
+                             &sheen_offset,
+                             &sheen_tint_offset,
+                             &clearcoat_offset,
+                             &clearcoat_roughness_offset);
+      svm_unpack_node_uchar4(data_node2.x,
+                             &eta_offset,
+                             &transmission_offset,
+                             &anisotropic_rotation_offset,
+                             &transmission_roughness_offset);
 
       // get Disney principled parameters
       float metallic = param1;
@@ -290,7 +273,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
           bsdf->N = N;
 
           /* setup bsdf */
-          sd->flag |= bsdf_principled_sheen_setup(bsdf);
+          sd->flag |= bsdf_principled_sheen_setup(sd, bsdf);
         }
       }
 
@@ -337,9 +320,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
             /* setup bsdf */
             if (distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID ||
                 roughness <= 0.075f) /* use single-scatter GGX */
-              sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
+              sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
             else /* use multi-scatter GGX */
-              sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
+              sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd);
           }
         }
 #  ifdef __CAUSTICS_TRICKS__
@@ -532,12 +515,34 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       float roughness = sqr(param1);
 
       bsdf->N = N;
-      bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
-      bsdf->alpha_x = roughness;
-      bsdf->alpha_y = roughness;
       bsdf->ior = 0.0f;
       bsdf->extra = NULL;
 
+      if (data_node.y == SVM_STACK_INVALID) {
+        bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+        bsdf->alpha_x = roughness;
+        bsdf->alpha_y = roughness;
+      }
+      else {
+        bsdf->T = stack_load_float3(stack, data_node.y);
+
+        /* rotate tangent */
+        float rotation = stack_load_float(stack, data_node.z);
+        if (rotation != 0.0f)
+          bsdf->T = rotate_around_axis(bsdf->T, bsdf->N, rotation * M_2PI_F);
+
+        /* compute roughness */
+        float anisotropy = clamp(param2, -0.99f, 0.99f);
+        if (anisotropy < 0.0f) {
+          bsdf->alpha_x = roughness / (1.0f + anisotropy);
+          bsdf->alpha_y = roughness * (1.0f + anisotropy);
+        }
+        else {
+          bsdf->alpha_x = roughness * (1.0f - anisotropy);
+          bsdf->alpha_y = roughness / (1.0f - anisotropy);
+        }
+      }
+
       /* setup bsdf */
       if (type == CLOSURE_BSDF_REFLECTION_ID)
         sd->flag |= bsdf_reflection_setup(bsdf);
@@ -546,10 +551,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       else if (type == CLOSURE_BSDF_MICROFACET_GGX_ID)
         sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
       else if (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
-        kernel_assert(stack_valid(data_node.z));
+        kernel_assert(stack_valid(data_node.w));
         bsdf->extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
         if (bsdf->extra) {
-          bsdf->extra->color = stack_load_float3(stack, data_node.z);
+          bsdf->extra->color = stack_load_float3(stack, data_node.w);
           bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
           bsdf->extra->clearcoat = 0.0f;
           sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
@@ -692,64 +697,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
       break;
     }
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID:
-    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: {
-#ifdef __CAUSTICS_TRICKS__
-      if (!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
-        break;
-#endif
-      float3 weight = sd->svm_closure_weight * mix_weight;
-      MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
-
-      if (bsdf) {
-        bsdf->N = N;
-        bsdf->extra = NULL;
-        bsdf->T = stack_load_float3(stack, data_node.y);
-
-        /* rotate tangent */
-        float rotation = stack_load_float(stack, data_node.z);
-
-        if (rotation != 0.0f)
-          bsdf->T = rotate_around_axis(bsdf->T, bsdf->N, rotation * M_2PI_F);
-
-        /* compute roughness */
-        float roughness = sqr(param1);
-        float anisotropy = clamp(param2, -0.99f, 0.99f);
-
-        if (anisotropy < 0.0f) {
-          bsdf->alpha_x = roughness / (1.0f + anisotropy);
-          bsdf->alpha_y = roughness * (1.0f + anisotropy);
-        }
-        else {
-          bsdf->alpha_x = roughness * (1.0f - anisotropy);
-          bsdf->alpha_y = roughness / (1.0f - anisotropy);
-        }
-
-        bsdf->ior = 0.0f;
-
-        if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) {
-          sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
-        }
-        else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) {
-          sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf);
-        }
-        else if (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) {
-          kernel_assert(stack_valid(data_node.w));
-          bsdf->extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-          if (bsdf->extra) {
-            bsdf->extra->color = stack_load_float3(stack, data_node.w);
-            bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
-            bsdf->extra->clearcoat = 0.0f;
-            sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
-          }
-        }
-        else
-          sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
-      }
-      break;
-    }
     case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
       float3 weight = sd->svm_closure_weight * mix_weight;
       VelvetBsdf *bsdf = (VelvetBsdf *)bsdf_alloc(sd, sizeof(VelvetBsdf), weight);
@@ -793,19 +740,19 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       float3 weight = sd->svm_closure_weight * mix_weight;
 
       uint offset_ofs, ior_ofs, color_ofs, parametrization;
-      decode_node_uchar4(data_node.y, &offset_ofs, &ior_ofs, &color_ofs, &parametrization);
+      svm_unpack_node_uchar4(data_node.y, &offset_ofs, &ior_ofs, &color_ofs, &parametrization);
       float alpha = stack_load_float_default(stack, offset_ofs, data_node.z);
       float ior = stack_load_float_default(stack, ior_ofs, data_node.w);
 
       uint coat_ofs, melanin_ofs, melanin_redness_ofs, absorption_coefficient_ofs;
-      decode_node_uchar4(data_node2.x,
-                         &coat_ofs,
-                         &melanin_ofs,
-                         &melanin_redness_ofs,
-                         &absorption_coefficient_ofs);
+      svm_unpack_node_uchar4(data_node2.x,
+                             &coat_ofs,
+                             &melanin_ofs,
+                             &melanin_redness_ofs,
+                             &absorption_coefficient_ofs);
 
       uint tint_ofs, random_ofs, random_color_ofs, random_roughness_ofs;
-      decode_node_uchar4(
+      svm_unpack_node_uchar4(
           data_node3.x, &tint_ofs, &random_ofs, &random_color_ofs, &random_roughness_ofs);
 
       const AttributeDescriptor attr_descr_random = find_attribute(kg, sd, data_node4.y);
@@ -868,24 +815,26 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
             /* Benedikt Bitterli's melanin ratio remapping. */
             float eumelanin = melanin * (1.0f - melanin_redness);
             float pheomelanin = melanin * melanin_redness;
-            float3 melanin_sigma = sigma_from_concentration(eumelanin, pheomelanin);
+            float3 melanin_sigma = bsdf_principled_hair_sigma_from_concentration(eumelanin,
+                                                                                 pheomelanin);
 
             /* Optional tint. */
             float3 tint = stack_load_float3(stack, tint_ofs);
-            float3 tint_sigma = sigma_from_reflectance(tint, radial_roughness);
+            float3 tint_sigma = bsdf_principled_hair_sigma_from_reflectance(tint,
+                                                                            radial_roughness);
 
             bsdf->sigma = melanin_sigma + tint_sigma;
             break;
           }
           case NODE_PRINCIPLED_HAIR_REFLECTANCE: {
             float3 color = stack_load_float3(stack, color_ofs);
-            bsdf->sigma = sigma_from_reflectance(color, radial_roughness);
+            bsdf->sigma = bsdf_principled_hair_sigma_from_reflectance(color, radial_roughness);
             break;
           }
           default: {
             /* Fallback to brownish hair, same as defaults for melanin. */
             kernel_assert(!"Invalid Principled Hair parametrization!");
-            bsdf->sigma = sigma_from_concentration(0.0f, 0.8054375f);
+            bsdf->sigma = bsdf_principled_hair_sigma_from_concentration(0.0f, 0.8054375f);
             break;
           }
         }
@@ -898,39 +847,29 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
       float3 weight = sd->svm_closure_weight * mix_weight;
 
-      if (sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
-        /* todo: giving a fixed weight here will cause issues when
-         * mixing multiple BSDFS. energy will not be conserved and
-         * the throughput can blow up after multiple bounces. we
-         * better figure out a way to skip backfaces from rays
-         * spawned by transmission from the front */
-        bsdf_transparent_setup(sd, make_float3(1.0f, 1.0f, 1.0f), path_flag);
-      }
-      else {
-        HairBsdf *bsdf = (HairBsdf *)bsdf_alloc(sd, sizeof(HairBsdf), weight);
+      HairBsdf *bsdf = (HairBsdf *)bsdf_alloc(sd, sizeof(HairBsdf), weight);
 
-        if (bsdf) {
-          bsdf->N = N;
-          bsdf->roughness1 = param1;
-          bsdf->roughness2 = param2;
-          bsdf->offset = -stack_load_float(stack, data_node.z);
+      if (bsdf) {
+        bsdf->N = N;
+        bsdf->roughness1 = param1;
+        bsdf->roughness2 = param2;
+        bsdf->offset = -stack_load_float(stack, data_node.z);
 
-          if (stack_valid(data_node.y)) {
-            bsdf->T = normalize(stack_load_float3(stack, data_node.y));
-          }
-          else if (!(sd->type & PRIMITIVE_ALL_CURVE)) {
-            bsdf->T = normalize(sd->dPdv);
-            bsdf->offset = 0.0f;
-          }
-          else
-            bsdf->T = normalize(sd->dPdu);
+        if (stack_valid(data_node.y)) {
+          bsdf->T = normalize(stack_load_float3(stack, data_node.y));
+        }
+        else if (!(sd->type & PRIMITIVE_ALL_CURVE)) {
+          bsdf->T = normalize(sd->dPdv);
+          bsdf->offset = 0.0f;
+        }
+        else
+          bsdf->T = normalize(sd->dPdu);
 
-          if (type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-            sd->flag |= bsdf_hair_reflection_setup(bsdf);
-          }
-          else {
-            sd->flag |= bsdf_hair_transmission_setup(bsdf);
-          }
+        if (type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
+          sd->flag |= bsdf_hair_reflection_setup(bsdf);
+        }
+        else {
+          sd->flag |= bsdf_hair_transmission_setup(bsdf);
         }
       }
 
@@ -982,7 +921,7 @@ ccl_device void svm_node_closure_volume(
   uint type, density_offset, anisotropy_offset;
 
   uint mix_weight_offset;
-  decode_node_uchar4(node.y, &type, &density_offset, &anisotropy_offset, &mix_weight_offset);
+  svm_unpack_node_uchar4(node.y, &type, &density_offset, &anisotropy_offset, &mix_weight_offset);
   float mix_weight = (stack_valid(mix_weight_offset) ? stack_load_float(stack, mix_weight_offset) :
                                                        1.0f);
 
@@ -1040,7 +979,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
   }
 
   uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset;
-  decode_node_uchar4(
+  svm_unpack_node_uchar4(
       node.y, &density_offset, &anisotropy_offset, &absorption_color_offset, &mix_weight_offset);
   float mix_weight = (stack_valid(mix_weight_offset) ? stack_load_float(stack, mix_weight_offset) :
                                                        1.0f);
@@ -1099,7 +1038,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
   }
 
   uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset;
-  decode_node_uchar4(
+  svm_unpack_node_uchar4(
       node.z, &emission_offset, &emission_color_offset, &blackbody_offset, &temperature_offset);
   float emission = (stack_valid(emission_offset)) ? stack_load_float(stack, emission_offset) :
                                                     __uint_as_float(value_node.z);
@@ -1229,7 +1168,8 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
   /* fetch weight from blend input, previous mix closures,
    * and write to stack to be used by closure nodes later */
   uint weight_offset, in_weight_offset, weight1_offset, weight2_offset;
-  decode_node_uchar4(node.y, &weight_offset, &in_weight_offset, &weight1_offset, &weight2_offset);
+  svm_unpack_node_uchar4(
+      node.y, &weight_offset, &in_weight_offset, &weight1_offset, &weight2_offset);
 
   float weight = stack_load_float(stack, weight_offset);
   weight = saturate(weight);
diff --git a/intern/cycles/kernel/svm/svm_color_util.h b/intern/cycles/kernel/svm/svm_color_util.h
index 12b59d2616b..1a0fa03305e 100644
--- a/intern/cycles/kernel/svm/svm_color_util.h
+++ b/intern/cycles/kernel/svm/svm_color_util.h
@@ -92,12 +92,12 @@ ccl_device float3 svm_mix_diff(float t, float3 col1, float3 col2)
 
 ccl_device float3 svm_mix_dark(float t, float3 col1, float3 col2)
 {
-  return min(col1, col2) * t + col1 * (1.0f - t);
+  return interp(col1, min(col1, col2), t);
 }
 
 ccl_device float3 svm_mix_light(float t, float3 col1, float3 col2)
 {
-  return max(col1, col2 * t);
+  return interp(col1, max(col1, col2), t);
 }
 
 ccl_device float3 svm_mix_dodge(float t, float3 col1, float3 col2)
@@ -255,16 +255,10 @@ ccl_device float3 svm_mix_linear(float t, float3 col1, float3 col2)
 
 ccl_device float3 svm_mix_clamp(float3 col)
 {
-  float3 outcol = col;
-
-  outcol.x = saturate(col.x);
-  outcol.y = saturate(col.y);
-  outcol.z = saturate(col.z);
-
-  return outcol;
+  return saturate3(col);
 }
 
-ccl_device_noinline float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
+ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
 {
   float t = saturate(fac);
 
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index f16664a684c..250fac6bcb8 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 #ifdef __RAY_DIFFERENTIALS__
   /* get normal input */
   uint normal_offset, scale_offset, invert, use_object_space;
-  decode_node_uchar4(node.y, &normal_offset, &scale_offset, &invert, &use_object_space);
+  svm_unpack_node_uchar4(node.y, &normal_offset, &scale_offset, &invert, &use_object_space);
 
   float3 normal_in = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
 
@@ -42,7 +42,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 
   /* get bump values */
   uint c_offset, x_offset, y_offset, strength_offset;
-  decode_node_uchar4(node.z, &c_offset, &x_offset, &y_offset, &strength_offset);
+  svm_unpack_node_uchar4(node.z, &c_offset, &x_offset, &y_offset, &strength_offset);
 
   float h_c = stack_load_float(stack, c_offset);
   float h_x = stack_load_float(stack, x_offset);
@@ -95,7 +95,7 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg,
 ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
   uint height_offset, midlevel_offset, scale_offset, normal_offset;
-  decode_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset);
+  svm_unpack_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset);
 
   float height = stack_load_float(stack, height_offset);
   float midlevel = stack_load_float(stack, midlevel_offset);
@@ -126,7 +126,7 @@ ccl_device void svm_node_vector_displacement(
   uint space = data_node.x;
 
   uint vector_offset, midlevel_offset, scale_offset, displacement_offset;
-  decode_node_uchar4(
+  svm_unpack_node_uchar4(
       node.y, &vector_offset, &midlevel_offset, &scale_offset, &displacement_offset);
 
   float3 vector = stack_load_float3(stack, vector_offset);
diff --git a/intern/cycles/kernel/svm/svm_fractal_noise.h b/intern/cycles/kernel/svm/svm_fractal_noise.h
new file mode 100644
index 00000000000..57fa8c690ac
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_fractal_noise.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* The fractal_noise_[1-4] functions are all exactly the same except for the input type. */
+ccl_device_noinline float fractal_noise_1d(float p, float octaves, float roughness)
+{
+  float fscale = 1.0f;
+  float amp = 1.0f;
+  float maxamp = 0.0f;
+  float sum = 0.0f;
+  octaves = clamp(octaves, 0.0f, 16.0f);
+  int n = float_to_int(octaves);
+  for (int i = 0; i <= n; i++) {
+    float t = noise_1d(fscale * p);
+    sum += t * amp;
+    maxamp += amp;
+    amp *= clamp(roughness, 0.0f, 1.0f);
+    fscale *= 2.0f;
+  }
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    float t = noise_1d(fscale * p);
+    float sum2 = sum + t * amp;
+    sum /= maxamp;
+    sum2 /= maxamp + amp;
+    return (1.0f - rmd) * sum + rmd * sum2;
+  }
+  else {
+    return sum / maxamp;
+  }
+}
+
+/* The fractal_noise_[1-4] functions are all exactly the same except for the input type. */
+ccl_device_noinline float fractal_noise_2d(float2 p, float octaves, float roughness)
+{
+  float fscale = 1.0f;
+  float amp = 1.0f;
+  float maxamp = 0.0f;
+  float sum = 0.0f;
+  octaves = clamp(octaves, 0.0f, 16.0f);
+  int n = float_to_int(octaves);
+  for (int i = 0; i <= n; i++) {
+    float t = noise_2d(fscale * p);
+    sum += t * amp;
+    maxamp += amp;
+    amp *= clamp(roughness, 0.0f, 1.0f);
+    fscale *= 2.0f;
+  }
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    float t = noise_2d(fscale * p);
+    float sum2 = sum + t * amp;
+    sum /= maxamp;
+    sum2 /= maxamp + amp;
+    return (1.0f - rmd) * sum + rmd * sum2;
+  }
+  else {
+    return sum / maxamp;
+  }
+}
+
+/* The fractal_noise_[1-4] functions are all exactly the same except for the input type. */
+ccl_device_noinline float fractal_noise_3d(float3 p, float octaves, float roughness)
+{
+  float fscale = 1.0f;
+  float amp = 1.0f;
+  float maxamp = 0.0f;
+  float sum = 0.0f;
+  octaves = clamp(octaves, 0.0f, 16.0f);
+  int n = float_to_int(octaves);
+  for (int i = 0; i <= n; i++) {
+    float t = noise_3d(fscale * p);
+    sum += t * amp;
+    maxamp += amp;
+    amp *= clamp(roughness, 0.0f, 1.0f);
+    fscale *= 2.0f;
+  }
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    float t = noise_3d(fscale * p);
+    float sum2 = sum + t * amp;
+    sum /= maxamp;
+    sum2 /= maxamp + amp;
+    return (1.0f - rmd) * sum + rmd * sum2;
+  }
+  else {
+    return sum / maxamp;
+  }
+}
+
+/* The fractal_noise_[1-4] functions are all exactly the same except for the input type. */
+ccl_device_noinline float fractal_noise_4d(float4 p, float octaves, float roughness)
+{
+  float fscale = 1.0f;
+  float amp = 1.0f;
+  float maxamp = 0.0f;
+  float sum = 0.0f;
+  octaves = clamp(octaves, 0.0f, 16.0f);
+  int n = float_to_int(octaves);
+  for (int i = 0; i <= n; i++) {
+    float t = noise_4d(fscale * p);
+    sum += t * amp;
+    maxamp += amp;
+    amp *= clamp(roughness, 0.0f, 1.0f);
+    fscale *= 2.0f;
+  }
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    float t = noise_4d(fscale * p);
+    float sum2 = sum + t * amp;
+    sum /= maxamp;
+    sum2 /= maxamp + amp;
+    return (1.0f - rmd) * sum + rmd * sum2;
+  }
+  else {
+    return sum / maxamp;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 03119991597..96d602e35bf 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -22,7 +22,7 @@ ccl_device void svm_node_fresnel(
     ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node)
 {
   uint normal_offset, out_offset;
-  decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
+  svm_unpack_node_uchar2(node, &normal_offset, &out_offset);
   float eta = (stack_valid(ior_offset)) ? stack_load_float(stack, ior_offset) :
                                           __uint_as_float(ior_value);
   float3 normal_in = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
@@ -43,7 +43,7 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
   uint blend_value = node.z;
 
   uint type, normal_offset, out_offset;
-  decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
+  svm_unpack_node_uchar3(node.w, &type, &normal_offset, &out_offset);
 
   float blend = (stack_valid(blend_offset)) ? stack_load_float(stack, blend_offset) :
                                               __uint_as_float(blend_value);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index a9104643299..77df19b2298 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -41,11 +41,9 @@ ccl_device_inline void svm_node_geometry(
     case NODE_GEOM_Ng:
       data = sd->Ng;
       break;
-#ifdef __UV__
     case NODE_GEOM_uv:
       data = make_float3(sd->u, sd->v, 0.0f);
       break;
-#endif
     default:
       data = make_float3(0.0f, 0.0f, 0.0f);
   }
@@ -113,6 +111,10 @@ ccl_device void svm_node_object_info(
       stack_store_float3(stack, out_offset, object_location(kg, sd));
       return;
     }
+    case NODE_INFO_OB_COLOR: {
+      stack_store_float3(stack, out_offset, object_color(kg, sd->object));
+      return;
+    }
     case NODE_INFO_OB_INDEX:
       data = object_pass_id(kg, sd->object);
       break;
@@ -149,7 +151,7 @@ ccl_device void svm_node_particle_info(
     }
     case NODE_INFO_PAR_RANDOM: {
       int particle_id = object_particle_id(kg, sd->object);
-      float random = hash_int_01(particle_index(kg, particle_id));
+      float random = hash_uint2_to_float(particle_index(kg, particle_id), 0);
       stack_store_float(stack, out_offset, random);
       break;
     }
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index c315564fbc2..08304bc47e8 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -64,7 +64,7 @@ ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 {
   uint type, co_offset, color_offset, fac_offset;
 
-  decode_node_uchar4(node.y, &type, &co_offset, &fac_offset, &color_offset);
+  svm_unpack_node_uchar4(node.y, &type, &co_offset, &fac_offset, &color_offset);
 
   float3 co = stack_load_float3(stack, co_offset);
 
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index 72379fba870..1f7bd421869 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -24,8 +24,8 @@ ccl_device void svm_node_hsv(
 {
   uint in_color_offset, fac_offset, out_color_offset;
   uint hue_offset, sat_offset, val_offset;
-  decode_node_uchar4(node.y, &in_color_offset, &fac_offset, &out_color_offset, NULL);
-  decode_node_uchar4(node.z, &hue_offset, &sat_offset, &val_offset, NULL);
+  svm_unpack_node_uchar3(node.y, &in_color_offset, &fac_offset, &out_color_offset);
+  svm_unpack_node_uchar3(node.z, &hue_offset, &sat_offset, &val_offset);
 
   float fac = stack_load_float(stack, fac_offset);
   float3 in_color = stack_load_float3(stack, in_color_offset);
diff --git a/intern/cycles/kernel/svm/svm_ies.h b/intern/cycles/kernel/svm/svm_ies.h
index 9434c0c5505..56c804b44d0 100644
--- a/intern/cycles/kernel/svm/svm_ies.h
+++ b/intern/cycles/kernel/svm/svm_ies.h
@@ -21,12 +21,12 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float interpolate_ies_vertical(
     KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
 {
-  /* Since lookups are performed in spherical coordinates, clamping the coordinates at the low end of v
-   * (corresponding to the north pole) would result in artifacts.
-   * The proper way of dealing with this would be to lookup the corresponding value on the other side of the pole,
-   * but since the horizontal coordinates might be nonuniform, this would require yet another interpolation.
-   * Therefore, the assumtion is made that the light is going to be symmetrical, which means that we can just take
-   * the corresponding value at the current horizontal coordinate. */
+  /* Since lookups are performed in spherical coordinates, clamping the coordinates at the low end
+   * of v (corresponding to the north pole) would result in artifacts. The proper way of dealing
+   * with this would be to lookup the corresponding value on the other side of the pole, but since
+   * the horizontal coordinates might be nonuniform, this would require yet another interpolation.
+   * Therefore, the assumption is made that the light is going to be symmetrical, which means that
+   * we can just take the corresponding value at the current horizontal coordinate. */
 
 #define IES_LOOKUP(v) kernel_tex_fetch(__ies, ofs + h * v_num + (v))
   /* If v is zero, assume symmetry and read at v=1 instead of v=-1. */
@@ -66,7 +66,8 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
 
   /* Lookup the angles to find the table position. */
   int h_i, v_i;
-  /* TODO(lukas): Consider using bisection. Probably not worth it for the vast majority of IES files. */
+  /* TODO(lukas): Consider using bisection.
+   * Probably not worth it for the vast majority of IES files. */
   for (h_i = 0; IES_LOOKUP_ANGLE_H(h_i + 1) < h_angle; h_i++)
     ;
   for (v_i = 0; IES_LOOKUP_ANGLE_V(v_i + 1) < v_angle; v_i++)
@@ -83,7 +84,8 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
 
   /* Perform cubic interpolation along the horizontal coordinate to get the intensity value.
    * If h_i is zero, just wrap around since the horizontal angles always go over the full circle.
-   * However, the last entry (360°) equals the first one, so we need to wrap around to the one before that. */
+   * However, the last entry (360°) equals the first one, so we need to wrap around to the one
+   * before that. */
   float a = interpolate_ies_vertical(
       kg, ofs, v_i, v_num, v_frac, (h_i == 0) ? h_num - 2 : h_i - 1);
   float b = interpolate_ies_vertical(kg, ofs, v_i, v_num, v_frac, h_i);
@@ -99,8 +101,8 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
 ccl_device void svm_node_ies(
     KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
-  uint vector_offset, strength_offset, fac_offset, dummy, slot = node.z;
-  decode_node_uchar4(node.y, &strength_offset, &vector_offset, &fac_offset, &dummy);
+  uint vector_offset, strength_offset, fac_offset, slot = node.z;
+  svm_unpack_node_uchar3(node.y, &strength_offset, &vector_offset, &fac_offset);
 
   float3 vector = stack_load_float3(stack, vector_offset);
   float strength = stack_load_float_default(stack, strength_offset, node.w);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index ee4b8b6e50c..f57c85fc23e 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,23 +16,22 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4
-svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
+ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint flags)
 {
+  if (id == -1) {
+    return make_float4(
+        TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+  }
+
   float4 r = kernel_tex_image_interp(kg, id, x, y);
   const float alpha = r.w;
 
-  if (use_alpha && alpha != 1.0f && alpha != 0.0f) {
+  if ((flags & NODE_IMAGE_ALPHA_UNASSOCIATE) && alpha != 1.0f && alpha != 0.0f) {
     r /= alpha;
-    const int texture_type = kernel_tex_type(id);
-    if (texture_type == IMAGE_DATA_TYPE_BYTE4 || texture_type == IMAGE_DATA_TYPE_BYTE) {
-      r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f));
-    }
     r.w = alpha;
   }
 
-  if (srgb) {
-    /* TODO(lukas): Implement proper conversion for image textures. */
+  if (flags & NODE_IMAGE_COMPRESS_AS_SRGB) {
     r = color_srgb_to_linear_v4(r);
   }
 
@@ -45,16 +44,15 @@ ccl_device_inline float3 texco_remap_square(float3 co)
   return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
 }
 
-ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tex_image(
+    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
-  uint id = node.y;
-  uint co_offset, out_offset, alpha_offset, srgb;
+  uint co_offset, out_offset, alpha_offset, flags;
 
-  decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
+  svm_unpack_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &flags);
 
   float3 co = stack_load_float3(stack, co_offset);
   float2 tex_co;
-  uint use_alpha = stack_valid(alpha_offset);
   if (node.w == NODE_IMAGE_PROJ_SPHERE) {
     co = texco_remap_square(co);
     tex_co = map_to_sphere(co);
@@ -66,7 +64,51 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
   else {
     tex_co = make_float2(co.x, co.y);
   }
-  float4 f = svm_image_texture(kg, id, tex_co.x, tex_co.y, srgb, use_alpha);
+
+  /* TODO(lukas): Consider moving tile information out of the SVM node.
+   * TextureInfo seems a reasonable candidate. */
+  int id = -1;
+  int num_nodes = (int)node.y;
+  if (num_nodes > 0) {
+    /* Remember the offset of the node following the tile nodes. */
+    int next_offset = (*offset) + num_nodes;
+
+    /* Find the tile that the UV lies in. */
+    int tx = (int)tex_co.x;
+    int ty = (int)tex_co.y;
+
+    /* Check that we're within a legitimate tile. */
+    if (tx >= 0 && ty >= 0 && tx < 10) {
+      int tile = 1001 + 10 * ty + tx;
+
+      /* Find the index of the tile. */
+      for (int i = 0; i < num_nodes; i++) {
+        uint4 tile_node = read_node(kg, offset);
+        if (tile_node.x == tile) {
+          id = tile_node.y;
+          break;
+        }
+        if (tile_node.z == tile) {
+          id = tile_node.w;
+          break;
+        }
+      }
+
+      /* If we found the tile, offset the UVs to be relative to it. */
+      if (id != -1) {
+        tex_co.x -= tx;
+        tex_co.y -= ty;
+      }
+    }
+
+    /* Skip over the remaining nodes. */
+    *offset = next_offset;
+  }
+  else {
+    id = -num_nodes;
+  }
+
+  float4 f = svm_image_texture(kg, id, tex_co.x, tex_co.y, flags);
 
   if (stack_valid(out_offset))
     stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
@@ -145,27 +187,26 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
   }
 
   /* now fetch textures */
-  uint co_offset, out_offset, alpha_offset, srgb;
-  decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
+  uint co_offset, out_offset, alpha_offset, flags;
+  svm_unpack_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &flags);
 
   float3 co = stack_load_float3(stack, co_offset);
   uint id = node.y;
 
   float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-  uint use_alpha = stack_valid(alpha_offset);
 
   /* Map so that no textures are flipped, rotation is somewhat arbitrary. */
   if (weight.x > 0.0f) {
     float2 uv = make_float2((signed_N.x < 0.0f) ? 1.0f - co.y : co.y, co.z);
-    f += weight.x * svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+    f += weight.x * svm_image_texture(kg, id, uv.x, uv.y, flags);
   }
   if (weight.y > 0.0f) {
     float2 uv = make_float2((signed_N.y > 0.0f) ? 1.0f - co.x : co.x, co.z);
-    f += weight.y * svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+    f += weight.y * svm_image_texture(kg, id, uv.x, uv.y, flags);
   }
   if (weight.z > 0.0f) {
     float2 uv = make_float2((signed_N.z > 0.0f) ? 1.0f - co.y : co.y, co.x);
-    f += weight.z * svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+    f += weight.z * svm_image_texture(kg, id, uv.x, uv.y, flags);
   }
 
   if (stack_valid(out_offset))
@@ -180,10 +221,10 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg,
                                          uint4 node)
 {
   uint id = node.y;
-  uint co_offset, out_offset, alpha_offset, srgb;
+  uint co_offset, out_offset, alpha_offset, flags;
   uint projection = node.w;
 
-  decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
+  svm_unpack_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &flags);
 
   float3 co = stack_load_float3(stack, co_offset);
   float2 uv;
@@ -195,8 +236,7 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg,
   else
     uv = direction_to_mirrorball(co);
 
-  uint use_alpha = stack_valid(alpha_offset);
-  float4 f = svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+  float4 f = svm_image_texture(kg, id, uv.x, uv.y, flags);
 
   if (stack_valid(out_offset))
     stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 65a9a284a17..768c65918cd 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -84,7 +84,7 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 {
   uint strength_offset, out_offset, smooth_offset;
 
-  decode_node_uchar4(node.z, &strength_offset, &smooth_offset, &out_offset, NULL);
+  svm_unpack_node_uchar3(node.z, &strength_offset, &smooth_offset, &out_offset);
 
   float strength = stack_load_float(stack, strength_offset);
   uint type = node.y;
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index 115d2e2fe4b..9c160e6d8cc 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Magic */
 
-ccl_device_noinline float3 svm_magic(float3 p, int n, float distortion)
+ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
 {
   float x = sinf((p.x + p.y + p.z) * 5.0f);
   float y = cosf((-p.x + p.y - p.z) * 5.0f);
@@ -93,8 +93,8 @@ ccl_device void svm_node_tex_magic(
   uint depth;
   uint scale_offset, distortion_offset, co_offset, fac_offset, color_offset;
 
-  decode_node_uchar4(node.y, &depth, &color_offset, &fac_offset, NULL);
-  decode_node_uchar4(node.z, &co_offset, &scale_offset, &distortion_offset, NULL);
+  svm_unpack_node_uchar3(node.y, &depth, &color_offset, &fac_offset);
+  svm_unpack_node_uchar3(node.z, &co_offset, &scale_offset, &distortion_offset);
 
   uint4 node2 = read_node(kg, offset);
   float3 co = stack_load_float3(stack, co_offset);
diff --git a/intern/cycles/kernel/svm/svm_map_range.h b/intern/cycles/kernel/svm/svm_map_range.h
new file mode 100644
index 00000000000..533a631c837
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_map_range.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Map Range Node */
+
+ccl_device_inline float smootherstep(float edge0, float edge1, float x)
+{
+  x = clamp(safe_divide((x - edge0), (edge1 - edge0)), 0.0f, 1.0f);
+  return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f);
+}
+
+ccl_device void svm_node_map_range(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint value_stack_offset,
+                                   uint parameters_stack_offsets,
+                                   uint results_stack_offsets,
+                                   int *offset)
+{
+  uint from_min_stack_offset, from_max_stack_offset, to_min_stack_offset, to_max_stack_offset;
+  uint type_stack_offset, steps_stack_offset, result_stack_offset;
+  svm_unpack_node_uchar4(parameters_stack_offsets,
+                         &from_min_stack_offset,
+                         &from_max_stack_offset,
+                         &to_min_stack_offset,
+                         &to_max_stack_offset);
+  svm_unpack_node_uchar3(
+      results_stack_offsets, &type_stack_offset, &steps_stack_offset, &result_stack_offset);
+
+  uint4 defaults = read_node(kg, offset);
+  uint4 defaults2 = read_node(kg, offset);
+
+  float value = stack_load_float(stack, value_stack_offset);
+  float from_min = stack_load_float_default(stack, from_min_stack_offset, defaults.x);
+  float from_max = stack_load_float_default(stack, from_max_stack_offset, defaults.y);
+  float to_min = stack_load_float_default(stack, to_min_stack_offset, defaults.z);
+  float to_max = stack_load_float_default(stack, to_max_stack_offset, defaults.w);
+  float steps = stack_load_float_default(stack, steps_stack_offset, defaults2.x);
+
+  float result;
+
+  if (from_max != from_min) {
+    float factor = value;
+    switch (type_stack_offset) {
+      default:
+      case NODE_MAP_RANGE_LINEAR:
+        factor = (value - from_min) / (from_max - from_min);
+        break;
+      case NODE_MAP_RANGE_STEPPED: {
+        factor = (value - from_min) / (from_max - from_min);
+        factor = (steps > 0.0f) ? floorf(factor * (steps + 1.0f)) / steps : 0.0f;
+        break;
+      }
+      case NODE_MAP_RANGE_SMOOTHSTEP: {
+        factor = (from_min > from_max) ? 1.0f - smoothstep(from_max, from_min, factor) :
+                                         smoothstep(from_min, from_max, factor);
+        break;
+      }
+      case NODE_MAP_RANGE_SMOOTHERSTEP: {
+        factor = (from_min > from_max) ? 1.0f - smootherstep(from_max, from_min, factor) :
+                                         smootherstep(from_min, from_max, factor);
+        break;
+      }
+    }
+    result = to_min + factor * (to_max - to_min);
+  }
+  else {
+    result = 0.0f;
+  }
+  stack_store_float(stack, result_stack_offset, result);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index 998a29912d4..6e19c859e19 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -18,7 +18,33 @@ CCL_NAMESPACE_BEGIN
 
 /* Mapping Node */
 
-ccl_device void svm_node_mapping(
+ccl_device void svm_node_mapping(KernelGlobals *kg,
+                                 ShaderData *sd,
+                                 float *stack,
+                                 uint type,
+                                 uint inputs_stack_offsets,
+                                 uint result_stack_offset,
+                                 int *offset)
+{
+  uint vector_stack_offset, location_stack_offset, rotation_stack_offset, scale_stack_offset;
+  svm_unpack_node_uchar4(inputs_stack_offsets,
+                         &vector_stack_offset,
+                         &location_stack_offset,
+                         &rotation_stack_offset,
+                         &scale_stack_offset);
+
+  float3 vector = stack_load_float3(stack, vector_stack_offset);
+  float3 location = stack_load_float3(stack, location_stack_offset);
+  float3 rotation = stack_load_float3(stack, rotation_stack_offset);
+  float3 scale = stack_load_float3(stack, scale_stack_offset);
+
+  float3 result = svm_mapping((NodeMappingType)type, vector, location, rotation, scale);
+  stack_store_float3(stack, result_stack_offset, result);
+}
+
+/* Texture Mapping */
+
+ccl_device void svm_node_texture_mapping(
     KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
diff --git a/intern/cycles/kernel/svm/svm_mapping_util.h b/intern/cycles/kernel/svm/svm_mapping_util.h
new file mode 100644
index 00000000000..ec2c84e0791
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_mapping_util.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float3
+svm_mapping(NodeMappingType type, float3 vector, float3 location, float3 rotation, float3 scale)
+{
+  Transform rotationTransform = euler_to_transform(rotation);
+  switch (type) {
+    case NODE_MAPPING_TYPE_POINT:
+      return transform_direction(&rotationTransform, (vector * scale)) + location;
+    case NODE_MAPPING_TYPE_TEXTURE:
+      return safe_divide_float3_float3(
+          transform_direction_transposed(&rotationTransform, (vector - location)), scale);
+    case NODE_MAPPING_TYPE_VECTOR:
+      return transform_direction(&rotationTransform, (vector * scale));
+    case NODE_MAPPING_TYPE_NORMAL:
+      return safe_normalize(
+          transform_direction(&rotationTransform, safe_divide_float3_float3(vector, scale)));
+    default:
+      return make_float3(0.0f, 0.0f, 0.0f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 5920913825b..12a2bbdeb9b 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -16,48 +16,59 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Nodes */
-
 ccl_device void svm_node_math(KernelGlobals *kg,
                               ShaderData *sd,
                               float *stack,
-                              uint itype,
-                              uint f1_offset,
-                              uint f2_offset,
+                              uint type,
+                              uint inputs_stack_offsets,
+                              uint result_stack_offset,
                               int *offset)
 {
-  NodeMath type = (NodeMath)itype;
-  float f1 = stack_load_float(stack, f1_offset);
-  float f2 = stack_load_float(stack, f2_offset);
-  float f = svm_math(type, f1, f2);
+  uint a_stack_offset, b_stack_offset, c_stack_offset;
+  svm_unpack_node_uchar3(inputs_stack_offsets, &a_stack_offset, &b_stack_offset, &c_stack_offset);
 
-  uint4 node1 = read_node(kg, offset);
+  float a = stack_load_float(stack, a_stack_offset);
+  float b = stack_load_float(stack, b_stack_offset);
+  float c = stack_load_float(stack, c_stack_offset);
+  float result = svm_math((NodeMathType)type, a, b, c);
 
-  stack_store_float(stack, node1.y, f);
+  stack_store_float(stack, result_stack_offset, result);
 }
 
 ccl_device void svm_node_vector_math(KernelGlobals *kg,
                                      ShaderData *sd,
                                      float *stack,
-                                     uint itype,
-                                     uint v1_offset,
-                                     uint v2_offset,
+                                     uint type,
+                                     uint inputs_stack_offsets,
+                                     uint outputs_stack_offsets,
                                      int *offset)
 {
-  NodeVectorMath type = (NodeVectorMath)itype;
-  float3 v1 = stack_load_float3(stack, v1_offset);
-  float3 v2 = stack_load_float3(stack, v2_offset);
-  float f;
-  float3 v;
+  uint value_stack_offset, vector_stack_offset;
+  uint a_stack_offset, b_stack_offset, scale_stack_offset;
+  svm_unpack_node_uchar3(
+      inputs_stack_offsets, &a_stack_offset, &b_stack_offset, &scale_stack_offset);
+  svm_unpack_node_uchar2(outputs_stack_offsets, &value_stack_offset, &vector_stack_offset);
+
+  float3 a = stack_load_float3(stack, a_stack_offset);
+  float3 b = stack_load_float3(stack, b_stack_offset);
+  float3 c = make_float3(0.0f, 0.0f, 0.0f);
+  float scale = stack_load_float(stack, scale_stack_offset);
+
+  float value;
+  float3 vector;
 
-  svm_vector_math(&f, &v, type, v1, v2);
+  /* 3 Vector Operators */
+  if (type == NODE_VECTOR_MATH_WRAP) {
+    uint4 extra_node = read_node(kg, offset);
+    c = stack_load_float3(stack, extra_node.x);
+  }
 
-  uint4 node1 = read_node(kg, offset);
+  svm_vector_math(&value, &vector, (NodeVectorMathType)type, a, b, c, scale);
 
-  if (stack_valid(node1.y))
-    stack_store_float(stack, node1.y, f);
-  if (stack_valid(node1.z))
-    stack_store_float3(stack, node1.z, v);
+  if (stack_valid(value_stack_offset))
+    stack_store_float(stack, value_stack_offset, value);
+  if (stack_valid(vector_stack_offset))
+    stack_store_float3(stack, vector_stack_offset, vector);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index e3544515f1b..d1e1fa87e53 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -16,99 +16,179 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float average_fac(float3 v)
+ccl_device void svm_vector_math(float *value,
+                                float3 *vector,
+                                NodeVectorMathType type,
+                                float3 a,
+                                float3 b,
+                                float3 c,
+                                float scale)
 {
-  return (fabsf(v.x) + fabsf(v.y) + fabsf(v.z)) / 3.0f;
-}
-
-ccl_device void svm_vector_math(
-    float *Fac, float3 *Vector, NodeVectorMath type, float3 Vector1, float3 Vector2)
-{
-  if (type == NODE_VECTOR_MATH_ADD) {
-    *Vector = Vector1 + Vector2;
-    *Fac = average_fac(*Vector);
-  }
-  else if (type == NODE_VECTOR_MATH_SUBTRACT) {
-    *Vector = Vector1 - Vector2;
-    *Fac = average_fac(*Vector);
-  }
-  else if (type == NODE_VECTOR_MATH_AVERAGE) {
-    *Vector = safe_normalize_len(Vector1 + Vector2, Fac);
-  }
-  else if (type == NODE_VECTOR_MATH_DOT_PRODUCT) {
-    *Fac = dot(Vector1, Vector2);
-    *Vector = make_float3(0.0f, 0.0f, 0.0f);
-  }
-  else if (type == NODE_VECTOR_MATH_CROSS_PRODUCT) {
-    *Vector = safe_normalize_len(cross(Vector1, Vector2), Fac);
-  }
-  else if (type == NODE_VECTOR_MATH_NORMALIZE) {
-    *Vector = safe_normalize_len(Vector1, Fac);
-  }
-  else {
-    *Fac = 0.0f;
-    *Vector = make_float3(0.0f, 0.0f, 0.0f);
+  switch (type) {
+    case NODE_VECTOR_MATH_ADD:
+      *vector = a + b;
+      break;
+    case NODE_VECTOR_MATH_SUBTRACT:
+      *vector = a - b;
+      break;
+    case NODE_VECTOR_MATH_MULTIPLY:
+      *vector = a * b;
+      break;
+    case NODE_VECTOR_MATH_DIVIDE:
+      *vector = safe_divide_float3_float3(a, b);
+      break;
+    case NODE_VECTOR_MATH_CROSS_PRODUCT:
+      *vector = cross(a, b);
+      break;
+    case NODE_VECTOR_MATH_PROJECT:
+      *vector = project(a, b);
+      break;
+    case NODE_VECTOR_MATH_REFLECT:
+      *vector = reflect(a, b);
+      break;
+    case NODE_VECTOR_MATH_DOT_PRODUCT:
+      *value = dot(a, b);
+      break;
+    case NODE_VECTOR_MATH_DISTANCE:
+      *value = distance(a, b);
+      break;
+    case NODE_VECTOR_MATH_LENGTH:
+      *value = len(a);
+      break;
+    case NODE_VECTOR_MATH_SCALE:
+      *vector = a * scale;
+      break;
+    case NODE_VECTOR_MATH_NORMALIZE:
+      *vector = safe_normalize(a);
+      break;
+    case NODE_VECTOR_MATH_SNAP:
+      *vector = floor(safe_divide_float3_float3(a, b)) * b;
+      break;
+    case NODE_VECTOR_MATH_FLOOR:
+      *vector = floor(a);
+      break;
+    case NODE_VECTOR_MATH_CEIL:
+      *vector = ceil(a);
+      break;
+    case NODE_VECTOR_MATH_MODULO:
+      *vector = make_float3(safe_modulo(a.x, b.x), safe_modulo(a.y, b.y), safe_modulo(a.z, b.z));
+      break;
+    case NODE_VECTOR_MATH_WRAP:
+      *vector = make_float3(wrapf(a.x, b.x, c.x), wrapf(a.y, b.y, c.y), wrapf(a.z, b.z, c.z));
+      break;
+    case NODE_VECTOR_MATH_FRACTION:
+      *vector = a - floor(a);
+      break;
+    case NODE_VECTOR_MATH_ABSOLUTE:
+      *vector = fabs(a);
+      break;
+    case NODE_VECTOR_MATH_MINIMUM:
+      *vector = min(a, b);
+      break;
+    case NODE_VECTOR_MATH_MAXIMUM:
+      *vector = max(a, b);
+      break;
+    case NODE_VECTOR_MATH_SINE:
+      *vector = make_float3(sinf(a.x), sinf(a.y), sinf(a.z));
+      break;
+    case NODE_VECTOR_MATH_COSINE:
+      *vector = make_float3(cosf(a.x), cosf(a.y), cosf(a.z));
+      break;
+    case NODE_VECTOR_MATH_TANGENT:
+      *vector = make_float3(tanf(a.x), tanf(a.y), tanf(a.z));
+      break;
+    default:
+      *vector = make_float3(0.0f, 0.0f, 0.0f);
+      *value = 0.0f;
   }
 }
 
-ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
+ccl_device float svm_math(NodeMathType type, float a, float b, float c)
 {
-  float Fac;
-
-  if (type == NODE_MATH_ADD)
-    Fac = Fac1 + Fac2;
-  else if (type == NODE_MATH_SUBTRACT)
-    Fac = Fac1 - Fac2;
-  else if (type == NODE_MATH_MULTIPLY)
-    Fac = Fac1 * Fac2;
-  else if (type == NODE_MATH_DIVIDE)
-    Fac = safe_divide(Fac1, Fac2);
-  else if (type == NODE_MATH_SINE)
-    Fac = sinf(Fac1);
-  else if (type == NODE_MATH_COSINE)
-    Fac = cosf(Fac1);
-  else if (type == NODE_MATH_TANGENT)
-    Fac = tanf(Fac1);
-  else if (type == NODE_MATH_ARCSINE)
-    Fac = safe_asinf(Fac1);
-  else if (type == NODE_MATH_ARCCOSINE)
-    Fac = safe_acosf(Fac1);
-  else if (type == NODE_MATH_ARCTANGENT)
-    Fac = atanf(Fac1);
-  else if (type == NODE_MATH_POWER)
-    Fac = safe_powf(Fac1, Fac2);
-  else if (type == NODE_MATH_LOGARITHM)
-    Fac = safe_logf(Fac1, Fac2);
-  else if (type == NODE_MATH_MINIMUM)
-    Fac = fminf(Fac1, Fac2);
-  else if (type == NODE_MATH_MAXIMUM)
-    Fac = fmaxf(Fac1, Fac2);
-  else if (type == NODE_MATH_ROUND)
-    Fac = floorf(Fac1 + 0.5f);
-  else if (type == NODE_MATH_LESS_THAN)
-    Fac = Fac1 < Fac2;
-  else if (type == NODE_MATH_GREATER_THAN)
-    Fac = Fac1 > Fac2;
-  else if (type == NODE_MATH_MODULO)
-    Fac = safe_modulo(Fac1, Fac2);
-  else if (type == NODE_MATH_ABSOLUTE)
-    Fac = fabsf(Fac1);
-  else if (type == NODE_MATH_ARCTAN2)
-    Fac = atan2f(Fac1, Fac2);
-  else if (type == NODE_MATH_FLOOR)
-    Fac = floorf(Fac1);
-  else if (type == NODE_MATH_CEIL)
-    Fac = ceilf(Fac1);
-  else if (type == NODE_MATH_FRACT)
-    Fac = Fac1 - floorf(Fac1);
-  else if (type == NODE_MATH_SQRT)
-    Fac = safe_sqrtf(Fac1);
-  else if (type == NODE_MATH_CLAMP)
-    Fac = saturate(Fac1);
-  else
-    Fac = 0.0f;
-
-  return Fac;
+  switch (type) {
+    case NODE_MATH_ADD:
+      return a + b;
+    case NODE_MATH_SUBTRACT:
+      return a - b;
+    case NODE_MATH_MULTIPLY:
+      return a * b;
+    case NODE_MATH_DIVIDE:
+      return safe_divide(a, b);
+    case NODE_MATH_POWER:
+      return safe_powf(a, b);
+    case NODE_MATH_LOGARITHM:
+      return safe_logf(a, b);
+    case NODE_MATH_SQRT:
+      return safe_sqrtf(a);
+    case NODE_MATH_INV_SQRT:
+      return inversesqrtf(a);
+    case NODE_MATH_ABSOLUTE:
+      return fabsf(a);
+    case NODE_MATH_RADIANS:
+      return a * (M_PI_F / 180.0f);
+    case NODE_MATH_DEGREES:
+      return a * (180.0f / M_PI_F);
+    case NODE_MATH_MINIMUM:
+      return fminf(a, b);
+    case NODE_MATH_MAXIMUM:
+      return fmaxf(a, b);
+    case NODE_MATH_LESS_THAN:
+      return a < b;
+    case NODE_MATH_GREATER_THAN:
+      return a > b;
+    case NODE_MATH_ROUND:
+      return floorf(a + 0.5f);
+    case NODE_MATH_FLOOR:
+      return floorf(a);
+    case NODE_MATH_CEIL:
+      return ceilf(a);
+    case NODE_MATH_FRACTION:
+      return a - floorf(a);
+    case NODE_MATH_MODULO:
+      return safe_modulo(a, b);
+    case NODE_MATH_TRUNC:
+      return a >= 0.0f ? floorf(a) : ceilf(a);
+    case NODE_MATH_SNAP:
+      return floorf(safe_divide(a, b)) * b;
+    case NODE_MATH_WRAP:
+      return wrapf(a, b, c);
+    case NODE_MATH_PINGPONG:
+      return pingpongf(a, b);
+    case NODE_MATH_SINE:
+      return sinf(a);
+    case NODE_MATH_COSINE:
+      return cosf(a);
+    case NODE_MATH_TANGENT:
+      return tanf(a);
+    case NODE_MATH_SINH:
+      return sinhf(a);
+    case NODE_MATH_COSH:
+      return coshf(a);
+    case NODE_MATH_TANH:
+      return tanhf(a);
+    case NODE_MATH_ARCSINE:
+      return safe_asinf(a);
+    case NODE_MATH_ARCCOSINE:
+      return safe_acosf(a);
+    case NODE_MATH_ARCTANGENT:
+      return atanf(a);
+    case NODE_MATH_ARCTAN2:
+      return atan2f(a, b);
+    case NODE_MATH_SIGN:
+      return compatible_signf(a);
+    case NODE_MATH_EXPONENT:
+      return expf(a);
+    case NODE_MATH_COMPARE:
+      return ((a == b) || (fabsf(a - b) <= fmaxf(c, FLT_EPSILON))) ? 1.0f : 0.0f;
+    case NODE_MATH_MULTIPLY_ADD:
+      return a * b + c;
+    case NODE_MATH_SMOOTH_MIN:
+      return smoothminf(a, b, c);
+    case NODE_MATH_SMOOTH_MAX:
+      return -smoothminf(-a, -b, c);
+    default:
+      return 0.0f;
+  }
 }
 
 /* Calculate color in range 800..12000 using an approximation
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 67fb5ca6241..571f62fe27f 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Musgrave fBm
+/* 1D Musgrave fBm
  *
  * H: fractal increment parameter
  * lacunarity: gap between successive frequencies
@@ -25,59 +25,404 @@ CCL_NAMESPACE_BEGIN
  * from "Texturing and Modelling: A procedural approach"
  */
 
-ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves)
+ccl_device_noinline_cpu float noise_musgrave_fBm_1d(float co,
+                                                    float H,
+                                                    float lacunarity,
+                                                    float octaves)
 {
-  float rmd;
+  float p = co;
   float value = 0.0f;
   float pwr = 1.0f;
   float pwHL = powf(lacunarity, -H);
-  int i;
 
-  for (i = 0; i < float_to_int(octaves); i++) {
-    value += snoise(p) * pwr;
+  for (int i = 0; i < float_to_int(octaves); i++) {
+    value += snoise_1d(p) * pwr;
     pwr *= pwHL;
     p *= lacunarity;
   }
 
-  rmd = octaves - floorf(octaves);
-  if (rmd != 0.0f)
-    value += rmd * snoise(p) * pwr;
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value += rmd * snoise_1d(p) * pwr;
+  }
+
+  return value;
+}
+
+/* 1D Musgrave Multifractal
+ *
+ * H: highest fractal dimension
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_multi_fractal_1d(float co,
+                                                              float H,
+                                                              float lacunarity,
+                                                              float octaves)
+{
+  float p = co;
+  float value = 1.0f;
+  float pwr = 1.0f;
+  float pwHL = powf(lacunarity, -H);
+
+  for (int i = 0; i < float_to_int(octaves); i++) {
+    value *= (pwr * snoise_1d(p) + 1.0f);
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value *= (rmd * pwr * snoise_1d(p) + 1.0f); /* correct? */
+  }
+
+  return value;
+}
+
+/* 1D Musgrave Heterogeneous Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_hetero_terrain_1d(
+    float co, float H, float lacunarity, float octaves, float offset)
+{
+  float p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  /* first unscaled octave of function; later octaves are scaled */
+  float value = offset + snoise_1d(p);
+  p *= lacunarity;
+
+  for (int i = 1; i < float_to_int(octaves); i++) {
+    float increment = (snoise_1d(p) + offset) * pwr * value;
+    value += increment;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    float increment = (snoise_1d(p) + offset) * pwr * value;
+    value += rmd * increment;
+  }
+
+  return value;
+}
+
+/* 1D Hybrid Additive/Multiplicative Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_1d(
+    float co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  float p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  float value = snoise_1d(p) + offset;
+  float weight = gain * value;
+  p *= lacunarity;
+
+  for (int i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
+    if (weight > 1.0f) {
+      weight = 1.0f;
+    }
+
+    float signal = (snoise_1d(p) + offset) * pwr;
+    pwr *= pwHL;
+    value += weight * signal;
+    weight *= gain * signal;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value += rmd * ((snoise_1d(p) + offset) * pwr);
+  }
+
+  return value;
+}
+
+/* 1D Ridged Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_1d(
+    float co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  float p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  float signal = offset - fabsf(snoise_1d(p));
+  signal *= signal;
+  float value = signal;
+  float weight = 1.0f;
+
+  for (int i = 1; i < float_to_int(octaves); i++) {
+    p *= lacunarity;
+    weight = saturate(signal * gain);
+    signal = offset - fabsf(snoise_1d(p));
+    signal *= signal;
+    signal *= weight;
+    value += signal * pwr;
+    pwr *= pwHL;
+  }
+
+  return value;
+}
+
+/* 2D Musgrave fBm
+ *
+ * H: fractal increment parameter
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ *
+ * from "Texturing and Modelling: A procedural approach"
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_fBm_2d(float2 co,
+                                                    float H,
+                                                    float lacunarity,
+                                                    float octaves)
+{
+  float2 p = co;
+  float value = 0.0f;
+  float pwr = 1.0f;
+  float pwHL = powf(lacunarity, -H);
+
+  for (int i = 0; i < float_to_int(octaves); i++) {
+    value += snoise_2d(p) * pwr;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value += rmd * snoise_2d(p) * pwr;
+  }
+
+  return value;
+}
+
+/* 2D Musgrave Multifractal
+ *
+ * H: highest fractal dimension
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_multi_fractal_2d(float2 co,
+                                                              float H,
+                                                              float lacunarity,
+                                                              float octaves)
+{
+  float2 p = co;
+  float value = 1.0f;
+  float pwr = 1.0f;
+  float pwHL = powf(lacunarity, -H);
+
+  for (int i = 0; i < float_to_int(octaves); i++) {
+    value *= (pwr * snoise_2d(p) + 1.0f);
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value *= (rmd * pwr * snoise_2d(p) + 1.0f); /* correct? */
+  }
+
+  return value;
+}
+
+/* 2D Musgrave Heterogeneous Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_hetero_terrain_2d(
+    float2 co, float H, float lacunarity, float octaves, float offset)
+{
+  float2 p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  /* first unscaled octave of function; later octaves are scaled */
+  float value = offset + snoise_2d(p);
+  p *= lacunarity;
+
+  for (int i = 1; i < float_to_int(octaves); i++) {
+    float increment = (snoise_2d(p) + offset) * pwr * value;
+    value += increment;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    float increment = (snoise_2d(p) + offset) * pwr * value;
+    value += rmd * increment;
+  }
+
+  return value;
+}
+
+/* 2D Hybrid Additive/Multiplicative Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_2d(
+    float2 co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  float2 p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  float value = snoise_2d(p) + offset;
+  float weight = gain * value;
+  p *= lacunarity;
+
+  for (int i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
+    if (weight > 1.0f) {
+      weight = 1.0f;
+    }
+
+    float signal = (snoise_2d(p) + offset) * pwr;
+    pwr *= pwHL;
+    value += weight * signal;
+    weight *= gain * signal;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value += rmd * ((snoise_2d(p) + offset) * pwr);
+  }
+
+  return value;
+}
+
+/* 2D Ridged Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_2d(
+    float2 co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  float2 p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  float signal = offset - fabsf(snoise_2d(p));
+  signal *= signal;
+  float value = signal;
+  float weight = 1.0f;
+
+  for (int i = 1; i < float_to_int(octaves); i++) {
+    p *= lacunarity;
+    weight = saturate(signal * gain);
+    signal = offset - fabsf(snoise_2d(p));
+    signal *= signal;
+    signal *= weight;
+    value += signal * pwr;
+    pwr *= pwHL;
+  }
+
+  return value;
+}
+
+/* 3D Musgrave fBm
+ *
+ * H: fractal increment parameter
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ *
+ * from "Texturing and Modelling: A procedural approach"
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_fBm_3d(float3 co,
+                                                    float H,
+                                                    float lacunarity,
+                                                    float octaves)
+{
+  float3 p = co;
+  float value = 0.0f;
+  float pwr = 1.0f;
+  float pwHL = powf(lacunarity, -H);
+
+  for (int i = 0; i < float_to_int(octaves); i++) {
+    value += snoise_3d(p) * pwr;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value += rmd * snoise_3d(p) * pwr;
+  }
 
   return value;
 }
 
-/* Musgrave Multifractal
+/* 3D Musgrave Multifractal
  *
  * H: highest fractal dimension
  * lacunarity: gap between successive frequencies
  * octaves: number of frequencies in the fBm
  */
 
-ccl_device_noinline float noise_musgrave_multi_fractal(float3 p,
-                                                       float H,
-                                                       float lacunarity,
-                                                       float octaves)
+ccl_device_noinline_cpu float noise_musgrave_multi_fractal_3d(float3 co,
+                                                              float H,
+                                                              float lacunarity,
+                                                              float octaves)
 {
-  float rmd;
+  float3 p = co;
   float value = 1.0f;
   float pwr = 1.0f;
   float pwHL = powf(lacunarity, -H);
-  int i;
 
-  for (i = 0; i < float_to_int(octaves); i++) {
-    value *= (pwr * snoise(p) + 1.0f);
+  for (int i = 0; i < float_to_int(octaves); i++) {
+    value *= (pwr * snoise_3d(p) + 1.0f);
     pwr *= pwHL;
     p *= lacunarity;
   }
 
-  rmd = octaves - floorf(octaves);
-  if (rmd != 0.0f)
-    value *= (rmd * pwr * snoise(p) + 1.0f); /* correct? */
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value *= (rmd * pwr * snoise_3d(p) + 1.0f); /* correct? */
+  }
 
   return value;
 }
 
-/* Musgrave Heterogeneous Terrain
+/* 3D Musgrave Heterogeneous Terrain
  *
  * H: fractal dimension of the roughest area
  * lacunarity: gap between successive frequencies
@@ -85,35 +430,34 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p,
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_hetero_terrain(
-    float3 p, float H, float lacunarity, float octaves, float offset)
+ccl_device_noinline_cpu float noise_musgrave_hetero_terrain_3d(
+    float3 co, float H, float lacunarity, float octaves, float offset)
 {
-  float value, increment, rmd;
+  float3 p = co;
   float pwHL = powf(lacunarity, -H);
   float pwr = pwHL;
-  int i;
 
   /* first unscaled octave of function; later octaves are scaled */
-  value = offset + snoise(p);
+  float value = offset + snoise_3d(p);
   p *= lacunarity;
 
-  for (i = 1; i < float_to_int(octaves); i++) {
-    increment = (snoise(p) + offset) * pwr * value;
+  for (int i = 1; i < float_to_int(octaves); i++) {
+    float increment = (snoise_3d(p) + offset) * pwr * value;
     value += increment;
     pwr *= pwHL;
     p *= lacunarity;
   }
 
-  rmd = octaves - floorf(octaves);
+  float rmd = octaves - floorf(octaves);
   if (rmd != 0.0f) {
-    increment = (snoise(p) + offset) * pwr * value;
+    float increment = (snoise_3d(p) + offset) * pwr * value;
     value += rmd * increment;
   }
 
   return value;
 }
 
-/* Hybrid Additive/Multiplicative Multifractal Terrain
+/* 3D Hybrid Additive/Multiplicative Multifractal Terrain
  *
  * H: fractal dimension of the roughest area
  * lacunarity: gap between successive frequencies
@@ -121,37 +465,38 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
-    float3 p, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_3d(
+    float3 co, float H, float lacunarity, float octaves, float offset, float gain)
 {
-  float result, signal, weight, rmd;
+  float3 p = co;
   float pwHL = powf(lacunarity, -H);
   float pwr = pwHL;
-  int i;
 
-  result = snoise(p) + offset;
-  weight = gain * result;
+  float value = snoise_3d(p) + offset;
+  float weight = gain * value;
   p *= lacunarity;
 
-  for (i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
-    if (weight > 1.0f)
+  for (int i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
+    if (weight > 1.0f) {
       weight = 1.0f;
+    }
 
-    signal = (snoise(p) + offset) * pwr;
+    float signal = (snoise_3d(p) + offset) * pwr;
     pwr *= pwHL;
-    result += weight * signal;
+    value += weight * signal;
     weight *= gain * signal;
     p *= lacunarity;
   }
 
-  rmd = octaves - floorf(octaves);
-  if (rmd != 0.0f)
-    result += rmd * ((snoise(p) + offset) * pwr);
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value += rmd * ((snoise_3d(p) + offset) * pwr);
+  }
 
-  return result;
+  return value;
 }
 
-/* Ridged Multifractal Terrain
+/* 3D Ridged Multifractal Terrain
  *
  * H: fractal dimension of the roughest area
  * lacunarity: gap between successive frequencies
@@ -159,93 +504,346 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_ridged_multi_fractal(
-    float3 p, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_3d(
+    float3 co, float H, float lacunarity, float octaves, float offset, float gain)
 {
-  float result, signal, weight;
+  float3 p = co;
   float pwHL = powf(lacunarity, -H);
   float pwr = pwHL;
-  int i;
 
-  signal = offset - fabsf(snoise(p));
+  float signal = offset - fabsf(snoise_3d(p));
   signal *= signal;
-  result = signal;
-  weight = 1.0f;
+  float value = signal;
+  float weight = 1.0f;
 
-  for (i = 1; i < float_to_int(octaves); i++) {
+  for (int i = 1; i < float_to_int(octaves); i++) {
     p *= lacunarity;
     weight = saturate(signal * gain);
-    signal = offset - fabsf(snoise(p));
+    signal = offset - fabsf(snoise_3d(p));
     signal *= signal;
     signal *= weight;
-    result += signal * pwr;
+    value += signal * pwr;
     pwr *= pwHL;
   }
 
-  return result;
+  return value;
 }
 
-/* Shader */
+/* 4D Musgrave fBm
+ *
+ * H: fractal increment parameter
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ *
+ * from "Texturing and Modelling: A procedural approach"
+ */
 
-ccl_device float svm_musgrave(NodeMusgraveType type,
-                              float dimension,
-                              float lacunarity,
-                              float octaves,
-                              float offset,
-                              float intensity,
-                              float gain,
-                              float3 p)
+ccl_device_noinline_cpu float noise_musgrave_fBm_4d(float4 co,
+                                                    float H,
+                                                    float lacunarity,
+                                                    float octaves)
 {
-  if (type == NODE_MUSGRAVE_MULTIFRACTAL)
-    return intensity * noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves);
-  else if (type == NODE_MUSGRAVE_FBM)
-    return intensity * noise_musgrave_fBm(p, dimension, lacunarity, octaves);
-  else if (type == NODE_MUSGRAVE_HYBRID_MULTIFRACTAL)
-    return intensity *
-           noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, offset, gain);
-  else if (type == NODE_MUSGRAVE_RIDGED_MULTIFRACTAL)
-    return intensity *
-           noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, offset, gain);
-  else if (type == NODE_MUSGRAVE_HETERO_TERRAIN)
-    return intensity * noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, offset);
+  float4 p = co;
+  float value = 0.0f;
+  float pwr = 1.0f;
+  float pwHL = powf(lacunarity, -H);
+
+  for (int i = 0; i < float_to_int(octaves); i++) {
+    value += snoise_4d(p) * pwr;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value += rmd * snoise_4d(p) * pwr;
+  }
 
-  return 0.0f;
+  return value;
 }
 
-ccl_device void svm_node_tex_musgrave(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+/* 4D Musgrave Multifractal
+ *
+ * H: highest fractal dimension
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_multi_fractal_4d(float4 co,
+                                                              float H,
+                                                              float lacunarity,
+                                                              float octaves)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
+  float4 p = co;
+  float value = 1.0f;
+  float pwr = 1.0f;
+  float pwHL = powf(lacunarity, -H);
 
-  uint type, co_offset, color_offset, fac_offset;
-  uint dimension_offset, lacunarity_offset, detail_offset, offset_offset;
-  uint gain_offset, scale_offset;
+  for (int i = 0; i < float_to_int(octaves); i++) {
+    value *= (pwr * snoise_4d(p) + 1.0f);
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
 
-  decode_node_uchar4(node.y, &type, &co_offset, &color_offset, &fac_offset);
-  decode_node_uchar4(
-      node.z, &dimension_offset, &lacunarity_offset, &detail_offset, &offset_offset);
-  decode_node_uchar4(node.w, &gain_offset, &scale_offset, NULL, NULL);
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value *= (rmd * pwr * snoise_4d(p) + 1.0f); /* correct? */
+  }
 
-  float3 co = stack_load_float3(stack, co_offset);
-  float dimension = stack_load_float_default(stack, dimension_offset, node2.x);
-  float lacunarity = stack_load_float_default(stack, lacunarity_offset, node2.y);
-  float detail = stack_load_float_default(stack, detail_offset, node2.z);
-  float foffset = stack_load_float_default(stack, offset_offset, node2.w);
-  float gain = stack_load_float_default(stack, gain_offset, node3.x);
-  float scale = stack_load_float_default(stack, scale_offset, node3.y);
+  return value;
+}
+
+/* 4D Musgrave Heterogeneous Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_hetero_terrain_4d(
+    float4 co, float H, float lacunarity, float octaves, float offset)
+{
+  float4 p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  /* first unscaled octave of function; later octaves are scaled */
+  float value = offset + snoise_4d(p);
+  p *= lacunarity;
+
+  for (int i = 1; i < float_to_int(octaves); i++) {
+    float increment = (snoise_4d(p) + offset) * pwr * value;
+    value += increment;
+    pwr *= pwHL;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    float increment = (snoise_4d(p) + offset) * pwr * value;
+    value += rmd * increment;
+  }
+
+  return value;
+}
+
+/* 4D Hybrid Additive/Multiplicative Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_4d(
+    float4 co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  float4 p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  float value = snoise_4d(p) + offset;
+  float weight = gain * value;
+  p *= lacunarity;
+
+  for (int i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
+    if (weight > 1.0f) {
+      weight = 1.0f;
+    }
+
+    float signal = (snoise_4d(p) + offset) * pwr;
+    pwr *= pwHL;
+    value += weight * signal;
+    weight *= gain * signal;
+    p *= lacunarity;
+  }
+
+  float rmd = octaves - floorf(octaves);
+  if (rmd != 0.0f) {
+    value += rmd * ((snoise_4d(p) + offset) * pwr);
+  }
+
+  return value;
+}
+
+/* 4D Ridged Multifractal Terrain
+ *
+ * H: fractal dimension of the roughest area
+ * lacunarity: gap between successive frequencies
+ * octaves: number of frequencies in the fBm
+ * offset: raises the terrain from `sea level'
+ */
+
+ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_4d(
+    float4 co, float H, float lacunarity, float octaves, float offset, float gain)
+{
+  float4 p = co;
+  float pwHL = powf(lacunarity, -H);
+  float pwr = pwHL;
+
+  float signal = offset - fabsf(snoise_4d(p));
+  signal *= signal;
+  float value = signal;
+  float weight = 1.0f;
+
+  for (int i = 1; i < float_to_int(octaves); i++) {
+    p *= lacunarity;
+    weight = saturate(signal * gain);
+    signal = offset - fabsf(snoise_4d(p));
+    signal *= signal;
+    signal *= weight;
+    value += signal * pwr;
+    pwr *= pwHL;
+  }
+
+  return value;
+}
+
+ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint offsets1,
+                                      uint offsets2,
+                                      uint offsets3,
+                                      int *offset)
+{
+  uint type, dimensions, co_stack_offset, w_stack_offset;
+  uint scale_stack_offset, detail_stack_offset, dimension_stack_offset, lacunarity_stack_offset;
+  uint offset_stack_offset, gain_stack_offset, fac_stack_offset;
+
+  svm_unpack_node_uchar4(offsets1, &type, &dimensions, &co_stack_offset, &w_stack_offset);
+  svm_unpack_node_uchar4(offsets2,
+                         &scale_stack_offset,
+                         &detail_stack_offset,
+                         &dimension_stack_offset,
+                         &lacunarity_stack_offset);
+  svm_unpack_node_uchar3(offsets3, &offset_stack_offset, &gain_stack_offset, &fac_stack_offset);
+
+  uint4 defaults1 = read_node(kg, offset);
+  uint4 defaults2 = read_node(kg, offset);
+
+  float3 co = stack_load_float3(stack, co_stack_offset);
+  float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
+  float scale = stack_load_float_default(stack, scale_stack_offset, defaults1.y);
+  float detail = stack_load_float_default(stack, detail_stack_offset, defaults1.z);
+  float dimension = stack_load_float_default(stack, dimension_stack_offset, defaults1.w);
+  float lacunarity = stack_load_float_default(stack, lacunarity_stack_offset, defaults2.x);
+  float foffset = stack_load_float_default(stack, offset_stack_offset, defaults2.y);
+  float gain = stack_load_float_default(stack, gain_stack_offset, defaults2.z);
 
   dimension = fmaxf(dimension, 1e-5f);
   detail = clamp(detail, 0.0f, 16.0f);
   lacunarity = fmaxf(lacunarity, 1e-5f);
 
-  float f = svm_musgrave(
-      (NodeMusgraveType)type, dimension, lacunarity, detail, foffset, 1.0f, gain, co * scale);
+  float fac;
+
+  switch (dimensions) {
+    case 1: {
+      float p = w * scale;
+      switch ((NodeMusgraveType)type) {
+        case NODE_MUSGRAVE_MULTIFRACTAL:
+          fac = noise_musgrave_multi_fractal_1d(p, dimension, lacunarity, detail);
+          break;
+        case NODE_MUSGRAVE_FBM:
+          fac = noise_musgrave_fBm_1d(p, dimension, lacunarity, detail);
+          break;
+        case NODE_MUSGRAVE_HYBRID_MULTIFRACTAL:
+          fac = noise_musgrave_hybrid_multi_fractal_1d(
+              p, dimension, lacunarity, detail, foffset, gain);
+          break;
+        case NODE_MUSGRAVE_RIDGED_MULTIFRACTAL:
+          fac = noise_musgrave_ridged_multi_fractal_1d(
+              p, dimension, lacunarity, detail, foffset, gain);
+          break;
+        case NODE_MUSGRAVE_HETERO_TERRAIN:
+          fac = noise_musgrave_hetero_terrain_1d(p, dimension, lacunarity, detail, foffset);
+          break;
+        default:
+          fac = 0.0f;
+      }
+      break;
+    }
+    case 2: {
+      float2 p = make_float2(co.x, co.y) * scale;
+      switch ((NodeMusgraveType)type) {
+        case NODE_MUSGRAVE_MULTIFRACTAL:
+          fac = noise_musgrave_multi_fractal_2d(p, dimension, lacunarity, detail);
+          break;
+        case NODE_MUSGRAVE_FBM:
+          fac = noise_musgrave_fBm_2d(p, dimension, lacunarity, detail);
+          break;
+        case NODE_MUSGRAVE_HYBRID_MULTIFRACTAL:
+          fac = noise_musgrave_hybrid_multi_fractal_2d(
+              p, dimension, lacunarity, detail, foffset, gain);
+          break;
+        case NODE_MUSGRAVE_RIDGED_MULTIFRACTAL:
+          fac = noise_musgrave_ridged_multi_fractal_2d(
+              p, dimension, lacunarity, detail, foffset, gain);
+          break;
+        case NODE_MUSGRAVE_HETERO_TERRAIN:
+          fac = noise_musgrave_hetero_terrain_2d(p, dimension, lacunarity, detail, foffset);
+          break;
+        default:
+          fac = 0.0f;
+      }
+      break;
+    }
+    case 3: {
+      float3 p = co * scale;
+      switch ((NodeMusgraveType)type) {
+        case NODE_MUSGRAVE_MULTIFRACTAL:
+          fac = noise_musgrave_multi_fractal_3d(p, dimension, lacunarity, detail);
+          break;
+        case NODE_MUSGRAVE_FBM:
+          fac = noise_musgrave_fBm_3d(p, dimension, lacunarity, detail);
+          break;
+        case NODE_MUSGRAVE_HYBRID_MULTIFRACTAL:
+          fac = noise_musgrave_hybrid_multi_fractal_3d(
+              p, dimension, lacunarity, detail, foffset, gain);
+          break;
+        case NODE_MUSGRAVE_RIDGED_MULTIFRACTAL:
+          fac = noise_musgrave_ridged_multi_fractal_3d(
+              p, dimension, lacunarity, detail, foffset, gain);
+          break;
+        case NODE_MUSGRAVE_HETERO_TERRAIN:
+          fac = noise_musgrave_hetero_terrain_3d(p, dimension, lacunarity, detail, foffset);
+          break;
+        default:
+          fac = 0.0f;
+      }
+      break;
+    }
+    case 4: {
+      float4 p = make_float4(co.x, co.y, co.z, w) * scale;
+      switch ((NodeMusgraveType)type) {
+        case NODE_MUSGRAVE_MULTIFRACTAL:
+          fac = noise_musgrave_multi_fractal_4d(p, dimension, lacunarity, detail);
+          break;
+        case NODE_MUSGRAVE_FBM:
+          fac = noise_musgrave_fBm_4d(p, dimension, lacunarity, detail);
+          break;
+        case NODE_MUSGRAVE_HYBRID_MULTIFRACTAL:
+          fac = noise_musgrave_hybrid_multi_fractal_4d(
+              p, dimension, lacunarity, detail, foffset, gain);
+          break;
+        case NODE_MUSGRAVE_RIDGED_MULTIFRACTAL:
+          fac = noise_musgrave_ridged_multi_fractal_4d(
+              p, dimension, lacunarity, detail, foffset, gain);
+          break;
+        case NODE_MUSGRAVE_HETERO_TERRAIN:
+          fac = noise_musgrave_hetero_terrain_4d(p, dimension, lacunarity, detail, foffset);
+          break;
+        default:
+          fac = 0.0f;
+      }
+      break;
+    }
+    default:
+      fac = 0.0f;
+  }
 
-  if (stack_valid(fac_offset))
-    stack_store_float(stack, fac_offset, f);
-  if (stack_valid(color_offset))
-    stack_store_float3(stack, color_offset, make_float3(f, f, f));
+  stack_store_float(stack, fac_stack_offset, fac);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 322579ccfe3..7db8ffcc6e1 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -32,304 +32,711 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssei quick_floor_sse(const ssef &x)
-{
-  ssei b = truncatei(x);
-  ssei isneg = cast((x < ssef(0.0f)).m128);
-  return b + isneg;  // unsaturated add 0xffffffff is the same as subtract -1
-}
-#endif
+/* **** Perlin Noise **** */
 
-ccl_device uint hash(uint kx, uint ky, uint kz)
-{
-  // define some handy macros
-#define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
-#define final(a, b, c) \
-  { \
-    c ^= b; \
-    c -= rot(b, 14); \
-    a ^= c; \
-    a -= rot(c, 11); \
-    b ^= a; \
-    b -= rot(a, 25); \
-    c ^= b; \
-    c -= rot(b, 16); \
-    a ^= c; \
-    a -= rot(c, 4); \
-    b ^= a; \
-    b -= rot(a, 14); \
-    c ^= b; \
-    c -= rot(b, 24); \
-  }
-  // now hash the data!
-  uint a, b, c, len = 3;
-  a = b = c = 0xdeadbeef + (len << 2) + 13;
-
-  c += kz;
-  b += ky;
-  a += kx;
-  final(a, b, c);
-
-  return c;
-  // macros not needed anymore
-#undef rot
-#undef final
-}
-
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssei hash_sse(const ssei &kx, const ssei &ky, const ssei &kz)
-{
-#  define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))
-#  define xor_rot(a, b, c) \
-    do { \
-      a = a ^ b; \
-      a = a - rot(b, c); \
-    } while (0)
-
-  uint len = 3;
-  ssei magic = ssei(0xdeadbeef + (len << 2) + 13);
-  ssei a = magic + kx;
-  ssei b = magic + ky;
-  ssei c = magic + kz;
-
-  xor_rot(c, b, 14);
-  xor_rot(a, c, 11);
-  xor_rot(b, a, 25);
-  xor_rot(c, b, 16);
-  xor_rot(a, c, 4);
-  xor_rot(b, a, 14);
-  xor_rot(c, b, 24);
-
-  return c;
-#  undef rot
-#  undef xor_rot
-}
-#endif
-
-#if 0  // unused
-ccl_device int imod(int a, int b)
+ccl_device float fade(float t)
 {
-  a %= b;
-  return a < 0 ? a + b : a;
+  return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
 }
 
-ccl_device uint phash(int kx, int ky, int kz, int3 p)
+ccl_device_inline float negate_if(float val, int condition)
 {
-  return hash(imod(kx, p.x), imod(ky, p.y), imod(kz, p.z));
+  return (condition) ? -val : val;
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float floorfrac(float x, int *i)
+ccl_device float grad1(int hash, float x)
 {
-  *i = quick_floor_to_int(x);
-  return x - *i;
+  int h = hash & 15;
+  float g = 1 + (h & 7);
+  return negate_if(g, h & 8) * x;
 }
-#else
-ccl_device_inline ssef floorfrac_sse(const ssef &x, ssei *i)
+
+ccl_device_noinline_cpu float perlin_1d(float x)
 {
-  *i = quick_floor_sse(x);
-  return x - ssef(*i);
+  int X;
+  float fx = floorfrac(x, &X);
+  float u = fade(fx);
+
+  return mix(grad1(hash_uint(X), fx), grad1(hash_uint(X + 1), fx - 1.0f), u);
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float fade(float t)
+/* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if
+ * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not
+ * supported, we do a standard implementation, but if it is supported, we
+ * do an implementation using SSE intrinsics.
+ */
+#if !defined(__KERNEL_SSE2__)
+
+/* ** Standard Implementation ** */
+
+/* Bilinear Interpolation:
+ *
+ * v2          v3
+ *  @ + + + + @       y
+ *  +         +       ^
+ *  +         +       |
+ *  +         +       |
+ *  @ + + + + @       @------> x
+ * v0          v1
+ *
+ */
+ccl_device float bi_mix(float v0, float v1, float v2, float v3, float x, float y)
 {
-  return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
+  float x1 = 1.0f - x;
+  return (1.0f - y) * (v0 * x1 + v1 * x) + y * (v2 * x1 + v3 * x);
 }
-#else
-ccl_device_inline ssef fade_sse(const ssef *t)
+
+/* Trilinear Interpolation:
+ *
+ *   v6               v7
+ *     @ + + + + + + @
+ *     +\            +\
+ *     + \           + \
+ *     +  \          +  \
+ *     +   \ v4      +   \ v5
+ *     +    @ + + + +++ + @          z
+ *     +    +        +    +      y   ^
+ *  v2 @ + +++ + + + @ v3 +       \  |
+ *      \   +         \   +        \ |
+ *       \  +          \  +         \|
+ *        \ +           \ +          +---------> x
+ *         \+            \+
+ *          @ + + + + + + @
+ *        v0               v1
+ */
+ccl_device float tri_mix(float v0,
+                         float v1,
+                         float v2,
+                         float v3,
+                         float v4,
+                         float v5,
+                         float v6,
+                         float v7,
+                         float x,
+                         float y,
+                         float z)
 {
-  ssef a = madd(*t, ssef(6.0f), ssef(-15.0f));
-  ssef b = madd(*t, a, ssef(10.0f));
-  return ((*t) * (*t)) * ((*t) * b);
+  float x1 = 1.0f - x;
+  float y1 = 1.0f - y;
+  float z1 = 1.0f - z;
+  return z1 * (y1 * (v0 * x1 + v1 * x) + y * (v2 * x1 + v3 * x)) +
+         z * (y1 * (v4 * x1 + v5 * x) + y * (v6 * x1 + v7 * x));
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float nerp(float t, float a, float b)
+ccl_device float quad_mix(float v0,
+                          float v1,
+                          float v2,
+                          float v3,
+                          float v4,
+                          float v5,
+                          float v6,
+                          float v7,
+                          float v8,
+                          float v9,
+                          float v10,
+                          float v11,
+                          float v12,
+                          float v13,
+                          float v14,
+                          float v15,
+                          float x,
+                          float y,
+                          float z,
+                          float w)
 {
-  return (1.0f - t) * a + t * b;
+  return mix(tri_mix(v0, v1, v2, v3, v4, v5, v6, v7, x, y, z),
+             tri_mix(v8, v9, v10, v11, v12, v13, v14, v15, x, y, z),
+             w);
 }
-#else
-ccl_device_inline ssef nerp_sse(const ssef &t, const ssef &a, const ssef &b)
+
+ccl_device float grad2(int hash, float x, float y)
 {
-  ssef x1 = (ssef(1.0f) - t) * a;
-  return madd(t, b, x1);
+  int h = hash & 7;
+  float u = h < 4 ? x : y;
+  float v = 2.0f * (h < 4 ? y : x);
+  return negate_if(u, h & 1) + negate_if(v, h & 2);
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float grad(int hash, float x, float y, float z)
+ccl_device float grad3(int hash, float x, float y, float z)
 {
-  // use vectors pointing to the edges of the cube
   int h = hash & 15;
   float u = h < 8 ? x : y;
-  float vt = ((h == 12) | (h == 14)) ? x : z;
+  float vt = ((h == 12) || (h == 14)) ? x : z;
   float v = h < 4 ? y : vt;
-  return ((h & 1) ? -u : u) + ((h & 2) ? -v : v);
+  return negate_if(u, h & 1) + negate_if(v, h & 2);
 }
-#else
-ccl_device_inline ssef grad_sse(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
-{
-  ssei c1 = ssei(1);
-  ssei c2 = ssei(2);
 
-  ssei h = hash & ssei(15);  // h = hash & 15
+ccl_device float grad4(int hash, float x, float y, float z, float w)
+{
+  int h = hash & 31;
+  float u = h < 24 ? x : y;
+  float v = h < 16 ? y : z;
+  float s = h < 8 ? z : w;
+  return negate_if(u, h & 1) + negate_if(v, h & 2) + negate_if(s, h & 4);
+}
 
-  sseb case_ux = h < ssei(8);  // 0xffffffff if h < 8 else 0
+ccl_device_noinline_cpu float perlin_2d(float x, float y)
+{
+  int X;
+  int Y;
 
-  ssef u = select(case_ux, x, y);  // u = h<8 ? x : y
+  float fx = floorfrac(x, &X);
+  float fy = floorfrac(y, &Y);
 
-  sseb case_vy = h < ssei(4);  // 0xffffffff if h < 4 else 0
+  float u = fade(fx);
+  float v = fade(fy);
 
-  sseb case_h12 = h == ssei(12);  // 0xffffffff if h == 12 else 0
-  sseb case_h14 = h == ssei(14);  // 0xffffffff if h == 14 else 0
+  float r = bi_mix(grad2(hash_uint2(X, Y), fx, fy),
+                   grad2(hash_uint2(X + 1, Y), fx - 1.0f, fy),
+                   grad2(hash_uint2(X, Y + 1), fx, fy - 1.0f),
+                   grad2(hash_uint2(X + 1, Y + 1), fx - 1.0f, fy - 1.0f),
+                   u,
+                   v);
 
-  sseb case_vx = case_h12 | case_h14;  // 0xffffffff if h == 12 or h == 14 else 0
+  return r;
+}
 
-  ssef v = select(case_vy, y, select(case_vx, x, z));  // v = h<4 ? y : h == 12 || h == 14 ? x : z
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
+{
+  int X;
+  int Y;
+  int Z;
 
-  ssei case_uneg = (h & c1) << 31;        // 1<<31 if h&1 else 0
-  ssef case_uneg_mask = cast(case_uneg);  // -0.0 if h&1 else +0.0
-  ssef ru = u ^ case_uneg_mask;           // -u if h&1 else u (copy float sign)
+  float fx = floorfrac(x, &X);
+  float fy = floorfrac(y, &Y);
+  float fz = floorfrac(z, &Z);
 
-  ssei case_vneg = (h & c2) << 30;        // 2<<30 if h&2 else 0
-  ssef case_vneg_mask = cast(case_vneg);  // -0.0 if h&2 else +0.0
-  ssef rv = v ^ case_vneg_mask;           // -v if h&2 else v (copy float sign)
+  float u = fade(fx);
+  float v = fade(fy);
+  float w = fade(fz);
 
-  ssef r = ru + rv;  // ((h&1) ? -u : u) + ((h&2) ? -v : v)
+  float r = tri_mix(grad3(hash_uint3(X, Y, Z), fx, fy, fz),
+                    grad3(hash_uint3(X + 1, Y, Z), fx - 1.0f, fy, fz),
+                    grad3(hash_uint3(X, Y + 1, Z), fx, fy - 1.0f, fz),
+                    grad3(hash_uint3(X + 1, Y + 1, Z), fx - 1.0f, fy - 1.0f, fz),
+                    grad3(hash_uint3(X, Y, Z + 1), fx, fy, fz - 1.0f),
+                    grad3(hash_uint3(X + 1, Y, Z + 1), fx - 1.0f, fy, fz - 1.0f),
+                    grad3(hash_uint3(X, Y + 1, Z + 1), fx, fy - 1.0f, fz - 1.0f),
+                    grad3(hash_uint3(X + 1, Y + 1, Z + 1), fx - 1.0f, fy - 1.0f, fz - 1.0f),
+                    u,
+                    v,
+                    w);
   return r;
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float scale3(float result)
-{
-  return 0.9820f * result;
-}
-#else
-ccl_device_inline ssef scale3_sse(const ssef &result)
-{
-  return ssef(0.9820f) * result;
-}
-#endif
-
-#ifndef __KERNEL_SSE2__
-ccl_device_noinline float perlin(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   int X;
-  float fx = floorfrac(x, &X);
   int Y;
-  float fy = floorfrac(y, &Y);
   int Z;
+  int W;
+
+  float fx = floorfrac(x, &X);
+  float fy = floorfrac(y, &Y);
   float fz = floorfrac(z, &Z);
+  float fw = floorfrac(w, &W);
 
   float u = fade(fx);
   float v = fade(fy);
-  float w = fade(fz);
+  float t = fade(fz);
+  float s = fade(fw);
+
+  float r = quad_mix(
+      grad4(hash_uint4(X, Y, Z, W), fx, fy, fz, fw),
+      grad4(hash_uint4(X + 1, Y, Z, W), fx - 1.0f, fy, fz, fw),
+      grad4(hash_uint4(X, Y + 1, Z, W), fx, fy - 1.0f, fz, fw),
+      grad4(hash_uint4(X + 1, Y + 1, Z, W), fx - 1.0f, fy - 1.0f, fz, fw),
+      grad4(hash_uint4(X, Y, Z + 1, W), fx, fy, fz - 1.0f, fw),
+      grad4(hash_uint4(X + 1, Y, Z + 1, W), fx - 1.0f, fy, fz - 1.0f, fw),
+      grad4(hash_uint4(X, Y + 1, Z + 1, W), fx, fy - 1.0f, fz - 1.0f, fw),
+      grad4(hash_uint4(X + 1, Y + 1, Z + 1, W), fx - 1.0f, fy - 1.0f, fz - 1.0f, fw),
+      grad4(hash_uint4(X, Y, Z, W + 1), fx, fy, fz, fw - 1.0f),
+      grad4(hash_uint4(X + 1, Y, Z, W + 1), fx - 1.0f, fy, fz, fw - 1.0f),
+      grad4(hash_uint4(X, Y + 1, Z, W + 1), fx, fy - 1.0f, fz, fw - 1.0f),
+      grad4(hash_uint4(X + 1, Y + 1, Z, W + 1), fx - 1.0f, fy - 1.0f, fz, fw - 1.0f),
+      grad4(hash_uint4(X, Y, Z + 1, W + 1), fx, fy, fz - 1.0f, fw - 1.0f),
+      grad4(hash_uint4(X + 1, Y, Z + 1, W + 1), fx - 1.0f, fy, fz - 1.0f, fw - 1.0f),
+      grad4(hash_uint4(X, Y + 1, Z + 1, W + 1), fx, fy - 1.0f, fz - 1.0f, fw - 1.0f),
+      grad4(hash_uint4(X + 1, Y + 1, Z + 1, W + 1), fx - 1.0f, fy - 1.0f, fz - 1.0f, fw - 1.0f),
+      u,
+      v,
+      t,
+      s);
+
+  return r;
+}
+
+#else /* SSE is supported. */
+
+/* ** SSE Implementation ** */
+
+/* SSE Bilinear Interpolation:
+ *
+ * The function takes two ssef inputs:
+ * - p : Contains the values at the points (v0, v1, v2, v3).
+ * - f : Contains the values (x, y, _, _). The third and fourth values are unused.
+ *
+ * The interpolation is done in two steps:
+ * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1).
+ *    (v2, v3) is generated by moving v2 and v3 to the first and second
+ *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    fourth values are unused.
+ * 2. Interpolate g0 and g1 along the y axis to get the final value.
+ *    g1 is generated by populating an ssef with the second value of g.
+ *    Only the first value is important in the final ssef.
+ *
+ * v1          v3          g1
+ *  @ + + + + @            @                    y
+ *  +         +     (1)    +    (2)             ^
+ *  +         +     --->   +    --->   final    |
+ *  +         +            +                    |
+ *  @ + + + + @            @                    @------> x
+ * v0          v2          g0
+ *
+ */
+ccl_device_inline ssef bi_mix(ssef p, ssef f)
+{
+  ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
+  return mix(g, shuffle<1>(g), shuffle<1>(f));
+}
+
+ccl_device_inline ssef fade(const ssef &t)
+{
+  ssef a = madd(t, 6.0f, -15.0f);
+  ssef b = madd(t, a, 10.0f);
+  return (t * t) * (t * b);
+}
+
+/* Negate val if the nth bit of h is 1. */
+#  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
+
+ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+{
+  ssei h = hash & 7;
+  ssef u = select(h < 4, x, y);
+  ssef v = 2.0f * select(h < 4, y, x);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
+}
+
+/* We use SSE to compute and interpolate 4 gradients at once:
+ *
+ *    Point  Offset from v0
+ *     v0       (0, 0)
+ *     v1       (0, 1)
+ *     v2       (1, 0)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(V, V + 1))
+ *     v3       (1, 1)         ^
+ *               |  |__________|       (0, 0, 1, 1) = shuffle<0, 0, 0, 0>(V, V + 1)
+ *               |                          ^
+ *               |__________________________|
+ *
+ */
+ccl_device_noinline float perlin_2d(float x, float y)
+{
+  ssei XY;
+  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
+  ssef uv = fade(fxy);
+
+  ssei XY1 = XY + 1;
+  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
+  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+
+  ssei h = hash_ssei2(X, Y);
+
+  ssef fxy1 = fxy - 1.0f;
+  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+
+  ssef g = grad(h, fx, fy);
+
+  return extract<0>(bi_mix(g, uv));
+}
+
+/* SSE Trilinear Interpolation:
+ *
+ * The function takes three ssef inputs:
+ * - p : Contains the values at the points (v0, v1, v2, v3).
+ * - q : Contains the values at the points (v4, v5, v6, v7).
+ * - f : Contains the values (x, y, z, _). The fourth value is unused.
+ *
+ * The interpolation is done in three steps:
+ * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3).
+ * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1).
+ *    (s2, s3) is generated by moving v2 and v3 to the first and second
+ *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    fourth values are unused.
+ * 3. Interpolate g0 and g1 along the z axis to get the final value.
+ *    g1 is generated by populating an ssef with the second value of g.
+ *    Only the first value is important in the final ssef.
+ *
+ *   v3               v7
+ *     @ + + + + + + @               s3 @
+ *     +\            +\                 +\
+ *     + \           + \                + \
+ *     +  \          +  \               +  \             g1
+ *     +   \ v1      +   \ v5           +   \ s1         @
+ *     +    @ + + + +++ + @             +    @           +                     z
+ *     +    +        +    +    (1)      +    +    (2)    +   (3)           y   ^
+ *  v2 @ + +++ + + + @ v6 +    --->  s2 @    +    --->   +   --->  final    \  |
+ *      \   +         \   +              \   +           +                   \ |
+ *       \  +          \  +               \  +           +                    \|
+ *        \ +           \ +                \ +           @                     +---------> x
+ *         \+            \+                 \+           g0
+ *          @ + + + + + + @                  @
+ *        v0               v4                 s0
+ */
+ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
+{
+  ssef s = mix(p, q, shuffle<0>(f));
+  ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
+  return mix(g, shuffle<1>(g), shuffle<2>(f));
+}
+
+/* 3D and 4D noise can be accelerated using AVX, so we first check if AVX
+ * is supported, that is, if __KERNEL_AVX__ is defined. If it is not
+ * supported, we do an SSE implementation, but if it is supported,
+ * we do an implementation using AVX intrinsics.
+ */
+#  if !defined(__KERNEL_AVX__)
+
+ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
+{
+  ssei h = hash & 15;
+  ssef u = select(h < 8, x, y);
+  ssef vt = select((h == 12) | (h == 14), x, z);
+  ssef v = select(h < 4, y, vt);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
+}
+
+ccl_device_inline ssef
+grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w)
+{
+  ssei h = hash & 31;
+  ssef u = select(h < 24, x, y);
+  ssef v = select(h < 16, y, z);
+  ssef s = select(h < 8, z, w);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
+}
 
-  float result;
-
-  result = nerp(
-      w,
-      nerp(v,
-           nerp(u, grad(hash(X, Y, Z), fx, fy, fz), grad(hash(X + 1, Y, Z), fx - 1.0f, fy, fz)),
-           nerp(u,
-                grad(hash(X, Y + 1, Z), fx, fy - 1.0f, fz),
-                grad(hash(X + 1, Y + 1, Z), fx - 1.0f, fy - 1.0f, fz))),
-      nerp(v,
-           nerp(u,
-                grad(hash(X, Y, Z + 1), fx, fy, fz - 1.0f),
-                grad(hash(X + 1, Y, Z + 1), fx - 1.0f, fy, fz - 1.0f)),
-           nerp(u,
-                grad(hash(X, Y + 1, Z + 1), fx, fy - 1.0f, fz - 1.0f),
-                grad(hash(X + 1, Y + 1, Z + 1), fx - 1.0f, fy - 1.0f, fz - 1.0f))));
-  float r = scale3(result);
-
-  /* can happen for big coordinates, things even out to 0.0 then anyway */
-  return (isfinite(r)) ? r : 0.0f;
-}
-#else
-ccl_device_noinline float perlin(float x, float y, float z)
-{
-  ssef xyz = ssef(x, y, z, 0.0f);
+/* SSE Quadrilinear Interpolation:
+ *
+ * Quadrilinear interpolation is as simple as a linear interpolation
+ * between two trilinear interpolations.
+ *
+ */
+ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
+{
+  return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
+}
+
+/* We use SSE to compute and interpolate 4 gradients at once. Since we have 8
+ * gradients in 3D, we need to compute two sets of gradients at the points:
+ *
+ *    Point  Offset from v0
+ *     v0      (0, 0, 0)
+ *     v1      (0, 0, 1)
+ *     v2      (0, 1, 0)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ *     v3      (0, 1, 1)         ^
+ *                 |  |__________|       (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ *                 |                          ^
+ *                 |__________________________|
+ *
+ *    Point  Offset from v0
+ *     v4      (1, 0, 0)
+ *     v5      (1, 0, 1)
+ *     v6      (1, 1, 0)
+ *     v7      (1, 1, 1)
+ *
+ */
+ccl_device_noinline float perlin_3d(float x, float y, float z)
+{
   ssei XYZ;
+  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
+  ssef uvw = fade(fxyz);
+
+  ssei XYZ1 = XYZ + 1;
+  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
 
-  ssef fxyz = floorfrac_sse(xyz, &XYZ);
+  ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z);
+  ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z);
 
-  ssef uvw = fade_sse(&fxyz);
-  ssef u = shuffle<0>(uvw), v = shuffle<1>(uvw), w = shuffle<2>(uvw);
+  ssef fxyz1 = fxyz - 1.0f;
+  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
 
-  ssei XYZ_ofc = XYZ + ssei(1);
-  ssei vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc);                       // +0, +0, +1, +1
-  ssei vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc));  // +0, +1, +0, +1
+  ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
+  ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
 
-  ssei h1 = hash_sse(shuffle<0>(XYZ), vdy, vdz);      // hash directions 000, 001, 010, 011
-  ssei h2 = hash_sse(shuffle<0>(XYZ_ofc), vdy, vdz);  // hash directions 100, 101, 110, 111
+  return extract<0>(tri_mix(g1, g2, uvw));
+}
 
-  ssef fxyz_ofc = fxyz - ssef(1.0f);
-  ssef vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc);
-  ssef vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc));
+/* We use SSE to compute and interpolate 4 gradients at once. Since we have 16
+ * gradients in 4D, we need to compute four sets of gradients at the points:
+ *
+ *    Point  Offset from v0
+ *     v0     (0, 0, 0, 0)
+ *     v1     (0, 0, 1, 0)
+ *     v2     (0, 1, 0, 0)  (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ *     v3     (0, 1, 1, 0)    ^
+ *                |  |________|    (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ *                |                       ^
+ *                |_______________________|
+ *
+ *    Point  Offset from v0
+ *     v4     (1, 0, 0, 0)
+ *     v5     (1, 0, 1, 0)
+ *     v6     (1, 1, 0, 0)
+ *     v7     (1, 1, 1, 0)
+ *
+ *    Point  Offset from v0
+ *     v8     (0, 0, 0, 1)
+ *     v9     (0, 0, 1, 1)
+ *     v10    (0, 1, 0, 1)
+ *     v11    (0, 1, 1, 1)
+ *
+ *    Point  Offset from v0
+ *     v12    (1, 0, 0, 1)
+ *     v13    (1, 0, 1, 1)
+ *     v14    (1, 1, 0, 1)
+ *     v15    (1, 1, 1, 1)
+ *
+ */
+ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+{
+  ssei XYZW;
+  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
+  ssef uvws = fade(fxyzw);
+
+  ssei XYZW1 = XYZW + 1;
+  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+
+  ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
+  ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
 
-  ssef g1 = grad_sse(h1, shuffle<0>(fxyz), vfy, vfz);
-  ssef g2 = grad_sse(h2, shuffle<0>(fxyz_ofc), vfy, vfz);
-  ssef n1 = nerp_sse(u, g1, g2);
+  ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
+  ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
 
-  ssef n1_half = shuffle<2, 3, 2, 3>(n1);  // extract 2 floats to a separate vector
-  ssef n2 = nerp_sse(
-      v, n1, n1_half);  // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
+  ssef fxyzw1 = fxyzw - 1.0f;
+  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
 
-  ssef n2_second = shuffle<1>(n2);  // extract b to a separate vector
-  ssef result = nerp_sse(
-      w, n2, n2_second);  // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]
+  ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
+  ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
 
-  ssef r = scale3_sse(result);
+  ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
+  ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
 
-  ssef infmask = cast(ssei(0x7f800000));
-  ssef rinfmask = ((r & infmask) == infmask).m128;  // 0xffffffff if r is inf/-inf/nan else 0
-  ssef rfinite = andnot(rinfmask, r);               // 0 if r is inf/-inf/nan else r
-  return extract<0>(rfinite);
+  return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
 }
-#endif
 
-/* perlin noise in range 0..1 */
-ccl_device float noise(float3 p)
+#  else /* AVX is supported. */
+
+/* AVX Implementation */
+
+ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z)
 {
-  float r = perlin(p.x, p.y, p.z);
-  return 0.5f * r + 0.5f;
+  avxi h = hash & 15;
+  avxf u = select(h < 8, x, y);
+  avxf vt = select((h == 12) | (h == 14), x, z);
+  avxf v = select(h < 4, y, vt);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
-/* perlin noise in range -1..1 */
-ccl_device float snoise(float3 p)
+ccl_device_inline avxf
+grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w)
 {
-  return perlin(p.x, p.y, p.z);
+  avxi h = hash & 31;
+  avxf u = select(h < 24, x, y);
+  avxf v = select(h < 16, y, z);
+  avxf s = select(h < 8, z, w);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }
 
-/* cell noise */
-ccl_device float cellnoise(float3 p)
+/* SSE Quadrilinear Interpolation:
+ *
+ * The interpolation is done in two steps:
+ * 1. Interpolate p and q along the w axis to get s.
+ * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final
+ *    value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the
+ *    low and high ssef from s.
+ *
+ */
+ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
 {
-  int3 ip = quick_floor_to_int3(p);
-  return bits_to_01(hash(ip.x, ip.y, ip.z));
+  ssef fv = shuffle<3>(f);
+  avxf s = mix(p, q, avxf(fv, fv));
+  return tri_mix(low(s), high(s), f);
 }
 
-ccl_device float3 cellnoise3(float3 p)
+/* We use AVX to compute and interpolate 8 gradients at once.
+ *
+ *    Point  Offset from v0
+ *     v0      (0, 0, 0)
+ *     v1      (0, 0, 1)    The full AVX type is computed by inserting the following
+ *     v2      (0, 1, 0)    SSE types into both the low and high parts of the AVX.
+ *     v3      (0, 1, 1)
+ *     v4      (1, 0, 0)
+ *     v5      (1, 0, 1)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ *     v6      (1, 1, 0)         ^
+ *     v7      (1, 1, 1)         |
+ *                 |  |__________|       (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ *                 |                          ^
+ *                 |__________________________|
+ *
+ */
+ccl_device_noinline float perlin_3d(float x, float y, float z)
 {
-  int3 ip = quick_floor_to_int3(p);
-#ifndef __KERNEL_SSE__
-  float r = bits_to_01(hash(ip.x, ip.y, ip.z));
-  float g = bits_to_01(hash(ip.y, ip.x, ip.z));
-  float b = bits_to_01(hash(ip.y, ip.z, ip.x));
-  return make_float3(r, g, b);
-#else
-  ssei ip_yxz = shuffle<1, 0, 2, 3>(ssei(ip.m128));
-  ssei ip_xyy = shuffle<0, 1, 1, 3>(ssei(ip.m128));
-  ssei ip_zzx = shuffle<2, 2, 0, 3>(ssei(ip.m128));
-  ssei bits = hash_sse(ip_xyy, ip_yxz, ip_zzx);
-  return float3(uint32_to_float(bits) * ssef(1.0f / (float)0xFFFFFFFF));
+  ssei XYZ;
+  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
+  ssef uvw = fade(fxyz);
+
+  ssei XYZ1 = XYZ + 1;
+  ssei X = shuffle<0>(XYZ);
+  ssei X1 = shuffle<0>(XYZ1);
+  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+
+  avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z));
+
+  ssef fxyz1 = fxyz - 1.0f;
+  ssef fx = shuffle<0>(fxyz);
+  ssef fx1 = shuffle<0>(fxyz1);
+  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+
+  avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz));
+
+  return extract<0>(tri_mix(low(g), high(g), uvw));
+}
+
+/* We use AVX to compute and interpolate 8 gradients at once. Since we have 16
+ * gradients in 4D, we need to compute two sets of gradients at the points:
+ *
+ *    Point  Offset from v0
+ *     v0     (0, 0, 0, 0)
+ *     v1     (0, 0, 1, 0)  The full avx type is computed by inserting the following
+ *     v2     (0, 1, 0, 0)  sse types into both the low and high parts of the avx.
+ *     v3     (0, 1, 1, 0)
+ *     v4     (1, 0, 0, 0)
+ *     v5     (1, 0, 1, 0)  (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ *     v6     (1, 1, 0, 0)    ^
+ *     v7     (1, 1, 1, 0)    |
+ *                |  |________|    (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ *                |                       ^
+ *                |_______________________|
+ *
+ *    Point  Offset from v0
+ *     v8     (0, 0, 0, 1)
+ *     v9     (0, 0, 1, 1)
+ *     v10    (0, 1, 0, 1)
+ *     v11    (0, 1, 1, 1)
+ *     v12    (1, 0, 0, 1)
+ *     v13    (1, 0, 1, 1)
+ *     v14    (1, 1, 0, 1)
+ *     v15    (1, 1, 1, 1)
+ *
+ */
+ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+{
+  ssei XYZW;
+  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
+  ssef uvws = fade(fxyzw);
+
+  ssei XYZW1 = XYZW + 1;
+  ssei X = shuffle<0>(XYZW);
+  ssei X1 = shuffle<0>(XYZW1);
+  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  ssei W = shuffle<3>(XYZW);
+  ssei W1 = shuffle<3>(XYZW1);
+
+  avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W));
+  avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1));
+
+  ssef fxyzw1 = fxyzw - 1.0f;
+  ssef fx = shuffle<0>(fxyzw);
+  ssef fx1 = shuffle<0>(fxyzw1);
+  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  ssef fw = shuffle<3>(fxyzw);
+  ssef fw1 = shuffle<3>(fxyzw1);
+
+  avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw));
+  avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1));
+
+  return extract<0>(quad_mix(g1, g2, uvws));
+}
+#  endif
+
+#  undef negate_if_nth_bit
+
 #endif
+
+/* Remap the output of noise to a predictable range [-1, 1].
+ * The scale values were computed experimentally by the OSL developers.
+ */
+
+ccl_device_inline float noise_scale1(float result)
+{
+  return 0.2500f * result;
+}
+
+ccl_device_inline float noise_scale2(float result)
+{
+  return 0.6616f * result;
+}
+
+ccl_device_inline float noise_scale3(float result)
+{
+  return 0.9820f * result;
+}
+
+ccl_device_inline float noise_scale4(float result)
+{
+  return 0.8344f * result;
+}
+
+/* Safe Signed And Unsigned Noise */
+
+ccl_device_inline float snoise_1d(float p)
+{
+  return noise_scale1(ensure_finite(perlin_1d(p)));
+}
+
+ccl_device_inline float noise_1d(float p)
+{
+  return 0.5f * snoise_1d(p) + 0.5f;
+}
+
+ccl_device_inline float snoise_2d(float2 p)
+{
+  return noise_scale2(ensure_finite(perlin_2d(p.x, p.y)));
+}
+
+ccl_device_inline float noise_2d(float2 p)
+{
+  return 0.5f * snoise_2d(p) + 0.5f;
+}
+
+ccl_device_inline float snoise_3d(float3 p)
+{
+  return noise_scale3(ensure_finite(perlin_3d(p.x, p.y, p.z)));
+}
+
+ccl_device_inline float noise_3d(float3 p)
+{
+  return 0.5f * snoise_3d(p) + 0.5f;
+}
+
+ccl_device_inline float snoise_4d(float4 p)
+{
+  return noise_scale4(ensure_finite(perlin_4d(p.x, p.y, p.z, p.w)));
+}
+
+ccl_device_inline float noise_4d(float4 p)
+{
+  return 0.5f * snoise_4d(p) + 0.5f;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 3324e86fcd8..920dd7d9d02 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -16,44 +16,201 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Noise */
+/* The following offset functions generate random offsets to be added to texture
+ * coordinates to act as a seed since the noise functions don't have seed values.
+ * A seed value is needed for generating distortion textures and color outputs.
+ * The offset's components are in the range [100, 200], not too high to cause
+ * bad precision and not to small to be noticeable. We use float seed because
+ * OSL only support float hashes.
+ */
+
+ccl_device_inline float random_float_offset(float seed)
+{
+  return 100.0f + hash_float_to_float(seed) * 100.0f;
+}
+
+ccl_device_inline float2 random_float2_offset(float seed)
+{
+  return make_float2(100.0f + hash_float2_to_float(make_float2(seed, 0.0f)) * 100.0f,
+                     100.0f + hash_float2_to_float(make_float2(seed, 1.0f)) * 100.0f);
+}
 
-ccl_device void svm_node_tex_noise(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_inline float3 random_float3_offset(float seed)
 {
-  uint co_offset, scale_offset, detail_offset, distortion_offset, fac_offset, color_offset;
+  return make_float3(100.0f + hash_float2_to_float(make_float2(seed, 0.0f)) * 100.0f,
+                     100.0f + hash_float2_to_float(make_float2(seed, 1.0f)) * 100.0f,
+                     100.0f + hash_float2_to_float(make_float2(seed, 2.0f)) * 100.0f);
+}
 
-  decode_node_uchar4(node.y, &co_offset, &scale_offset, &detail_offset, &distortion_offset);
-  decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
+ccl_device_inline float4 random_float4_offset(float seed)
+{
+  return make_float4(100.0f + hash_float2_to_float(make_float2(seed, 0.0f)) * 100.0f,
+                     100.0f + hash_float2_to_float(make_float2(seed, 1.0f)) * 100.0f,
+                     100.0f + hash_float2_to_float(make_float2(seed, 2.0f)) * 100.0f,
+                     100.0f + hash_float2_to_float(make_float2(seed, 3.0f)) * 100.0f);
+}
 
-  uint4 node2 = read_node(kg, offset);
+ccl_device void noise_texture_1d(float co,
+                                 float detail,
+                                 float roughness,
+                                 float distortion,
+                                 bool color_is_needed,
+                                 float *value,
+                                 float3 *color)
+{
+  float p = co;
+  if (distortion != 0.0f) {
+    p += snoise_1d(p + random_float_offset(0.0f)) * distortion;
+  }
+
+  *value = fractal_noise_1d(p, detail, roughness);
+  if (color_is_needed) {
+    *color = make_float3(*value,
+                         fractal_noise_1d(p + random_float_offset(1.0f), detail, roughness),
+                         fractal_noise_1d(p + random_float_offset(2.0f), detail, roughness));
+  }
+}
+
+ccl_device void noise_texture_2d(float2 co,
+                                 float detail,
+                                 float roughness,
+                                 float distortion,
+                                 bool color_is_needed,
+                                 float *value,
+                                 float3 *color)
+{
+  float2 p = co;
+  if (distortion != 0.0f) {
+    p += make_float2(snoise_2d(p + random_float2_offset(0.0f)) * distortion,
+                     snoise_2d(p + random_float2_offset(1.0f)) * distortion);
+  }
 
-  float scale = stack_load_float_default(stack, scale_offset, node2.x);
-  float detail = stack_load_float_default(stack, detail_offset, node2.y);
-  float distortion = stack_load_float_default(stack, distortion_offset, node2.z);
-  float3 p = stack_load_float3(stack, co_offset) * scale;
-  int hard = 0;
+  *value = fractal_noise_2d(p, detail, roughness);
+  if (color_is_needed) {
+    *color = make_float3(*value,
+                         fractal_noise_2d(p + random_float2_offset(2.0f), detail, roughness),
+                         fractal_noise_2d(p + random_float2_offset(3.0f), detail, roughness));
+  }
+}
 
+ccl_device void noise_texture_3d(float3 co,
+                                 float detail,
+                                 float roughness,
+                                 float distortion,
+                                 bool color_is_needed,
+                                 float *value,
+                                 float3 *color)
+{
+  float3 p = co;
   if (distortion != 0.0f) {
-    float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
+    p += make_float3(snoise_3d(p + random_float3_offset(0.0f)) * distortion,
+                     snoise_3d(p + random_float3_offset(1.0f)) * distortion,
+                     snoise_3d(p + random_float3_offset(2.0f)) * distortion);
+  }
 
-    r.x = noise(p + offset) * distortion;
-    r.y = noise(p) * distortion;
-    r.z = noise(p - offset) * distortion;
+  *value = fractal_noise_3d(p, detail, roughness);
+  if (color_is_needed) {
+    *color = make_float3(*value,
+                         fractal_noise_3d(p + random_float3_offset(3.0f), detail, roughness),
+                         fractal_noise_3d(p + random_float3_offset(4.0f), detail, roughness));
+  }
+}
 
-    p += r;
+ccl_device void noise_texture_4d(float4 co,
+                                 float detail,
+                                 float roughness,
+                                 float distortion,
+                                 bool color_is_needed,
+                                 float *value,
+                                 float3 *color)
+{
+  float4 p = co;
+  if (distortion != 0.0f) {
+    p += make_float4(snoise_4d(p + random_float4_offset(0.0f)) * distortion,
+                     snoise_4d(p + random_float4_offset(1.0f)) * distortion,
+                     snoise_4d(p + random_float4_offset(2.0f)) * distortion,
+                     snoise_4d(p + random_float4_offset(3.0f)) * distortion);
   }
 
-  float f = noise_turbulence(p, detail, hard);
+  *value = fractal_noise_4d(p, detail, roughness);
+  if (color_is_needed) {
+    *color = make_float3(*value,
+                         fractal_noise_4d(p + random_float4_offset(4.0f), detail, roughness),
+                         fractal_noise_4d(p + random_float4_offset(5.0f), detail, roughness));
+  }
+}
+
+ccl_device void svm_node_tex_noise(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint dimensions,
+                                   uint offsets1,
+                                   uint offsets2,
+                                   int *offset)
+{
+  uint vector_stack_offset, w_stack_offset, scale_stack_offset;
+  uint detail_stack_offset, roughness_stack_offset, distortion_stack_offset;
+  uint value_stack_offset, color_stack_offset;
+
+  svm_unpack_node_uchar4(
+      offsets1, &vector_stack_offset, &w_stack_offset, &scale_stack_offset, &detail_stack_offset);
+  svm_unpack_node_uchar4(offsets2,
+                         &roughness_stack_offset,
+                         &distortion_stack_offset,
+                         &value_stack_offset,
+                         &color_stack_offset);
+
+  uint4 defaults1 = read_node(kg, offset);
+  uint4 defaults2 = read_node(kg, offset);
+
+  float3 vector = stack_load_float3(stack, vector_stack_offset);
+  float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
+  float scale = stack_load_float_default(stack, scale_stack_offset, defaults1.y);
+  float detail = stack_load_float_default(stack, detail_stack_offset, defaults1.z);
+  float roughness = stack_load_float_default(stack, roughness_stack_offset, defaults1.w);
+  float distortion = stack_load_float_default(stack, distortion_stack_offset, defaults2.x);
+
+  vector *= scale;
+  w *= scale;
+
+  float value;
+  float3 color;
+  switch (dimensions) {
+    case 1:
+      noise_texture_1d(
+          w, detail, roughness, distortion, stack_valid(color_stack_offset), &value, &color);
+      break;
+    case 2:
+      noise_texture_2d(make_float2(vector.x, vector.y),
+                       detail,
+                       roughness,
+                       distortion,
+                       stack_valid(color_stack_offset),
+                       &value,
+                       &color);
+      break;
+    case 3:
+      noise_texture_3d(
+          vector, detail, roughness, distortion, stack_valid(color_stack_offset), &value, &color);
+      break;
+    case 4:
+      noise_texture_4d(make_float4(vector.x, vector.y, vector.z, w),
+                       detail,
+                       roughness,
+                       distortion,
+                       stack_valid(color_stack_offset),
+                       &value,
+                       &color);
+      break;
+    default:
+      kernel_assert(0);
+  }
 
-  if (stack_valid(fac_offset)) {
-    stack_store_float(stack, fac_offset, f);
+  if (stack_valid(value_stack_offset)) {
+    stack_store_float(stack, value_stack_offset, value);
   }
-  if (stack_valid(color_offset)) {
-    float3 color = make_float3(f,
-                               noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
-                               noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
-    stack_store_float3(stack, color_offset, color);
+  if (stack_valid(color_stack_offset)) {
+    stack_store_float3(stack, color_stack_offset, color);
   }
 }
 
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 6084ee35a1f..85ccf39144b 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -59,7 +59,7 @@ ccl_device void svm_node_rgb_ramp(
   uint fac_offset, color_offset, alpha_offset;
   uint interpolate = node.z;
 
-  decode_node_uchar4(node.y, &fac_offset, &color_offset, &alpha_offset, NULL);
+  svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &alpha_offset);
 
   uint table_size = read_node(kg, offset).x;
 
@@ -78,7 +78,7 @@ ccl_device void svm_node_curves(
     KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
   uint fac_offset, color_offset, out_offset;
-  decode_node_uchar4(node.y, &fac_offset, &color_offset, &out_offset, NULL);
+  svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &out_offset);
 
   uint table_size = read_node(kg, offset).x;
 
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index 50fe0c8232f..f824184c1d4 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -37,16 +37,16 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma)
          (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma);
 }
 
-ccl_device float3 sky_radiance_old(KernelGlobals *kg,
-                                   float3 dir,
-                                   float sunphi,
-                                   float suntheta,
-                                   float radiance_x,
-                                   float radiance_y,
-                                   float radiance_z,
-                                   float *config_x,
-                                   float *config_y,
-                                   float *config_z)
+ccl_device float3 sky_radiance_preetham(KernelGlobals *kg,
+                                        float3 dir,
+                                        float sunphi,
+                                        float suntheta,
+                                        float radiance_x,
+                                        float radiance_y,
+                                        float radiance_z,
+                                        float *config_x,
+                                        float *config_y,
+                                        float *config_z)
 {
   /* convert vector to spherical coordinates */
   float2 spherical = direction_to_spherical(dir);
@@ -90,16 +90,16 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float
           configuration[6] * mieM + configuration[7] * zenith);
 }
 
-ccl_device float3 sky_radiance_new(KernelGlobals *kg,
-                                   float3 dir,
-                                   float sunphi,
-                                   float suntheta,
-                                   float radiance_x,
-                                   float radiance_y,
-                                   float radiance_z,
-                                   float *config_x,
-                                   float *config_y,
-                                   float *config_z)
+ccl_device float3 sky_radiance_hosek(KernelGlobals *kg,
+                                     float3 dir,
+                                     float sunphi,
+                                     float suntheta,
+                                     float radiance_x,
+                                     float radiance_y,
+                                     float radiance_z,
+                                     float *config_x,
+                                     float *config_y,
+                                     float *config_z)
 {
   /* convert vector to spherical coordinates */
   float2 spherical = direction_to_spherical(dir);
@@ -121,93 +121,209 @@ ccl_device float3 sky_radiance_new(KernelGlobals *kg,
   return xyz_to_rgb(kg, make_float3(x, y, z)) * (M_2PI_F / 683);
 }
 
+/* Nishita improved sky model */
+ccl_device float3 geographical_to_direction(float lat, float lon)
+{
+  return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
+}
+
+ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
+                                       float3 dir,
+                                       float *nishita_data,
+                                       uint texture_id)
+{
+  /* definitions */
+  float sun_elevation = nishita_data[6];
+  float sun_rotation = nishita_data[7];
+  float angular_diameter = nishita_data[8];
+  float sun_intensity = nishita_data[9];
+  bool sun_disc = (angular_diameter > 0.0f);
+  float3 xyz;
+  /* convert dir to spherical coordinates */
+  float2 direction = direction_to_spherical(dir);
+
+  /* render above the horizon */
+  if (dir.z >= 0.0f) {
+    /* definitions */
+    float3 sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2_F);
+    float sun_dir_angle = precise_angle(dir, sun_dir);
+    float half_angular = angular_diameter / 2.0f;
+    float dir_elevation = M_PI_2_F - direction.x;
+
+    /* if ray inside sun disc render it, otherwise render sky */
+    if (sun_disc && sun_dir_angle < half_angular) {
+      /* get 2 pixels data */
+      float3 pixel_bottom = make_float3(nishita_data[0], nishita_data[1], nishita_data[2]);
+      float3 pixel_top = make_float3(nishita_data[3], nishita_data[4], nishita_data[5]);
+      float y;
+
+      /* sun interpolation */
+      if (sun_elevation - half_angular > 0.0f) {
+        if (sun_elevation + half_angular > 0.0f) {
+          y = ((dir_elevation - sun_elevation) / angular_diameter) + 0.5f;
+          xyz = interp(pixel_bottom, pixel_top, y) * sun_intensity;
+        }
+      }
+      else {
+        if (sun_elevation + half_angular > 0.0f) {
+          y = dir_elevation / (sun_elevation + half_angular);
+          xyz = interp(pixel_bottom, pixel_top, y) * sun_intensity;
+        }
+      }
+      /* limb darkening, coefficient is 0.6f */
+      float limb_darkening = (1.0f -
+                              0.6f * (1.0f - sqrtf(1.0f - sqr(sun_dir_angle / half_angular))));
+      xyz *= limb_darkening;
+    }
+    /* sky */
+    else {
+      /* sky interpolation */
+      float x = (direction.y + M_PI_F + sun_rotation) / M_2PI_F;
+      /* more pixels toward horizon compensation */
+      float y = safe_sqrtf(dir_elevation / M_PI_2_F);
+      if (x > 1.0f) {
+        x -= 1.0f;
+      }
+      xyz = float4_to_float3(kernel_tex_image_interp(kg, texture_id, x, y));
+    }
+  }
+  /* ground */
+  else {
+    if (dir.z < -0.4f) {
+      xyz = make_float3(0.0f, 0.0f, 0.0f);
+    }
+    else {
+      /* black ground fade */
+      float fade = 1.0f + dir.z * 2.5f;
+      fade = sqr(fade) * fade;
+      /* interpolation */
+      float x = (direction.y + M_PI_F + sun_rotation) / M_2PI_F;
+      if (x > 1.0f) {
+        x -= 1.0f;
+      }
+      xyz = float4_to_float3(kernel_tex_image_interp(kg, texture_id, x, -0.5)) * fade;
+    }
+  }
+
+  /* convert to RGB */
+  return xyz_to_rgb(kg, xyz);
+}
+
 ccl_device void svm_node_tex_sky(
     KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
-  /* Define variables */
-  float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
-  float config_x[9], config_y[9], config_z[9];
-
   /* Load data */
   uint dir_offset = node.y;
   uint out_offset = node.z;
   int sky_model = node.w;
 
-  float4 data = read_node_float(kg, offset);
-  sunphi = data.x;
-  suntheta = data.y;
-  radiance_x = data.z;
-  radiance_y = data.w;
-
-  data = read_node_float(kg, offset);
-  radiance_z = data.x;
-  config_x[0] = data.y;
-  config_x[1] = data.z;
-  config_x[2] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_x[3] = data.x;
-  config_x[4] = data.y;
-  config_x[5] = data.z;
-  config_x[6] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_x[7] = data.x;
-  config_x[8] = data.y;
-  config_y[0] = data.z;
-  config_y[1] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_y[2] = data.x;
-  config_y[3] = data.y;
-  config_y[4] = data.z;
-  config_y[5] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_y[6] = data.x;
-  config_y[7] = data.y;
-  config_y[8] = data.z;
-  config_z[0] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_z[1] = data.x;
-  config_z[2] = data.y;
-  config_z[3] = data.z;
-  config_z[4] = data.w;
-
-  data = read_node_float(kg, offset);
-  config_z[5] = data.x;
-  config_z[6] = data.y;
-  config_z[7] = data.z;
-  config_z[8] = data.w;
-
   float3 dir = stack_load_float3(stack, dir_offset);
   float3 f;
 
-  /* Compute Sky */
-  if (sky_model == 0) {
-    f = sky_radiance_old(kg,
-                         dir,
-                         sunphi,
-                         suntheta,
-                         radiance_x,
-                         radiance_y,
-                         radiance_z,
-                         config_x,
-                         config_y,
-                         config_z);
+  /* Preetham and Hosek share the same data */
+  if (sky_model == 0 || sky_model == 1) {
+    /* Define variables */
+    float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
+    float config_x[9], config_y[9], config_z[9];
+
+    float4 data = read_node_float(kg, offset);
+    sunphi = data.x;
+    suntheta = data.y;
+    radiance_x = data.z;
+    radiance_y = data.w;
+
+    data = read_node_float(kg, offset);
+    radiance_z = data.x;
+    config_x[0] = data.y;
+    config_x[1] = data.z;
+    config_x[2] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_x[3] = data.x;
+    config_x[4] = data.y;
+    config_x[5] = data.z;
+    config_x[6] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_x[7] = data.x;
+    config_x[8] = data.y;
+    config_y[0] = data.z;
+    config_y[1] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_y[2] = data.x;
+    config_y[3] = data.y;
+    config_y[4] = data.z;
+    config_y[5] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_y[6] = data.x;
+    config_y[7] = data.y;
+    config_y[8] = data.z;
+    config_z[0] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_z[1] = data.x;
+    config_z[2] = data.y;
+    config_z[3] = data.z;
+    config_z[4] = data.w;
+
+    data = read_node_float(kg, offset);
+    config_z[5] = data.x;
+    config_z[6] = data.y;
+    config_z[7] = data.z;
+    config_z[8] = data.w;
+
+    /* Compute Sky */
+    if (sky_model == 0) {
+      f = sky_radiance_preetham(kg,
+                                dir,
+                                sunphi,
+                                suntheta,
+                                radiance_x,
+                                radiance_y,
+                                radiance_z,
+                                config_x,
+                                config_y,
+                                config_z);
+    }
+    else {
+      f = sky_radiance_hosek(kg,
+                             dir,
+                             sunphi,
+                             suntheta,
+                             radiance_x,
+                             radiance_y,
+                             radiance_z,
+                             config_x,
+                             config_y,
+                             config_z);
+    }
   }
+  /* Nishita */
   else {
-    f = sky_radiance_new(kg,
-                         dir,
-                         sunphi,
-                         suntheta,
-                         radiance_x,
-                         radiance_y,
-                         radiance_z,
-                         config_x,
-                         config_y,
-                         config_z);
+    /* Define variables */
+    float nishita_data[10];
+
+    float4 data = read_node_float(kg, offset);
+    nishita_data[0] = data.x;
+    nishita_data[1] = data.y;
+    nishita_data[2] = data.z;
+    nishita_data[3] = data.w;
+
+    data = read_node_float(kg, offset);
+    nishita_data[4] = data.x;
+    nishita_data[5] = data.y;
+    nishita_data[6] = data.z;
+    nishita_data[7] = data.w;
+
+    data = read_node_float(kg, offset);
+    nishita_data[8] = data.x;
+    nishita_data[9] = data.y;
+    uint texture_id = __float_as_uint(data.z);
+
+    /* Compute Sky */
+    f = sky_radiance_nishita(kg, dir, nishita_data, texture_id);
   }
 
   stack_store_float3(stack, out_offset, f);
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 1fb3e20f9e0..a876d6bc916 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -257,7 +257,7 @@ ccl_device void svm_node_tex_coord_bump_dy(
 ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
   uint color_offset, strength_offset, normal_offset, space;
-  decode_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space);
+  svm_unpack_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space);
 
   float3 color = stack_load_float3(stack, color_offset);
   color = 2.0f * make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f);
@@ -349,7 +349,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
   uint tangent_offset, direction_type, axis;
-  decode_node_uchar4(node.y, &tangent_offset, &direction_type, &axis, NULL);
+  svm_unpack_node_uchar3(node.y, &tangent_offset, &direction_type, &axis);
 
   float3 tangent;
   float3 attribute_value;
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h
deleted file mode 100644
index 290aa85c831..00000000000
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Turbulence */
-
-ccl_device_noinline float noise_turbulence(float3 p, float octaves, int hard)
-{
-  float fscale = 1.0f;
-  float amp = 1.0f;
-  float sum = 0.0f;
-  int i, n;
-
-  octaves = clamp(octaves, 0.0f, 16.0f);
-  n = float_to_int(octaves);
-
-  for (i = 0; i <= n; i++) {
-    float t = noise(fscale * p);
-
-    if (hard)
-      t = fabsf(2.0f * t - 1.0f);
-
-    sum += t * amp;
-    amp *= 0.5f;
-    fscale *= 2.0f;
-  }
-
-  float rmd = octaves - floorf(octaves);
-
-  if (rmd != 0.0f) {
-    float t = noise(fscale * p);
-
-    if (hard)
-      t = fabsf(2.0f * t - 1.0f);
-
-    float sum2 = sum + t * amp;
-
-    sum *= ((float)(1 << n) / (float)((1 << (n + 1)) - 1));
-    sum2 *= ((float)(1 << (n + 1)) / (float)((1 << (n + 2)) - 1));
-
-    return (1.0f - rmd) * sum + rmd * sum2;
-  }
-  else {
-    sum *= ((float)(1 << n) / (float)((1 << (n + 1)) - 1));
-    return sum;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index d31e4f93696..f1ebb37e23e 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -42,108 +42,126 @@ CCL_NAMESPACE_BEGIN
 #define NODE_GROUP_LEVEL_1 1
 #define NODE_GROUP_LEVEL_2 2
 #define NODE_GROUP_LEVEL_3 3
-#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_3
+#define NODE_GROUP_LEVEL_4 4
+#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_4
 
 #define NODE_FEATURE_VOLUME (1 << 0)
 #define NODE_FEATURE_HAIR (1 << 1)
 #define NODE_FEATURE_BUMP (1 << 2)
 #define NODE_FEATURE_BUMP_STATE (1 << 3)
+#define NODE_FEATURE_VORONOI_EXTRA (1 << 4)
 /* TODO(sergey): Consider using something like ((uint)(-1)).
  * Need to check carefully operand types around usage of this
  * define first.
  */
 #define NODE_FEATURE_ALL \
-  (NODE_FEATURE_VOLUME | NODE_FEATURE_HAIR | NODE_FEATURE_BUMP | NODE_FEATURE_BUMP_STATE)
+  (NODE_FEATURE_VOLUME | NODE_FEATURE_HAIR | NODE_FEATURE_BUMP | NODE_FEATURE_BUMP_STATE | \
+   NODE_FEATURE_VORONOI_EXTRA)
+
+#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
+#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
 
 typedef enum ShaderNodeType {
   NODE_END = 0,
+  NODE_SHADER_JUMP,
   NODE_CLOSURE_BSDF,
   NODE_CLOSURE_EMISSION,
   NODE_CLOSURE_BACKGROUND,
   NODE_CLOSURE_SET_WEIGHT,
   NODE_CLOSURE_WEIGHT,
+  NODE_EMISSION_WEIGHT,
   NODE_MIX_CLOSURE,
   NODE_JUMP_IF_ZERO,
   NODE_JUMP_IF_ONE,
-  NODE_TEX_IMAGE,
-  NODE_TEX_IMAGE_BOX,
-  NODE_TEX_SKY,
   NODE_GEOMETRY,
-  NODE_GEOMETRY_DUPLI,
-  NODE_LIGHT_PATH,
+  NODE_CONVERT,
+  NODE_TEX_COORD,
   NODE_VALUE_F,
   NODE_VALUE_V,
-  NODE_MIX,
   NODE_ATTR,
-  NODE_CONVERT,
-  NODE_FRESNEL,
-  NODE_WIREFRAME,
-  NODE_WAVELENGTH,
-  NODE_BLACKBODY,
-  NODE_EMISSION_WEIGHT,
-  NODE_TEX_GRADIENT,
-  NODE_TEX_VORONOI,
-  NODE_TEX_MUSGRAVE,
-  NODE_TEX_WAVE,
-  NODE_TEX_MAGIC,
-  NODE_TEX_NOISE,
-  NODE_SHADER_JUMP,
-  NODE_SET_DISPLACEMENT,
+  NODE_VERTEX_COLOR,
   NODE_GEOMETRY_BUMP_DX,
   NODE_GEOMETRY_BUMP_DY,
+  NODE_SET_DISPLACEMENT,
+  NODE_DISPLACEMENT,
+  NODE_VECTOR_DISPLACEMENT,
+  NODE_TEX_IMAGE,
+  NODE_TEX_IMAGE_BOX,
+  NODE_TEX_NOISE,
   NODE_SET_BUMP,
-  NODE_MATH,
-  NODE_VECTOR_MATH,
-  NODE_VECTOR_TRANSFORM,
-  NODE_MAPPING,
-  NODE_TEX_COORD,
-  NODE_TEX_COORD_BUMP_DX,
-  NODE_TEX_COORD_BUMP_DY,
   NODE_ATTR_BUMP_DX,
   NODE_ATTR_BUMP_DY,
-  NODE_TEX_ENVIRONMENT,
+  NODE_VERTEX_COLOR_BUMP_DX,
+  NODE_VERTEX_COLOR_BUMP_DY,
+  NODE_TEX_COORD_BUMP_DX,
+  NODE_TEX_COORD_BUMP_DY,
+  NODE_CLOSURE_SET_NORMAL,
+  NODE_ENTER_BUMP_EVAL,
+  NODE_LEAVE_BUMP_EVAL,
+  NODE_HSV,
   NODE_CLOSURE_HOLDOUT,
+  NODE_FRESNEL,
   NODE_LAYER_WEIGHT,
   NODE_CLOSURE_VOLUME,
-  NODE_SEPARATE_VECTOR,
-  NODE_COMBINE_VECTOR,
-  NODE_SEPARATE_HSV,
-  NODE_COMBINE_HSV,
-  NODE_HSV,
-  NODE_CAMERA,
-  NODE_INVERT,
-  NODE_NORMAL,
+  NODE_PRINCIPLED_VOLUME,
+  NODE_MATH,
+  NODE_VECTOR_MATH,
+  NODE_RGB_RAMP,
   NODE_GAMMA,
-  NODE_TEX_CHECKER,
   NODE_BRIGHTCONTRAST,
-  NODE_RGB_RAMP,
-  NODE_RGB_CURVES,
-  NODE_VECTOR_CURVES,
-  NODE_MIN_MAX,
-  NODE_LIGHT_FALLOFF,
+  NODE_LIGHT_PATH,
   NODE_OBJECT_INFO,
   NODE_PARTICLE_INFO,
+  NODE_HAIR_INFO,
+  NODE_TEXTURE_MAPPING,
+  NODE_MAPPING,
+  NODE_MIN_MAX,
+  NODE_CAMERA,
+  NODE_TEX_ENVIRONMENT,
+  NODE_TEX_SKY,
+  NODE_TEX_GRADIENT,
+  NODE_TEX_VORONOI,
+  NODE_TEX_MUSGRAVE,
+  NODE_TEX_WAVE,
+  NODE_TEX_MAGIC,
+  NODE_TEX_CHECKER,
   NODE_TEX_BRICK,
-  NODE_CLOSURE_SET_NORMAL,
-  NODE_AMBIENT_OCCLUSION,
+  NODE_TEX_WHITE_NOISE,
+  NODE_NORMAL,
+  NODE_LIGHT_FALLOFF,
+  NODE_IES,
+  NODE_RGB_CURVES,
+  NODE_VECTOR_CURVES,
   NODE_TANGENT,
   NODE_NORMAL_MAP,
-  NODE_HAIR_INFO,
-  NODE_UVMAP,
-  NODE_TEX_VOXEL,
-  NODE_ENTER_BUMP_EVAL,
-  NODE_LEAVE_BUMP_EVAL,
+  NODE_INVERT,
+  NODE_MIX,
+  NODE_SEPARATE_VECTOR,
+  NODE_COMBINE_VECTOR,
+  NODE_SEPARATE_HSV,
+  NODE_COMBINE_HSV,
+  NODE_VECTOR_ROTATE,
+  NODE_VECTOR_TRANSFORM,
+  NODE_WIREFRAME,
+  NODE_WAVELENGTH,
+  NODE_BLACKBODY,
+  NODE_MAP_RANGE,
+  NODE_CLAMP,
   NODE_BEVEL,
-  NODE_DISPLACEMENT,
-  NODE_VECTOR_DISPLACEMENT,
-  NODE_PRINCIPLED_VOLUME,
-  NODE_IES,
+  NODE_AMBIENT_OCCLUSION,
+  NODE_TEX_VOXEL,
+  NODE_AOV_START,
+  NODE_AOV_COLOR,
+  NODE_AOV_VALUE,
+  /* NOTE: for best OpenCL performance, item definition in the enum must
+   * match the switch case order in svm.h. */
 } ShaderNodeType;
 
 typedef enum NodeAttributeType {
   NODE_ATTR_FLOAT = 0,
   NODE_ATTR_FLOAT2,
   NODE_ATTR_FLOAT3,
+  NODE_ATTR_RGBA,
   NODE_ATTR_MATRIX
 } NodeAttributeType;
 
@@ -158,6 +176,7 @@ typedef enum NodeGeometry {
 
 typedef enum NodeObjectInfo {
   NODE_INFO_OB_LOCATION,
+  NODE_INFO_OB_COLOR,
   NODE_INFO_OB_INDEX,
   NODE_INFO_MAT_INDEX,
   NODE_INFO_OB_RANDOM
@@ -242,7 +261,7 @@ typedef enum NodeMix {
   NODE_MIX_CLAMP /* used for the clamp UI option */
 } NodeMix;
 
-typedef enum NodeMath {
+typedef enum NodeMathType {
   NODE_MATH_ADD,
   NODE_MATH_SUBTRACT,
   NODE_MATH_MULTIPLY,
@@ -265,19 +284,82 @@ typedef enum NodeMath {
   NODE_MATH_ARCTAN2,
   NODE_MATH_FLOOR,
   NODE_MATH_CEIL,
-  NODE_MATH_FRACT,
+  NODE_MATH_FRACTION,
   NODE_MATH_SQRT,
-  NODE_MATH_CLAMP /* used for the clamp UI option */
-} NodeMath;
-
-typedef enum NodeVectorMath {
+  NODE_MATH_INV_SQRT,
+  NODE_MATH_SIGN,
+  NODE_MATH_EXPONENT,
+  NODE_MATH_RADIANS,
+  NODE_MATH_DEGREES,
+  NODE_MATH_SINH,
+  NODE_MATH_COSH,
+  NODE_MATH_TANH,
+  NODE_MATH_TRUNC,
+  NODE_MATH_SNAP,
+  NODE_MATH_WRAP,
+  NODE_MATH_COMPARE,
+  NODE_MATH_MULTIPLY_ADD,
+  NODE_MATH_PINGPONG,
+  NODE_MATH_SMOOTH_MIN,
+  NODE_MATH_SMOOTH_MAX,
+} NodeMathType;
+
+typedef enum NodeVectorMathType {
   NODE_VECTOR_MATH_ADD,
   NODE_VECTOR_MATH_SUBTRACT,
-  NODE_VECTOR_MATH_AVERAGE,
-  NODE_VECTOR_MATH_DOT_PRODUCT,
+  NODE_VECTOR_MATH_MULTIPLY,
+  NODE_VECTOR_MATH_DIVIDE,
+
   NODE_VECTOR_MATH_CROSS_PRODUCT,
-  NODE_VECTOR_MATH_NORMALIZE
-} NodeVectorMath;
+  NODE_VECTOR_MATH_PROJECT,
+  NODE_VECTOR_MATH_REFLECT,
+  NODE_VECTOR_MATH_DOT_PRODUCT,
+
+  NODE_VECTOR_MATH_DISTANCE,
+  NODE_VECTOR_MATH_LENGTH,
+  NODE_VECTOR_MATH_SCALE,
+  NODE_VECTOR_MATH_NORMALIZE,
+
+  NODE_VECTOR_MATH_SNAP,
+  NODE_VECTOR_MATH_FLOOR,
+  NODE_VECTOR_MATH_CEIL,
+  NODE_VECTOR_MATH_MODULO,
+  NODE_VECTOR_MATH_FRACTION,
+  NODE_VECTOR_MATH_ABSOLUTE,
+  NODE_VECTOR_MATH_MINIMUM,
+  NODE_VECTOR_MATH_MAXIMUM,
+  NODE_VECTOR_MATH_WRAP,
+  NODE_VECTOR_MATH_SINE,
+  NODE_VECTOR_MATH_COSINE,
+  NODE_VECTOR_MATH_TANGENT,
+} NodeVectorMathType;
+
+typedef enum NodeClampType {
+  NODE_CLAMP_MINMAX,
+  NODE_CLAMP_RANGE,
+} NodeClampType;
+
+typedef enum NodeMapRangeType {
+  NODE_MAP_RANGE_LINEAR,
+  NODE_MAP_RANGE_STEPPED,
+  NODE_MAP_RANGE_SMOOTHSTEP,
+  NODE_MAP_RANGE_SMOOTHERSTEP,
+} NodeMapRangeType;
+
+typedef enum NodeMappingType {
+  NODE_MAPPING_TYPE_POINT,
+  NODE_MAPPING_TYPE_TEXTURE,
+  NODE_MAPPING_TYPE_VECTOR,
+  NODE_MAPPING_TYPE_NORMAL
+} NodeMappingType;
+
+typedef enum NodeVectorRotateType {
+  NODE_VECTOR_ROTATE_TYPE_AXIS,
+  NODE_VECTOR_ROTATE_TYPE_AXIS_X,
+  NODE_VECTOR_ROTATE_TYPE_AXIS_Y,
+  NODE_VECTOR_ROTATE_TYPE_AXIS_Z,
+  NODE_VECTOR_ROTATE_TYPE_EULER_XYZ,
+} NodeVectorRotateType;
 
 typedef enum NodeVectorTransformType {
   NODE_VECTOR_TRANSFORM_TYPE_VECTOR,
@@ -312,12 +394,27 @@ typedef enum NodeMusgraveType {
 
 typedef enum NodeWaveType { NODE_WAVE_BANDS, NODE_WAVE_RINGS } NodeWaveType;
 
-typedef enum NodeWaveProfiles {
+typedef enum NodeWaveBandsDirection {
+  NODE_WAVE_BANDS_DIRECTION_X,
+  NODE_WAVE_BANDS_DIRECTION_Y,
+  NODE_WAVE_BANDS_DIRECTION_Z,
+  NODE_WAVE_BANDS_DIRECTION_DIAGONAL
+} NodeWaveBandsDirection;
+
+typedef enum NodeWaveRingsDirection {
+  NODE_WAVE_RINGS_DIRECTION_X,
+  NODE_WAVE_RINGS_DIRECTION_Y,
+  NODE_WAVE_RINGS_DIRECTION_Z,
+  NODE_WAVE_RINGS_DIRECTION_SPHERICAL
+} NodeWaveRingsDirection;
+
+typedef enum NodeWaveProfile {
   NODE_WAVE_PROFILE_SIN,
   NODE_WAVE_PROFILE_SAW,
+  NODE_WAVE_PROFILE_TRI,
 } NodeWaveProfile;
 
-typedef enum NodeSkyType { NODE_SKY_OLD, NODE_SKY_NEW } NodeSkyType;
+typedef enum NodeSkyType { NODE_SKY_PREETHAM, NODE_SKY_HOSEK, NODE_SKY_NISHITA } NodeSkyType;
 
 typedef enum NodeGradientType {
   NODE_BLEND_LINEAR,
@@ -329,24 +426,19 @@ typedef enum NodeGradientType {
   NODE_BLEND_SPHERICAL
 } NodeGradientType;
 
-typedef enum NodeVoronoiColoring {
-  NODE_VORONOI_INTENSITY,
-  NODE_VORONOI_CELLS
-} NodeVoronoiColoring;
-
 typedef enum NodeVoronoiDistanceMetric {
-  NODE_VORONOI_DISTANCE,
+  NODE_VORONOI_EUCLIDEAN,
   NODE_VORONOI_MANHATTAN,
   NODE_VORONOI_CHEBYCHEV,
-  NODE_VORONOI_MINKOWSKI
+  NODE_VORONOI_MINKOWSKI,
 } NodeVoronoiDistanceMetric;
 
 typedef enum NodeVoronoiFeature {
   NODE_VORONOI_F1,
   NODE_VORONOI_F2,
-  NODE_VORONOI_F3,
-  NODE_VORONOI_F4,
-  NODE_VORONOI_F2F1
+  NODE_VORONOI_SMOOTH_F1,
+  NODE_VORONOI_DISTANCE_TO_EDGE,
+  NODE_VORONOI_N_SPHERE_RADIUS,
 } NodeVoronoiFeature;
 
 typedef enum NodeBlendWeightType {
@@ -373,11 +465,6 @@ typedef enum NodeNormalMapSpace {
   NODE_NORMAL_MAP_BLENDER_WORLD,
 } NodeNormalMapSpace;
 
-typedef enum NodeImageColorSpace {
-  NODE_COLOR_SPACE_NONE = 0,
-  NODE_COLOR_SPACE_COLOR = 1,
-} NodeImageColorSpace;
-
 typedef enum NodeImageProjection {
   NODE_IMAGE_PROJ_FLAT = 0,
   NODE_IMAGE_PROJ_BOX = 1,
@@ -385,6 +472,11 @@ typedef enum NodeImageProjection {
   NODE_IMAGE_PROJ_TUBE = 3,
 } NodeImageProjection;
 
+typedef enum NodeImageFlags {
+  NODE_IMAGE_COMPRESS_AS_SRGB = 1,
+  NODE_IMAGE_ALPHA_UNASSOCIATE = 2,
+} NodeImageFlags;
+
 typedef enum NodeEnvironmentProjection {
   NODE_ENVIRONMENT_EQUIRECTANGULAR = 0,
   NODE_ENVIRONMENT_MIRROR_BALL = 1,
@@ -436,6 +528,7 @@ typedef enum ClosureType {
   CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID,
   CLOSURE_BSDF_PRINCIPLED_SHEEN_ID,
   CLOSURE_BSDF_DIFFUSE_TOON_ID,
+  CLOSURE_BSDF_TRANSLUCENT_ID,
 
   /* Glossy */
   CLOSURE_BSDF_REFLECTION_ID,
@@ -446,19 +539,12 @@ typedef enum ClosureType {
   CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID,
   CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID,
   CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID,
-  CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID,
-  CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID,
-  CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID,
-  CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID,
-  CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
-  CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
   CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
   CLOSURE_BSDF_PHONG_RAMP_ID,
   CLOSURE_BSDF_GLOSSY_TOON_ID,
   CLOSURE_BSDF_HAIR_REFLECTION_ID,
 
   /* Transmission */
-  CLOSURE_BSDF_TRANSLUCENT_ID,
   CLOSURE_BSDF_REFRACTION_ID,
   CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
   CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
@@ -499,12 +585,12 @@ typedef enum ClosureType {
 /* watch this, being lazy with memory usage */
 #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_DIFFUSE(type) \
-  (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
+  (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_TRANSLUCENT_ID)
 #define CLOSURE_IS_BSDF_GLOSSY(type) \
   ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) || \
    (type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID))
 #define CLOSURE_IS_BSDF_TRANSMISSION(type) \
-  (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+  (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
 #define CLOSURE_IS_BSDF_BSSRDF(type) \
   (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
 #define CLOSURE_IS_BSDF_SINGULAR(type) \
@@ -513,13 +599,17 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_MULTISCATTER(type) \
   (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID || \
-   type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
    type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
 #define CLOSURE_IS_BSDF_MICROFACET(type) \
-  ((type >= CLOSURE_BSDF_MICROFACET_GGX_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) || \
+  ((type >= CLOSURE_BSDF_MICROFACET_GGX_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID) || \
    (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID && \
     type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) || \
    (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID))
+#define CLOSURE_IS_BSDF_MICROFACET_FRESNEL(type) \
+  (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID || \
+   type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
+   type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
+   type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
 #define CLOSURE_IS_BSSRDF(type) \
   (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
diff --git a/intern/cycles/kernel/svm/svm_vector_rotate.h b/intern/cycles/kernel/svm/svm_vector_rotate.h
new file mode 100644
index 00000000000..79a4ec2c40e
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_vector_rotate.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Vector Rotate */
+
+ccl_device void svm_node_vector_rotate(ShaderData *sd,
+                                       float *stack,
+                                       uint input_stack_offsets,
+                                       uint axis_stack_offsets,
+                                       uint result_stack_offset)
+{
+  uint type, vector_stack_offset, rotation_stack_offset, center_stack_offset, axis_stack_offset,
+      angle_stack_offset, invert;
+
+  svm_unpack_node_uchar4(
+      input_stack_offsets, &type, &vector_stack_offset, &rotation_stack_offset, &invert);
+  svm_unpack_node_uchar3(
+      axis_stack_offsets, &center_stack_offset, &axis_stack_offset, &angle_stack_offset);
+
+  if (stack_valid(result_stack_offset)) {
+
+    float3 vector = stack_load_float3(stack, vector_stack_offset);
+    float3 center = stack_load_float3(stack, center_stack_offset);
+    float3 result = make_float3(0.0f, 0.0f, 0.0f);
+
+    if (type == NODE_VECTOR_ROTATE_TYPE_EULER_XYZ) {
+      float3 rotation = stack_load_float3(stack, rotation_stack_offset);  // Default XYZ.
+      Transform rotationTransform = euler_to_transform(rotation);
+      if (invert) {
+        result = transform_direction_transposed(&rotationTransform, vector - center) + center;
+      }
+      else {
+        result = transform_direction(&rotationTransform, vector - center) + center;
+      }
+    }
+    else {
+      float3 axis;
+      switch (type) {
+        case NODE_VECTOR_ROTATE_TYPE_AXIS_X:
+          axis = make_float3(1.0f, 0.0f, 0.0f);
+          break;
+        case NODE_VECTOR_ROTATE_TYPE_AXIS_Y:
+          axis = make_float3(0.0f, 1.0f, 0.0f);
+          break;
+        case NODE_VECTOR_ROTATE_TYPE_AXIS_Z:
+          axis = make_float3(0.0f, 0.0f, 1.0f);
+          break;
+        default:
+          axis = normalize(stack_load_float3(stack, axis_stack_offset));
+          break;
+      }
+      float angle = stack_load_float(stack, angle_stack_offset);
+      angle = invert ? -angle : angle;
+      result = (len_squared(axis) != 0.0f) ?
+                   rotate_around_axis(vector - center, axis, angle) + center :
+                   vector;
+    }
+
+    stack_store_float3(stack, result_stack_offset, result);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 7ec0f07f2e4..1e95492cf1b 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -26,8 +26,8 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg,
   uint itype, ifrom, ito;
   uint vector_in, vector_out;
 
-  decode_node_uchar4(node.y, &itype, &ifrom, &ito, NULL);
-  decode_node_uchar4(node.z, &vector_in, &vector_out, NULL, NULL);
+  svm_unpack_node_uchar3(node.y, &itype, &ifrom, &ito);
+  svm_unpack_node_uchar2(node.z, &vector_in, &vector_out);
 
   float3 in = stack_load_float3(stack, vector_in);
 
diff --git a/intern/cycles/kernel/svm/svm_vertex_color.h b/intern/cycles/kernel/svm/svm_vertex_color.h
new file mode 100644
index 00000000000..3c105b1cbfa
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_vertex_color.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void svm_node_vertex_color(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint layer_id,
+                                      uint color_offset,
+                                      uint alpha_offset)
+{
+  AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
+  if (descriptor.offset != ATTR_STD_NOT_FOUND) {
+    float4 vertex_color = primitive_attribute_float4(kg, sd, descriptor, NULL, NULL);
+    stack_store_float3(stack, color_offset, float4_to_float3(vertex_color));
+    stack_store_float(stack, alpha_offset, vertex_color.w);
+  }
+  else {
+    stack_store_float3(stack, color_offset, make_float3(0.0f, 0.0f, 0.0f));
+    stack_store_float(stack, alpha_offset, 0.0f);
+  }
+}
+
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_noinline
+#endif
+    void
+    svm_node_vertex_color_bump_dx(KernelGlobals *kg,
+                                  ShaderData *sd,
+                                  float *stack,
+                                  uint layer_id,
+                                  uint color_offset,
+                                  uint alpha_offset)
+{
+  AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
+  if (descriptor.offset != ATTR_STD_NOT_FOUND) {
+    float4 dx;
+    float4 vertex_color = primitive_attribute_float4(kg, sd, descriptor, &dx, NULL);
+    vertex_color += dx;
+    stack_store_float3(stack, color_offset, float4_to_float3(vertex_color));
+    stack_store_float(stack, alpha_offset, vertex_color.w);
+  }
+  else {
+    stack_store_float3(stack, color_offset, make_float3(0.0f, 0.0f, 0.0f));
+    stack_store_float(stack, alpha_offset, 0.0f);
+  }
+}
+
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_noinline
+#endif
+    void
+    svm_node_vertex_color_bump_dy(KernelGlobals *kg,
+                                  ShaderData *sd,
+                                  float *stack,
+                                  uint layer_id,
+                                  uint color_offset,
+                                  uint alpha_offset)
+{
+  AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
+  if (descriptor.offset != ATTR_STD_NOT_FOUND) {
+    float4 dy;
+    float4 vertex_color = primitive_attribute_float4(kg, sd, descriptor, NULL, &dy);
+    vertex_color += dy;
+    stack_store_float3(stack, color_offset, float4_to_float3(vertex_color));
+    stack_store_float(stack, alpha_offset, vertex_color.w);
+  }
+  else {
+    stack_store_float3(stack, color_offset, make_float3(0.0f, 0.0f, 0.0f));
+    stack_store_float(stack, alpha_offset, 0.0f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index c311aefaf38..f0fc0068fa2 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -16,169 +16,1124 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Voronoi */
-
-ccl_device void voronoi_neighbors(
-    float3 p, NodeVoronoiDistanceMetric distance, float e, float da[4], float3 pa[4])
-{
-  /* Compute the distance to and the position of the closest neighbors to p.
-   *
-   * The neighbors are randomly placed, 1 each in a 3x3x3 grid (Worley pattern).
-   * The distances and points are returned in ascending order, i.e. da[0] and pa[0] will
-   * contain the distance to the closest point and its coordinates respectively.
-   */
-
-  da[0] = 1e10f;
-  da[1] = 1e10f;
-  da[2] = 1e10f;
-  da[3] = 1e10f;
-
-  pa[0] = make_float3(0.0f, 0.0f, 0.0f);
-  pa[1] = make_float3(0.0f, 0.0f, 0.0f);
-  pa[2] = make_float3(0.0f, 0.0f, 0.0f);
-  pa[3] = make_float3(0.0f, 0.0f, 0.0f);
-
-  int3 xyzi = quick_floor_to_int3(p);
-
-  for (int xx = -1; xx <= 1; xx++) {
-    for (int yy = -1; yy <= 1; yy++) {
-      for (int zz = -1; zz <= 1; zz++) {
-        int3 ip = xyzi + make_int3(xx, yy, zz);
-        float3 fp = make_float3(ip.x, ip.y, ip.z);
-        float3 vp = fp + cellnoise3(fp);
-
-        float d;
-        switch (distance) {
-          case NODE_VORONOI_DISTANCE:
-            d = len_squared(p - vp);
-            break;
-          case NODE_VORONOI_MANHATTAN:
-            d = reduce_add(fabs(vp - p));
-            break;
-          case NODE_VORONOI_CHEBYCHEV:
-            d = max3(fabs(vp - p));
-            break;
-          case NODE_VORONOI_MINKOWSKI: {
-            float3 n = fabs(vp - p);
-            if (e == 0.5f) {
-              d = sqr(reduce_add(sqrt(n)));
-            }
-            else {
-              d = powf(reduce_add(pow3(n, e)), 1.0f / e);
-            }
-            break;
+/*
+ * Smooth Voronoi:
+ *
+ * - https://wiki.blender.org/wiki/User:OmarSquircleArt/GSoC2019/Documentation/Smooth_Voronoi
+ *
+ * Distance To Edge:
+ *
+ * - https://www.shadertoy.com/view/llG3zy
+ *
+ */
+
+/* **** 1D Voronoi **** */
+
+ccl_device float voronoi_distance_1d(float a,
+                                     float b,
+                                     NodeVoronoiDistanceMetric metric,
+                                     float exponent)
+{
+  return fabsf(b - a);
+}
+
+ccl_device void voronoi_f1_1d(float w,
+                              float exponent,
+                              float randomness,
+                              NodeVoronoiDistanceMetric metric,
+                              float *outDistance,
+                              float3 *outColor,
+                              float *outW)
+{
+  float cellPosition = floorf(w);
+  float localPosition = w - cellPosition;
+
+  float minDistance = 8.0f;
+  float targetOffset = 0.0f;
+  float targetPosition = 0.0f;
+  for (int i = -1; i <= 1; i++) {
+    float cellOffset = i;
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = voronoi_distance_1d(pointPosition, localPosition, metric, exponent);
+    if (distanceToPoint < minDistance) {
+      targetOffset = cellOffset;
+      minDistance = distanceToPoint;
+      targetPosition = pointPosition;
+    }
+  }
+  *outDistance = minDistance;
+  *outColor = hash_float_to_float3(cellPosition + targetOffset);
+  *outW = targetPosition + cellPosition;
+}
+
+ccl_device void voronoi_smooth_f1_1d(float w,
+                                     float smoothness,
+                                     float exponent,
+                                     float randomness,
+                                     NodeVoronoiDistanceMetric metric,
+                                     float *outDistance,
+                                     float3 *outColor,
+                                     float *outW)
+{
+  float cellPosition = floorf(w);
+  float localPosition = w - cellPosition;
+
+  float smoothDistance = 8.0f;
+  float smoothPosition = 0.0f;
+  float3 smoothColor = make_float3(0.0f, 0.0f, 0.0f);
+  for (int i = -2; i <= 2; i++) {
+    float cellOffset = i;
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = voronoi_distance_1d(pointPosition, localPosition, metric, exponent);
+    float h = smoothstep(
+        0.0f, 1.0f, 0.5f + 0.5f * (smoothDistance - distanceToPoint) / smoothness);
+    float correctionFactor = smoothness * h * (1.0f - h);
+    smoothDistance = mix(smoothDistance, distanceToPoint, h) - correctionFactor;
+    correctionFactor /= 1.0f + 3.0f * smoothness;
+    float3 cellColor = hash_float_to_float3(cellPosition + cellOffset);
+    smoothColor = mix(smoothColor, cellColor, h) - correctionFactor;
+    smoothPosition = mix(smoothPosition, pointPosition, h) - correctionFactor;
+  }
+  *outDistance = smoothDistance;
+  *outColor = smoothColor;
+  *outW = cellPosition + smoothPosition;
+}
+
+ccl_device void voronoi_f2_1d(float w,
+                              float exponent,
+                              float randomness,
+                              NodeVoronoiDistanceMetric metric,
+                              float *outDistance,
+                              float3 *outColor,
+                              float *outW)
+{
+  float cellPosition = floorf(w);
+  float localPosition = w - cellPosition;
+
+  float distanceF1 = 8.0f;
+  float distanceF2 = 8.0f;
+  float offsetF1 = 0.0f;
+  float positionF1 = 0.0f;
+  float offsetF2 = 0.0f;
+  float positionF2 = 0.0f;
+  for (int i = -1; i <= 1; i++) {
+    float cellOffset = i;
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = voronoi_distance_1d(pointPosition, localPosition, metric, exponent);
+    if (distanceToPoint < distanceF1) {
+      distanceF2 = distanceF1;
+      distanceF1 = distanceToPoint;
+      offsetF2 = offsetF1;
+      offsetF1 = cellOffset;
+      positionF2 = positionF1;
+      positionF1 = pointPosition;
+    }
+    else if (distanceToPoint < distanceF2) {
+      distanceF2 = distanceToPoint;
+      offsetF2 = cellOffset;
+      positionF2 = pointPosition;
+    }
+  }
+  *outDistance = distanceF2;
+  *outColor = hash_float_to_float3(cellPosition + offsetF2);
+  *outW = positionF2 + cellPosition;
+}
+
+ccl_device void voronoi_distance_to_edge_1d(float w, float randomness, float *outDistance)
+{
+  float cellPosition = floorf(w);
+  float localPosition = w - cellPosition;
+
+  float minDistance = 8.0f;
+  for (int i = -1; i <= 1; i++) {
+    float cellOffset = i;
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = fabsf(pointPosition - localPosition);
+    minDistance = min(distanceToPoint, minDistance);
+  }
+  *outDistance = minDistance;
+}
+
+ccl_device void voronoi_n_sphere_radius_1d(float w, float randomness, float *outRadius)
+{
+  float cellPosition = floorf(w);
+  float localPosition = w - cellPosition;
+
+  float closestPoint = 0.0f;
+  float closestPointOffset = 0.0f;
+  float minDistance = 8.0f;
+  for (int i = -1; i <= 1; i++) {
+    float cellOffset = i;
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = fabsf(pointPosition - localPosition);
+    if (distanceToPoint < minDistance) {
+      minDistance = distanceToPoint;
+      closestPoint = pointPosition;
+      closestPointOffset = cellOffset;
+    }
+  }
+
+  minDistance = 8.0f;
+  float closestPointToClosestPoint = 0.0f;
+  for (int i = -1; i <= 1; i++) {
+    if (i == 0) {
+      continue;
+    }
+    float cellOffset = i + closestPointOffset;
+    float pointPosition = cellOffset + hash_float_to_float(cellPosition + cellOffset) * randomness;
+    float distanceToPoint = fabsf(closestPoint - pointPosition);
+    if (distanceToPoint < minDistance) {
+      minDistance = distanceToPoint;
+      closestPointToClosestPoint = pointPosition;
+    }
+  }
+  *outRadius = fabsf(closestPointToClosestPoint - closestPoint) / 2.0f;
+}
+
+/* **** 2D Voronoi **** */
+
+ccl_device float voronoi_distance_2d(float2 a,
+                                     float2 b,
+                                     NodeVoronoiDistanceMetric metric,
+                                     float exponent)
+{
+  if (metric == NODE_VORONOI_EUCLIDEAN) {
+    return distance(a, b);
+  }
+  else if (metric == NODE_VORONOI_MANHATTAN) {
+    return fabsf(a.x - b.x) + fabsf(a.y - b.y);
+  }
+  else if (metric == NODE_VORONOI_CHEBYCHEV) {
+    return max(fabsf(a.x - b.x), fabsf(a.y - b.y));
+  }
+  else if (metric == NODE_VORONOI_MINKOWSKI) {
+    return powf(powf(fabsf(a.x - b.x), exponent) + powf(fabsf(a.y - b.y), exponent),
+                1.0f / exponent);
+  }
+  else {
+    return 0.0f;
+  }
+}
+
+ccl_device void voronoi_f1_2d(float2 coord,
+                              float exponent,
+                              float randomness,
+                              NodeVoronoiDistanceMetric metric,
+                              float *outDistance,
+                              float3 *outColor,
+                              float2 *outPosition)
+{
+  float2 cellPosition = floor(coord);
+  float2 localPosition = coord - cellPosition;
+
+  float minDistance = 8.0f;
+  float2 targetOffset = make_float2(0.0f, 0.0f);
+  float2 targetPosition = make_float2(0.0f, 0.0f);
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      float2 cellOffset = make_float2(i, j);
+      float2 pointPosition = cellOffset +
+                             hash_float2_to_float2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = voronoi_distance_2d(pointPosition, localPosition, metric, exponent);
+      if (distanceToPoint < minDistance) {
+        targetOffset = cellOffset;
+        minDistance = distanceToPoint;
+        targetPosition = pointPosition;
+      }
+    }
+  }
+  *outDistance = minDistance;
+  *outColor = hash_float2_to_float3(cellPosition + targetOffset);
+  *outPosition = targetPosition + cellPosition;
+}
+
+ccl_device void voronoi_smooth_f1_2d(float2 coord,
+                                     float smoothness,
+                                     float exponent,
+                                     float randomness,
+                                     NodeVoronoiDistanceMetric metric,
+                                     float *outDistance,
+                                     float3 *outColor,
+                                     float2 *outPosition)
+{
+  float2 cellPosition = floor(coord);
+  float2 localPosition = coord - cellPosition;
+
+  float smoothDistance = 8.0f;
+  float3 smoothColor = make_float3(0.0f, 0.0f, 0.0f);
+  float2 smoothPosition = make_float2(0.0f, 0.0f);
+  for (int j = -2; j <= 2; j++) {
+    for (int i = -2; i <= 2; i++) {
+      float2 cellOffset = make_float2(i, j);
+      float2 pointPosition = cellOffset +
+                             hash_float2_to_float2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = voronoi_distance_2d(pointPosition, localPosition, metric, exponent);
+      float h = smoothstep(
+          0.0f, 1.0f, 0.5f + 0.5f * (smoothDistance - distanceToPoint) / smoothness);
+      float correctionFactor = smoothness * h * (1.0f - h);
+      smoothDistance = mix(smoothDistance, distanceToPoint, h) - correctionFactor;
+      correctionFactor /= 1.0f + 3.0f * smoothness;
+      float3 cellColor = hash_float2_to_float3(cellPosition + cellOffset);
+      smoothColor = mix(smoothColor, cellColor, h) - correctionFactor;
+      smoothPosition = mix(smoothPosition, pointPosition, h) - correctionFactor;
+    }
+  }
+  *outDistance = smoothDistance;
+  *outColor = smoothColor;
+  *outPosition = cellPosition + smoothPosition;
+}
+
+ccl_device void voronoi_f2_2d(float2 coord,
+                              float exponent,
+                              float randomness,
+                              NodeVoronoiDistanceMetric metric,
+                              float *outDistance,
+                              float3 *outColor,
+                              float2 *outPosition)
+{
+  float2 cellPosition = floor(coord);
+  float2 localPosition = coord - cellPosition;
+
+  float distanceF1 = 8.0f;
+  float distanceF2 = 8.0f;
+  float2 offsetF1 = make_float2(0.0f, 0.0f);
+  float2 positionF1 = make_float2(0.0f, 0.0f);
+  float2 offsetF2 = make_float2(0.0f, 0.0f);
+  float2 positionF2 = make_float2(0.0f, 0.0f);
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      float2 cellOffset = make_float2(i, j);
+      float2 pointPosition = cellOffset +
+                             hash_float2_to_float2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = voronoi_distance_2d(pointPosition, localPosition, metric, exponent);
+      if (distanceToPoint < distanceF1) {
+        distanceF2 = distanceF1;
+        distanceF1 = distanceToPoint;
+        offsetF2 = offsetF1;
+        offsetF1 = cellOffset;
+        positionF2 = positionF1;
+        positionF1 = pointPosition;
+      }
+      else if (distanceToPoint < distanceF2) {
+        distanceF2 = distanceToPoint;
+        offsetF2 = cellOffset;
+        positionF2 = pointPosition;
+      }
+    }
+  }
+  *outDistance = distanceF2;
+  *outColor = hash_float2_to_float3(cellPosition + offsetF2);
+  *outPosition = positionF2 + cellPosition;
+}
+
+ccl_device void voronoi_distance_to_edge_2d(float2 coord, float randomness, float *outDistance)
+{
+  float2 cellPosition = floor(coord);
+  float2 localPosition = coord - cellPosition;
+
+  float2 vectorToClosest = make_float2(0.0f, 0.0f);
+  float minDistance = 8.0f;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      float2 cellOffset = make_float2(i, j);
+      float2 vectorToPoint = cellOffset +
+                             hash_float2_to_float2(cellPosition + cellOffset) * randomness -
+                             localPosition;
+      float distanceToPoint = dot(vectorToPoint, vectorToPoint);
+      if (distanceToPoint < minDistance) {
+        minDistance = distanceToPoint;
+        vectorToClosest = vectorToPoint;
+      }
+    }
+  }
+
+  minDistance = 8.0f;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      float2 cellOffset = make_float2(i, j);
+      float2 vectorToPoint = cellOffset +
+                             hash_float2_to_float2(cellPosition + cellOffset) * randomness -
+                             localPosition;
+      float2 perpendicularToEdge = vectorToPoint - vectorToClosest;
+      if (dot(perpendicularToEdge, perpendicularToEdge) > 0.0001f) {
+        float distanceToEdge = dot((vectorToClosest + vectorToPoint) / 2.0f,
+                                   normalize(perpendicularToEdge));
+        minDistance = min(minDistance, distanceToEdge);
+      }
+    }
+  }
+  *outDistance = minDistance;
+}
+
+ccl_device void voronoi_n_sphere_radius_2d(float2 coord, float randomness, float *outRadius)
+{
+  float2 cellPosition = floor(coord);
+  float2 localPosition = coord - cellPosition;
+
+  float2 closestPoint = make_float2(0.0f, 0.0f);
+  float2 closestPointOffset = make_float2(0.0f, 0.0f);
+  float minDistance = 8.0f;
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      float2 cellOffset = make_float2(i, j);
+      float2 pointPosition = cellOffset +
+                             hash_float2_to_float2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = distance(pointPosition, localPosition);
+      if (distanceToPoint < minDistance) {
+        minDistance = distanceToPoint;
+        closestPoint = pointPosition;
+        closestPointOffset = cellOffset;
+      }
+    }
+  }
+
+  minDistance = 8.0f;
+  float2 closestPointToClosestPoint = make_float2(0.0f, 0.0f);
+  for (int j = -1; j <= 1; j++) {
+    for (int i = -1; i <= 1; i++) {
+      if (i == 0 && j == 0) {
+        continue;
+      }
+      float2 cellOffset = make_float2(i, j) + closestPointOffset;
+      float2 pointPosition = cellOffset +
+                             hash_float2_to_float2(cellPosition + cellOffset) * randomness;
+      float distanceToPoint = distance(closestPoint, pointPosition);
+      if (distanceToPoint < minDistance) {
+        minDistance = distanceToPoint;
+        closestPointToClosestPoint = pointPosition;
+      }
+    }
+  }
+  *outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f;
+}
+
+/* **** 3D Voronoi **** */
+
+ccl_device float voronoi_distance_3d(float3 a,
+                                     float3 b,
+                                     NodeVoronoiDistanceMetric metric,
+                                     float exponent)
+{
+  if (metric == NODE_VORONOI_EUCLIDEAN) {
+    return distance(a, b);
+  }
+  else if (metric == NODE_VORONOI_MANHATTAN) {
+    return fabsf(a.x - b.x) + fabsf(a.y - b.y) + fabsf(a.z - b.z);
+  }
+  else if (metric == NODE_VORONOI_CHEBYCHEV) {
+    return max(fabsf(a.x - b.x), max(fabsf(a.y - b.y), fabsf(a.z - b.z)));
+  }
+  else if (metric == NODE_VORONOI_MINKOWSKI) {
+    return powf(powf(fabsf(a.x - b.x), exponent) + powf(fabsf(a.y - b.y), exponent) +
+                    powf(fabsf(a.z - b.z), exponent),
+                1.0f / exponent);
+  }
+  else {
+    return 0.0f;
+  }
+}
+
+ccl_device void voronoi_f1_3d(float3 coord,
+                              float exponent,
+                              float randomness,
+                              NodeVoronoiDistanceMetric metric,
+                              float *outDistance,
+                              float3 *outColor,
+                              float3 *outPosition)
+{
+  float3 cellPosition = floor(coord);
+  float3 localPosition = coord - cellPosition;
+
+  float minDistance = 8.0f;
+  float3 targetOffset = make_float3(0.0f, 0.0f, 0.0f);
+  float3 targetPosition = make_float3(0.0f, 0.0f, 0.0f);
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        float3 cellOffset = make_float3(i, j, k);
+        float3 pointPosition = cellOffset +
+                               hash_float3_to_float3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = voronoi_distance_3d(
+            pointPosition, localPosition, metric, exponent);
+        if (distanceToPoint < minDistance) {
+          targetOffset = cellOffset;
+          minDistance = distanceToPoint;
+          targetPosition = pointPosition;
+        }
+      }
+    }
+  }
+  *outDistance = minDistance;
+  *outColor = hash_float3_to_float3(cellPosition + targetOffset);
+  *outPosition = targetPosition + cellPosition;
+}
+
+ccl_device void voronoi_smooth_f1_3d(float3 coord,
+                                     float smoothness,
+                                     float exponent,
+                                     float randomness,
+                                     NodeVoronoiDistanceMetric metric,
+                                     float *outDistance,
+                                     float3 *outColor,
+                                     float3 *outPosition)
+{
+  float3 cellPosition = floor(coord);
+  float3 localPosition = coord - cellPosition;
+
+  float smoothDistance = 8.0f;
+  float3 smoothColor = make_float3(0.0f, 0.0f, 0.0f);
+  float3 smoothPosition = make_float3(0.0f, 0.0f, 0.0f);
+  for (int k = -2; k <= 2; k++) {
+    for (int j = -2; j <= 2; j++) {
+      for (int i = -2; i <= 2; i++) {
+        float3 cellOffset = make_float3(i, j, k);
+        float3 pointPosition = cellOffset +
+                               hash_float3_to_float3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = voronoi_distance_3d(
+            pointPosition, localPosition, metric, exponent);
+        float h = smoothstep(
+            0.0f, 1.0f, 0.5f + 0.5f * (smoothDistance - distanceToPoint) / smoothness);
+        float correctionFactor = smoothness * h * (1.0f - h);
+        smoothDistance = mix(smoothDistance, distanceToPoint, h) - correctionFactor;
+        correctionFactor /= 1.0f + 3.0f * smoothness;
+        float3 cellColor = hash_float3_to_float3(cellPosition + cellOffset);
+        smoothColor = mix(smoothColor, cellColor, h) - correctionFactor;
+        smoothPosition = mix(smoothPosition, pointPosition, h) - correctionFactor;
+      }
+    }
+  }
+  *outDistance = smoothDistance;
+  *outColor = smoothColor;
+  *outPosition = cellPosition + smoothPosition;
+}
+
+ccl_device void voronoi_f2_3d(float3 coord,
+                              float exponent,
+                              float randomness,
+                              NodeVoronoiDistanceMetric metric,
+                              float *outDistance,
+                              float3 *outColor,
+                              float3 *outPosition)
+{
+  float3 cellPosition = floor(coord);
+  float3 localPosition = coord - cellPosition;
+
+  float distanceF1 = 8.0f;
+  float distanceF2 = 8.0f;
+  float3 offsetF1 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 positionF1 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 offsetF2 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 positionF2 = make_float3(0.0f, 0.0f, 0.0f);
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        float3 cellOffset = make_float3(i, j, k);
+        float3 pointPosition = cellOffset +
+                               hash_float3_to_float3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = voronoi_distance_3d(
+            pointPosition, localPosition, metric, exponent);
+        if (distanceToPoint < distanceF1) {
+          distanceF2 = distanceF1;
+          distanceF1 = distanceToPoint;
+          offsetF2 = offsetF1;
+          offsetF1 = cellOffset;
+          positionF2 = positionF1;
+          positionF1 = pointPosition;
+        }
+        else if (distanceToPoint < distanceF2) {
+          distanceF2 = distanceToPoint;
+          offsetF2 = cellOffset;
+          positionF2 = pointPosition;
+        }
+      }
+    }
+  }
+  *outDistance = distanceF2;
+  *outColor = hash_float3_to_float3(cellPosition + offsetF2);
+  *outPosition = positionF2 + cellPosition;
+}
+
+ccl_device void voronoi_distance_to_edge_3d(float3 coord, float randomness, float *outDistance)
+{
+  float3 cellPosition = floor(coord);
+  float3 localPosition = coord - cellPosition;
+
+  float3 vectorToClosest = make_float3(0.0f, 0.0f, 0.0f);
+  float minDistance = 8.0f;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        float3 cellOffset = make_float3(i, j, k);
+        float3 vectorToPoint = cellOffset +
+                               hash_float3_to_float3(cellPosition + cellOffset) * randomness -
+                               localPosition;
+        float distanceToPoint = dot(vectorToPoint, vectorToPoint);
+        if (distanceToPoint < minDistance) {
+          minDistance = distanceToPoint;
+          vectorToClosest = vectorToPoint;
+        }
+      }
+    }
+  }
+
+  minDistance = 8.0f;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        float3 cellOffset = make_float3(i, j, k);
+        float3 vectorToPoint = cellOffset +
+                               hash_float3_to_float3(cellPosition + cellOffset) * randomness -
+                               localPosition;
+        float3 perpendicularToEdge = vectorToPoint - vectorToClosest;
+        if (dot(perpendicularToEdge, perpendicularToEdge) > 0.0001f) {
+          float distanceToEdge = dot((vectorToClosest + vectorToPoint) / 2.0f,
+                                     normalize(perpendicularToEdge));
+          minDistance = min(minDistance, distanceToEdge);
+        }
+      }
+    }
+  }
+  *outDistance = minDistance;
+}
+
+ccl_device void voronoi_n_sphere_radius_3d(float3 coord, float randomness, float *outRadius)
+{
+  float3 cellPosition = floor(coord);
+  float3 localPosition = coord - cellPosition;
+
+  float3 closestPoint = make_float3(0.0f, 0.0f, 0.0f);
+  float3 closestPointOffset = make_float3(0.0f, 0.0f, 0.0f);
+  float minDistance = 8.0f;
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        float3 cellOffset = make_float3(i, j, k);
+        float3 pointPosition = cellOffset +
+                               hash_float3_to_float3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = distance(pointPosition, localPosition);
+        if (distanceToPoint < minDistance) {
+          minDistance = distanceToPoint;
+          closestPoint = pointPosition;
+          closestPointOffset = cellOffset;
+        }
+      }
+    }
+  }
+
+  minDistance = 8.0f;
+  float3 closestPointToClosestPoint = make_float3(0.0f, 0.0f, 0.0f);
+  for (int k = -1; k <= 1; k++) {
+    for (int j = -1; j <= 1; j++) {
+      for (int i = -1; i <= 1; i++) {
+        if (i == 0 && j == 0 && k == 0) {
+          continue;
+        }
+        float3 cellOffset = make_float3(i, j, k) + closestPointOffset;
+        float3 pointPosition = cellOffset +
+                               hash_float3_to_float3(cellPosition + cellOffset) * randomness;
+        float distanceToPoint = distance(closestPoint, pointPosition);
+        if (distanceToPoint < minDistance) {
+          minDistance = distanceToPoint;
+          closestPointToClosestPoint = pointPosition;
+        }
+      }
+    }
+  }
+  *outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f;
+}
+
+/* **** 4D Voronoi **** */
+
+ccl_device float voronoi_distance_4d(float4 a,
+                                     float4 b,
+                                     NodeVoronoiDistanceMetric metric,
+                                     float exponent)
+{
+  if (metric == NODE_VORONOI_EUCLIDEAN) {
+    return distance(a, b);
+  }
+  else if (metric == NODE_VORONOI_MANHATTAN) {
+    return fabsf(a.x - b.x) + fabsf(a.y - b.y) + fabsf(a.z - b.z) + fabsf(a.w - b.w);
+  }
+  else if (metric == NODE_VORONOI_CHEBYCHEV) {
+    return max(fabsf(a.x - b.x), max(fabsf(a.y - b.y), max(fabsf(a.z - b.z), fabsf(a.w - b.w))));
+  }
+  else if (metric == NODE_VORONOI_MINKOWSKI) {
+    return powf(powf(fabsf(a.x - b.x), exponent) + powf(fabsf(a.y - b.y), exponent) +
+                    powf(fabsf(a.z - b.z), exponent) + powf(fabsf(a.w - b.w), exponent),
+                1.0f / exponent);
+  }
+  else {
+    return 0.0f;
+  }
+}
+
+ccl_device void voronoi_f1_4d(float4 coord,
+                              float exponent,
+                              float randomness,
+                              NodeVoronoiDistanceMetric metric,
+                              float *outDistance,
+                              float3 *outColor,
+                              float4 *outPosition)
+{
+  float4 cellPosition = floor(coord);
+  float4 localPosition = coord - cellPosition;
+
+  float minDistance = 8.0f;
+  float4 targetOffset = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  float4 targetPosition = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      ccl_loop_no_unroll for (int j = -1; j <= 1; j++)
+      {
+        for (int i = -1; i <= 1; i++) {
+          float4 cellOffset = make_float4(i, j, k, u);
+          float4 pointPosition = cellOffset +
+                                 hash_float4_to_float4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = voronoi_distance_4d(
+              pointPosition, localPosition, metric, exponent);
+          if (distanceToPoint < minDistance) {
+            targetOffset = cellOffset;
+            minDistance = distanceToPoint;
+            targetPosition = pointPosition;
+          }
+        }
+      }
+    }
+  }
+  *outDistance = minDistance;
+  *outColor = hash_float4_to_float3(cellPosition + targetOffset);
+  *outPosition = targetPosition + cellPosition;
+}
+
+ccl_device void voronoi_smooth_f1_4d(float4 coord,
+                                     float smoothness,
+                                     float exponent,
+                                     float randomness,
+                                     NodeVoronoiDistanceMetric metric,
+                                     float *outDistance,
+                                     float3 *outColor,
+                                     float4 *outPosition)
+{
+  float4 cellPosition = floor(coord);
+  float4 localPosition = coord - cellPosition;
+
+  float smoothDistance = 8.0f;
+  float3 smoothColor = make_float3(0.0f, 0.0f, 0.0f);
+  float4 smoothPosition = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  for (int u = -2; u <= 2; u++) {
+    for (int k = -2; k <= 2; k++) {
+      ccl_loop_no_unroll for (int j = -2; j <= 2; j++)
+      {
+        for (int i = -2; i <= 2; i++) {
+          float4 cellOffset = make_float4(i, j, k, u);
+          float4 pointPosition = cellOffset +
+                                 hash_float4_to_float4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = voronoi_distance_4d(
+              pointPosition, localPosition, metric, exponent);
+          float h = smoothstep(
+              0.0f, 1.0f, 0.5f + 0.5f * (smoothDistance - distanceToPoint) / smoothness);
+          float correctionFactor = smoothness * h * (1.0f - h);
+          smoothDistance = mix(smoothDistance, distanceToPoint, h) - correctionFactor;
+          correctionFactor /= 1.0f + 3.0f * smoothness;
+          float3 cellColor = hash_float4_to_float3(cellPosition + cellOffset);
+          smoothColor = mix(smoothColor, cellColor, h) - correctionFactor;
+          smoothPosition = mix(smoothPosition, pointPosition, h) - correctionFactor;
+        }
+      }
+    }
+  }
+  *outDistance = smoothDistance;
+  *outColor = smoothColor;
+  *outPosition = cellPosition + smoothPosition;
+}
+
+ccl_device void voronoi_f2_4d(float4 coord,
+                              float exponent,
+                              float randomness,
+                              NodeVoronoiDistanceMetric metric,
+                              float *outDistance,
+                              float3 *outColor,
+                              float4 *outPosition)
+{
+  float4 cellPosition = floor(coord);
+  float4 localPosition = coord - cellPosition;
+
+  float distanceF1 = 8.0f;
+  float distanceF2 = 8.0f;
+  float4 offsetF1 = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  float4 positionF1 = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  float4 offsetF2 = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  float4 positionF2 = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      ccl_loop_no_unroll for (int j = -1; j <= 1; j++)
+      {
+        for (int i = -1; i <= 1; i++) {
+          float4 cellOffset = make_float4(i, j, k, u);
+          float4 pointPosition = cellOffset +
+                                 hash_float4_to_float4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = voronoi_distance_4d(
+              pointPosition, localPosition, metric, exponent);
+          if (distanceToPoint < distanceF1) {
+            distanceF2 = distanceF1;
+            distanceF1 = distanceToPoint;
+            offsetF2 = offsetF1;
+            offsetF1 = cellOffset;
+            positionF2 = positionF1;
+            positionF1 = pointPosition;
+          }
+          else if (distanceToPoint < distanceF2) {
+            distanceF2 = distanceToPoint;
+            offsetF2 = cellOffset;
+            positionF2 = pointPosition;
           }
         }
+      }
+    }
+  }
+  *outDistance = distanceF2;
+  *outColor = hash_float4_to_float3(cellPosition + offsetF2);
+  *outPosition = positionF2 + cellPosition;
+}
+
+ccl_device void voronoi_distance_to_edge_4d(float4 coord, float randomness, float *outDistance)
+{
+  float4 cellPosition = floor(coord);
+  float4 localPosition = coord - cellPosition;
 
-        /* To keep the shortest four distances and associated points we have to keep them in sorted order. */
-        if (d < da[0]) {
-          da[3] = da[2];
-          da[2] = da[1];
-          da[1] = da[0];
-          da[0] = d;
-
-          pa[3] = pa[2];
-          pa[2] = pa[1];
-          pa[1] = pa[0];
-          pa[0] = vp;
+  float4 vectorToClosest = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  float minDistance = 8.0f;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      ccl_loop_no_unroll for (int j = -1; j <= 1; j++)
+      {
+        for (int i = -1; i <= 1; i++) {
+          float4 cellOffset = make_float4(i, j, k, u);
+          float4 vectorToPoint = cellOffset +
+                                 hash_float4_to_float4(cellPosition + cellOffset) * randomness -
+                                 localPosition;
+          float distanceToPoint = dot(vectorToPoint, vectorToPoint);
+          if (distanceToPoint < minDistance) {
+            minDistance = distanceToPoint;
+            vectorToClosest = vectorToPoint;
+          }
         }
-        else if (d < da[1]) {
-          da[3] = da[2];
-          da[2] = da[1];
-          da[1] = d;
-
-          pa[3] = pa[2];
-          pa[2] = pa[1];
-          pa[1] = vp;
+      }
+    }
+  }
+
+  minDistance = 8.0f;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      ccl_loop_no_unroll for (int j = -1; j <= 1; j++)
+      {
+        for (int i = -1; i <= 1; i++) {
+          float4 cellOffset = make_float4(i, j, k, u);
+          float4 vectorToPoint = cellOffset +
+                                 hash_float4_to_float4(cellPosition + cellOffset) * randomness -
+                                 localPosition;
+          float4 perpendicularToEdge = vectorToPoint - vectorToClosest;
+          if (dot(perpendicularToEdge, perpendicularToEdge) > 0.0001f) {
+            float distanceToEdge = dot((vectorToClosest + vectorToPoint) / 2.0f,
+                                       normalize(perpendicularToEdge));
+            minDistance = min(minDistance, distanceToEdge);
+          }
         }
-        else if (d < da[2]) {
-          da[3] = da[2];
-          da[2] = d;
+      }
+    }
+  }
+  *outDistance = minDistance;
+}
 
-          pa[3] = pa[2];
-          pa[2] = vp;
+ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float *outRadius)
+{
+  float4 cellPosition = floor(coord);
+  float4 localPosition = coord - cellPosition;
+
+  float4 closestPoint = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  float4 closestPointOffset = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  float minDistance = 8.0f;
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      ccl_loop_no_unroll for (int j = -1; j <= 1; j++)
+      {
+        for (int i = -1; i <= 1; i++) {
+          float4 cellOffset = make_float4(i, j, k, u);
+          float4 pointPosition = cellOffset +
+                                 hash_float4_to_float4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = distance(pointPosition, localPosition);
+          if (distanceToPoint < minDistance) {
+            minDistance = distanceToPoint;
+            closestPoint = pointPosition;
+            closestPointOffset = cellOffset;
+          }
         }
-        else if (d < da[3]) {
-          da[3] = d;
-          pa[3] = vp;
+      }
+    }
+  }
+
+  minDistance = 8.0f;
+  float4 closestPointToClosestPoint = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  for (int u = -1; u <= 1; u++) {
+    for (int k = -1; k <= 1; k++) {
+      ccl_loop_no_unroll for (int j = -1; j <= 1; j++)
+      {
+        for (int i = -1; i <= 1; i++) {
+          if (i == 0 && j == 0 && k == 0 && u == 0) {
+            continue;
+          }
+          float4 cellOffset = make_float4(i, j, k, u) + closestPointOffset;
+          float4 pointPosition = cellOffset +
+                                 hash_float4_to_float4(cellPosition + cellOffset) * randomness;
+          float distanceToPoint = distance(closestPoint, pointPosition);
+          if (distanceToPoint < minDistance) {
+            minDistance = distanceToPoint;
+            closestPointToClosestPoint = pointPosition;
+          }
         }
       }
     }
   }
+  *outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f;
 }
 
-ccl_device void svm_node_tex_voronoi(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
+                                     ShaderData *sd,
+                                     float *stack,
+                                     uint dimensions,
+                                     uint feature,
+                                     uint metric,
+                                     int *offset)
 {
-  uint4 node2 = read_node(kg, offset);
+  uint4 stack_offsets = read_node(kg, offset);
+  uint4 defaults = read_node(kg, offset);
+
+  uint coord_stack_offset, w_stack_offset, scale_stack_offset, smoothness_stack_offset;
+  uint exponent_stack_offset, randomness_stack_offset, distance_out_stack_offset,
+      color_out_stack_offset;
+  uint position_out_stack_offset, w_out_stack_offset, radius_out_stack_offset;
+
+  svm_unpack_node_uchar4(stack_offsets.x,
+                         &coord_stack_offset,
+                         &w_stack_offset,
+                         &scale_stack_offset,
+                         &smoothness_stack_offset);
+  svm_unpack_node_uchar4(stack_offsets.y,
+                         &exponent_stack_offset,
+                         &randomness_stack_offset,
+                         &distance_out_stack_offset,
+                         &color_out_stack_offset);
+  svm_unpack_node_uchar3(
+      stack_offsets.z, &position_out_stack_offset, &w_out_stack_offset, &radius_out_stack_offset);
 
-  uint co_offset, coloring, distance, feature;
-  uint scale_offset, e_offset, fac_offset, color_offset;
+  float3 coord = stack_load_float3(stack, coord_stack_offset);
+  float w = stack_load_float_default(stack, w_stack_offset, stack_offsets.w);
+  float scale = stack_load_float_default(stack, scale_stack_offset, defaults.x);
+  float smoothness = stack_load_float_default(stack, smoothness_stack_offset, defaults.y);
+  float exponent = stack_load_float_default(stack, exponent_stack_offset, defaults.z);
+  float randomness = stack_load_float_default(stack, randomness_stack_offset, defaults.w);
 
-  decode_node_uchar4(node.y, &co_offset, &coloring, &distance, &feature);
-  decode_node_uchar4(node.z, &scale_offset, &e_offset, &fac_offset, &color_offset);
+  NodeVoronoiFeature voronoi_feature = (NodeVoronoiFeature)feature;
+  NodeVoronoiDistanceMetric voronoi_metric = (NodeVoronoiDistanceMetric)metric;
 
-  float3 co = stack_load_float3(stack, co_offset);
-  float scale = stack_load_float_default(stack, scale_offset, node2.x);
-  float exponent = stack_load_float_default(stack, e_offset, node2.y);
+  float distance_out = 0.0f, w_out = 0.0f, radius_out = 0.0f;
+  float3 color_out = make_float3(0.0f, 0.0f, 0.0f);
+  float3 position_out = make_float3(0.0f, 0.0f, 0.0f);
 
-  float dist[4];
-  float3 neighbor[4];
-  voronoi_neighbors(co * scale, (NodeVoronoiDistanceMetric)distance, exponent, dist, neighbor);
+  randomness = clamp(randomness, 0.0f, 1.0f);
+  smoothness = clamp(smoothness / 2.0f, 0.0f, 0.5f);
 
-  float3 color;
-  float fac;
-  if (coloring == NODE_VORONOI_INTENSITY) {
-    switch (feature) {
-      case NODE_VORONOI_F1:
-        fac = dist[0];
-        break;
-      case NODE_VORONOI_F2:
-        fac = dist[1];
-        break;
-      case NODE_VORONOI_F3:
-        fac = dist[2];
-        break;
-      case NODE_VORONOI_F4:
-        fac = dist[3];
-        break;
-      case NODE_VORONOI_F2F1:
-        fac = dist[1] - dist[0];
-        break;
+  w *= scale;
+  coord *= scale;
+
+  switch (dimensions) {
+    case 1: {
+      switch (voronoi_feature) {
+        case NODE_VORONOI_F1:
+          voronoi_f1_1d(
+              w, exponent, randomness, voronoi_metric, &distance_out, &color_out, &w_out);
+          break;
+        case NODE_VORONOI_SMOOTH_F1:
+          voronoi_smooth_f1_1d(w,
+                               smoothness,
+                               exponent,
+                               randomness,
+                               voronoi_metric,
+                               &distance_out,
+                               &color_out,
+                               &w_out);
+          break;
+        case NODE_VORONOI_F2:
+          voronoi_f2_1d(
+              w, exponent, randomness, voronoi_metric, &distance_out, &color_out, &w_out);
+          break;
+        case NODE_VORONOI_DISTANCE_TO_EDGE:
+          voronoi_distance_to_edge_1d(w, randomness, &distance_out);
+          break;
+        case NODE_VORONOI_N_SPHERE_RADIUS:
+          voronoi_n_sphere_radius_1d(w, randomness, &radius_out);
+          break;
+        default:
+          kernel_assert(0);
+      }
+      w_out = safe_divide(w_out, scale);
+      break;
+    }
+    case 2: {
+      float2 coord_2d = make_float2(coord.x, coord.y);
+      float2 position_out_2d;
+      switch (voronoi_feature) {
+        case NODE_VORONOI_F1:
+          voronoi_f1_2d(coord_2d,
+                        exponent,
+                        randomness,
+                        voronoi_metric,
+                        &distance_out,
+                        &color_out,
+                        &position_out_2d);
+          break;
+#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
+        case NODE_VORONOI_SMOOTH_F1:
+          voronoi_smooth_f1_2d(coord_2d,
+                               smoothness,
+                               exponent,
+                               randomness,
+                               voronoi_metric,
+                               &distance_out,
+                               &color_out,
+                               &position_out_2d);
+          break;
+#endif
+        case NODE_VORONOI_F2:
+          voronoi_f2_2d(coord_2d,
+                        exponent,
+                        randomness,
+                        voronoi_metric,
+                        &distance_out,
+                        &color_out,
+                        &position_out_2d);
+          break;
+        case NODE_VORONOI_DISTANCE_TO_EDGE:
+          voronoi_distance_to_edge_2d(coord_2d, randomness, &distance_out);
+          break;
+        case NODE_VORONOI_N_SPHERE_RADIUS:
+          voronoi_n_sphere_radius_2d(coord_2d, randomness, &radius_out);
+          break;
+        default:
+          kernel_assert(0);
+      }
+      position_out_2d = safe_divide_float2_float(position_out_2d, scale);
+      position_out = make_float3(position_out_2d.x, position_out_2d.y, 0.0f);
+      break;
+    }
+    case 3: {
+      switch (voronoi_feature) {
+        case NODE_VORONOI_F1:
+          voronoi_f1_3d(coord,
+                        exponent,
+                        randomness,
+                        voronoi_metric,
+                        &distance_out,
+                        &color_out,
+                        &position_out);
+          break;
+#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
+        case NODE_VORONOI_SMOOTH_F1:
+          voronoi_smooth_f1_3d(coord,
+                               smoothness,
+                               exponent,
+                               randomness,
+                               voronoi_metric,
+                               &distance_out,
+                               &color_out,
+                               &position_out);
+          break;
+#endif
+        case NODE_VORONOI_F2:
+          voronoi_f2_3d(coord,
+                        exponent,
+                        randomness,
+                        voronoi_metric,
+                        &distance_out,
+                        &color_out,
+                        &position_out);
+          break;
+        case NODE_VORONOI_DISTANCE_TO_EDGE:
+          voronoi_distance_to_edge_3d(coord, randomness, &distance_out);
+          break;
+        case NODE_VORONOI_N_SPHERE_RADIUS:
+          voronoi_n_sphere_radius_3d(coord, randomness, &radius_out);
+          break;
+        default:
+          kernel_assert(0);
+      }
+      position_out = safe_divide_float3_float(position_out, scale);
+      break;
     }
 
-    color = make_float3(fac, fac, fac);
+#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
+    case 4: {
+      float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
+      float4 position_out_4d;
+      switch (voronoi_feature) {
+        case NODE_VORONOI_F1:
+          voronoi_f1_4d(coord_4d,
+                        exponent,
+                        randomness,
+                        voronoi_metric,
+                        &distance_out,
+                        &color_out,
+                        &position_out_4d);
+          break;
+        case NODE_VORONOI_SMOOTH_F1:
+          voronoi_smooth_f1_4d(coord_4d,
+                               smoothness,
+                               exponent,
+                               randomness,
+                               voronoi_metric,
+                               &distance_out,
+                               &color_out,
+                               &position_out_4d);
+          break;
+        case NODE_VORONOI_F2:
+          voronoi_f2_4d(coord_4d,
+                        exponent,
+                        randomness,
+                        voronoi_metric,
+                        &distance_out,
+                        &color_out,
+                        &position_out_4d);
+          break;
+        case NODE_VORONOI_DISTANCE_TO_EDGE:
+          voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
+          break;
+        case NODE_VORONOI_N_SPHERE_RADIUS:
+          voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
+          break;
+        default:
+          kernel_assert(0);
+      }
+      position_out_4d = safe_divide_float4_float(position_out_4d, scale);
+      position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
+      w_out = position_out_4d.w;
+      break;
+    }
+#endif
+    default:
+      kernel_assert(0);
   }
-  else {
-    /* NODE_VORONOI_CELLS */
-    switch (feature) {
-      case NODE_VORONOI_F1:
-        color = neighbor[0];
-        break;
-      case NODE_VORONOI_F2:
-        color = neighbor[1];
-        break;
-      case NODE_VORONOI_F3:
-        color = neighbor[2];
-        break;
-      case NODE_VORONOI_F4:
-        color = neighbor[3];
-        break;
-      /* Usefulness of this vector is questionable. Note F2 >= F1 but the
-       * individual vector components might not be. */
-      case NODE_VORONOI_F2F1:
-        color = fabs(neighbor[1] - neighbor[0]);
-        break;
-    }
-
-    color = cellnoise3(color);
-    fac = average(color);
-  }
-
-  if (stack_valid(fac_offset))
-    stack_store_float(stack, fac_offset, fac);
-  if (stack_valid(color_offset))
-    stack_store_float3(stack, color_offset, color);
+
+  if (stack_valid(distance_out_stack_offset))
+    stack_store_float(stack, distance_out_stack_offset, distance_out);
+  if (stack_valid(color_out_stack_offset))
+    stack_store_float3(stack, color_out_stack_offset, color_out);
+  if (stack_valid(position_out_stack_offset))
+    stack_store_float3(stack, position_out_stack_offset, position_out);
+  if (stack_valid(w_out_stack_offset))
+    stack_store_float(stack, w_out_stack_offset, w_out);
+  if (stack_valid(radius_out_stack_offset))
+    stack_store_float(stack, radius_out_stack_offset, radius_out);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 26d8cc71d3b..4bc14f82382 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_tex_voxel(
     KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
   uint co_offset, density_out_offset, color_out_offset, space;
-  decode_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
+  svm_unpack_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
 #ifdef __VOLUME__
   int id = node.y;
   float3 co = stack_load_float3(stack, co_offset);
@@ -39,7 +39,7 @@ ccl_device void svm_node_tex_voxel(
     co = transform_point(&tfm, co);
   }
 
-  float4 r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z, INTERPOLATION_NONE);
+  float4 r = kernel_tex_image_interp_3d(kg, id, co, INTERPOLATION_NONE);
 #else
   float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index 003ad7dc63a..c4763475b47 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -18,30 +18,67 @@ CCL_NAMESPACE_BEGIN
 
 /* Wave */
 
-ccl_device_noinline float svm_wave(NodeWaveType type,
-                                   NodeWaveProfile profile,
-                                   float3 p,
-                                   float detail,
-                                   float distortion,
-                                   float dscale)
+ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
+                                       NodeWaveBandsDirection bands_dir,
+                                       NodeWaveRingsDirection rings_dir,
+                                       NodeWaveProfile profile,
+                                       float3 p,
+                                       float distortion,
+                                       float detail,
+                                       float dscale,
+                                       float droughness,
+                                       float phase)
 {
+  /* Prevent precision issues on unit coordinates. */
+  p = (p + 0.000001f) * 0.999999f;
+
   float n;
 
-  if (type == NODE_WAVE_BANDS)
-    n = (p.x + p.y + p.z) * 10.0f;
-  else /* NODE_WAVE_RINGS */
-    n = len(p) * 20.0f;
+  if (type == NODE_WAVE_BANDS) {
+    if (bands_dir == NODE_WAVE_BANDS_DIRECTION_X) {
+      n = p.x * 20.0f;
+    }
+    else if (bands_dir == NODE_WAVE_BANDS_DIRECTION_Y) {
+      n = p.y * 20.0f;
+    }
+    else if (bands_dir == NODE_WAVE_BANDS_DIRECTION_Z) {
+      n = p.z * 20.0f;
+    }
+    else { /* NODE_WAVE_BANDS_DIRECTION_DIAGONAL */
+      n = (p.x + p.y + p.z) * 10.0f;
+    }
+  }
+  else { /* NODE_WAVE_RINGS */
+    float3 rp = p;
+    if (rings_dir == NODE_WAVE_RINGS_DIRECTION_X) {
+      rp *= make_float3(0.0f, 1.0f, 1.0f);
+    }
+    else if (rings_dir == NODE_WAVE_RINGS_DIRECTION_Y) {
+      rp *= make_float3(1.0f, 0.0f, 1.0f);
+    }
+    else if (rings_dir == NODE_WAVE_RINGS_DIRECTION_Z) {
+      rp *= make_float3(1.0f, 1.0f, 0.0f);
+    }
+    /* else: NODE_WAVE_RINGS_DIRECTION_SPHERICAL */
+
+    n = len(rp) * 20.0f;
+  }
+
+  n += phase;
 
   if (distortion != 0.0f)
-    n += distortion * noise_turbulence(p * dscale, detail, 0);
+    n += distortion * (fractal_noise_3d(p * dscale, detail, droughness) * 2.0f - 1.0f);
 
   if (profile == NODE_WAVE_PROFILE_SIN) {
-    return 0.5f + 0.5f * sinf(n);
+    return 0.5f + 0.5f * sinf(n - M_PI_2_F);
+  }
+  else if (profile == NODE_WAVE_PROFILE_SAW) {
+    n /= M_2PI_F;
+    return n - floorf(n);
   }
-  else { /* NODE_WAVE_PROFILE_SAW */
+  else { /* NODE_WAVE_PROFILE_TRI */
     n /= M_2PI_F;
-    n -= (int)n;
-    return (n < 0.0f) ? n + 1.0f : n;
+    return fabsf(n - floorf(n + 0.5f)) * 2.0f;
   }
 }
 
@@ -49,22 +86,40 @@ ccl_device void svm_node_tex_wave(
     KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
   uint4 node2 = read_node(kg, offset);
+  uint4 node3 = read_node(kg, offset);
 
-  uint type;
-  uint co_offset, scale_offset, detail_offset, dscale_offset, distortion_offset, color_offset,
-      fac_offset;
+  /* RNA properties */
+  uint type_offset, bands_dir_offset, rings_dir_offset, profile_offset;
+  /* Inputs, Outputs */
+  uint co_offset, scale_offset, distortion_offset, detail_offset, dscale_offset, droughness_offset,
+      phase_offset;
+  uint color_offset, fac_offset;
 
-  decode_node_uchar4(node.y, &type, &color_offset, &fac_offset, &dscale_offset);
-  decode_node_uchar4(node.z, &co_offset, &scale_offset, &detail_offset, &distortion_offset);
+  svm_unpack_node_uchar4(
+      node.y, &type_offset, &bands_dir_offset, &rings_dir_offset, &profile_offset);
+  svm_unpack_node_uchar3(node.z, &co_offset, &scale_offset, &distortion_offset);
+  svm_unpack_node_uchar4(
+      node.w, &detail_offset, &dscale_offset, &droughness_offset, &phase_offset);
+  svm_unpack_node_uchar2(node2.x, &color_offset, &fac_offset);
 
   float3 co = stack_load_float3(stack, co_offset);
-  float scale = stack_load_float_default(stack, scale_offset, node2.x);
-  float detail = stack_load_float_default(stack, detail_offset, node2.y);
+  float scale = stack_load_float_default(stack, scale_offset, node2.y);
   float distortion = stack_load_float_default(stack, distortion_offset, node2.z);
-  float dscale = stack_load_float_default(stack, dscale_offset, node2.w);
+  float detail = stack_load_float_default(stack, detail_offset, node2.w);
+  float dscale = stack_load_float_default(stack, dscale_offset, node3.x);
+  float droughness = stack_load_float_default(stack, droughness_offset, node3.y);
+  float phase = stack_load_float_default(stack, phase_offset, node3.z);
 
-  float f = svm_wave(
-      (NodeWaveType)type, (NodeWaveProfile)node.w, co * scale, detail, distortion, dscale);
+  float f = svm_wave((NodeWaveType)type_offset,
+                     (NodeWaveBandsDirection)bands_dir_offset,
+                     (NodeWaveRingsDirection)rings_dir_offset,
+                     (NodeWaveProfile)profile_offset,
+                     co * scale,
+                     distortion,
+                     detail,
+                     dscale,
+                     droughness,
+                     phase);
 
   if (stack_valid(fac_offset))
     stack_store_float(stack, fac_offset, f);
diff --git a/intern/cycles/kernel/svm/svm_white_noise.h b/intern/cycles/kernel/svm/svm_white_noise.h
new file mode 100644
index 00000000000..b30d85acaec
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_white_noise.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void svm_node_tex_white_noise(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint dimensions,
+                                         uint inputs_stack_offsets,
+                                         uint ouptuts_stack_offsets,
+                                         int *offset)
+{
+  uint vector_stack_offset, w_stack_offset, value_stack_offset, color_stack_offset;
+  svm_unpack_node_uchar2(inputs_stack_offsets, &vector_stack_offset, &w_stack_offset);
+  svm_unpack_node_uchar2(ouptuts_stack_offsets, &value_stack_offset, &color_stack_offset);
+
+  float3 vector = stack_load_float3(stack, vector_stack_offset);
+  float w = stack_load_float(stack, w_stack_offset);
+
+  if (stack_valid(color_stack_offset)) {
+    float3 color;
+    switch (dimensions) {
+      case 1:
+        color = hash_float_to_float3(w);
+        break;
+      case 2:
+        color = hash_float2_to_float3(make_float2(vector.x, vector.y));
+        break;
+      case 3:
+        color = hash_float3_to_float3(vector);
+        break;
+      case 4:
+        color = hash_float4_to_float3(make_float4(vector.x, vector.y, vector.z, w));
+        break;
+      default:
+        color = make_float3(1.0f, 0.0f, 1.0f);
+        kernel_assert(0);
+        break;
+    }
+    stack_store_float3(stack, color_stack_offset, color);
+  }
+
+  if (stack_valid(value_stack_offset)) {
+    float value;
+    switch (dimensions) {
+      case 1:
+        value = hash_float_to_float(w);
+        break;
+      case 2:
+        value = hash_float2_to_float(make_float2(vector.x, vector.y));
+        break;
+      case 3:
+        value = hash_float3_to_float(vector);
+        break;
+      case 4:
+        value = hash_float4_to_float(make_float4(vector.x, vector.y, vector.z, w));
+        break;
+      default:
+        value = 0.0f;
+        kernel_assert(0);
+        break;
+    }
+    stack_store_float(stack, value_stack_offset, value);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 55e61d0e8c7..49158bd86d5 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -93,7 +93,7 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
   uint in_size = node.y;
   uint out_fac = node.z;
   uint use_pixel_size, bump_offset;
-  decode_node_uchar4(node.w, &use_pixel_size, &bump_offset, NULL, NULL);
+  svm_unpack_node_uchar2(node.w, &use_pixel_size, &bump_offset);
 
   /* Input Data */
   float size = stack_load_float(stack, in_size);
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 32bdb077ab7..e4b86d33fe7 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -2,6 +2,7 @@
 set(INC
   ..
   ../../glew-mx
+  ../../sky/include
 )
 
 set(INC_SYS
@@ -14,13 +15,20 @@ set(SRC
   bake.cpp
   buffers.cpp
   camera.cpp
+  colorspace.cpp
   constant_fold.cpp
   coverage.cpp
   denoising.cpp
   film.cpp
+  geometry.cpp
   graph.cpp
+  hair.cpp
   image.cpp
+  image_oiio.cpp
+  image_sky.cpp
+  image_vdb.cpp
   integrator.cpp
+  jitter.cpp
   light.cpp
   light_tree.cpp
   merge.cpp
@@ -49,14 +57,21 @@ set(SRC_HEADERS
   background.h
   buffers.h
   camera.h
+  colorspace.h
   constant_fold.h
   coverage.h
   denoising.h
   film.h
+  geometry.h
   graph.h
+  hair.h
   image.h
+  image_oiio.h
+  image_sky.h
+  image_vdb.h
   integrator.h
   light.h
+  jitter.h
   merge.h
   mesh.h
   nodes.h
@@ -76,15 +91,40 @@ set(SRC_HEADERS
 
 set(LIB
   cycles_bvh
+  cycles_device
+  cycles_subd
+  cycles_util
+  bf_intern_sky
 )
 
 if(WITH_CYCLES_OSL)
   list(APPEND LIB
     cycles_kernel_osl
   )
+
+  SET_PROPERTY(SOURCE osl.cpp PROPERTY COMPILE_FLAGS ${RTTI_DISABLE_FLAGS})
+endif()
+
+if(WITH_OPENCOLORIO)
+  add_definitions(-DWITH_OCIO)
+  include_directories(
+    SYSTEM
+    ${OPENCOLORIO_INCLUDE_DIRS}
+  )
+  if(WIN32)
+    add_definitions(-DOpenColorIO_STATIC)
+  endif()
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
+if(WITH_OPENVDB)
+  add_definitions(-DWITH_OPENVDB ${OPENVDB_DEFINITIONS})
+  list(APPEND INC_SYS
+    ${OPENVDB_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${OPENVDB_LIBRARIES}
+  )
+endif()
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index dad6cb4fe6d..4c26d5e8365 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
+#include "render/attribute.h"
+#include "render/hair.h"
 #include "render/image.h"
 #include "render/mesh.h"
-#include "render/attribute.h"
 
 #include "util/util_foreach.h"
 #include "util/util_transform.h"
@@ -25,45 +26,51 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute */
 
-Attribute::~Attribute()
+Attribute::Attribute(
+    ustring name, TypeDesc type, AttributeElement element, Geometry *geom, AttributePrimitive prim)
+    : name(name), std(ATTR_STD_NONE), type(type), element(element), flags(0)
 {
-  /* for voxel data, we need to remove the image from the image manager */
-  if (element == ATTR_ELEMENT_VOXEL) {
-    VoxelAttribute *voxel_data = data_voxel();
+  /* string and matrix not supported! */
+  assert(type == TypeDesc::TypeFloat || type == TypeDesc::TypeColor ||
+         type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+         type == TypeDesc::TypeNormal || type == TypeDesc::TypeMatrix || type == TypeFloat2 ||
+         type == TypeRGBA);
 
-    if (voxel_data && voxel_data->slot != -1) {
-      voxel_data->manager->remove_image(voxel_data->slot);
-    }
+  if (element == ATTR_ELEMENT_VOXEL) {
+    buffer.resize(sizeof(ImageHandle));
+    new (buffer.data()) ImageHandle();
+  }
+  else {
+    resize(geom, prim, false);
   }
 }
 
-void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
+Attribute::~Attribute()
 {
-  name = name_;
-  type = type_;
-  element = element_;
-  std = ATTR_STD_NONE;
-  flags = 0;
-
-  /* string and matrix not supported! */
-  assert(type == TypeDesc::TypeFloat || type == TypeDesc::TypeColor ||
-         type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
-         type == TypeDesc::TypeNormal || type == TypeDesc::TypeMatrix || type == TypeFloat2);
+  /* For voxel data, we need to free the image handle. */
+  if (element == ATTR_ELEMENT_VOXEL && buffer.size()) {
+    ImageHandle &handle = data_voxel();
+    handle.~ImageHandle();
+  }
 }
 
-void Attribute::resize(Mesh *mesh, AttributePrimitive prim, bool reserve_only)
+void Attribute::resize(Geometry *geom, AttributePrimitive prim, bool reserve_only)
 {
-  if (reserve_only) {
-    buffer.reserve(buffer_size(mesh, prim));
-  }
-  else {
-    buffer.resize(buffer_size(mesh, prim), 0);
+  if (element != ATTR_ELEMENT_VOXEL) {
+    if (reserve_only) {
+      buffer.reserve(buffer_size(geom, prim));
+    }
+    else {
+      buffer.resize(buffer_size(geom, prim), 0);
+    }
   }
 }
 
 void Attribute::resize(size_t num_elements)
 {
-  buffer.resize(num_elements * data_sizeof(), 0);
+  if (element != ATTR_ELEMENT_VOXEL) {
+    buffer.resize(num_elements * data_sizeof(), 0);
+  }
 }
 
 void Attribute::add(const float &f)
@@ -121,17 +128,6 @@ void Attribute::add(const Transform &f)
     buffer.push_back(data[i]);
 }
 
-void Attribute::add(const VoxelAttribute &f)
-{
-  assert(data_sizeof() == sizeof(VoxelAttribute));
-
-  char *data = (char *)&f;
-  size_t size = sizeof(f);
-
-  for (size_t i = 0; i < size; i++)
-    buffer.push_back(data[i]);
-}
-
 void Attribute::add(const char *data)
 {
   size_t size = data_sizeof();
@@ -143,7 +139,7 @@ void Attribute::add(const char *data)
 size_t Attribute::data_sizeof() const
 {
   if (element == ATTR_ELEMENT_VOXEL)
-    return sizeof(VoxelAttribute);
+    return sizeof(ImageHandle);
   else if (element == ATTR_ELEMENT_CORNER_BYTE)
     return sizeof(uchar4);
   else if (type == TypeDesc::TypeFloat)
@@ -156,13 +152,13 @@ size_t Attribute::data_sizeof() const
     return sizeof(float3);
 }
 
-size_t Attribute::element_size(Mesh *mesh, AttributePrimitive prim) const
+size_t Attribute::element_size(Geometry *geom, AttributePrimitive prim) const
 {
   if (flags & ATTR_FINAL_SIZE) {
     return buffer.size() / data_sizeof();
   }
 
-  size_t size;
+  size_t size = 0;
 
   switch (element) {
     case ATTR_ELEMENT_OBJECT:
@@ -171,54 +167,74 @@ size_t Attribute::element_size(Mesh *mesh, AttributePrimitive prim) const
       size = 1;
       break;
     case ATTR_ELEMENT_VERTEX:
-      size = mesh->verts.size() + mesh->num_ngons;
-      if (prim == ATTR_PRIM_SUBD) {
-        size -= mesh->num_subd_verts;
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        size = mesh->verts.size() + mesh->num_ngons;
+        if (prim == ATTR_PRIM_SUBD) {
+          size -= mesh->num_subd_verts;
+        }
       }
       break;
     case ATTR_ELEMENT_VERTEX_MOTION:
-      size = (mesh->verts.size() + mesh->num_ngons) * (mesh->motion_steps - 1);
-      if (prim == ATTR_PRIM_SUBD) {
-        size -= mesh->num_subd_verts * (mesh->motion_steps - 1);
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        size = (mesh->verts.size() + mesh->num_ngons) * (mesh->motion_steps - 1);
+        if (prim == ATTR_PRIM_SUBD) {
+          size -= mesh->num_subd_verts * (mesh->motion_steps - 1);
+        }
       }
       break;
     case ATTR_ELEMENT_FACE:
-      if (prim == ATTR_PRIM_TRIANGLE) {
-        size = mesh->num_triangles();
-      }
-      else {
-        size = mesh->subd_faces.size() + mesh->num_ngons;
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (prim == ATTR_PRIM_GEOMETRY) {
+          size = mesh->num_triangles();
+        }
+        else {
+          size = mesh->subd_faces.size() + mesh->num_ngons;
+        }
       }
       break;
     case ATTR_ELEMENT_CORNER:
     case ATTR_ELEMENT_CORNER_BYTE:
-      if (prim == ATTR_PRIM_TRIANGLE) {
-        size = mesh->num_triangles() * 3;
-      }
-      else {
-        size = mesh->subd_face_corners.size() + mesh->num_ngons;
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (prim == ATTR_PRIM_GEOMETRY) {
+          size = mesh->num_triangles() * 3;
+        }
+        else {
+          size = mesh->subd_face_corners.size() + mesh->num_ngons;
+        }
       }
       break;
     case ATTR_ELEMENT_CURVE:
-      size = mesh->num_curves();
+      if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        size = hair->num_curves();
+      }
       break;
     case ATTR_ELEMENT_CURVE_KEY:
-      size = mesh->curve_keys.size();
+      if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        size = hair->curve_keys.size();
+      }
       break;
     case ATTR_ELEMENT_CURVE_KEY_MOTION:
-      size = mesh->curve_keys.size() * (mesh->motion_steps - 1);
+      if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        size = hair->curve_keys.size() * (hair->motion_steps - 1);
+      }
       break;
     default:
-      size = 0;
       break;
   }
 
   return size;
 }
 
-size_t Attribute::buffer_size(Mesh *mesh, AttributePrimitive prim) const
+size_t Attribute::buffer_size(Geometry *geom, AttributePrimitive prim) const
 {
-  return element_size(mesh, prim) * data_sizeof();
+  return element_size(geom, prim) * data_sizeof();
 }
 
 bool Attribute::same_storage(TypeDesc a, TypeDesc b)
@@ -251,6 +267,9 @@ void Attribute::add_with_weight(void *dst, void *src, float weight)
   else if (same_storage(type, TypeDesc::TypeFloat)) {
     *((float *)dst) += *((float *)src) * weight;
   }
+  else if (same_storage(type, TypeFloat2)) {
+    *((float2 *)dst) += *((float2 *)src) * weight;
+  }
   else if (same_storage(type, TypeDesc::TypeVector)) {
     *((float4 *)dst) += *((float4 *)src) * weight;
   }
@@ -276,6 +295,8 @@ const char *Attribute::standard_name(AttributeStandard std)
       return "tangent";
     case ATTR_STD_UV_TANGENT_SIGN:
       return "tangent_sign";
+    case ATTR_STD_VERTEX_COLOR:
+      return "vertex_color";
     case ATTR_STD_POSITION_UNDEFORMED:
       return "undeformed";
     case ATTR_STD_POSITION_UNDISPLACED:
@@ -308,6 +329,8 @@ const char *Attribute::standard_name(AttributeStandard std)
       return "velocity";
     case ATTR_STD_POINTINESS:
       return "pointiness";
+    case ATTR_STD_RANDOM_PER_ISLAND:
+      return "random_per_island";
     case ATTR_STD_NOT_FOUND:
     case ATTR_STD_NONE:
     case ATTR_STD_NUM:
@@ -330,13 +353,42 @@ AttributeStandard Attribute::name_standard(const char *name)
   return ATTR_STD_NONE;
 }
 
+void Attribute::get_uv_tiles(Geometry *geom,
+                             AttributePrimitive prim,
+                             unordered_set<int> &tiles) const
+{
+  if (type != TypeFloat2) {
+    return;
+  }
+
+  const int num = element_size(geom, prim);
+  const float2 *uv = data_float2();
+  for (int i = 0; i < num; i++, uv++) {
+    float u = uv->x, v = uv->y;
+    int x = (int)u, y = (int)v;
+
+    if (x < 0 || y < 0 || x >= 10) {
+      continue;
+    }
+
+    /* Be conservative in corners - precisely touching the right or upper edge of a tile
+     * should not load its right/upper neighbor as well. */
+    if (x > 0 && (u < x + 1e-6f)) {
+      x--;
+    }
+    if (y > 0 && (v < y + 1e-6f)) {
+      y--;
+    }
+
+    tiles.insert(1001 + 10 * y + x);
+  }
+}
+
 /* Attribute Set */
 
-AttributeSet::AttributeSet()
+AttributeSet::AttributeSet(Geometry *geometry, AttributePrimitive prim)
+    : geometry(geometry), prim(prim)
 {
-  triangle_mesh = NULL;
-  curve_mesh = NULL;
-  subd_mesh = NULL;
 }
 
 AttributeSet::~AttributeSet()
@@ -356,28 +408,9 @@ Attribute *AttributeSet::add(ustring name, TypeDesc type, AttributeElement eleme
     remove(name);
   }
 
-#if __cplusplus >= 201103L
-  attributes.emplace_back();
-  attr = &attributes.back();
-  attr->set(name, type, element);
-#else
-  {
-    Attribute attr_temp;
-    attr_temp.set(name, type, element);
-    attributes.push_back(attr_temp);
-    attr = &attributes.back();
-  }
-#endif
-
-  /* this is weak .. */
-  if (triangle_mesh)
-    attr->resize(triangle_mesh, ATTR_PRIM_TRIANGLE, false);
-  if (curve_mesh)
-    attr->resize(curve_mesh, ATTR_PRIM_CURVE, false);
-  if (subd_mesh)
-    attr->resize(subd_mesh, ATTR_PRIM_SUBD, false);
-
-  return attr;
+  Attribute new_attr(name, type, element, geometry, prim);
+  attributes.emplace_back(std::move(new_attr));
+  return &attributes.back();
 }
 
 Attribute *AttributeSet::find(ustring name) const
@@ -412,7 +445,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
   if (name == ustring())
     name = Attribute::standard_name(std);
 
-  if (triangle_mesh || subd_mesh) {
+  if (geometry->type == Geometry::MESH) {
     switch (std) {
       case ATTR_STD_VERTEX_NORMAL:
         attr = add(name, TypeDesc::TypeNormal, ATTR_ELEMENT_VERTEX);
@@ -429,6 +462,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
       case ATTR_STD_UV_TANGENT_SIGN:
         attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CORNER);
         break;
+      case ATTR_STD_VERTEX_COLOR:
+        attr = add(name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
+        break;
       case ATTR_STD_GENERATED:
       case ATTR_STD_POSITION_UNDEFORMED:
       case ATTR_STD_POSITION_UNDISPLACED:
@@ -464,12 +500,15 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
       case ATTR_STD_POINTINESS:
         attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VERTEX);
         break;
+      case ATTR_STD_RANDOM_PER_ISLAND:
+        attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_FACE);
+        break;
       default:
         assert(0);
         break;
     }
   }
-  else if (curve_mesh) {
+  else if (geometry->type == Geometry::HAIR) {
     switch (std) {
       case ATTR_STD_UV:
         attr = add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
@@ -492,6 +531,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
       case ATTR_STD_POINTINESS:
         attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VERTEX);
         break;
+      case ATTR_STD_RANDOM_PER_ISLAND:
+        attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_FACE);
+        break;
       default:
         assert(0);
         break;
@@ -549,12 +591,7 @@ void AttributeSet::remove(Attribute *attribute)
 void AttributeSet::resize(bool reserve_only)
 {
   foreach (Attribute &attr, attributes) {
-    if (triangle_mesh)
-      attr.resize(triangle_mesh, ATTR_PRIM_TRIANGLE, reserve_only);
-    if (curve_mesh)
-      attr.resize(curve_mesh, ATTR_PRIM_CURVE, reserve_only);
-    if (subd_mesh)
-      attr.resize(subd_mesh, ATTR_PRIM_SUBD, reserve_only);
+    attr.resize(geometry, prim, reserve_only);
   }
 }
 
@@ -584,15 +621,10 @@ AttributeRequest::AttributeRequest(ustring name_)
   name = name_;
   std = ATTR_STD_NONE;
 
-  triangle_type = TypeDesc::TypeFloat;
-  triangle_desc.element = ATTR_ELEMENT_NONE;
-  triangle_desc.offset = 0;
-  triangle_desc.type = NODE_ATTR_FLOAT;
-
-  curve_type = TypeDesc::TypeFloat;
-  curve_desc.element = ATTR_ELEMENT_NONE;
-  curve_desc.offset = 0;
-  curve_desc.type = NODE_ATTR_FLOAT;
+  type = TypeDesc::TypeFloat;
+  desc.element = ATTR_ELEMENT_NONE;
+  desc.offset = 0;
+  desc.type = NODE_ATTR_FLOAT;
 
   subd_type = TypeDesc::TypeFloat;
   subd_desc.element = ATTR_ELEMENT_NONE;
@@ -605,15 +637,10 @@ AttributeRequest::AttributeRequest(AttributeStandard std_)
   name = ustring();
   std = std_;
 
-  triangle_type = TypeDesc::TypeFloat;
-  triangle_desc.element = ATTR_ELEMENT_NONE;
-  triangle_desc.offset = 0;
-  triangle_desc.type = NODE_ATTR_FLOAT;
-
-  curve_type = TypeDesc::TypeFloat;
-  curve_desc.element = ATTR_ELEMENT_NONE;
-  curve_desc.offset = 0;
-  curve_desc.type = NODE_ATTR_FLOAT;
+  type = TypeDesc::TypeFloat;
+  desc.element = ATTR_ELEMENT_NONE;
+  desc.offset = 0;
+  desc.type = NODE_ATTR_FLOAT;
 
   subd_type = TypeDesc::TypeFloat;
   subd_desc.element = ATTR_ELEMENT_NONE;
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index ebab0fe7f88..5871fa04a31 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -17,10 +17,13 @@
 #ifndef __ATTRIBUTE_H__
 #define __ATTRIBUTE_H__
 
+#include "render/image.h"
+
 #include "kernel/kernel_types.h"
 
 #include "util/util_list.h"
 #include "util/util_param.h"
+#include "util/util_set.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
@@ -30,17 +33,12 @@ class Attribute;
 class AttributeRequest;
 class AttributeRequestSet;
 class AttributeSet;
-class ImageManager;
+class ImageHandle;
+class Geometry;
+class Hair;
 class Mesh;
 struct Transform;
 
-/* Attributes for voxels are images */
-
-struct VoxelAttribute {
-  ImageManager *manager;
-  int slot;
-};
-
 /* Attribute
  *
  * Arbitrary data layers on meshes.
@@ -56,17 +54,23 @@ class Attribute {
   AttributeElement element;
   uint flags; /* enum AttributeFlag */
 
-  Attribute()
-  {
-  }
+  Attribute(ustring name,
+            TypeDesc type,
+            AttributeElement element,
+            Geometry *geom,
+            AttributePrimitive prim);
+  Attribute(Attribute &&other) = default;
+  Attribute(const Attribute &other) = delete;
+  Attribute &operator=(const Attribute &other) = delete;
   ~Attribute();
+
   void set(ustring name, TypeDesc type, AttributeElement element);
-  void resize(Mesh *mesh, AttributePrimitive prim, bool reserve_only);
+  void resize(Geometry *geom, AttributePrimitive prim, bool reserve_only);
   void resize(size_t num_elements);
 
   size_t data_sizeof() const;
-  size_t element_size(Mesh *mesh, AttributePrimitive prim) const;
-  size_t buffer_size(Mesh *mesh, AttributePrimitive prim) const;
+  size_t element_size(Geometry *geom, AttributePrimitive prim) const;
+  size_t buffer_size(Geometry *geom, AttributePrimitive prim) const;
 
   char *data()
   {
@@ -102,10 +106,12 @@ class Attribute {
     assert(data_sizeof() == sizeof(Transform));
     return (Transform *)data();
   }
-  VoxelAttribute *data_voxel()
+
+  /* Attributes for voxels are images */
+  ImageHandle &data_voxel()
   {
-    assert(data_sizeof() == sizeof(VoxelAttribute));
-    return (VoxelAttribute *)data();
+    assert(data_sizeof() == sizeof(ImageHandle));
+    return *(ImageHandle *)data();
   }
 
   const char *data() const
@@ -137,10 +143,10 @@ class Attribute {
     assert(data_sizeof() == sizeof(Transform));
     return (const Transform *)data();
   }
-  const VoxelAttribute *data_voxel() const
+  const ImageHandle &data_voxel() const
   {
-    assert(data_sizeof() == sizeof(VoxelAttribute));
-    return (const VoxelAttribute *)data();
+    assert(data_sizeof() == sizeof(ImageHandle));
+    return *(const ImageHandle *)data();
   }
 
   void zero_data(void *dst);
@@ -150,13 +156,14 @@ class Attribute {
   void add(const float2 &f);
   void add(const float3 &f);
   void add(const uchar4 &f);
-  void add(const Transform &f);
-  void add(const VoxelAttribute &f);
+  void add(const Transform &tfm);
   void add(const char *data);
 
   static bool same_storage(TypeDesc a, TypeDesc b);
   static const char *standard_name(AttributeStandard std);
   static AttributeStandard name_standard(const char *name);
+
+  void get_uv_tiles(Geometry *geom, AttributePrimitive prim, unordered_set<int> &tiles) const;
 };
 
 /* Attribute Set
@@ -165,12 +172,11 @@ class Attribute {
 
 class AttributeSet {
  public:
-  Mesh *triangle_mesh;
-  Mesh *curve_mesh;
-  Mesh *subd_mesh;
+  Geometry *geometry;
+  AttributePrimitive prim;
   list<Attribute> attributes;
 
-  AttributeSet();
+  AttributeSet(Geometry *geometry, AttributePrimitive prim);
   ~AttributeSet();
 
   Attribute *add(ustring name, TypeDesc type, AttributeElement element);
@@ -200,9 +206,9 @@ class AttributeRequest {
   ustring name;
   AttributeStandard std;
 
-  /* temporary variables used by MeshManager */
-  TypeDesc triangle_type, curve_type, subd_type;
-  AttributeDescriptor triangle_desc, curve_desc, subd_desc;
+  /* temporary variables used by GeometryManager */
+  TypeDesc type, subd_type;
+  AttributeDescriptor desc, subd_desc;
 
   explicit AttributeRequest(ustring name_);
   explicit AttributeRequest(AttributeStandard std);
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index b32cc55903d..694bb640995 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -16,8 +16,8 @@
 
 #include "render/background.h"
 #include "device/device.h"
-#include "render/integrator.h"
 #include "render/graph.h"
+#include "render/integrator.h"
 #include "render/nodes.h"
 #include "render/scene.h"
 #include "render/shader.h"
@@ -43,6 +43,8 @@ NODE_DEFINE(Background)
   SOCKET_BOOLEAN(transparent_glass, "Transparent Glass", false);
   SOCKET_FLOAT(transparent_roughness_threshold, "Transparent Roughness Threshold", 0.0f);
 
+  SOCKET_FLOAT(volume_step_size, "Volume Step Size", 0.1f);
+
   SOCKET_NODE(shader, "Shader", &Shader::node_type);
 
   return type;
@@ -51,6 +53,7 @@ NODE_DEFINE(Background)
 Background::Background() : Node(node_type)
 {
   need_update = true;
+  shader = NULL;
 }
 
 Background::~Background()
@@ -64,14 +67,7 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
   device_free(device, dscene);
 
-  Shader *bg_shader = shader;
-
-  if (use_shader) {
-    if (!bg_shader)
-      bg_shader = scene->default_background;
-  }
-  else
-    bg_shader = scene->default_empty;
+  Shader *bg_shader = get_shader(scene);
 
   /* set shader index and transparent option */
   KernelBackground *kbackground = &dscene->data.background;
@@ -98,6 +94,8 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
   else
     kbackground->volume_shader = SHADER_NONE;
 
+  kbackground->volume_step_size = volume_step_size * scene->integrator->volume_step_rate;
+
   /* No background node, make world shader invisible to all rays, to skip evaluation in kernel. */
   if (bg_shader->graph->nodes.size() <= 1) {
     kbackground->surface_shader |= SHADER_EXCLUDE_ANY;
@@ -134,4 +132,9 @@ void Background::tag_update(Scene *scene)
   need_update = true;
 }
 
+Shader *Background::get_shader(const Scene *scene)
+{
+  return (use_shader) ? ((shader) ? shader : scene->default_background) : scene->default_empty;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index 020db7bf6aa..c2ca1f75179 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -45,6 +45,8 @@ class Background : public Node {
   bool transparent_glass;
   float transparent_roughness_threshold;
 
+  float volume_step_size;
+
   bool need_update;
 
   Background();
@@ -55,6 +57,8 @@ class Background : public Node {
 
   bool modified(const Background &background);
   void tag_update(Scene *scene);
+
+  Shader *get_shader(const Scene *scene);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 73893921500..6044182a51a 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -15,283 +15,140 @@
  */
 
 #include "render/bake.h"
+#include "render/buffers.h"
+#include "render/integrator.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/shader.h"
-#include "render/integrator.h"
 
 #include "util/util_foreach.h"
 
 CCL_NAMESPACE_BEGIN
 
-BakeData::BakeData(const int object, const size_t tri_offset, const size_t num_pixels)
-    : m_object(object), m_tri_offset(tri_offset), m_num_pixels(num_pixels)
-{
-  m_primitive.resize(num_pixels);
-  m_u.resize(num_pixels);
-  m_v.resize(num_pixels);
-  m_dudx.resize(num_pixels);
-  m_dudy.resize(num_pixels);
-  m_dvdx.resize(num_pixels);
-  m_dvdy.resize(num_pixels);
-}
-
-BakeData::~BakeData()
-{
-  m_primitive.clear();
-  m_u.clear();
-  m_v.clear();
-  m_dudx.clear();
-  m_dudy.clear();
-  m_dvdx.clear();
-  m_dvdy.clear();
-}
-
-void BakeData::set(int i, int prim, float uv[2], float dudx, float dudy, float dvdx, float dvdy)
-{
-  m_primitive[i] = (prim == -1 ? -1 : m_tri_offset + prim);
-  m_u[i] = uv[0];
-  m_v[i] = uv[1];
-  m_dudx[i] = dudx;
-  m_dudy[i] = dudy;
-  m_dvdx[i] = dvdx;
-  m_dvdy[i] = dvdy;
-}
-
-void BakeData::set_null(int i)
-{
-  m_primitive[i] = -1;
-}
-
-int BakeData::object()
-{
-  return m_object;
-}
-
-size_t BakeData::size()
+static int aa_samples(Scene *scene, Object *object, ShaderEvalType type)
 {
-  return m_num_pixels;
-}
+  if (type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) {
+    return 1;
+  }
+  else if (type == SHADER_EVAL_NORMAL) {
+    /* Only antialias normal if mesh has bump mapping. */
+    if (object->geometry) {
+      foreach (Shader *shader, object->geometry->used_shaders) {
+        if (shader->has_bump) {
+          return scene->integrator->aa_samples;
+        }
+      }
+    }
 
-bool BakeData::is_valid(int i)
-{
-  return m_primitive[i] != -1;
+    return 1;
+  }
+  else {
+    return scene->integrator->aa_samples;
+  }
 }
 
-uint4 BakeData::data(int i)
+/* Keep it synced with kernel_bake.h logic */
+static int shader_type_to_pass_filter(ShaderEvalType type, int pass_filter)
 {
-  return make_uint4(m_object, m_primitive[i], __float_as_int(m_u[i]), __float_as_int(m_v[i]));
-}
+  const int component_flags = pass_filter &
+                              (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_COLOR);
 
-uint4 BakeData::differentials(int i)
-{
-  return make_uint4(__float_as_int(m_dudx[i]),
-                    __float_as_int(m_dudy[i]),
-                    __float_as_int(m_dvdx[i]),
-                    __float_as_int(m_dvdy[i]));
+  switch (type) {
+    case SHADER_EVAL_AO:
+      return BAKE_FILTER_AO;
+    case SHADER_EVAL_SHADOW:
+      return BAKE_FILTER_DIRECT;
+    case SHADER_EVAL_DIFFUSE:
+      return BAKE_FILTER_DIFFUSE | component_flags;
+    case SHADER_EVAL_GLOSSY:
+      return BAKE_FILTER_GLOSSY | component_flags;
+    case SHADER_EVAL_TRANSMISSION:
+      return BAKE_FILTER_TRANSMISSION | component_flags;
+    case SHADER_EVAL_COMBINED:
+      return pass_filter;
+    default:
+      return 0;
+  }
 }
 
 BakeManager::BakeManager()
 {
-  m_bake_data = NULL;
-  m_is_baking = false;
+  type = SHADER_EVAL_BAKE;
+  pass_filter = 0;
+
   need_update = true;
-  m_shader_limit = 512 * 512;
 }
 
 BakeManager::~BakeManager()
 {
-  if (m_bake_data)
-    delete m_bake_data;
 }
 
 bool BakeManager::get_baking()
 {
-  return m_is_baking;
-}
-
-void BakeManager::set_baking(const bool value)
-{
-  m_is_baking = value;
-}
-
-BakeData *BakeManager::init(const int object, const size_t tri_offset, const size_t num_pixels)
-{
-  m_bake_data = new BakeData(object, tri_offset, num_pixels);
-  return m_bake_data;
+  return !object_name.empty();
 }
 
-void BakeManager::set_shader_limit(const size_t x, const size_t y)
+void BakeManager::set(Scene *scene,
+                      const std::string &object_name_,
+                      ShaderEvalType type_,
+                      int pass_filter_)
 {
-  m_shader_limit = x * y;
-  m_shader_limit = (size_t)pow(2, ceil(log(m_shader_limit) / log(2)));
-}
-
-bool BakeManager::bake(Device *device,
-                       DeviceScene *dscene,
-                       Scene *scene,
-                       Progress &progress,
-                       ShaderEvalType shader_type,
-                       const int pass_filter,
-                       BakeData *bake_data,
-                       float result[])
-{
-  size_t num_pixels = bake_data->size();
+  object_name = object_name_;
+  type = type_;
+  pass_filter = shader_type_to_pass_filter(type_, pass_filter_);
 
-  int num_samples = aa_samples(scene, bake_data, shader_type);
+  Pass::add(PASS_BAKE_PRIMITIVE, scene->film->passes);
+  Pass::add(PASS_BAKE_DIFFERENTIAL, scene->film->passes);
 
-  /* calculate the total pixel samples for the progress bar */
-  total_pixel_samples = 0;
-  for (size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) {
-    size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit);
-    total_pixel_samples += shader_size * num_samples;
+  if (type == SHADER_EVAL_UV) {
+    /* force UV to be available */
+    Pass::add(PASS_UV, scene->film->passes);
   }
-  progress.reset_sample();
-  progress.set_total_pixel_samples(total_pixel_samples);
-
-  /* needs to be up to date for baking specific AA samples */
-  dscene->data.integrator.aa_samples = num_samples;
-  device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-  for (size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) {
-    size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit);
-
-    /* setup input for device task */
-    device_vector<uint4> d_input(device, "bake_input", MEM_READ_ONLY);
-    uint4 *d_input_data = d_input.alloc(shader_size * 2);
-    size_t d_input_size = 0;
-
-    for (size_t i = shader_offset; i < (shader_offset + shader_size); i++) {
-      d_input_data[d_input_size++] = bake_data->data(i);
-      d_input_data[d_input_size++] = bake_data->differentials(i);
-    }
-
-    if (d_input_size == 0) {
-      m_is_baking = false;
-      return false;
-    }
-
-    /* run device task */
-    device_vector<float4> d_output(device, "bake_output", MEM_READ_WRITE);
-    d_output.alloc(shader_size);
-    d_output.zero_to_device();
-    d_input.copy_to_device();
-
-    DeviceTask task(DeviceTask::SHADER);
-    task.shader_input = d_input.device_pointer;
-    task.shader_output = d_output.device_pointer;
-    task.shader_eval_type = shader_type;
-    task.shader_filter = pass_filter;
-    task.shader_x = 0;
-    task.offset = shader_offset;
-    task.shader_w = d_output.size();
-    task.num_samples = num_samples;
-    task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-    task.update_progress_sample = function_bind(&Progress::add_samples_update, &progress, _1, _2);
-
-    device->task_add(task);
-    device->task_wait();
-
-    if (progress.get_cancel()) {
-      d_input.free();
-      d_output.free();
-      m_is_baking = false;
-      return false;
-    }
-
-    d_output.copy_from_device(0, 1, d_output.size());
-    d_input.free();
-
-    /* read result */
-    int k = 0;
-
-    float4 *offset = d_output.data();
-
-    size_t depth = 4;
-    for (size_t i = shader_offset; i < (shader_offset + shader_size); i++) {
-      size_t index = i * depth;
-      float4 out = offset[k++];
-
-      if (bake_data->is_valid(i)) {
-        for (size_t j = 0; j < 4; j++) {
-          result[index + j] = out[j];
-        }
-      }
-    }
-
-    d_output.free();
+  /* force use_light_pass to be true if we bake more than just colors */
+  if (pass_filter & ~BAKE_FILTER_COLOR) {
+    Pass::add(PASS_LIGHT, scene->film->passes);
   }
 
-  m_is_baking = false;
-  return true;
+  /* create device and update scene */
+  scene->film->tag_update(scene);
+  scene->integrator->tag_update(scene);
+
+  need_update = true;
 }
 
 void BakeManager::device_update(Device * /*device*/,
-                                DeviceScene * /*dscene*/,
-                                Scene * /*scene*/,
-                                Progress &progress)
+                                DeviceScene *dscene,
+                                Scene *scene,
+                                Progress & /* progress */)
 {
   if (!need_update)
     return;
 
-  if (progress.get_cancel())
-    return;
-
-  need_update = false;
-}
-
-void BakeManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
-{
-}
+  KernelIntegrator *kintegrator = &dscene->data.integrator;
+  KernelBake *kbake = &dscene->data.bake;
 
-int BakeManager::aa_samples(Scene *scene, BakeData *bake_data, ShaderEvalType type)
-{
-  if (type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) {
-    return 1;
-  }
-  else if (type == SHADER_EVAL_NORMAL) {
-    /* Only antialias normal if mesh has bump mapping. */
-    Object *object = scene->objects[bake_data->object()];
+  kbake->type = type;
+  kbake->pass_filter = pass_filter;
 
-    if (object->mesh) {
-      foreach (Shader *shader, object->mesh->used_shaders) {
-        if (shader->has_bump) {
-          return scene->integrator->aa_samples;
-        }
-      }
+  int object_index = 0;
+  foreach (Object *object, scene->objects) {
+    const Geometry *geom = object->geometry;
+    if (object->name == object_name && geom->type == Geometry::MESH) {
+      kbake->object_index = object_index;
+      kbake->tri_offset = geom->prim_offset;
+      kintegrator->aa_samples = aa_samples(scene, object, type);
+      break;
     }
 
-    return 1;
-  }
-  else {
-    return scene->integrator->aa_samples;
+    object_index++;
   }
+
+  need_update = false;
 }
 
-/* Keep it synced with kernel_bake.h logic */
-int BakeManager::shader_type_to_pass_filter(ShaderEvalType type, const int pass_filter)
+void BakeManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
 {
-  const int component_flags = pass_filter &
-                              (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_COLOR);
-
-  switch (type) {
-    case SHADER_EVAL_AO:
-      return BAKE_FILTER_AO;
-    case SHADER_EVAL_SHADOW:
-      return BAKE_FILTER_DIRECT;
-    case SHADER_EVAL_DIFFUSE:
-      return BAKE_FILTER_DIFFUSE | component_flags;
-    case SHADER_EVAL_GLOSSY:
-      return BAKE_FILTER_GLOSSY | component_flags;
-    case SHADER_EVAL_TRANSMISSION:
-      return BAKE_FILTER_TRANSMISSION | component_flags;
-    case SHADER_EVAL_SUBSURFACE:
-      return BAKE_FILTER_SUBSURFACE | component_flags;
-    case SHADER_EVAL_COMBINED:
-      return pass_filter;
-    default:
-      return 0;
-  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 88537623efb..93e664c2ab1 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -25,67 +25,23 @@
 
 CCL_NAMESPACE_BEGIN
 
-class BakeData {
- public:
-  BakeData(const int object, const size_t tri_offset, const size_t num_pixels);
-  ~BakeData();
-
-  void set(int i, int prim, float uv[2], float dudx, float dudy, float dvdx, float dvdy);
-  void set_null(int i);
-  int object();
-  size_t size();
-  uint4 data(int i);
-  uint4 differentials(int i);
-  bool is_valid(int i);
-
- private:
-  int m_object;
-  size_t m_tri_offset;
-  size_t m_num_pixels;
-  vector<int> m_primitive;
-  vector<float> m_u;
-  vector<float> m_v;
-  vector<float> m_dudx;
-  vector<float> m_dudy;
-  vector<float> m_dvdx;
-  vector<float> m_dvdy;
-};
-
 class BakeManager {
  public:
   BakeManager();
   ~BakeManager();
 
+  void set(Scene *scene, const std::string &object_name, ShaderEvalType type, int pass_filter);
   bool get_baking();
-  void set_baking(const bool value);
-
-  BakeData *init(const int object, const size_t tri_offset, const size_t num_pixels);
-
-  void set_shader_limit(const size_t x, const size_t y);
-
-  bool bake(Device *device,
-            DeviceScene *dscene,
-            Scene *scene,
-            Progress &progress,
-            ShaderEvalType shader_type,
-            const int pass_filter,
-            BakeData *bake_data,
-            float result[]);
 
   void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
   void device_free(Device *device, DeviceScene *dscene);
 
-  static int shader_type_to_pass_filter(ShaderEvalType type, const int pass_filter);
-  static int aa_samples(Scene *scene, BakeData *bake_data, ShaderEvalType type);
-
   bool need_update;
 
-  size_t total_pixel_samples;
-
  private:
-  BakeData *m_bake_data;
-  bool m_is_baking;
-  size_t m_shader_limit;
+  ShaderEvalType type;
+  int pass_filter;
+  std::string object_name;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 5405aaefc1d..b26366af852 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -16,8 +16,8 @@
 
 #include <stdlib.h>
 
-#include "render/buffers.h"
 #include "device/device.h"
+#include "render/buffers.h"
 
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
@@ -57,7 +57,10 @@ bool BufferParams::modified(const BufferParams &params)
 {
   return !(full_x == params.full_x && full_y == params.full_y && width == params.width &&
            height == params.height && full_width == params.full_width &&
-           full_height == params.full_height && Pass::equals(passes, params.passes));
+           full_height == params.full_height && Pass::equals(passes, params.passes) &&
+           denoising_data_pass == params.denoising_data_pass &&
+           denoising_clean_pass == params.denoising_clean_pass &&
+           denoising_prefiltered_pass == params.denoising_prefiltered_pass);
 }
 
 int BufferParams::get_passes_size()
@@ -143,7 +146,7 @@ void RenderBuffers::reset(BufferParams &params_)
   params = params_;
 
   /* re-allocate buffer */
-  buffer.alloc(params.width * params.height * params.get_passes_size());
+  buffer.alloc(params.width * params.get_passes_size(), params.height);
   buffer.zero_to_device();
 }
 
@@ -185,13 +188,28 @@ bool RenderBuffers::get_denoising_pass_rect(
     offset = type + params.get_denoising_offset();
     scale /= sample;
   }
-  else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) {
-    /* If we're not saving the prefiltering result, return the original noisy pass. */
-    offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
-    scale /= sample;
+  else if (params.denoising_prefiltered_pass) {
+    offset = type + params.get_denoising_prefiltered_offset();
   }
   else {
-    offset = type + params.get_denoising_prefiltered_offset();
+    switch (type) {
+      case DENOISING_PASS_PREFILTERED_DEPTH:
+        offset = params.get_denoising_offset() + DENOISING_PASS_DEPTH;
+        break;
+      case DENOISING_PASS_PREFILTERED_NORMAL:
+        offset = params.get_denoising_offset() + DENOISING_PASS_NORMAL;
+        break;
+      case DENOISING_PASS_PREFILTERED_ALBEDO:
+        offset = params.get_denoising_offset() + DENOISING_PASS_ALBEDO;
+        break;
+      case DENOISING_PASS_PREFILTERED_COLOR:
+        /* If we're not saving the prefiltering result, return the original noisy pass. */
+        offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
+        break;
+      default:
+        return false;
+    }
+    scale /= sample;
   }
 
   int pass_stride = params.get_passes_size();
@@ -217,9 +235,14 @@ bool RenderBuffers::get_denoising_pass_rect(
     float *in_combined = buffer.data();
 
     for (int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
-      pixels[0] = in[0] * scale;
-      pixels[1] = in[1] * scale;
-      pixels[2] = in[2] * scale;
+      float3 val = make_float3(in[0], in[1], in[2]);
+      if (type == DENOISING_PASS_PREFILTERED_COLOR && params.denoising_prefiltered_pass) {
+        /* Remove highlight compression from the image. */
+        val = color_highlight_uncompress(val);
+      }
+      pixels[0] = val.x * scale;
+      pixels[1] = val.y * scale;
+      pixels[2] = val.z * scale;
       pixels[3] = saturate(in_combined[3] * alpha_scale);
     }
   }
@@ -231,29 +254,41 @@ bool RenderBuffers::get_denoising_pass_rect(
 }
 
 bool RenderBuffers::get_pass_rect(
-    PassType type, float exposure, int sample, int components, float *pixels, const string &name)
+    const string &name, float exposure, int sample, int components, float *pixels)
 {
   if (buffer.data() == NULL) {
     return false;
   }
 
+  float *sample_count = NULL;
+  if (name == "Combined") {
+    int sample_offset = 0;
+    for (size_t j = 0; j < params.passes.size(); j++) {
+      Pass &pass = params.passes[j];
+      if (pass.type != PASS_SAMPLE_COUNT) {
+        sample_offset += pass.components;
+        continue;
+      }
+      else {
+        sample_count = buffer.data() + sample_offset;
+        break;
+      }
+    }
+  }
+
   int pass_offset = 0;
 
   for (size_t j = 0; j < params.passes.size(); j++) {
     Pass &pass = params.passes[j];
 
-    if (pass.type != type) {
+    /* Pass is identified by both type and name, multiple of the same type
+     * may exist with a different name. */
+    if (pass.name != name) {
       pass_offset += pass.components;
       continue;
     }
 
-    /* Tell Cryptomatte passes apart by their name. */
-    if (pass.type == PASS_CRYPTOMATTE) {
-      if (pass.name != name) {
-        pass_offset += pass.components;
-        continue;
-      }
-    }
+    PassType type = pass.type;
 
     float *in = buffer.data() + pass_offset;
     int pass_stride = params.get_passes_size();
@@ -401,6 +436,11 @@ bool RenderBuffers::get_pass_rect(
       }
       else {
         for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
+          if (sample_count && sample_count[i * pass_stride] < 0.0f) {
+            scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
+            scale_exposure = (pass.exposure) ? scale * exposure : scale;
+          }
+
           float4 f = make_float4(in[0], in[1], in[2], in[3]);
 
           pixels[0] = f.x * scale_exposure;
@@ -419,6 +459,40 @@ bool RenderBuffers::get_pass_rect(
   return false;
 }
 
+bool RenderBuffers::set_pass_rect(PassType type, int components, float *pixels)
+{
+  if (buffer.data() == NULL) {
+    return false;
+  }
+
+  int pass_offset = 0;
+
+  for (size_t j = 0; j < params.passes.size(); j++) {
+    Pass &pass = params.passes[j];
+
+    if (pass.type != type) {
+      pass_offset += pass.components;
+      continue;
+    }
+
+    float *out = buffer.data() + pass_offset;
+    int pass_stride = params.get_passes_size();
+    int size = params.width * params.height;
+
+    assert(pass.components == components);
+
+    for (int i = 0; i < size; i++, out += pass_stride, pixels += components) {
+      for (int j = 0; j < components; j++) {
+        out[j] = pixels[j];
+      }
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
 /* Display Buffer */
 
 DisplayBuffer::DisplayBuffer(Device *device, bool linear)
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 1c49038cd4b..06b6094e6c9 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -52,7 +52,7 @@ class BufferParams {
   /* passes */
   vector<Pass> passes;
   bool denoising_data_pass;
-  /* If only some light path types should be denoised, an additional pass is needed. */
+  /* If only some light path types should be target, an additional pass is needed. */
   bool denoising_clean_pass;
   /* When we're prefiltering the passes during rendering, we need to keep both the
    * original and the prefiltered data around because neighboring tiles might still
@@ -64,7 +64,6 @@ class BufferParams {
 
   void get_offset_stride(int &offset, int &stride);
   bool modified(const BufferParams &params);
-  void add_pass(PassType type);
   int get_passes_size();
   int get_denoising_offset();
   int get_denoising_prefiltered_offset();
@@ -89,14 +88,11 @@ class RenderBuffers {
   void zero();
 
   bool copy_from_device();
-  bool get_pass_rect(PassType type,
-                     float exposure,
-                     int sample,
-                     int components,
-                     float *pixels,
-                     const string &name);
+  bool get_pass_rect(
+      const string &name, float exposure, int sample, int components, float *pixels);
   bool get_denoising_pass_rect(
       int offset, float exposure, int sample, int components, float *pixels);
+  bool set_pass_rect(PassType type, int components, float *pixels);
 };
 
 /* Display Buffer
@@ -135,7 +131,7 @@ class DisplayBuffer {
 
 class RenderTile {
  public:
-  typedef enum { PATH_TRACE, DENOISE } Task;
+  typedef enum { PATH_TRACE = (1 << 0), BAKE = (1 << 1), DENOISE = (1 << 2) } Task;
 
   Task task;
   int x, y, w, h;
@@ -153,6 +149,50 @@ class RenderTile {
   RenderBuffers *buffers;
 
   RenderTile();
+
+  int4 bounds() const
+  {
+    return make_int4(x,      /* xmin */
+                     y,      /* ymin */
+                     x + w,  /* xmax */
+                     y + h); /* ymax */
+  }
+};
+
+/* Render Tile Neighbors
+ * Set of neighboring tiles used for denoising. Tile order:
+ *  0 1 2
+ *  3 4 5
+ *  6 7 8 */
+
+class RenderTileNeighbors {
+ public:
+  static const int SIZE = 9;
+  static const int CENTER = 4;
+
+  RenderTile tiles[SIZE];
+  RenderTile target;
+
+  RenderTileNeighbors(const RenderTile &center)
+  {
+    tiles[CENTER] = center;
+  }
+
+  int4 bounds() const
+  {
+    return make_int4(tiles[3].x,               /* xmin */
+                     tiles[1].y,               /* ymin */
+                     tiles[5].x + tiles[5].w,  /* xmax */
+                     tiles[7].y + tiles[7].h); /* ymax */
+  }
+
+  void set_bounds_from_center()
+  {
+    tiles[3].x = tiles[CENTER].x;
+    tiles[1].y = tiles[CENTER].y;
+    tiles[5].x = tiles[CENTER].x + tiles[CENTER].w;
+    tiles[7].y = tiles[CENTER].y + tiles[CENTER].h;
+  }
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 9c9070c8a90..bbc111cb798 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -26,9 +26,11 @@
 #include "util/util_function.h"
 #include "util/util_logging.h"
 #include "util/util_math_cdf.h"
+#include "util/util_task.h"
 #include "util/util_vector.h"
 
 /* needed for calculating differentials */
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
@@ -36,6 +38,7 @@
 #include "kernel/kernel_differential.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_camera.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -118,6 +121,8 @@ NODE_DEFINE(Camera)
   stereo_eye_enum.insert("right", STEREO_RIGHT);
   SOCKET_ENUM(stereo_eye, "Stereo Eye", stereo_eye_enum, STEREO_NONE);
 
+  SOCKET_BOOLEAN(use_spherical_stereo, "Use Spherical Stereo", false);
+
   SOCKET_FLOAT(interocular_distance, "Interocular Distance", 0.065f);
   SOCKET_FLOAT(convergence_distance, "Convergence Distance", 30.0f * 0.065f);
 
@@ -492,20 +497,35 @@ void Camera::device_update_volume(Device * /*device*/, DeviceScene *dscene, Scen
   if (!need_device_update && !need_flags_update) {
     return;
   }
-  KernelCamera *kcam = &dscene->data.cam;
-  BoundBox viewplane_boundbox = viewplane_bounds_get();
-  for (size_t i = 0; i < scene->objects.size(); ++i) {
-    Object *object = scene->objects[i];
-    if (object->mesh->has_volume && viewplane_boundbox.intersects(object->bounds)) {
-      /* TODO(sergey): Consider adding more grained check. */
-      VLOG(1) << "Detected camera inside volume.";
-      kcam->is_inside_volume = 1;
-      break;
+
+  KernelIntegrator *kintegrator = &dscene->data.integrator;
+  if (kintegrator->use_volumes) {
+    KernelCamera *kcam = &dscene->data.cam;
+    BoundBox viewplane_boundbox = viewplane_bounds_get();
+
+    /* Parallel object update, with grain size to avoid too much threading overhead
+     * for individual objects. */
+    static const int OBJECTS_PER_TASK = 32;
+    parallel_for(blocked_range<size_t>(0, scene->objects.size(), OBJECTS_PER_TASK),
+                 [&](const blocked_range<size_t> &r) {
+                   for (size_t i = r.begin(); i != r.end(); i++) {
+                     Object *object = scene->objects[i];
+                     if (object->geometry->has_volume &&
+                         viewplane_boundbox.intersects(object->bounds)) {
+                       /* TODO(sergey): Consider adding more grained check. */
+                       VLOG(1) << "Detected camera inside volume.";
+                       kcam->is_inside_volume = 1;
+                       parallel_for_cancel();
+                       break;
+                     }
+                   }
+                 });
+
+    if (!kcam->is_inside_volume) {
+      VLOG(1) << "Camera is outside of the volume.";
     }
   }
-  if (!kcam->is_inside_volume) {
-    VLOG(1) << "Camera is outside of the volume.";
-  }
+
   need_device_update = false;
   need_flags_update = false;
 }
@@ -563,8 +583,7 @@ float3 Camera::transform_raster_to_world(float raster_x, float raster_y)
 BoundBox Camera::viewplane_bounds_get()
 {
   /* TODO(sergey): This is all rather stupid, but is there a way to perform
-   * checks we need in a more clear and smart fasion?
-   */
+   * checks we need in a more clear and smart fashion? */
   BoundBox bounds = BoundBox::empty;
 
   if (type == CAMERA_PANORAMA) {
@@ -642,7 +661,8 @@ float Camera::world_to_raster_size(float3 P)
     float3 D = normalize(Ddiff);
     res = len(dist * dDdx - dot(dist * dDdx, D) * D);
 
-    /* Decent approx distance to frustum (doesn't handle corners correctly, but not that big of a deal) */
+    /* Decent approx distance to frustum
+     * (doesn't handle corners correctly, but not that big of a deal) */
     float f_dist = 0.0f;
 
     if (offscreen_dicing_scale > 1.0f) {
@@ -686,7 +706,8 @@ float Camera::world_to_raster_size(float3 P)
               f_dist = max(f_dist, *d);
             }
             else {
-              /* Possibly far enough behind the frustum to use distance to origin instead of edge */
+              /* Possibly far enough behind the frustum to use distance to origin instead of edge
+               */
               test_o = true;
             }
           }
@@ -716,10 +737,17 @@ float Camera::world_to_raster_size(float3 P)
     float3 raster = transform_perspective(&full_cameratoraster, make_float3(dir.x, dir.y, 0.0f));
 
     ray.t = 1.0f;
-    camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), raster.x, raster.y, 0.0f, 0.0f, &ray);
-    if(ray.t == 0.0f) {
+    camera_sample_panorama(
+        &kernel_camera, kernel_camera_motion.data(), raster.x, raster.y, 0.0f, 0.0f, &ray);
+    if (ray.t == 0.0f) {
       /* No differentials, just use from directly ahead. */
-      camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*full_width, 0.5f*full_height, 0.0f, 0.0f, &ray);
+      camera_sample_panorama(&kernel_camera,
+                             kernel_camera_motion.data(),
+                             0.5f * full_width,
+                             0.5f * full_height,
+                             0.0f,
+                             0.0f,
+                             &ray);
     }
 #else
     camera_sample_panorama(&kernel_camera,
diff --git a/intern/cycles/render/colorspace.cpp b/intern/cycles/render/colorspace.cpp
new file mode 100644
index 00000000000..57979d5f225
--- /dev/null
+++ b/intern/cycles/render/colorspace.cpp
@@ -0,0 +1,395 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/colorspace.h"
+
+#include "util/util_color.h"
+#include "util/util_half.h"
+#include "util/util_image.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
+
+#ifdef WITH_OCIO
+#  include <OpenColorIO/OpenColorIO.h>
+namespace OCIO = OCIO_NAMESPACE;
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Builtin colorspaces. */
+ustring u_colorspace_auto;
+ustring u_colorspace_raw("__builtin_raw");
+ustring u_colorspace_srgb("__builtin_srgb");
+
+/* Cached data. */
+#ifdef WITH_OCIO
+static thread_mutex cache_colorspaces_mutex;
+static thread_mutex cache_processors_mutex;
+static unordered_map<ustring, ustring, ustringHash> cached_colorspaces;
+static unordered_map<ustring, OCIO::ConstProcessorRcPtr, ustringHash> cached_processors;
+#endif
+
+ColorSpaceProcessor *ColorSpaceManager::get_processor(ustring colorspace)
+{
+#ifdef WITH_OCIO
+  /* Only use this for OpenColorIO color spaces, not the builtin ones. */
+  assert(colorspace != u_colorspace_srgb && colorspace != u_colorspace_auto);
+
+  if (colorspace == u_colorspace_raw) {
+    return NULL;
+  }
+
+  OCIO::ConstConfigRcPtr config = OCIO::GetCurrentConfig();
+  if (!config) {
+    return NULL;
+  }
+
+  /* Cache processor until free_memory(), memory overhead is expected to be
+   * small and the processor is likely to be reused. */
+  thread_scoped_lock cache_processors_lock(cache_processors_mutex);
+  if (cached_processors.find(colorspace) == cached_processors.end()) {
+    try {
+      cached_processors[colorspace] = config->getProcessor(colorspace.c_str(), "scene_linear");
+    }
+    catch (OCIO::Exception &exception) {
+      cached_processors[colorspace] = OCIO::ConstProcessorRcPtr();
+      VLOG(1) << "Colorspace " << colorspace.c_str()
+              << " can't be converted to scene_linear: " << exception.what();
+    }
+  }
+
+  const OCIO::Processor *processor = cached_processors[colorspace].get();
+  return (ColorSpaceProcessor *)processor;
+#else
+  /* No OpenColorIO. */
+  (void)colorspace;
+  return NULL;
+#endif
+}
+
+bool ColorSpaceManager::colorspace_is_data(ustring colorspace)
+{
+  if (colorspace == u_colorspace_auto || colorspace == u_colorspace_raw ||
+      colorspace == u_colorspace_srgb) {
+    return false;
+  }
+
+#ifdef WITH_OCIO
+  OCIO::ConstConfigRcPtr config = OCIO::GetCurrentConfig();
+  if (!config) {
+    return false;
+  }
+
+  try {
+    OCIO::ConstColorSpaceRcPtr space = config->getColorSpace(colorspace.c_str());
+    return space && space->isData();
+  }
+  catch (OCIO::Exception &) {
+    return false;
+  }
+#else
+  return false;
+#endif
+}
+
+ustring ColorSpaceManager::detect_known_colorspace(ustring colorspace,
+                                                   const char *file_format,
+                                                   bool is_float)
+{
+  if (colorspace == u_colorspace_auto) {
+    /* Auto detect sRGB or raw if none specified. */
+    if (is_float) {
+      bool srgb = (colorspace == "sRGB" || colorspace == "GammaCorrected" ||
+                   (colorspace.empty() &&
+                    (strcmp(file_format, "png") == 0 || strcmp(file_format, "tiff") == 0 ||
+                     strcmp(file_format, "dpx") == 0 || strcmp(file_format, "jpeg2000") == 0)));
+      return srgb ? u_colorspace_srgb : u_colorspace_raw;
+    }
+    else {
+      return u_colorspace_srgb;
+    }
+  }
+  else if (colorspace == u_colorspace_srgb || colorspace == u_colorspace_raw) {
+    /* Builtin colorspaces. */
+    return colorspace;
+  }
+  else {
+    /* Use OpenColorIO. */
+#ifdef WITH_OCIO
+    {
+      thread_scoped_lock cache_lock(cache_colorspaces_mutex);
+      /* Cached lookup. */
+      if (cached_colorspaces.find(colorspace) != cached_colorspaces.end()) {
+        return cached_colorspaces[colorspace];
+      }
+    }
+
+    /* Detect if it matches a simple builtin colorspace. */
+    bool is_scene_linear, is_srgb;
+    is_builtin_colorspace(colorspace, is_scene_linear, is_srgb);
+
+    thread_scoped_lock cache_lock(cache_colorspaces_mutex);
+    if (is_scene_linear) {
+      VLOG(1) << "Colorspace " << colorspace.string() << " is no-op";
+      cached_colorspaces[colorspace] = u_colorspace_raw;
+      return u_colorspace_raw;
+    }
+    else if (is_srgb) {
+      VLOG(1) << "Colorspace " << colorspace.string() << " is sRGB";
+      cached_colorspaces[colorspace] = u_colorspace_srgb;
+      return u_colorspace_srgb;
+    }
+
+    /* Verify if we can convert from the requested color space. */
+    if (!get_processor(colorspace)) {
+      OCIO::ConstConfigRcPtr config = OCIO::GetCurrentConfig();
+      if (!config || !config->getColorSpace(colorspace.c_str())) {
+        VLOG(1) << "Colorspace " << colorspace.c_str() << " not found, using raw instead";
+      }
+      else {
+        VLOG(1) << "Colorspace " << colorspace.c_str()
+                << " can't be converted to scene_linear, using raw instead";
+      }
+      cached_colorspaces[colorspace] = u_colorspace_raw;
+      return u_colorspace_raw;
+    }
+
+    /* Convert to/from colorspace with OpenColorIO. */
+    VLOG(1) << "Colorspace " << colorspace.string() << " handled through OpenColorIO";
+    cached_colorspaces[colorspace] = colorspace;
+    return colorspace;
+#else
+    VLOG(1) << "Colorspace " << colorspace.c_str() << " not available, built without OpenColorIO";
+    return u_colorspace_raw;
+#endif
+  }
+}
+
+void ColorSpaceManager::is_builtin_colorspace(ustring colorspace,
+                                              bool &is_scene_linear,
+                                              bool &is_srgb)
+{
+#ifdef WITH_OCIO
+  const OCIO::Processor *processor = (const OCIO::Processor *)get_processor(colorspace);
+  if (!processor) {
+    is_scene_linear = false;
+    is_srgb = false;
+    return;
+  }
+
+  is_scene_linear = true;
+  is_srgb = true;
+  for (int i = 0; i < 256; i++) {
+    float v = i / 255.0f;
+
+    float cR[3] = {v, 0, 0};
+    float cG[3] = {0, v, 0};
+    float cB[3] = {0, 0, v};
+    float cW[3] = {v, v, v};
+    processor->applyRGB(cR);
+    processor->applyRGB(cG);
+    processor->applyRGB(cB);
+    processor->applyRGB(cW);
+
+    /* Make sure that there is no channel crosstalk. */
+    if (fabsf(cR[1]) > 1e-5f || fabsf(cR[2]) > 1e-5f || fabsf(cG[0]) > 1e-5f ||
+        fabsf(cG[2]) > 1e-5f || fabsf(cB[0]) > 1e-5f || fabsf(cB[1]) > 1e-5f) {
+      is_scene_linear = false;
+      is_srgb = false;
+      break;
+    }
+    /* Make sure that the three primaries combine linearly. */
+    if (!compare_floats(cR[0], cW[0], 1e-6f, 64) || !compare_floats(cG[1], cW[1], 1e-6f, 64) ||
+        !compare_floats(cB[2], cW[2], 1e-6f, 64)) {
+      is_scene_linear = false;
+      is_srgb = false;
+      break;
+    }
+    /* Make sure that the three channels behave identically. */
+    if (!compare_floats(cW[0], cW[1], 1e-6f, 64) || !compare_floats(cW[1], cW[2], 1e-6f, 64)) {
+      is_scene_linear = false;
+      is_srgb = false;
+      break;
+    }
+
+    float out_v = average(make_float3(cW[0], cW[1], cW[2]));
+    if (!compare_floats(v, out_v, 1e-6f, 64)) {
+      is_scene_linear = false;
+    }
+    if (!compare_floats(color_srgb_to_linear(v), out_v, 1e-6f, 64)) {
+      is_srgb = false;
+    }
+  }
+#else
+  (void)colorspace;
+  is_scene_linear = false;
+  is_srgb = false;
+#endif
+}
+
+#ifdef WITH_OCIO
+
+template<typename T> inline float4 cast_to_float4(T *data)
+{
+  return make_float4(util_image_cast_to_float(data[0]),
+                     util_image_cast_to_float(data[1]),
+                     util_image_cast_to_float(data[2]),
+                     util_image_cast_to_float(data[3]));
+}
+
+template<typename T> inline void cast_from_float4(T *data, float4 value)
+{
+  data[0] = util_image_cast_from_float<T>(value.x);
+  data[1] = util_image_cast_from_float<T>(value.y);
+  data[2] = util_image_cast_from_float<T>(value.z);
+  data[3] = util_image_cast_from_float<T>(value.w);
+}
+
+/* Slower versions for other all data types, which needs to convert to float and back. */
+template<typename T, bool compress_as_srgb = false>
+inline void processor_apply_pixels(const OCIO::Processor *processor, T *pixels, size_t num_pixels)
+{
+  /* TODO: implement faster version for when we know the conversion
+   * is a simple matrix transform between linear spaces. In that case
+   * un-premultiply is not needed. */
+
+  /* Process large images in chunks to keep temporary memory requirement down. */
+  const size_t chunk_size = std::min((size_t)(16 * 1024 * 1024), num_pixels);
+  vector<float4> float_pixels(chunk_size);
+
+  for (size_t j = 0; j < num_pixels; j += chunk_size) {
+    size_t width = std::min(chunk_size, num_pixels - j);
+
+    for (size_t i = 0; i < width; i++) {
+      float4 value = cast_to_float4(pixels + 4 * (j + i));
+
+      if (!(value.w <= 0.0f || value.w == 1.0f)) {
+        float inv_alpha = 1.0f / value.w;
+        value.x *= inv_alpha;
+        value.y *= inv_alpha;
+        value.z *= inv_alpha;
+      }
+
+      float_pixels[i] = value;
+    }
+
+    OCIO::PackedImageDesc desc((float *)float_pixels.data(), width, 1, 4);
+    processor->apply(desc);
+
+    for (size_t i = 0; i < width; i++) {
+      float4 value = float_pixels[i];
+
+      if (compress_as_srgb) {
+        value = color_linear_to_srgb_v4(value);
+      }
+
+      if (!(value.w <= 0.0f || value.w == 1.0f)) {
+        value.x *= value.w;
+        value.y *= value.w;
+        value.z *= value.w;
+      }
+
+      cast_from_float4(pixels + 4 * (j + i), value);
+    }
+  }
+}
+#endif
+
+template<typename T>
+void ColorSpaceManager::to_scene_linear(ustring colorspace,
+                                        T *pixels,
+                                        size_t num_pixels,
+                                        bool compress_as_srgb)
+{
+#ifdef WITH_OCIO
+  const OCIO::Processor *processor = (const OCIO::Processor *)get_processor(colorspace);
+
+  if (processor) {
+    if (compress_as_srgb) {
+      /* Compress output as sRGB. */
+      processor_apply_pixels<T, true>(processor, pixels, num_pixels);
+    }
+    else {
+      /* Write output as scene linear directly. */
+      processor_apply_pixels<T>(processor, pixels, num_pixels);
+    }
+  }
+#else
+  (void)colorspace;
+  (void)pixels;
+  (void)num_pixels;
+  (void)compress_as_srgb;
+#endif
+}
+
+void ColorSpaceManager::to_scene_linear(ColorSpaceProcessor *processor_,
+                                        float *pixel,
+                                        int channels)
+{
+#ifdef WITH_OCIO
+  const OCIO::Processor *processor = (const OCIO::Processor *)processor_;
+
+  if (processor) {
+    if (channels == 3) {
+      processor->applyRGB(pixel);
+    }
+    else if (channels == 4) {
+      if (pixel[3] == 1.0f || pixel[3] == 0.0f) {
+        /* Fast path for RGBA. */
+        processor->applyRGB(pixel);
+      }
+      else {
+        /* Un-associate and associate alpha since color management should not
+         * be affected by transparency. */
+        float alpha = pixel[3];
+        float inv_alpha = 1.0f / alpha;
+
+        pixel[0] *= inv_alpha;
+        pixel[1] *= inv_alpha;
+        pixel[2] *= inv_alpha;
+
+        processor->applyRGB(pixel);
+
+        pixel[0] *= alpha;
+        pixel[1] *= alpha;
+        pixel[2] *= alpha;
+      }
+    }
+  }
+#else
+  (void)processor_;
+  (void)pixel;
+  (void)channels;
+#endif
+}
+
+void ColorSpaceManager::free_memory()
+{
+#ifdef WITH_OCIO
+  map_free_memory(cached_colorspaces);
+  map_free_memory(cached_colorspaces);
+#endif
+}
+
+/* Template instanstations so we don't have to inline functions. */
+template void ColorSpaceManager::to_scene_linear(ustring, uchar *, size_t, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, ushort *, size_t, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, half *, size_t, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, float *, size_t, bool);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/colorspace.h b/intern/cycles/render/colorspace.h
new file mode 100644
index 00000000000..51d0b121cc0
--- /dev/null
+++ b/intern/cycles/render/colorspace.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __COLORSPACE_H__
+#define __COLORSPACE_H__
+
+#include "util/util_map.h"
+#include "util/util_param.h"
+
+CCL_NAMESPACE_BEGIN
+
+extern ustring u_colorspace_auto;
+extern ustring u_colorspace_raw;
+extern ustring u_colorspace_srgb;
+
+class ColorSpaceProcessor;
+
+class ColorSpaceManager {
+ public:
+  /* Convert used specified colorspace to a colorspace that we are able to
+   * convert to and from. If the colorspace is u_colorspace_auto, we auto
+   * detect a colospace. */
+  static ustring detect_known_colorspace(ustring colorspace,
+                                         const char *file_format,
+                                         bool is_float);
+
+  /* Test if colorspace is for non-color data. */
+  static bool colorspace_is_data(ustring colorspace);
+
+  /* Convert pixels in the specified colorspace to scene linear color for
+   * rendering. Must be a colorspace returned from detect_known_colorspace. */
+  template<typename T>
+  static void to_scene_linear(ustring colorspace,
+                              T *pixels,
+                              size_t num_pixels,
+                              bool compress_as_srgb);
+
+  /* Efficiently convert pixels to scene linear colorspace at render time,
+   * for OSL where the image texture cache contains original pixels. The
+   * handle is valid for the lifetime of the application. */
+  static ColorSpaceProcessor *get_processor(ustring colorspace);
+  static void to_scene_linear(ColorSpaceProcessor *processor, float *pixel, int channels);
+
+  /* Clear memory when the application exits. Invalidates all processors. */
+  static void free_memory();
+
+ private:
+  static void is_builtin_colorspace(ustring colorspace, bool &is_no_op, bool &is_srgb);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __COLORSPACE_H__ */
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index e475ff60eef..f3809ee8d80 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -301,7 +301,7 @@ void ConstantFolder::fold_mix(NodeMix type, bool clamp) const
   }
 }
 
-void ConstantFolder::fold_math(NodeMath type, bool clamp) const
+void ConstantFolder::fold_math(NodeMathType type) const
 {
   ShaderInput *value1_in = node->input("Value1");
   ShaderInput *value2_in = node->input("Value2");
@@ -310,25 +310,25 @@ void ConstantFolder::fold_math(NodeMath type, bool clamp) const
     case NODE_MATH_ADD:
       /* X + 0 == 0 + X == X */
       if (is_zero(value1_in)) {
-        try_bypass_or_make_constant(value2_in, clamp);
+        try_bypass_or_make_constant(value2_in);
       }
       else if (is_zero(value2_in)) {
-        try_bypass_or_make_constant(value1_in, clamp);
+        try_bypass_or_make_constant(value1_in);
       }
       break;
     case NODE_MATH_SUBTRACT:
       /* X - 0 == X */
       if (is_zero(value2_in)) {
-        try_bypass_or_make_constant(value1_in, clamp);
+        try_bypass_or_make_constant(value1_in);
       }
       break;
     case NODE_MATH_MULTIPLY:
       /* X * 1 == 1 * X == X */
       if (is_one(value1_in)) {
-        try_bypass_or_make_constant(value2_in, clamp);
+        try_bypass_or_make_constant(value2_in);
       }
       else if (is_one(value2_in)) {
-        try_bypass_or_make_constant(value1_in, clamp);
+        try_bypass_or_make_constant(value1_in);
       }
       /* X * 0 == 0 * X == 0 */
       else if (is_zero(value1_in) || is_zero(value2_in)) {
@@ -338,7 +338,7 @@ void ConstantFolder::fold_math(NodeMath type, bool clamp) const
     case NODE_MATH_DIVIDE:
       /* X / 1 == X */
       if (is_one(value2_in)) {
-        try_bypass_or_make_constant(value1_in, clamp);
+        try_bypass_or_make_constant(value1_in);
       }
       /* 0 / X == 0 */
       else if (is_zero(value1_in)) {
@@ -352,17 +352,18 @@ void ConstantFolder::fold_math(NodeMath type, bool clamp) const
       }
       /* X ^ 1 == X */
       else if (is_one(value2_in)) {
-        try_bypass_or_make_constant(value1_in, clamp);
+        try_bypass_or_make_constant(value1_in);
       }
     default:
       break;
   }
 }
 
-void ConstantFolder::fold_vector_math(NodeVectorMath type) const
+void ConstantFolder::fold_vector_math(NodeVectorMathType type) const
 {
   ShaderInput *vector1_in = node->input("Vector1");
   ShaderInput *vector2_in = node->input("Vector2");
+  ShaderInput *scale_in = node->input("Scale");
 
   switch (type) {
     case NODE_VECTOR_MATH_ADD:
@@ -380,6 +381,27 @@ void ConstantFolder::fold_vector_math(NodeVectorMath type) const
         try_bypass_or_make_constant(vector1_in);
       }
       break;
+    case NODE_VECTOR_MATH_MULTIPLY:
+      /* X * 0 == 0 * X == 0 */
+      if (is_zero(vector1_in) || is_zero(vector2_in)) {
+        make_zero();
+      } /* X * 1 == 1 * X == X */
+      else if (is_one(vector1_in)) {
+        try_bypass_or_make_constant(vector2_in);
+      }
+      else if (is_one(vector2_in)) {
+        try_bypass_or_make_constant(vector1_in);
+      }
+      break;
+    case NODE_VECTOR_MATH_DIVIDE:
+      /* X / 0 == 0 / X == 0 */
+      if (is_zero(vector1_in) || is_zero(vector2_in)) {
+        make_zero();
+      } /* X / 1 == X */
+      else if (is_one(vector2_in)) {
+        try_bypass_or_make_constant(vector1_in);
+      }
+      break;
     case NODE_VECTOR_MATH_DOT_PRODUCT:
     case NODE_VECTOR_MATH_CROSS_PRODUCT:
       /* X * 0 == 0 * X == 0 */
@@ -387,9 +409,41 @@ void ConstantFolder::fold_vector_math(NodeVectorMath type) const
         make_zero();
       }
       break;
+    case NODE_VECTOR_MATH_LENGTH:
+    case NODE_VECTOR_MATH_ABSOLUTE:
+      if (is_zero(vector1_in)) {
+        make_zero();
+      }
+      break;
+    case NODE_VECTOR_MATH_SCALE:
+      /* X * 0 == 0 * X == 0 */
+      if (is_zero(vector1_in) || is_zero(scale_in)) {
+        make_zero();
+      } /* X * 1 == X */
+      else if (is_one(scale_in)) {
+        try_bypass_or_make_constant(vector1_in);
+      }
+      break;
     default:
       break;
   }
 }
 
+void ConstantFolder::fold_mapping(NodeMappingType type) const
+{
+  ShaderInput *vector_in = node->input("Vector");
+  ShaderInput *location_in = node->input("Location");
+  ShaderInput *rotation_in = node->input("Rotation");
+  ShaderInput *scale_in = node->input("Scale");
+
+  if (is_zero(scale_in)) {
+    make_zero();
+  }
+  else if ((is_zero(location_in) || type == NODE_MAPPING_TYPE_VECTOR ||
+            type == NODE_MAPPING_TYPE_NORMAL) &&
+           is_zero(rotation_in) && is_one(scale_in)) {
+    try_bypass_or_make_constant(vector_in);
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h
index c14b94868dc..fec4123c361 100644
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -17,8 +17,8 @@
 #ifndef __CONSTANT_FOLD_H__
 #define __CONSTANT_FOLD_H__
 
-#include "util/util_types.h"
 #include "kernel/svm/svm_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -64,8 +64,9 @@ class ConstantFolder {
 
   /* Specific nodes. */
   void fold_mix(NodeMix type, bool clamp) const;
-  void fold_math(NodeMath type, bool clamp) const;
-  void fold_vector_math(NodeVectorMath type) const;
+  void fold_math(NodeMathType type) const;
+  void fold_vector_math(NodeVectorMathType type) const;
+  void fold_mapping(NodeMappingType type) const;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp
index 0a29903728a..99d4daa6961 100644
--- a/intern/cycles/render/coverage.cpp
+++ b/intern/cycles/render/coverage.cpp
@@ -15,13 +15,16 @@
  */
 
 #include "render/coverage.h"
+#include "render/buffers.h"
+
 #include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data.h"
+
 #include "kernel/kernel_globals.h"
 #include "kernel/kernel_id_passes.h"
-#include "kernel/kernel_types.h"
+
 #include "util/util_map.h"
-#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h
index 3d1f6a2b040..12182c614da 100644
--- a/intern/cycles/render/coverage.h
+++ b/intern/cycles/render/coverage.h
@@ -14,18 +14,19 @@
  * limitations under the License.
  */
 
-#include "render/buffers.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
+#ifndef __COVERAGE_H__
+#define __COVERAGE_H__
+
 #include "util/util_map.h"
 #include "util/util_vector.h"
 
-#ifndef __COVERAGE_H__
-#  define __COVERAGE_H__
-
 CCL_NAMESPACE_BEGIN
 
+struct KernelGlobals;
+class RenderTile;
+
+typedef unordered_map<float, float> CoverageMap;
+
 class Coverage {
  public:
   Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_)
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index 49ab70541c2..db48d8b6430 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "device/device.h"
 #include "render/curves.h"
+#include "device/device.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
@@ -36,13 +36,12 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim)
   float *p2 = &p[2].x;
   float *p3 = &p[3].x;
 
-  float fc = 0.71f;
+  /* Catmull-Rom weights. */
   float curve_coef[4];
   curve_coef[0] = p1[dim];
-  curve_coef[1] = -fc * p0[dim] + fc * p2[dim];
-  curve_coef[2] = 2.0f * fc * p0[dim] + (fc - 3.0f) * p1[dim] + (3.0f - 2.0f * fc) * p2[dim] -
-                  fc * p3[dim];
-  curve_coef[3] = -fc * p0[dim] + (2.0f - fc) * p1[dim] + (fc - 2.0f) * p2[dim] + fc * p3[dim];
+  curve_coef[1] = 0.5f * (-p0[dim] + p2[dim]);
+  curve_coef[2] = 0.5f * (2 * p0[dim] - 5 * p1[dim] + 4 * p2[dim] - p3[dim]);
+  curve_coef[3] = 0.5f * (-p0[dim] + 3 * p1[dim] - 3 * p2[dim] + p3[dim]);
 
   float discroot = curve_coef[2] * curve_coef[2] - 3 * curve_coef[3] * curve_coef[1];
   float ta = -1.0f;
@@ -77,112 +76,4 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim)
   *lower = min(*lower, min(exa, exb));
 }
 
-/* Hair System Manager */
-
-CurveSystemManager::CurveSystemManager()
-{
-  primitive = CURVE_LINE_SEGMENTS;
-  curve_shape = CURVE_THICK;
-  line_method = CURVE_CORRECTED;
-  triangle_method = CURVE_CAMERA_TRIANGLES;
-  resolution = 3;
-  subdivisions = 3;
-
-  minimum_width = 0.0f;
-  maximum_width = 0.0f;
-
-  use_curves = true;
-  use_encasing = true;
-  use_backfacing = false;
-  use_tangent_normal_geometry = false;
-
-  need_update = true;
-  need_mesh_update = false;
-}
-
-CurveSystemManager::~CurveSystemManager()
-{
-}
-
-void CurveSystemManager::device_update(Device *device,
-                                       DeviceScene *dscene,
-                                       Scene * /*scene*/,
-                                       Progress &progress)
-{
-  if (!need_update)
-    return;
-
-  device_free(device, dscene);
-
-  progress.set_status("Updating Hair settings", "Copying Hair settings to device");
-
-  KernelCurves *kcurve = &dscene->data.curve;
-
-  kcurve->curveflags = 0;
-
-  if (use_curves) {
-    if (primitive == CURVE_SEGMENTS || primitive == CURVE_RIBBONS)
-      kcurve->curveflags |= CURVE_KN_INTERPOLATE;
-    if (primitive == CURVE_RIBBONS)
-      kcurve->curveflags |= CURVE_KN_RIBBONS;
-
-    if (line_method == CURVE_ACCURATE)
-      kcurve->curveflags |= CURVE_KN_ACCURATE;
-    else if (line_method == CURVE_CORRECTED)
-      kcurve->curveflags |= CURVE_KN_INTERSECTCORRECTION;
-
-    if (use_tangent_normal_geometry)
-      kcurve->curveflags |= CURVE_KN_TRUETANGENTGNORMAL;
-    if (use_backfacing)
-      kcurve->curveflags |= CURVE_KN_BACKFACING;
-    if (use_encasing)
-      kcurve->curveflags |= CURVE_KN_ENCLOSEFILTER;
-
-    kcurve->minimum_width = minimum_width;
-    kcurve->maximum_width = maximum_width;
-    kcurve->subdivisions = subdivisions;
-  }
-
-  if (progress.get_cancel())
-    return;
-
-  need_update = false;
-}
-
-void CurveSystemManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/)
-{
-}
-
-bool CurveSystemManager::modified(const CurveSystemManager &CurveSystemManager)
-{
-  return !(
-      curve_shape == CurveSystemManager.curve_shape &&
-      line_method == CurveSystemManager.line_method && primitive == CurveSystemManager.primitive &&
-      use_encasing == CurveSystemManager.use_encasing &&
-      use_tangent_normal_geometry == CurveSystemManager.use_tangent_normal_geometry &&
-      minimum_width == CurveSystemManager.minimum_width &&
-      maximum_width == CurveSystemManager.maximum_width &&
-      use_backfacing == CurveSystemManager.use_backfacing &&
-      triangle_method == CurveSystemManager.triangle_method &&
-      resolution == CurveSystemManager.resolution && use_curves == CurveSystemManager.use_curves &&
-      subdivisions == CurveSystemManager.subdivisions);
-}
-
-bool CurveSystemManager::modified_mesh(const CurveSystemManager &CurveSystemManager)
-{
-  return !(
-      primitive == CurveSystemManager.primitive && curve_shape == CurveSystemManager.curve_shape &&
-      triangle_method == CurveSystemManager.triangle_method &&
-      resolution == CurveSystemManager.resolution && use_curves == CurveSystemManager.use_curves);
-}
-
-void CurveSystemManager::tag_update(Scene * /*scene*/)
-{
-  need_update = true;
-}
-
-void CurveSystemManager::tag_update_mesh()
-{
-  need_mesh_update = true;
-}
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h
index 81e7b4ac88d..c52fcb9c882 100644
--- a/intern/cycles/render/curves.h
+++ b/intern/cycles/render/curves.h
@@ -20,6 +20,8 @@
 #include "util/util_array.h"
 #include "util/util_types.h"
 
+#include "render/hair.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Device;
@@ -29,33 +31,6 @@ class Scene;
 
 void curvebounds(float *lower, float *upper, float3 *p, int dim);
 
-typedef enum CurvePrimitiveType {
-  CURVE_TRIANGLES = 0,
-  CURVE_LINE_SEGMENTS = 1,
-  CURVE_SEGMENTS = 2,
-  CURVE_RIBBONS = 3,
-
-  CURVE_NUM_PRIMITIVE_TYPES,
-} CurvePrimitiveType;
-
-typedef enum CurveShapeType {
-  CURVE_RIBBON = 0,
-  CURVE_THICK = 1,
-
-  CURVE_NUM_SHAPE_TYPES,
-} CurveShapeType;
-
-typedef enum CurveTriangleMethod {
-  CURVE_CAMERA_TRIANGLES,
-  CURVE_TESSELATED_TRIANGLES
-} CurveTriangleMethod;
-
-typedef enum CurveLineMethod {
-  CURVE_ACCURATE,
-  CURVE_CORRECTED,
-  CURVE_UNCORRECTED
-} CurveLineMethod;
-
 class ParticleCurveData {
 
  public:
@@ -75,46 +50,12 @@ class ParticleCurveData {
   array<int> curve_keynum;
   array<float> curve_length;
   array<float2> curve_uv;
-  array<float3> curve_vcol;
+  array<float4> curve_vcol;
 
   array<float3> curvekey_co;
   array<float> curvekey_time;
 };
 
-/* HairSystem Manager */
-
-class CurveSystemManager {
- public:
-  CurvePrimitiveType primitive;
-  CurveShapeType curve_shape;
-  CurveLineMethod line_method;
-  CurveTriangleMethod triangle_method;
-  int resolution;
-  int subdivisions;
-
-  float minimum_width;
-  float maximum_width;
-
-  bool use_curves;
-  bool use_encasing;
-  bool use_backfacing;
-  bool use_tangent_normal_geometry;
-
-  bool need_update;
-  bool need_mesh_update;
-
-  CurveSystemManager();
-  ~CurveSystemManager();
-
-  void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-  void device_free(Device *device, DeviceScene *dscene);
-  bool modified(const CurveSystemManager &CurveSystemManager);
-  bool modified_mesh(const CurveSystemManager &CurveSystemManager);
-
-  void tag_update(Scene *scene);
-  void tag_update_mesh();
-};
-
 CCL_NAMESPACE_END
 
 #endif /* __CURVES_H__ */
diff --git a/intern/cycles/render/denoising.cpp b/intern/cycles/render/denoising.cpp
index c4f21d9c771..76408ca4849 100644
--- a/intern/cycles/render/denoising.cpp
+++ b/intern/cycles/render/denoising.cpp
@@ -21,6 +21,7 @@
 #include "util/util_foreach.h"
 #include "util/util_map.h"
 #include "util/util_system.h"
+#include "util/util_task.h"
 #include "util/util_time.h"
 
 #include <OpenImageIO/filesystem.h>
@@ -69,8 +70,8 @@ static void print_progress(int num, int total, int frame, int num_frames)
   fflush(stdout);
 }
 
-/* Splits in at its last dot, setting suffix to the part after the dot and in to the part before it.
- * Returns whether a dot was found. */
+/* Splits in at its last dot, setting suffix to the part after the dot and in to the part before
+ * it. Returns whether a dot was found. */
 static bool split_last_dot(string &in, string &suffix)
 {
   size_t pos = in.rfind(".");
@@ -84,9 +85,8 @@ static bool split_last_dot(string &in, string &suffix)
 
 /* Separate channel names as generated by Blender.
  * If views is true:
- *   Inputs are expected in the form RenderLayer.Pass.View.Channel, sets renderlayer to "RenderLayer.View"
- * Otherwise:
- *   Inputs are expected in the form RenderLayer.Pass.Channel */
+ *   Inputs are expected in the form RenderLayer.Pass.View.Channel, sets renderlayer to
+ *   "RenderLayer.View" Otherwise: Inputs are expected in the form RenderLayer.Pass.Channel */
 static bool parse_channel_name(
     string name, string &renderlayer, string &pass, string &channel, bool multiview_channels)
 {
@@ -271,42 +271,45 @@ bool DenoiseTask::acquire_tile(Device *device, Device *tile_device, RenderTile &
  *
  * However, since there is only one large memory, the denoised result has to be written to
  * a different buffer to avoid having to copy an entire horizontal slice of the image. */
-void DenoiseTask::map_neighboring_tiles(RenderTile *tiles, Device *tile_device)
+void DenoiseTask::map_neighboring_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
 {
+  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+  RenderTile &target_tile = neighbors.target;
+
   /* Fill tile information. */
-  for (int i = 0; i < 9; i++) {
-    if (i == 4) {
+  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+    if (i == RenderTileNeighbors::CENTER) {
       continue;
     }
 
+    RenderTile &tile = neighbors.tiles[i];
     int dx = (i % 3) - 1;
     int dy = (i / 3) - 1;
-    tiles[i].x = clamp(tiles[4].x + dx * denoiser->tile_size.x, 0, image.width);
-    tiles[i].w = clamp(tiles[4].x + (dx + 1) * denoiser->tile_size.x, 0, image.width) - tiles[i].x;
-    tiles[i].y = clamp(tiles[4].y + dy * denoiser->tile_size.y, 0, image.height);
-    tiles[i].h = clamp(tiles[4].y + (dy + 1) * denoiser->tile_size.y, 0, image.height) -
-                 tiles[i].y;
+    tile.x = clamp(center_tile.x + dx * denoiser->tile_size.x, 0, image.width);
+    tile.w = clamp(center_tile.x + (dx + 1) * denoiser->tile_size.x, 0, image.width) - tile.x;
+    tile.y = clamp(center_tile.y + dy * denoiser->tile_size.y, 0, image.height);
+    tile.h = clamp(center_tile.y + (dy + 1) * denoiser->tile_size.y, 0, image.height) - tile.y;
 
-    tiles[i].buffer = tiles[4].buffer;
-    tiles[i].offset = tiles[4].offset;
-    tiles[i].stride = image.width;
+    tile.buffer = center_tile.buffer;
+    tile.offset = center_tile.offset;
+    tile.stride = image.width;
   }
 
   /* Allocate output buffer. */
   device_vector<float> *output_mem = new device_vector<float>(
       tile_device, "denoising_output", MEM_READ_WRITE);
-  output_mem->alloc(OUTPUT_NUM_CHANNELS * tiles[4].w * tiles[4].h);
+  output_mem->alloc(OUTPUT_NUM_CHANNELS * center_tile.w * center_tile.h);
 
   /* Fill output buffer with noisy image, assumed by kernel_filter_finalize
    * when skipping denoising of some pixels. */
   float *result = output_mem->data();
-  float *in = &image.pixels[image.num_channels * (tiles[4].y * image.width + tiles[4].x)];
+  float *in = &image.pixels[image.num_channels * (center_tile.y * image.width + center_tile.x)];
 
   const DenoiseImageLayer &layer = image.layers[current_layer];
   const int *input_to_image_channel = layer.input_to_image_channel.data();
 
-  for (int y = 0; y < tiles[4].h; y++) {
-    for (int x = 0; x < tiles[4].w; x++, result += OUTPUT_NUM_CHANNELS) {
+  for (int y = 0; y < center_tile.h; y++) {
+    for (int x = 0; x < center_tile.w; x++, result += OUTPUT_NUM_CHANNELS) {
       for (int i = 0; i < OUTPUT_NUM_CHANNELS; i++) {
         result[i] = in[image.num_channels * x + input_to_image_channel[INPUT_NOISY_IMAGE + i]];
       }
@@ -317,35 +320,38 @@ void DenoiseTask::map_neighboring_tiles(RenderTile *tiles, Device *tile_device)
   output_mem->copy_to_device();
 
   /* Fill output tile info. */
-  tiles[9] = tiles[4];
-  tiles[9].buffer = output_mem->device_pointer;
-  tiles[9].stride = tiles[9].w;
-  tiles[9].offset -= tiles[9].x + tiles[9].y * tiles[9].stride;
+  target_tile = center_tile;
+  target_tile.buffer = output_mem->device_pointer;
+  target_tile.stride = target_tile.w;
+  target_tile.offset -= target_tile.x + target_tile.y * target_tile.stride;
 
   thread_scoped_lock output_lock(output_mutex);
-  assert(output_pixels.count(tiles[4].tile_index) == 0);
-  output_pixels[tiles[9].tile_index] = output_mem;
+  assert(output_pixels.count(center_tile.tile_index) == 0);
+  output_pixels[target_tile.tile_index] = output_mem;
 }
 
-void DenoiseTask::unmap_neighboring_tiles(RenderTile *tiles)
+void DenoiseTask::unmap_neighboring_tiles(RenderTileNeighbors &neighbors)
 {
+  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+  RenderTile &target_tile = neighbors.target;
+
   thread_scoped_lock output_lock(output_mutex);
-  assert(output_pixels.count(tiles[4].tile_index) == 1);
-  device_vector<float> *output_mem = output_pixels[tiles[9].tile_index];
-  output_pixels.erase(tiles[4].tile_index);
+  assert(output_pixels.count(center_tile.tile_index) == 1);
+  device_vector<float> *output_mem = output_pixels[target_tile.tile_index];
+  output_pixels.erase(center_tile.tile_index);
   output_lock.unlock();
 
   /* Copy denoised pixels from device. */
-  output_mem->copy_from_device(0, OUTPUT_NUM_CHANNELS * tiles[9].w, tiles[9].h);
+  output_mem->copy_from_device(0, OUTPUT_NUM_CHANNELS * target_tile.w, target_tile.h);
 
   float *result = output_mem->data();
-  float *out = &image.pixels[image.num_channels * (tiles[9].y * image.width + tiles[9].x)];
+  float *out = &image.pixels[image.num_channels * (target_tile.y * image.width + target_tile.x)];
 
   const DenoiseImageLayer &layer = image.layers[current_layer];
   const int *output_to_image_channel = layer.output_to_image_channel.data();
 
-  for (int y = 0; y < tiles[9].h; y++) {
-    for (int x = 0; x < tiles[9].w; x++, result += OUTPUT_NUM_CHANNELS) {
+  for (int y = 0; y < target_tile.h; y++) {
+    for (int x = 0; x < target_tile.w; x++, result += OUTPUT_NUM_CHANNELS) {
       for (int i = 0; i < OUTPUT_NUM_CHANNELS; i++) {
         out[image.num_channels * x + output_to_image_channel[i]] = result[i];
       }
@@ -378,8 +384,9 @@ void DenoiseTask::create_task(DeviceTask &task)
 
   /* Denoising parameters. */
   task.denoising = denoiser->params;
-  task.denoising_do_filter = true;
-  task.denoising_write_passes = false;
+  task.denoising.type = DENOISER_NLM;
+  task.denoising.use = true;
+  task.denoising.store_passes = false;
   task.denoising_from_render = false;
 
   task.denoising_frames.resize(neighbor_frames.size());
@@ -492,6 +499,19 @@ bool DenoiseTask::load_input_pixels(int layer)
       }
     }
 
+    /* Highlight compression */
+    data = buffer_data + 8;
+    for (int y = 0; y < h; y++) {
+      for (int x = 0; x < w; x++) {
+        int idx = INPUT_NUM_CHANNELS * (y * w + x);
+        float3 color = make_float3(data[idx], data[idx + 1], data[idx + 2]);
+        color = color_highlight_compress(color, NULL);
+        data[idx] = color.x;
+        data[idx + 1] = color.y;
+        data[idx + 2] = color.z;
+      }
+    }
+
     buffer_data += frame_stride;
   }
 
@@ -631,7 +651,8 @@ bool DenoiseImage::parse_channels(const ImageSpec &in_spec, string &error)
     layer.name = name;
     layer.samples = samples;
 
-    /* If the sample value isn't set yet, check if there is a layer-specific one in the input file. */
+    /* If the sample value isn't set yet, check if there is a layer-specific one in the input file.
+     */
     if (layer.samples < 1) {
       string sample_string = in_spec.get_string_attribute("cycles." + name + ".samples", "");
       if (sample_string != "") {
@@ -853,8 +874,10 @@ Denoiser::Denoiser(DeviceInfo &device_info)
   TaskScheduler::init();
 
   /* Initialize device. */
-  DeviceRequestedFeatures req;
   device = Device::create(device_info, stats, profiler, true);
+
+  DeviceRequestedFeatures req;
+  req.use_denoising = true;
   device->load_kernels(req);
 }
 
diff --git a/intern/cycles/render/denoising.h b/intern/cycles/render/denoising.h
index dcb842a4603..c1b4d0a5596 100644
--- a/intern/cycles/render/denoising.h
+++ b/intern/cycles/render/denoising.h
@@ -23,8 +23,8 @@
 #include "render/buffers.h"
 
 #include "util/util_string.h"
-#include "util/util_vector.h"
 #include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
 
 #include <OpenImageIO/imageio.h>
 
@@ -87,14 +87,17 @@ struct DenoiseImageLayer {
   /* input_to_image_channel of the secondary frames, if any are used. */
   vector<vector<int>> neighbor_input_to_image_channel;
 
-  /* Write i-th channel of the processing output to output_to_image_channel[i]-th channel of the file. */
+  /* Write i-th channel of the processing output to output_to_image_channel[i]-th channel of the
+   * file. */
   vector<int> output_to_image_channel;
 
-  /* Detect whether this layer contains a full set of channels and set up the offsets accordingly. */
+  /* Detect whether this layer contains a full set of channels and set up the offsets accordingly.
+   */
   bool detect_denoising_channels();
 
   /* Map the channels of a secondary frame to the channels that are required for processing,
-   * fill neighbor_input_to_image_channel if all are present or return false if a channel are missing. */
+   * fill neighbor_input_to_image_channel if all are present or return false if a channel are
+   * missing. */
   bool match_channels(int neighbor,
                       const std::vector<string> &channelnames,
                       const std::vector<string> &neighbor_channelnames);
@@ -125,7 +128,8 @@ class DenoiseImage {
 
   void free();
 
-  /* Open the input image, parse its channels, open the output image and allocate the output buffer. */
+  /* Open the input image, parse its channels, open the output image and allocate the output
+   * buffer. */
   bool load(const string &in_filepath, string &error);
 
   /* Load neighboring frames. */
@@ -139,7 +143,8 @@ class DenoiseImage {
   bool save_output(const string &out_filepath, string &error);
 
  protected:
-  /* Parse input file channels, separate them into DenoiseImageLayers, detect DenoiseImageLayers with full channel sets,
+  /* Parse input file channels, separate them into DenoiseImageLayers,
+   * detect DenoiseImageLayers with full channel sets,
    * fill layers and set up the output channels and passthrough map. */
   bool parse_channels(const ImageSpec &in_spec, string &error);
 
@@ -191,8 +196,8 @@ class DenoiseTask {
 
   /* Device task callbacks */
   bool acquire_tile(Device *device, Device *tile_device, RenderTile &tile);
-  void map_neighboring_tiles(RenderTile *tiles, Device *tile_device);
-  void unmap_neighboring_tiles(RenderTile *tiles);
+  void map_neighboring_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
+  void unmap_neighboring_tiles(RenderTileNeighbors &neighbors);
   void release_tile();
   bool get_cancel();
 };
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index d6c44b66117..d7cbf4a3581 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "render/camera.h"
-#include "device/device.h"
 #include "render/film.h"
+#include "device/device.h"
+#include "render/camera.h"
 #include "render/integrator.h"
 #include "render/mesh.h"
 #include "render/scene.h"
@@ -41,7 +41,34 @@ static bool compare_pass_order(const Pass &a, const Pass &b)
 void Pass::add(PassType type, vector<Pass> &passes, const char *name)
 {
   for (size_t i = 0; i < passes.size(); i++) {
-    if (passes[i].type == type && (name ? (passes[i].name == name) : passes[i].name.empty())) {
+    if (passes[i].type != type) {
+      continue;
+    }
+
+    /* An empty name is used as a placeholder to signal that any pass of
+     * that type is fine (because the content always is the same).
+     * This is important to support divide_type: If the pass that has a
+     * divide_type is added first, a pass for divide_type with an empty
+     * name will be added. Then, if a matching pass with a name is later
+     * requested, the existing placeholder will be renamed to that.
+     * If the divide_type is explicitly allocated with a name first and
+     * then again as part of another pass, the second one will just be
+     * skipped because that type already exists. */
+
+    /* If no name is specified, any pass of the correct type will match. */
+    if (name == NULL) {
+      return;
+    }
+
+    /* If we already have a placeholder pass, rename that one. */
+    if (passes[i].name.empty()) {
+      passes[i].name = name;
+      return;
+    }
+
+    /* If neither existing nor requested pass have placeholder name, they
+     * must match. */
+    if (name == passes[i].name) {
       return;
     }
   }
@@ -128,7 +155,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
     case PASS_DIFFUSE_COLOR:
     case PASS_GLOSSY_COLOR:
     case PASS_TRANSMISSION_COLOR:
-    case PASS_SUBSURFACE_COLOR:
       pass.components = 4;
       break;
     case PASS_DIFFUSE_DIRECT:
@@ -149,12 +175,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
       pass.exposure = true;
       pass.divide_type = PASS_TRANSMISSION_COLOR;
       break;
-    case PASS_SUBSURFACE_DIRECT:
-    case PASS_SUBSURFACE_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_SUBSURFACE_COLOR;
-      break;
     case PASS_VOLUME_DIRECT:
     case PASS_VOLUME_INDIRECT:
       pass.components = 4;
@@ -163,6 +183,23 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
     case PASS_CRYPTOMATTE:
       pass.components = 4;
       break;
+    case PASS_ADAPTIVE_AUX_BUFFER:
+      pass.components = 4;
+      break;
+    case PASS_SAMPLE_COUNT:
+      pass.components = 1;
+      pass.exposure = false;
+      break;
+    case PASS_AOV_COLOR:
+      pass.components = 4;
+      break;
+    case PASS_AOV_VALUE:
+      pass.components = 1;
+      break;
+    case PASS_BAKE_PRIMITIVE:
+    case PASS_BAKE_DIFFERENTIAL:
+      pass.components = 4;
+      break;
     default:
       assert(false);
       break;
@@ -170,9 +207,10 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
 
   passes.push_back(pass);
 
-  /* order from by components, to ensure alignment so passes with size 4
-   * come first and then passes with size 1 */
-  sort(&passes[0], &passes[0] + passes.size(), compare_pass_order);
+  /* Order from by components, to ensure alignment so passes with size 4
+   * come first and then passes with size 1. Note this must use stable sort
+   * so cryptomatte passes remain in the right order. */
+  stable_sort(&passes[0], &passes[0] + passes.size(), compare_pass_order);
 
   if (pass.divide_type != PASS_NONE)
     Pass::add(pass.divide_type, passes);
@@ -267,7 +305,7 @@ NODE_DEFINE(Film)
   NodeType *type = NodeType::add("film", create);
 
   SOCKET_FLOAT(exposure, "Exposure", 0.8f);
-  SOCKET_FLOAT(pass_alpha_threshold, "Pass Alpha Threshold", 0.5f);
+  SOCKET_FLOAT(pass_alpha_threshold, "Pass Alpha Threshold", 0.0f);
 
   static NodeEnum filter_enum;
   filter_enum.insert("box", FILTER_BOX);
@@ -281,12 +319,11 @@ NODE_DEFINE(Film)
   SOCKET_FLOAT(mist_depth, "Mist Depth", 100.0f);
   SOCKET_FLOAT(mist_falloff, "Mist Falloff", 1.0f);
 
-  SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
-
   SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
   SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
   SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
   SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
 
   return type;
 }
@@ -298,6 +335,7 @@ Film::Film() : Node(node_type)
   use_light_visibility = false;
   filter_table_offset = TABLE_OFFSET_INVALID;
   cryptomatte_passes = CRYPT_NONE;
+  display_pass = PASS_COMBINED;
 
   need_update = true;
 }
@@ -318,9 +356,18 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
   /* update __data */
   kfilm->exposure = exposure;
   kfilm->pass_flag = 0;
+
+  kfilm->display_pass_stride = -1;
+  kfilm->display_pass_components = 0;
+  kfilm->display_divide_pass_stride = -1;
+  kfilm->use_display_exposure = false;
+  kfilm->use_display_pass_alpha = (display_pass == PASS_COMBINED);
+
   kfilm->light_pass_flag = 0;
   kfilm->pass_stride = 0;
-  kfilm->use_light_pass = use_light_visibility || use_sample_clamp;
+  kfilm->use_light_pass = use_light_visibility;
+  kfilm->pass_aov_value_num = 0;
+  kfilm->pass_aov_color_num = 0;
 
   bool have_cryptomatte = false;
 
@@ -343,11 +390,13 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
     if (pass.type <= PASS_CATEGORY_MAIN_END) {
       kfilm->pass_flag |= pass_flag;
     }
-    else {
-      assert(pass.type <= PASS_CATEGORY_LIGHT_END);
+    else if (pass.type <= PASS_CATEGORY_LIGHT_END) {
       kfilm->use_light_pass = 1;
       kfilm->light_pass_flag |= pass_flag;
     }
+    else {
+      assert(pass.type <= PASS_CATEGORY_BAKE_END);
+    }
 
     switch (pass.type) {
       case PASS_COMBINED:
@@ -403,9 +452,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_TRANSMISSION_COLOR:
         kfilm->pass_transmission_color = kfilm->pass_stride;
         break;
-      case PASS_SUBSURFACE_COLOR:
-        kfilm->pass_subsurface_color = kfilm->pass_stride;
-        break;
       case PASS_DIFFUSE_INDIRECT:
         kfilm->pass_diffuse_indirect = kfilm->pass_stride;
         break;
@@ -415,9 +461,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_TRANSMISSION_INDIRECT:
         kfilm->pass_transmission_indirect = kfilm->pass_stride;
         break;
-      case PASS_SUBSURFACE_INDIRECT:
-        kfilm->pass_subsurface_indirect = kfilm->pass_stride;
-        break;
       case PASS_VOLUME_INDIRECT:
         kfilm->pass_volume_indirect = kfilm->pass_stride;
         break;
@@ -430,13 +473,17 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_TRANSMISSION_DIRECT:
         kfilm->pass_transmission_direct = kfilm->pass_stride;
         break;
-      case PASS_SUBSURFACE_DIRECT:
-        kfilm->pass_subsurface_direct = kfilm->pass_stride;
-        break;
       case PASS_VOLUME_DIRECT:
         kfilm->pass_volume_direct = kfilm->pass_stride;
         break;
 
+      case PASS_BAKE_PRIMITIVE:
+        kfilm->pass_bake_primitive = kfilm->pass_stride;
+        break;
+      case PASS_BAKE_DIFFERENTIAL:
+        kfilm->pass_bake_differential = kfilm->pass_stride;
+        break;
+
 #ifdef WITH_CYCLES_DEBUG
       case PASS_BVH_TRAVERSED_NODES:
         kfilm->pass_bvh_traversed_nodes = kfilm->pass_stride;
@@ -459,11 +506,39 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
                                       kfilm->pass_stride;
         have_cryptomatte = true;
         break;
+      case PASS_ADAPTIVE_AUX_BUFFER:
+        kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
+        break;
+      case PASS_SAMPLE_COUNT:
+        kfilm->pass_sample_count = kfilm->pass_stride;
+        break;
+      case PASS_AOV_COLOR:
+        if (kfilm->pass_aov_color_num == 0) {
+          kfilm->pass_aov_color = kfilm->pass_stride;
+        }
+        kfilm->pass_aov_color_num++;
+        break;
+      case PASS_AOV_VALUE:
+        if (kfilm->pass_aov_value_num == 0) {
+          kfilm->pass_aov_value = kfilm->pass_stride;
+        }
+        kfilm->pass_aov_value_num++;
+        break;
       default:
         assert(false);
         break;
     }
 
+    if (pass.type == display_pass) {
+      kfilm->display_pass_stride = kfilm->pass_stride;
+      kfilm->display_pass_components = pass.components;
+      kfilm->use_display_exposure = pass.exposure && (kfilm->exposure != 1.0f);
+    }
+    else if (pass.type == PASS_DIFFUSE_COLOR || pass.type == PASS_TRANSMISSION_COLOR ||
+             pass.type == PASS_GLOSSY_COLOR) {
+      kfilm->display_divide_pass_stride = kfilm->pass_stride;
+    }
+
     kfilm->pass_stride += pass.components;
   }
 
@@ -485,7 +560,18 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
   }
 
   kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
-  kfilm->pass_alpha_threshold = pass_alpha_threshold;
+
+  /* When displaying the normal/uv pass in the viewport we need to disable
+   * transparency.
+   *
+   * We also don't need to perform light accumulations. Later we want to optimize this to suppress
+   * light calculations. */
+  if (display_pass == PASS_NORMAL || display_pass == PASS_UV) {
+    kfilm->use_light_pass = 0;
+  }
+  else {
+    kfilm->pass_alpha_threshold = pass_alpha_threshold;
+  }
 
   /* update filter table */
   vector<float> table = filter_table(filter_type, filter_width);
@@ -518,18 +604,24 @@ bool Film::modified(const Film &film)
   return !Node::equals(film) || !Pass::equals(passes, film.passes);
 }
 
-void Film::tag_passes_update(Scene *scene, const vector<Pass> &passes_)
+void Film::tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes)
 {
   if (Pass::contains(passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) {
-    scene->mesh_manager->tag_update(scene);
+    scene->geometry_manager->tag_update(scene);
 
     foreach (Shader *shader, scene->shaders)
-      shader->need_update_mesh = true;
+      shader->need_update_geometry = true;
+  }
+  else if (Pass::contains(passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION)) {
+    scene->geometry_manager->tag_update(scene);
+  }
+  else if (Pass::contains(passes, PASS_AO) != Pass::contains(passes_, PASS_AO)) {
+    scene->integrator->tag_update(scene);
   }
-  else if (Pass::contains(passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION))
-    scene->mesh_manager->tag_update(scene);
 
-  passes = passes_;
+  if (update_passes) {
+    passes = passes_;
+  }
 }
 
 void Film::tag_update(Scene * /*scene*/)
@@ -537,4 +629,27 @@ void Film::tag_update(Scene * /*scene*/)
   need_update = true;
 }
 
+int Film::get_aov_offset(string name, bool &is_color)
+{
+  int num_color = 0, num_value = 0;
+  foreach (const Pass &pass, passes) {
+    if (pass.type == PASS_AOV_COLOR) {
+      num_color++;
+    }
+    else if (pass.type == PASS_AOV_VALUE) {
+      num_value++;
+    }
+    else {
+      continue;
+    }
+
+    if (pass.name == name) {
+      is_color = (pass.type == PASS_AOV_COLOR);
+      return (is_color ? num_color : num_value) - 1;
+    }
+  }
+
+  return -1;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 1cfa7c3b77d..aae8fb404b0 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -64,6 +64,7 @@ class Film : public Node {
   int denoising_flags;
   float pass_alpha_threshold;
 
+  PassType display_pass;
   int pass_stride;
   int denoising_data_offset;
   int denoising_clean_offset;
@@ -77,10 +78,11 @@ class Film : public Node {
   float mist_falloff;
 
   bool use_light_visibility;
-  bool use_sample_clamp;
   CryptomatteType cryptomatte_passes;
   int cryptomatte_depth;
 
+  bool use_adaptive_sampling;
+
   bool need_update;
 
   Film();
@@ -90,8 +92,10 @@ class Film : public Node {
   void device_free(Device *device, DeviceScene *dscene, Scene *scene);
 
   bool modified(const Film &film);
-  void tag_passes_update(Scene *scene, const vector<Pass> &passes_);
+  void tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes = true);
   void tag_update(Scene *scene);
+
+  int get_aov_offset(string name, bool &is_color);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp
new file mode 100644
index 00000000000..3d1b6e1d865
--- /dev/null
+++ b/intern/cycles/render/geometry.cpp
@@ -0,0 +1,1473 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_embree.h"
+
+#include "device/device.h"
+
+#include "render/attribute.h"
+#include "render/camera.h"
+#include "render/geometry.h"
+#include "render/hair.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/stats.h"
+
+#include "subd/subd_patch_table.h"
+#include "subd/subd_split.h"
+
+#include "kernel/osl/osl_globals.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Geometry */
+
+NODE_ABSTRACT_DEFINE(Geometry)
+{
+  NodeType *type = NodeType::add("geometry_base", NULL);
+
+  SOCKET_UINT(motion_steps, "Motion Steps", 3);
+  SOCKET_BOOLEAN(use_motion_blur, "Use Motion Blur", false);
+
+  return type;
+}
+
+Geometry::Geometry(const NodeType *node_type, const Type type)
+    : Node(node_type), type(type), attributes(this, ATTR_PRIM_GEOMETRY)
+{
+  need_update = true;
+  need_update_rebuild = false;
+
+  transform_applied = false;
+  transform_negative_scaled = false;
+  transform_normal = transform_identity();
+  bounds = BoundBox::empty;
+
+  has_volume = false;
+  has_surface_bssrdf = false;
+
+  bvh = NULL;
+  attr_map_offset = 0;
+  optix_prim_offset = 0;
+  prim_offset = 0;
+}
+
+Geometry::~Geometry()
+{
+  delete bvh;
+}
+
+void Geometry::clear()
+{
+  used_shaders.clear();
+  transform_applied = false;
+  transform_negative_scaled = false;
+  transform_normal = transform_identity();
+}
+
+bool Geometry::need_attribute(Scene *scene, AttributeStandard std)
+{
+  if (std == ATTR_STD_NONE)
+    return false;
+
+  if (scene->need_global_attribute(std))
+    return true;
+
+  foreach (Shader *shader, used_shaders)
+    if (shader->attributes.find(std))
+      return true;
+
+  return false;
+}
+
+bool Geometry::need_attribute(Scene * /*scene*/, ustring name)
+{
+  if (name == ustring())
+    return false;
+
+  foreach (Shader *shader, used_shaders)
+    if (shader->attributes.find(name))
+      return true;
+
+  return false;
+}
+
+float Geometry::motion_time(int step) const
+{
+  return (motion_steps > 1) ? 2.0f * step / (motion_steps - 1) - 1.0f : 0.0f;
+}
+
+int Geometry::motion_step(float time) const
+{
+  if (motion_steps > 1) {
+    int attr_step = 0;
+
+    for (int step = 0; step < motion_steps; step++) {
+      float step_time = motion_time(step);
+      if (step_time == time) {
+        return attr_step;
+      }
+
+      /* Center step is stored in a separate attribute. */
+      if (step != motion_steps / 2) {
+        attr_step++;
+      }
+    }
+  }
+
+  return -1;
+}
+
+bool Geometry::need_build_bvh(BVHLayout layout) const
+{
+  return !transform_applied || has_surface_bssrdf || layout == BVH_LAYOUT_OPTIX;
+}
+
+bool Geometry::is_instanced() const
+{
+  /* Currently we treat subsurface objects as instanced.
+   *
+   * While it might be not very optimal for ray traversal, it avoids having
+   * duplicated BVH in the memory, saving quite some space.
+   */
+  return !transform_applied || has_surface_bssrdf;
+}
+
+bool Geometry::has_true_displacement() const
+{
+  foreach (Shader *shader, used_shaders) {
+    if (shader->has_displacement && shader->displacement_method != DISPLACE_BUMP) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void Geometry::compute_bvh(
+    Device *device, DeviceScene *dscene, SceneParams *params, Progress *progress, int n, int total)
+{
+  if (progress->get_cancel())
+    return;
+
+  compute_bounds();
+
+  const BVHLayout bvh_layout = BVHParams::best_bvh_layout(params->bvh_layout,
+                                                          device->get_bvh_layout_mask());
+  if (need_build_bvh(bvh_layout)) {
+    string msg = "Updating Geometry BVH ";
+    if (name.empty())
+      msg += string_printf("%u/%u", (uint)(n + 1), (uint)total);
+    else
+      msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
+
+    Object object;
+    object.geometry = this;
+
+    vector<Geometry *> geometry;
+    geometry.push_back(this);
+    vector<Object *> objects;
+    objects.push_back(&object);
+
+    if (bvh && !need_update_rebuild) {
+      progress->set_status(msg, "Refitting BVH");
+
+      bvh->geometry = geometry;
+      bvh->objects = objects;
+
+      bvh->refit(*progress);
+    }
+    else {
+      progress->set_status(msg, "Building BVH");
+
+      BVHParams bparams;
+      bparams.use_spatial_split = params->use_bvh_spatial_split;
+      bparams.bvh_layout = bvh_layout;
+      bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
+                                    params->use_bvh_unaligned_nodes;
+      bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
+      bparams.num_motion_curve_steps = params->num_bvh_time_steps;
+      bparams.bvh_type = params->bvh_type;
+      bparams.curve_subdivisions = params->curve_subdivisions();
+
+      delete bvh;
+      bvh = BVH::create(bparams, geometry, objects);
+      MEM_GUARDED_CALL(progress, bvh->build, *progress);
+    }
+  }
+
+  need_update = false;
+  need_update_rebuild = false;
+}
+
+bool Geometry::has_motion_blur() const
+{
+  return (use_motion_blur && attributes.find(ATTR_STD_MOTION_VERTEX_POSITION));
+}
+
+bool Geometry::has_voxel_attributes() const
+{
+  foreach (const Attribute &attr, attributes.attributes) {
+    if (attr.element == ATTR_ELEMENT_VOXEL) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void Geometry::tag_update(Scene *scene, bool rebuild)
+{
+  need_update = true;
+
+  if (rebuild) {
+    need_update_rebuild = true;
+    scene->light_manager->need_update = true;
+  }
+  else {
+    foreach (Shader *shader, used_shaders)
+      if (shader->has_surface_emission)
+        scene->light_manager->need_update = true;
+  }
+
+  scene->geometry_manager->need_update = true;
+  scene->object_manager->need_update = true;
+}
+
+/* Geometry Manager */
+
+GeometryManager::GeometryManager()
+{
+  need_update = true;
+  need_flags_update = true;
+}
+
+GeometryManager::~GeometryManager()
+{
+}
+
+void GeometryManager::update_osl_attributes(Device *device,
+                                            Scene *scene,
+                                            vector<AttributeRequestSet> &geom_attributes)
+{
+#ifdef WITH_OSL
+  /* for OSL, a hash map is used to lookup the attribute by name. */
+  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+
+  og->object_name_map.clear();
+  og->attribute_map.clear();
+  og->object_names.clear();
+
+  og->attribute_map.resize(scene->objects.size() * ATTR_PRIM_TYPES);
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    /* set object name to object index map */
+    Object *object = scene->objects[i];
+    og->object_name_map[object->name] = i;
+    og->object_names.push_back(object->name);
+
+    /* set object attributes */
+    foreach (ParamValue &attr, object->attributes) {
+      OSLGlobals::Attribute osl_attr;
+
+      osl_attr.type = attr.type();
+      osl_attr.desc.element = ATTR_ELEMENT_OBJECT;
+      osl_attr.value = attr;
+      osl_attr.desc.offset = 0;
+      osl_attr.desc.flags = 0;
+
+      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_GEOMETRY][attr.name()] = osl_attr;
+      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][attr.name()] = osl_attr;
+    }
+
+    /* find geometry attributes */
+    size_t j;
+
+    for (j = 0; j < scene->geometry.size(); j++)
+      if (scene->geometry[j] == object->geometry)
+        break;
+
+    AttributeRequestSet &attributes = geom_attributes[j];
+
+    /* set object attributes */
+    foreach (AttributeRequest &req, attributes.requests) {
+      OSLGlobals::Attribute osl_attr;
+
+      if (req.desc.element != ATTR_ELEMENT_NONE) {
+        osl_attr.desc = req.desc;
+
+        if (req.type == TypeDesc::TypeFloat)
+          osl_attr.type = TypeDesc::TypeFloat;
+        else if (req.type == TypeDesc::TypeMatrix)
+          osl_attr.type = TypeDesc::TypeMatrix;
+        else if (req.type == TypeFloat2)
+          osl_attr.type = TypeFloat2;
+        else if (req.type == TypeRGBA)
+          osl_attr.type = TypeRGBA;
+        else
+          osl_attr.type = TypeDesc::TypeColor;
+
+        if (req.std != ATTR_STD_NONE) {
+          /* if standard attribute, add lookup by geom: name convention */
+          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
+          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_GEOMETRY][stdname] = osl_attr;
+        }
+        else if (req.name != ustring()) {
+          /* add lookup by geometry attribute name */
+          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_GEOMETRY][req.name] = osl_attr;
+        }
+      }
+
+      if (req.subd_desc.element != ATTR_ELEMENT_NONE) {
+        osl_attr.desc = req.subd_desc;
+
+        if (req.subd_type == TypeDesc::TypeFloat)
+          osl_attr.type = TypeDesc::TypeFloat;
+        else if (req.subd_type == TypeDesc::TypeMatrix)
+          osl_attr.type = TypeDesc::TypeMatrix;
+        else if (req.subd_type == TypeFloat2)
+          osl_attr.type = TypeFloat2;
+        else if (req.subd_type == TypeRGBA)
+          osl_attr.type = TypeRGBA;
+        else
+          osl_attr.type = TypeDesc::TypeColor;
+
+        if (req.std != ATTR_STD_NONE) {
+          /* if standard attribute, add lookup by geom: name convention */
+          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
+          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][stdname] = osl_attr;
+        }
+        else if (req.name != ustring()) {
+          /* add lookup by geometry attribute name */
+          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][req.name] = osl_attr;
+        }
+      }
+    }
+  }
+#else
+  (void)device;
+  (void)scene;
+  (void)geom_attributes;
+#endif
+}
+
+void GeometryManager::update_svm_attributes(Device *,
+                                            DeviceScene *dscene,
+                                            Scene *scene,
+                                            vector<AttributeRequestSet> &geom_attributes)
+{
+  /* for SVM, the attributes_map table is used to lookup the offset of an
+   * attribute, based on a unique shader attribute id. */
+
+  /* compute array stride */
+  int attr_map_size = 0;
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    geom->attr_map_offset = attr_map_size;
+    attr_map_size += (geom_attributes[i].size() + 1) * ATTR_PRIM_TYPES;
+  }
+
+  if (attr_map_size == 0)
+    return;
+
+  /* create attribute map */
+  uint4 *attr_map = dscene->attributes_map.alloc(attr_map_size);
+  memset(attr_map, 0, dscene->attributes_map.size() * sizeof(uint));
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+
+    /* set object attributes */
+    int index = geom->attr_map_offset;
+
+    foreach (AttributeRequest &req, attributes.requests) {
+      uint id;
+
+      if (req.std == ATTR_STD_NONE)
+        id = scene->shader_manager->get_attribute_id(req.name);
+      else
+        id = scene->shader_manager->get_attribute_id(req.std);
+
+      attr_map[index].x = id;
+      attr_map[index].y = req.desc.element;
+      attr_map[index].z = as_uint(req.desc.offset);
+
+      if (req.type == TypeDesc::TypeFloat)
+        attr_map[index].w = NODE_ATTR_FLOAT;
+      else if (req.type == TypeDesc::TypeMatrix)
+        attr_map[index].w = NODE_ATTR_MATRIX;
+      else if (req.type == TypeFloat2)
+        attr_map[index].w = NODE_ATTR_FLOAT2;
+      else if (req.type == TypeRGBA)
+        attr_map[index].w = NODE_ATTR_RGBA;
+      else
+        attr_map[index].w = NODE_ATTR_FLOAT3;
+
+      attr_map[index].w |= req.desc.flags << 8;
+
+      index++;
+
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (mesh->subd_faces.size()) {
+          attr_map[index].x = id;
+          attr_map[index].y = req.subd_desc.element;
+          attr_map[index].z = as_uint(req.subd_desc.offset);
+
+          if (req.subd_type == TypeDesc::TypeFloat)
+            attr_map[index].w = NODE_ATTR_FLOAT;
+          else if (req.subd_type == TypeDesc::TypeMatrix)
+            attr_map[index].w = NODE_ATTR_MATRIX;
+          else if (req.subd_type == TypeFloat2)
+            attr_map[index].w = NODE_ATTR_FLOAT2;
+          else if (req.subd_type == TypeRGBA)
+            attr_map[index].w = NODE_ATTR_RGBA;
+          else
+            attr_map[index].w = NODE_ATTR_FLOAT3;
+
+          attr_map[index].w |= req.subd_desc.flags << 8;
+        }
+      }
+
+      index++;
+    }
+
+    /* terminator */
+    for (int j = 0; j < ATTR_PRIM_TYPES; j++) {
+      attr_map[index].x = ATTR_STD_NONE;
+      attr_map[index].y = 0;
+      attr_map[index].z = 0;
+      attr_map[index].w = 0;
+
+      index++;
+    }
+  }
+
+  /* copy to device */
+  dscene->attributes_map.copy_to_device();
+}
+
+static void update_attribute_element_size(Geometry *geom,
+                                          Attribute *mattr,
+                                          AttributePrimitive prim,
+                                          size_t *attr_float_size,
+                                          size_t *attr_float2_size,
+                                          size_t *attr_float3_size,
+                                          size_t *attr_uchar4_size)
+{
+  if (mattr) {
+    size_t size = mattr->element_size(geom, prim);
+
+    if (mattr->element == ATTR_ELEMENT_VOXEL) {
+      /* pass */
+    }
+    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
+      *attr_uchar4_size += size;
+    }
+    else if (mattr->type == TypeDesc::TypeFloat) {
+      *attr_float_size += size;
+    }
+    else if (mattr->type == TypeFloat2) {
+      *attr_float2_size += size;
+    }
+    else if (mattr->type == TypeDesc::TypeMatrix) {
+      *attr_float3_size += size * 4;
+    }
+    else {
+      *attr_float3_size += size;
+    }
+  }
+}
+
+static void update_attribute_element_offset(Geometry *geom,
+                                            device_vector<float> &attr_float,
+                                            size_t &attr_float_offset,
+                                            device_vector<float2> &attr_float2,
+                                            size_t &attr_float2_offset,
+                                            device_vector<float4> &attr_float3,
+                                            size_t &attr_float3_offset,
+                                            device_vector<uchar4> &attr_uchar4,
+                                            size_t &attr_uchar4_offset,
+                                            Attribute *mattr,
+                                            AttributePrimitive prim,
+                                            TypeDesc &type,
+                                            AttributeDescriptor &desc)
+{
+  if (mattr) {
+    /* store element and type */
+    desc.element = mattr->element;
+    desc.flags = mattr->flags;
+    type = mattr->type;
+
+    /* store attribute data in arrays */
+    size_t size = mattr->element_size(geom, prim);
+
+    AttributeElement &element = desc.element;
+    int &offset = desc.offset;
+
+    if (mattr->element == ATTR_ELEMENT_VOXEL) {
+      /* store slot in offset value */
+      ImageHandle &handle = mattr->data_voxel();
+      offset = handle.svm_slot();
+    }
+    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
+      uchar4 *data = mattr->data_uchar4();
+      offset = attr_uchar4_offset;
+
+      assert(attr_uchar4.size() >= offset + size);
+      for (size_t k = 0; k < size; k++) {
+        attr_uchar4[offset + k] = data[k];
+      }
+      attr_uchar4_offset += size;
+    }
+    else if (mattr->type == TypeDesc::TypeFloat) {
+      float *data = mattr->data_float();
+      offset = attr_float_offset;
+
+      assert(attr_float.size() >= offset + size);
+      for (size_t k = 0; k < size; k++) {
+        attr_float[offset + k] = data[k];
+      }
+      attr_float_offset += size;
+    }
+    else if (mattr->type == TypeFloat2) {
+      float2 *data = mattr->data_float2();
+      offset = attr_float2_offset;
+
+      assert(attr_float2.size() >= offset + size);
+      for (size_t k = 0; k < size; k++) {
+        attr_float2[offset + k] = data[k];
+      }
+      attr_float2_offset += size;
+    }
+    else if (mattr->type == TypeDesc::TypeMatrix) {
+      Transform *tfm = mattr->data_transform();
+      offset = attr_float3_offset;
+
+      assert(attr_float3.size() >= offset + size * 3);
+      for (size_t k = 0; k < size * 3; k++) {
+        attr_float3[offset + k] = (&tfm->x)[k];
+      }
+      attr_float3_offset += size * 3;
+    }
+    else {
+      float4 *data = mattr->data_float4();
+      offset = attr_float3_offset;
+
+      assert(attr_float3.size() >= offset + size);
+      for (size_t k = 0; k < size; k++) {
+        attr_float3[offset + k] = data[k];
+      }
+      attr_float3_offset += size;
+    }
+
+    /* mesh vertex/curve index is global, not per object, so we sneak
+     * a correction for that in here */
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      if (mesh->subdivision_type == Mesh::SUBDIVISION_CATMULL_CLARK &&
+          desc.flags & ATTR_SUBDIVIDED) {
+        /* indices for subdivided attributes are retrieved
+         * from patch table so no need for correction here*/
+      }
+      else if (element == ATTR_ELEMENT_VERTEX)
+        offset -= mesh->vert_offset;
+      else if (element == ATTR_ELEMENT_VERTEX_MOTION)
+        offset -= mesh->vert_offset;
+      else if (element == ATTR_ELEMENT_FACE) {
+        if (prim == ATTR_PRIM_GEOMETRY)
+          offset -= mesh->prim_offset;
+        else
+          offset -= mesh->face_offset;
+      }
+      else if (element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE) {
+        if (prim == ATTR_PRIM_GEOMETRY)
+          offset -= 3 * mesh->prim_offset;
+        else
+          offset -= mesh->corner_offset;
+      }
+    }
+    else if (geom->type == Geometry::HAIR) {
+      Hair *hair = static_cast<Hair *>(geom);
+      if (element == ATTR_ELEMENT_CURVE)
+        offset -= hair->prim_offset;
+      else if (element == ATTR_ELEMENT_CURVE_KEY)
+        offset -= hair->curvekey_offset;
+      else if (element == ATTR_ELEMENT_CURVE_KEY_MOTION)
+        offset -= hair->curvekey_offset;
+    }
+  }
+  else {
+    /* attribute not found */
+    desc.element = ATTR_ELEMENT_NONE;
+    desc.offset = 0;
+  }
+}
+
+void GeometryManager::device_update_attributes(Device *device,
+                                               DeviceScene *dscene,
+                                               Scene *scene,
+                                               Progress &progress)
+{
+  progress.set_status("Updating Mesh", "Computing attributes");
+
+  /* gather per mesh requested attributes. as meshes may have multiple
+   * shaders assigned, this merges the requested attributes that have
+   * been set per shader by the shader manager */
+  vector<AttributeRequestSet> geom_attributes(scene->geometry.size());
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+
+    scene->need_global_attributes(geom_attributes[i]);
+
+    foreach (Shader *shader, geom->used_shaders) {
+      geom_attributes[i].add(shader->attributes);
+    }
+  }
+
+  /* mesh attribute are stored in a single array per data type. here we fill
+   * those arrays, and set the offset and element type to create attribute
+   * maps next */
+
+  /* Pre-allocate attributes to avoid arrays re-allocation which would
+   * take 2x of overall attribute memory usage.
+   */
+  size_t attr_float_size = 0;
+  size_t attr_float2_size = 0;
+  size_t attr_float3_size = 0;
+  size_t attr_uchar4_size = 0;
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+    foreach (AttributeRequest &req, attributes.requests) {
+      Attribute *attr = geom->attributes.find(req);
+
+      update_attribute_element_size(geom,
+                                    attr,
+                                    ATTR_PRIM_GEOMETRY,
+                                    &attr_float_size,
+                                    &attr_float2_size,
+                                    &attr_float3_size,
+                                    &attr_uchar4_size);
+
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        Attribute *subd_attr = mesh->subd_attributes.find(req);
+
+        update_attribute_element_size(mesh,
+                                      subd_attr,
+                                      ATTR_PRIM_SUBD,
+                                      &attr_float_size,
+                                      &attr_float2_size,
+                                      &attr_float3_size,
+                                      &attr_uchar4_size);
+      }
+    }
+  }
+
+  dscene->attributes_float.alloc(attr_float_size);
+  dscene->attributes_float2.alloc(attr_float2_size);
+  dscene->attributes_float3.alloc(attr_float3_size);
+  dscene->attributes_uchar4.alloc(attr_uchar4_size);
+
+  size_t attr_float_offset = 0;
+  size_t attr_float2_offset = 0;
+  size_t attr_float3_offset = 0;
+  size_t attr_uchar4_offset = 0;
+
+  /* Fill in attributes. */
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+
+    /* todo: we now store std and name attributes from requests even if
+     * they actually refer to the same mesh attributes, optimize */
+    foreach (AttributeRequest &req, attributes.requests) {
+      Attribute *attr = geom->attributes.find(req);
+      update_attribute_element_offset(geom,
+                                      dscene->attributes_float,
+                                      attr_float_offset,
+                                      dscene->attributes_float2,
+                                      attr_float2_offset,
+                                      dscene->attributes_float3,
+                                      attr_float3_offset,
+                                      dscene->attributes_uchar4,
+                                      attr_uchar4_offset,
+                                      attr,
+                                      ATTR_PRIM_GEOMETRY,
+                                      req.type,
+                                      req.desc);
+
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        Attribute *subd_attr = mesh->subd_attributes.find(req);
+
+        update_attribute_element_offset(mesh,
+                                        dscene->attributes_float,
+                                        attr_float_offset,
+                                        dscene->attributes_float2,
+                                        attr_float2_offset,
+                                        dscene->attributes_float3,
+                                        attr_float3_offset,
+                                        dscene->attributes_uchar4,
+                                        attr_uchar4_offset,
+                                        subd_attr,
+                                        ATTR_PRIM_SUBD,
+                                        req.subd_type,
+                                        req.subd_desc);
+      }
+
+      if (progress.get_cancel())
+        return;
+    }
+  }
+
+  /* create attribute lookup maps */
+  if (scene->shader_manager->use_osl())
+    update_osl_attributes(device, scene, geom_attributes);
+
+  update_svm_attributes(device, dscene, scene, geom_attributes);
+
+  if (progress.get_cancel())
+    return;
+
+  /* copy to device */
+  progress.set_status("Updating Mesh", "Copying Attributes to device");
+
+  if (dscene->attributes_float.size()) {
+    dscene->attributes_float.copy_to_device();
+  }
+  if (dscene->attributes_float2.size()) {
+    dscene->attributes_float2.copy_to_device();
+  }
+  if (dscene->attributes_float3.size()) {
+    dscene->attributes_float3.copy_to_device();
+  }
+  if (dscene->attributes_uchar4.size()) {
+    dscene->attributes_uchar4.copy_to_device();
+  }
+
+  if (progress.get_cancel())
+    return;
+
+  /* After mesh attributes and patch tables have been copied to device memory,
+   * we need to update offsets in the objects. */
+  scene->object_manager->device_update_mesh_offsets(device, dscene, scene);
+}
+
+void GeometryManager::mesh_calc_offset(Scene *scene)
+{
+  size_t vert_size = 0;
+  size_t tri_size = 0;
+
+  size_t curve_key_size = 0;
+  size_t curve_size = 0;
+
+  size_t patch_size = 0;
+  size_t face_size = 0;
+  size_t corner_size = 0;
+
+  size_t optix_prim_size = 0;
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+
+      mesh->vert_offset = vert_size;
+      mesh->prim_offset = tri_size;
+
+      mesh->patch_offset = patch_size;
+      mesh->face_offset = face_size;
+      mesh->corner_offset = corner_size;
+
+      vert_size += mesh->verts.size();
+      tri_size += mesh->num_triangles();
+
+      if (mesh->subd_faces.size()) {
+        Mesh::SubdFace &last = mesh->subd_faces[mesh->subd_faces.size() - 1];
+        patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
+
+        /* patch tables are stored in same array so include them in patch_size */
+        if (mesh->patch_table) {
+          mesh->patch_table_offset = patch_size;
+          patch_size += mesh->patch_table->total_size();
+        }
+      }
+
+      face_size += mesh->subd_faces.size();
+      corner_size += mesh->subd_face_corners.size();
+
+      mesh->optix_prim_offset = optix_prim_size;
+      optix_prim_size += mesh->num_triangles();
+    }
+    else if (geom->type == Geometry::HAIR) {
+      Hair *hair = static_cast<Hair *>(geom);
+
+      hair->curvekey_offset = curve_key_size;
+      hair->prim_offset = curve_size;
+
+      curve_key_size += hair->curve_keys.size();
+      curve_size += hair->num_curves();
+
+      hair->optix_prim_offset = optix_prim_size;
+      optix_prim_size += hair->num_segments();
+    }
+  }
+}
+
+void GeometryManager::device_update_mesh(
+    Device *, DeviceScene *dscene, Scene *scene, bool for_displacement, Progress &progress)
+{
+  /* Count. */
+  size_t vert_size = 0;
+  size_t tri_size = 0;
+
+  size_t curve_key_size = 0;
+  size_t curve_size = 0;
+
+  size_t patch_size = 0;
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+
+      vert_size += mesh->verts.size();
+      tri_size += mesh->num_triangles();
+
+      if (mesh->subd_faces.size()) {
+        Mesh::SubdFace &last = mesh->subd_faces[mesh->subd_faces.size() - 1];
+        patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
+
+        /* patch tables are stored in same array so include them in patch_size */
+        if (mesh->patch_table) {
+          mesh->patch_table_offset = patch_size;
+          patch_size += mesh->patch_table->total_size();
+        }
+      }
+    }
+    else if (geom->type == Geometry::HAIR) {
+      Hair *hair = static_cast<Hair *>(geom);
+
+      curve_key_size += hair->curve_keys.size();
+      curve_size += hair->num_curves();
+    }
+  }
+
+  /* Create mapping from triangle to primitive triangle array. */
+  vector<uint> tri_prim_index(tri_size);
+  if (for_displacement) {
+    /* For displacement kernels we do some trickery to make them believe
+     * we've got all required data ready. However, that data is different
+     * from final render kernels since we don't have BVH yet, so can't
+     * really use same semantic of arrays.
+     */
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        for (size_t i = 0; i < mesh->num_triangles(); ++i) {
+          tri_prim_index[i + mesh->prim_offset] = 3 * (i + mesh->prim_offset);
+        }
+      }
+    }
+  }
+  else {
+    for (size_t i = 0; i < dscene->prim_index.size(); ++i) {
+      if ((dscene->prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
+        tri_prim_index[dscene->prim_index[i]] = dscene->prim_tri_index[i];
+      }
+    }
+  }
+
+  /* Fill in all the arrays. */
+  if (tri_size != 0) {
+    /* normals */
+    progress.set_status("Updating Mesh", "Computing normals");
+
+    uint *tri_shader = dscene->tri_shader.alloc(tri_size);
+    float4 *vnormal = dscene->tri_vnormal.alloc(vert_size);
+    uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
+    uint *tri_patch = dscene->tri_patch.alloc(tri_size);
+    float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        mesh->pack_shaders(scene, &tri_shader[mesh->prim_offset]);
+        mesh->pack_normals(&vnormal[mesh->vert_offset]);
+        mesh->pack_verts(tri_prim_index,
+                         &tri_vindex[mesh->prim_offset],
+                         &tri_patch[mesh->prim_offset],
+                         &tri_patch_uv[mesh->vert_offset],
+                         mesh->vert_offset,
+                         mesh->prim_offset);
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    /* vertex coordinates */
+    progress.set_status("Updating Mesh", "Copying Mesh to device");
+
+    dscene->tri_shader.copy_to_device();
+    dscene->tri_vnormal.copy_to_device();
+    dscene->tri_vindex.copy_to_device();
+    dscene->tri_patch.copy_to_device();
+    dscene->tri_patch_uv.copy_to_device();
+  }
+
+  if (curve_size != 0) {
+    progress.set_status("Updating Mesh", "Copying Strands to device");
+
+    float4 *curve_keys = dscene->curve_keys.alloc(curve_key_size);
+    float4 *curves = dscene->curves.alloc(curve_size);
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        hair->pack_curves(scene,
+                          &curve_keys[hair->curvekey_offset],
+                          &curves[hair->prim_offset],
+                          hair->curvekey_offset);
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    dscene->curve_keys.copy_to_device();
+    dscene->curves.copy_to_device();
+  }
+
+  if (patch_size != 0) {
+    progress.set_status("Updating Mesh", "Copying Patches to device");
+
+    uint *patch_data = dscene->patches.alloc(patch_size);
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        mesh->pack_patches(&patch_data[mesh->patch_offset],
+                           mesh->vert_offset,
+                           mesh->face_offset,
+                           mesh->corner_offset);
+
+        if (mesh->patch_table) {
+          mesh->patch_table->copy_adjusting_offsets(&patch_data[mesh->patch_table_offset],
+                                                    mesh->patch_table_offset);
+        }
+
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    dscene->patches.copy_to_device();
+  }
+
+  if (for_displacement) {
+    float4 *prim_tri_verts = dscene->prim_tri_verts.alloc(tri_size * 3);
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        for (size_t i = 0; i < mesh->num_triangles(); ++i) {
+          Mesh::Triangle t = mesh->get_triangle(i);
+          size_t offset = 3 * (i + mesh->prim_offset);
+          prim_tri_verts[offset + 0] = float3_to_float4(mesh->verts[t.v[0]]);
+          prim_tri_verts[offset + 1] = float3_to_float4(mesh->verts[t.v[1]]);
+          prim_tri_verts[offset + 2] = float3_to_float4(mesh->verts[t.v[2]]);
+        }
+      }
+    }
+    dscene->prim_tri_verts.copy_to_device();
+  }
+}
+
+void GeometryManager::device_update_bvh(Device *device,
+                                        DeviceScene *dscene,
+                                        Scene *scene,
+                                        Progress &progress)
+{
+  /* bvh build */
+  progress.set_status("Updating Scene BVH", "Building");
+
+  BVHParams bparams;
+  bparams.top_level = true;
+  bparams.bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
+                                                  device->get_bvh_layout_mask());
+  bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
+  bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
+                                scene->params.use_bvh_unaligned_nodes;
+  bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
+  bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
+  bparams.bvh_type = scene->params.bvh_type;
+  bparams.curve_subdivisions = scene->params.curve_subdivisions();
+
+  VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout) << " layout.";
+
+  BVH *bvh = BVH::create(bparams, scene->geometry, scene->objects);
+  bvh->build(progress, &device->stats);
+
+  if (progress.get_cancel()) {
+#ifdef WITH_EMBREE
+    if (dscene->data.bvh.scene) {
+      BVHEmbree::destroy(dscene->data.bvh.scene);
+      dscene->data.bvh.scene = NULL;
+    }
+#endif
+    delete bvh;
+    return;
+  }
+
+  /* copy to device */
+  progress.set_status("Updating Scene BVH", "Copying BVH to device");
+
+  PackedBVH &pack = bvh->pack;
+
+  if (pack.nodes.size()) {
+    dscene->bvh_nodes.steal_data(pack.nodes);
+    dscene->bvh_nodes.copy_to_device();
+  }
+  if (pack.leaf_nodes.size()) {
+    dscene->bvh_leaf_nodes.steal_data(pack.leaf_nodes);
+    dscene->bvh_leaf_nodes.copy_to_device();
+  }
+  if (pack.object_node.size()) {
+    dscene->object_node.steal_data(pack.object_node);
+    dscene->object_node.copy_to_device();
+  }
+  if (pack.prim_tri_index.size()) {
+    dscene->prim_tri_index.steal_data(pack.prim_tri_index);
+    dscene->prim_tri_index.copy_to_device();
+  }
+  if (pack.prim_tri_verts.size()) {
+    dscene->prim_tri_verts.steal_data(pack.prim_tri_verts);
+    dscene->prim_tri_verts.copy_to_device();
+  }
+  if (pack.prim_type.size()) {
+    dscene->prim_type.steal_data(pack.prim_type);
+    dscene->prim_type.copy_to_device();
+  }
+  if (pack.prim_visibility.size()) {
+    dscene->prim_visibility.steal_data(pack.prim_visibility);
+    dscene->prim_visibility.copy_to_device();
+  }
+  if (pack.prim_index.size()) {
+    dscene->prim_index.steal_data(pack.prim_index);
+    dscene->prim_index.copy_to_device();
+  }
+  if (pack.prim_object.size()) {
+    dscene->prim_object.steal_data(pack.prim_object);
+    dscene->prim_object.copy_to_device();
+  }
+  if (pack.prim_time.size()) {
+    dscene->prim_time.steal_data(pack.prim_time);
+    dscene->prim_time.copy_to_device();
+  }
+
+  dscene->data.bvh.root = pack.root_index;
+  dscene->data.bvh.bvh_layout = bparams.bvh_layout;
+  dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
+  dscene->data.bvh.curve_subdivisions = scene->params.curve_subdivisions();
+
+  bvh->copy_to_device(progress, dscene);
+
+  delete bvh;
+}
+
+void GeometryManager::device_update_preprocess(Device *device, Scene *scene, Progress &progress)
+{
+  if (!need_update && !need_flags_update) {
+    return;
+  }
+
+  progress.set_status("Updating Meshes Flags");
+
+  /* Update flags. */
+  bool volume_images_updated = false;
+
+  foreach (Geometry *geom, scene->geometry) {
+    geom->has_volume = false;
+
+    foreach (const Shader *shader, geom->used_shaders) {
+      if (shader->has_volume) {
+        geom->has_volume = true;
+      }
+      if (shader->has_surface_bssrdf) {
+        geom->has_surface_bssrdf = true;
+      }
+    }
+
+    if (need_update && geom->has_volume && geom->type == Geometry::MESH) {
+      /* Create volume meshes if there is voxel data. */
+      if (geom->has_voxel_attributes()) {
+        if (!volume_images_updated) {
+          progress.set_status("Updating Meshes Volume Bounds");
+          device_update_volume_images(device, scene, progress);
+          volume_images_updated = true;
+        }
+
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        create_volume_mesh(mesh, progress);
+      }
+    }
+
+    if (geom->type == Geometry::HAIR) {
+      /* Set curve shape, still a global scene setting for now. */
+      Hair *hair = static_cast<Hair *>(geom);
+      hair->curve_shape = scene->params.hair_shape;
+    }
+  }
+
+  need_flags_update = false;
+}
+
+void GeometryManager::device_update_displacement_images(Device *device,
+                                                        Scene *scene,
+                                                        Progress &progress)
+{
+  progress.set_status("Updating Displacement Images");
+  TaskPool pool;
+  ImageManager *image_manager = scene->image_manager;
+  set<int> bump_images;
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->need_update) {
+      foreach (Shader *shader, geom->used_shaders) {
+        if (!shader->has_displacement || shader->displacement_method == DISPLACE_BUMP) {
+          continue;
+        }
+        foreach (ShaderNode *node, shader->graph->nodes) {
+          if (node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
+            continue;
+          }
+
+          ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode *>(node);
+          for (int i = 0; i < image_node->handle.num_tiles(); i++) {
+            const int slot = image_node->handle.svm_slot(i);
+            if (slot != -1) {
+              bump_images.insert(slot);
+            }
+          }
+        }
+      }
+    }
+  }
+  foreach (int slot, bump_images) {
+    pool.push(function_bind(
+        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
+  }
+  pool.wait_work();
+}
+
+void GeometryManager::device_update_volume_images(Device *device, Scene *scene, Progress &progress)
+{
+  progress.set_status("Updating Volume Images");
+  TaskPool pool;
+  ImageManager *image_manager = scene->image_manager;
+  set<int> volume_images;
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (!geom->need_update) {
+      continue;
+    }
+
+    foreach (Attribute &attr, geom->attributes.attributes) {
+      if (attr.element != ATTR_ELEMENT_VOXEL) {
+        continue;
+      }
+
+      ImageHandle &handle = attr.data_voxel();
+      const int slot = handle.svm_slot();
+      if (slot != -1) {
+        volume_images.insert(slot);
+      }
+    }
+  }
+
+  foreach (int slot, volume_images) {
+    pool.push(function_bind(
+        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
+  }
+  pool.wait_work();
+}
+
+void GeometryManager::device_update(Device *device,
+                                    DeviceScene *dscene,
+                                    Scene *scene,
+                                    Progress &progress)
+{
+  if (!need_update)
+    return;
+
+  VLOG(1) << "Total " << scene->geometry.size() << " meshes.";
+
+  bool true_displacement_used = false;
+  size_t total_tess_needed = 0;
+
+  foreach (Geometry *geom, scene->geometry) {
+    foreach (Shader *shader, geom->used_shaders) {
+      if (shader->need_update_geometry)
+        geom->need_update = true;
+    }
+
+    if (geom->need_update && geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+
+      /* Update normals. */
+      mesh->add_face_normals();
+      mesh->add_vertex_normals();
+
+      if (mesh->need_attribute(scene, ATTR_STD_POSITION_UNDISPLACED)) {
+        mesh->add_undisplaced();
+      }
+
+      /* Test if we need tessellation. */
+      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE && mesh->num_subd_verts == 0 &&
+          mesh->subd_params) {
+        total_tess_needed++;
+      }
+
+      /* Test if we need displacement. */
+      if (mesh->has_true_displacement()) {
+        true_displacement_used = true;
+      }
+
+      if (progress.get_cancel())
+        return;
+    }
+  }
+
+  /* Tessellate meshes that are using subdivision */
+  if (total_tess_needed) {
+    Camera *dicing_camera = scene->dicing_camera;
+    dicing_camera->update(scene);
+
+    size_t i = 0;
+    foreach (Geometry *geom, scene->geometry) {
+      if (!(geom->need_update && geom->type == Geometry::MESH)) {
+        continue;
+      }
+
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE && mesh->num_subd_verts == 0 &&
+          mesh->subd_params) {
+        string msg = "Tessellating ";
+        if (mesh->name == "")
+          msg += string_printf("%u/%u", (uint)(i + 1), (uint)total_tess_needed);
+        else
+          msg += string_printf(
+              "%s %u/%u", mesh->name.c_str(), (uint)(i + 1), (uint)total_tess_needed);
+
+        progress.set_status("Updating Mesh", msg);
+
+        mesh->subd_params->camera = dicing_camera;
+        DiagSplit dsplit(*mesh->subd_params);
+        mesh->tessellate(&dsplit);
+
+        i++;
+
+        if (progress.get_cancel())
+          return;
+      }
+    }
+  }
+
+  /* Update images needed for true displacement. */
+  bool old_need_object_flags_update = false;
+  if (true_displacement_used) {
+    VLOG(1) << "Updating images used for true displacement.";
+    device_update_displacement_images(device, scene, progress);
+    old_need_object_flags_update = scene->object_manager->need_flags_update;
+    scene->object_manager->device_update_flags(device, dscene, scene, progress, false);
+  }
+
+  /* Device update. */
+  device_free(device, dscene);
+
+  mesh_calc_offset(scene);
+  if (true_displacement_used) {
+    device_update_mesh(device, dscene, scene, true, progress);
+  }
+  if (progress.get_cancel())
+    return;
+
+  device_update_attributes(device, dscene, scene, progress);
+  if (progress.get_cancel())
+    return;
+
+  /* Update displacement. */
+  bool displacement_done = false;
+  size_t num_bvh = 0;
+  BVHLayout bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
+                                                    device->get_bvh_layout_mask());
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->need_update) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (displace(device, dscene, scene, mesh, progress)) {
+          displacement_done = true;
+        }
+      }
+
+      if (geom->need_build_bvh(bvh_layout)) {
+        num_bvh++;
+      }
+    }
+
+    if (progress.get_cancel())
+      return;
+  }
+
+  /* Device re-update after displacement. */
+  if (displacement_done) {
+    device_free(device, dscene);
+
+    device_update_attributes(device, dscene, scene, progress);
+    if (progress.get_cancel())
+      return;
+  }
+
+  TaskPool pool;
+
+  size_t i = 0;
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->need_update) {
+      pool.push(function_bind(
+          &Geometry::compute_bvh, geom, device, dscene, &scene->params, &progress, i, num_bvh));
+      if (geom->need_build_bvh(bvh_layout)) {
+        i++;
+      }
+    }
+  }
+
+  TaskPool::Summary summary;
+  pool.wait_work(&summary);
+  VLOG(2) << "Objects BVH build pool statistics:\n" << summary.full_report();
+
+  foreach (Shader *shader, scene->shaders) {
+    shader->need_update_geometry = false;
+  }
+
+  Scene::MotionType need_motion = scene->need_motion();
+  bool motion_blur = need_motion == Scene::MOTION_BLUR;
+
+  /* Update objects. */
+  vector<Object *> volume_objects;
+  foreach (Object *object, scene->objects) {
+    object->compute_bounds(motion_blur);
+  }
+
+  if (progress.get_cancel())
+    return;
+
+  device_update_bvh(device, dscene, scene, progress);
+  if (progress.get_cancel())
+    return;
+
+  device_update_mesh(device, dscene, scene, false, progress);
+  if (progress.get_cancel())
+    return;
+
+  need_update = false;
+
+  if (true_displacement_used) {
+    /* Re-tag flags for update, so they're re-evaluated
+     * for meshes with correct bounding boxes.
+     *
+     * This wouldn't cause wrong results, just true
+     * displacement might be less optimal ot calculate.
+     */
+    scene->object_manager->need_flags_update = old_need_object_flags_update;
+  }
+}
+
+void GeometryManager::device_free(Device *device, DeviceScene *dscene)
+{
+#ifdef WITH_EMBREE
+  if (dscene->data.bvh.scene) {
+    if (dscene->data.bvh.bvh_layout == BVH_LAYOUT_EMBREE)
+      BVHEmbree::destroy(dscene->data.bvh.scene);
+    dscene->data.bvh.scene = NULL;
+  }
+#endif
+
+  dscene->bvh_nodes.free();
+  dscene->bvh_leaf_nodes.free();
+  dscene->object_node.free();
+  dscene->prim_tri_verts.free();
+  dscene->prim_tri_index.free();
+  dscene->prim_type.free();
+  dscene->prim_visibility.free();
+  dscene->prim_index.free();
+  dscene->prim_object.free();
+  dscene->prim_time.free();
+  dscene->tri_shader.free();
+  dscene->tri_vnormal.free();
+  dscene->tri_vindex.free();
+  dscene->tri_patch.free();
+  dscene->tri_patch_uv.free();
+  dscene->curves.free();
+  dscene->curve_keys.free();
+  dscene->patches.free();
+  dscene->attributes_map.free();
+  dscene->attributes_float.free();
+  dscene->attributes_float2.free();
+  dscene->attributes_float3.free();
+  dscene->attributes_uchar4.free();
+
+  /* Signal for shaders like displacement not to do ray tracing. */
+  dscene->data.bvh.bvh_layout = BVH_LAYOUT_NONE;
+
+#ifdef WITH_OSL
+  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+
+  if (og) {
+    og->object_name_map.clear();
+    og->attribute_map.clear();
+    og->object_names.clear();
+  }
+#else
+  (void)device;
+#endif
+}
+
+void GeometryManager::tag_update(Scene *scene)
+{
+  need_update = true;
+  scene->object_manager->need_update = true;
+}
+
+void GeometryManager::collect_statistics(const Scene *scene, RenderStats *stats)
+{
+  foreach (Geometry *geometry, scene->geometry) {
+    stats->mesh.geometry.add_entry(
+        NamedSizeEntry(string(geometry->name.c_str()), geometry->get_total_size_in_bytes()));
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/geometry.h b/intern/cycles/render/geometry.h
new file mode 100644
index 00000000000..b0284304843
--- /dev/null
+++ b/intern/cycles/render/geometry.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __GEOMETRY_H__
+#define __GEOMETRY_H__
+
+#include "graph/node.h"
+
+#include "bvh/bvh_params.h"
+
+#include "render/attribute.h"
+
+#include "util/util_boundbox.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVH;
+class Device;
+class DeviceScene;
+class Mesh;
+class Progress;
+class RenderStats;
+class Scene;
+class SceneParams;
+class Shader;
+
+/* Geometry
+ *
+ * Base class for geometric types like Mesh and Hair. */
+
+class Geometry : public Node {
+ public:
+  NODE_ABSTRACT_DECLARE
+
+  enum Type {
+    MESH,
+    HAIR,
+  };
+
+  Type type;
+
+  /* Attributes */
+  AttributeSet attributes;
+
+  /* Shaders */
+  vector<Shader *> used_shaders;
+
+  /* Transform */
+  BoundBox bounds;
+  bool transform_applied;
+  bool transform_negative_scaled;
+  Transform transform_normal;
+
+  /* Motion Blur */
+  uint motion_steps;
+  bool use_motion_blur;
+
+  /* Maximum number of motion steps supported (due to Embree). */
+  static const uint MAX_MOTION_STEPS = 129;
+
+  /* BVH */
+  BVH *bvh;
+  size_t attr_map_offset;
+  size_t prim_offset;
+  size_t optix_prim_offset;
+
+  /* Shader Properties */
+  bool has_volume;         /* Set in the device_update_flags(). */
+  bool has_surface_bssrdf; /* Set in the device_update_flags(). */
+
+  /* Update Flags */
+  bool need_update;
+  bool need_update_rebuild;
+
+  /* Constructor/Destructor */
+  explicit Geometry(const NodeType *node_type, const Type type);
+  virtual ~Geometry();
+
+  /* Geometry */
+  virtual void clear();
+  virtual void compute_bounds() = 0;
+  virtual void apply_transform(const Transform &tfm, const bool apply_to_motion) = 0;
+
+  /* Attribute Requests */
+  bool need_attribute(Scene *scene, AttributeStandard std);
+  bool need_attribute(Scene *scene, ustring name);
+
+  /* UDIM */
+  virtual void get_uv_tiles(ustring map, unordered_set<int> &tiles) = 0;
+
+  /* Convert between normalized -1..1 motion time and index in the
+   * VERTEX_MOTION attribute. */
+  float motion_time(int step) const;
+  int motion_step(float time) const;
+
+  /* BVH */
+  void compute_bvh(Device *device,
+                   DeviceScene *dscene,
+                   SceneParams *params,
+                   Progress *progress,
+                   int n,
+                   int total);
+
+  /* Check whether the geometry should have own BVH built separately. Briefly,
+   * own BVH is needed for geometry, if:
+   *
+   * - It is instanced multiple times, so each instance object should share the
+   *   same BVH tree.
+   * - Special ray intersection is needed, for example to limit subsurface rays
+   *   to only the geometry itself.
+   * - The BVH layout requires the top level to only contain instances.
+   */
+  bool need_build_bvh(BVHLayout layout) const;
+
+  /* Test if the geometry should be treated as instanced. */
+  bool is_instanced() const;
+
+  bool has_true_displacement() const;
+  bool has_motion_blur() const;
+  bool has_voxel_attributes() const;
+
+  /* Updates */
+  void tag_update(Scene *scene, bool rebuild);
+};
+
+/* Geometry Manager */
+
+class GeometryManager {
+ public:
+  /* Update Flags */
+  bool need_update;
+  bool need_flags_update;
+
+  /* Constructor/Destructor */
+  GeometryManager();
+  ~GeometryManager();
+
+  /* Device Updates */
+  void device_update_preprocess(Device *device, Scene *scene, Progress &progress);
+  void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
+  void device_free(Device *device, DeviceScene *dscene);
+
+  /* Updates */
+  void tag_update(Scene *scene);
+
+  /* Statistics */
+  void collect_statistics(const Scene *scene, RenderStats *stats);
+
+ protected:
+  bool displace(Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress);
+
+  void create_volume_mesh(Mesh *mesh, Progress &progress);
+
+  /* Attributes */
+  void update_osl_attributes(Device *device,
+                             Scene *scene,
+                             vector<AttributeRequestSet> &geom_attributes);
+  void update_svm_attributes(Device *device,
+                             DeviceScene *dscene,
+                             Scene *scene,
+                             vector<AttributeRequestSet> &geom_attributes);
+
+  /* Compute verts/triangles/curves offsets in global arrays. */
+  void mesh_calc_offset(Scene *scene);
+
+  void device_update_object(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
+
+  void device_update_mesh(Device *device,
+                          DeviceScene *dscene,
+                          Scene *scene,
+                          bool for_displacement,
+                          Progress &progress);
+
+  void device_update_attributes(Device *device,
+                                DeviceScene *dscene,
+                                Scene *scene,
+                                Progress &progress);
+
+  void device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
+
+  void device_update_displacement_images(Device *device, Scene *scene, Progress &progress);
+
+  void device_update_volume_images(Device *device, Scene *scene, Progress &progress);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __GEOMETRY_H__ */
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index c284c64b5bf..436324b00ba 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "render/attribute.h"
 #include "render/graph.h"
+#include "render/attribute.h"
+#include "render/constant_fold.h"
 #include "render/nodes.h"
 #include "render/scene.h"
 #include "render/shader.h"
-#include "render/constant_fold.h"
 
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
@@ -55,6 +55,25 @@ bool check_node_inputs_traversed(const ShaderNode *node, const ShaderNodeSet &do
 
 } /* namespace */
 
+/* Sockets */
+
+void ShaderInput::disconnect()
+{
+  if (link) {
+    link->links.erase(remove(link->links.begin(), link->links.end(), this), link->links.end());
+  }
+  link = NULL;
+}
+
+void ShaderOutput::disconnect()
+{
+  foreach (ShaderInput *sock, links) {
+    sock->link = NULL;
+  }
+
+  links.clear();
+}
+
 /* Node */
 
 ShaderNode::ShaderNode(const NodeType *type) : Node(type)
@@ -127,6 +146,13 @@ ShaderOutput *ShaderNode::output(ustring name)
   return NULL;
 }
 
+void ShaderNode::remove_input(ShaderInput *input)
+{
+  assert(input->link == NULL);
+  delete input;
+  inputs.erase(remove(inputs.begin(), inputs.end(), input), inputs.end());
+}
+
 void ShaderNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
   foreach (ShaderInput *input, inputs) {
@@ -278,11 +304,7 @@ void ShaderGraph::disconnect(ShaderOutput *from)
   assert(!finalized);
   simplified = false;
 
-  foreach (ShaderInput *sock, from->links) {
-    sock->link = NULL;
-  }
-
-  from->links.clear();
+  from->disconnect();
 }
 
 void ShaderGraph::disconnect(ShaderInput *to)
@@ -291,10 +313,29 @@ void ShaderGraph::disconnect(ShaderInput *to)
   assert(to->link);
   simplified = false;
 
-  ShaderOutput *from = to->link;
+  to->disconnect();
+}
+
+void ShaderGraph::relink(ShaderInput *from, ShaderInput *to)
+{
+  ShaderOutput *out = from->link;
+  if (out) {
+    disconnect(from);
+    connect(out, to);
+  }
+  to->parent->copy_value(to->socket_type, *(from->parent), from->socket_type);
+}
 
-  to->link = NULL;
-  from->links.erase(remove(from->links.begin(), from->links.end(), to), from->links.end());
+void ShaderGraph::relink(ShaderOutput *from, ShaderOutput *to)
+{
+  /* Copy because disconnect modifies this list. */
+  vector<ShaderInput *> outputs = from->links;
+
+  foreach (ShaderInput *sock, outputs) {
+    disconnect(sock);
+    if (to)
+      connect(to, sock);
+  }
 }
 
 void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to)
@@ -320,6 +361,7 @@ void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to)
 void ShaderGraph::simplify(Scene *scene)
 {
   if (!simplified) {
+    expand();
     default_inputs(scene->shader_manager->use_osl());
     clean(scene);
     refine_bump_nodes();
@@ -721,6 +763,13 @@ void ShaderGraph::compute_displacement_hash()
       int link_id = (input->link) ? input->link->parent->id : 0;
       md5.append((uint8_t *)&link_id, sizeof(link_id));
     }
+
+    if (node->special_type == SHADER_SPECIAL_TYPE_OSL) {
+      /* Hash takes into account socket values, to detect changes
+       * in the code of the node we need an exception. */
+      OSLNode *oslnode = static_cast<OSLNode *>(node);
+      md5.append(oslnode->bytecode_hash);
+    }
   }
 
   displacement_hash = md5.get_hex();
@@ -745,6 +794,11 @@ void ShaderGraph::clean(Scene *scene)
 
   /* break cycles */
   break_cycles(output(), visited, on_stack);
+  foreach (ShaderNode *node, nodes) {
+    if (node->special_type == SHADER_SPECIAL_TYPE_OUTPUT_AOV) {
+      break_cycles(node, visited, on_stack);
+    }
+  }
 
   /* disconnect unused nodes */
   foreach (ShaderNode *node, nodes) {
@@ -773,6 +827,14 @@ void ShaderGraph::clean(Scene *scene)
   nodes = newnodes;
 }
 
+void ShaderGraph::expand()
+{
+  /* Call expand on all nodes, to generate additional nodes. */
+  foreach (ShaderNode *node, nodes) {
+    node->expand(this);
+  }
+}
+
 void ShaderGraph::default_inputs(bool do_osl)
 {
   /* nodes can specify default texture coordinates, for now we give
@@ -880,12 +942,12 @@ void ShaderGraph::refine_bump_nodes()
       foreach (NodePair &pair, nodes_dy)
         add(pair.second);
 
-      /* connect what is connected is bump to samplecenter input*/
+      /* Connect what is connected is bump to sample-center input. */
       connect(out, node->input("SampleCenter"));
 
-      /* bump input is just for connectivity purpose for the graph input,
-       * we re-connected this input to samplecenter, so lets disconnect it
-       * from bump input */
+      /* Bump input is just for connectivity purpose for the graph input,
+       * we re-connected this input to sample-center, so lets disconnect it
+       * from bump input. */
       disconnect(bump_input);
     }
   }
@@ -933,7 +995,7 @@ void ShaderGraph::bump_from_displacement(bool use_object_space)
   foreach (NodePair &pair, nodes_dy)
     pair.second->bump = SHADER_BUMP_DY;
 
-  /* add set normal node and connect the bump normal ouput to the set normal
+  /* add set normal node and connect the bump normal output to the set normal
    * output, so it can finally set the shader normal, note we are only doing
    * this for bump from displacement, this will be the only bump allowed to
    * overwrite the shader normal */
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index a2c030fd226..febd7a76f03 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -61,12 +61,13 @@ enum ShaderNodeSpecialType {
   SHADER_SPECIAL_TYPE_PROXY,
   SHADER_SPECIAL_TYPE_AUTOCONVERT,
   SHADER_SPECIAL_TYPE_GEOMETRY,
-  SHADER_SPECIAL_TYPE_SCRIPT,
+  SHADER_SPECIAL_TYPE_OSL,
   SHADER_SPECIAL_TYPE_IMAGE_SLOT,
   SHADER_SPECIAL_TYPE_CLOSURE,
   SHADER_SPECIAL_TYPE_COMBINE_CLOSURE,
   SHADER_SPECIAL_TYPE_OUTPUT,
   SHADER_SPECIAL_TYPE_BUMP,
+  SHADER_SPECIAL_TYPE_OUTPUT_AOV,
 };
 
 /* Input
@@ -104,6 +105,8 @@ class ShaderInput {
     ((Node *)parent)->set(socket_type, f);
   }
 
+  void disconnect();
+
   const SocketType &socket_type;
   ShaderNode *parent;
   ShaderOutput *link;
@@ -130,6 +133,8 @@ class ShaderOutput {
     return socket_type.type;
   }
 
+  void disconnect();
+
   const SocketType &socket_type;
   ShaderNode *parent;
   vector<ShaderInput *> links;
@@ -147,6 +152,7 @@ class ShaderNode : public Node {
   virtual ~ShaderNode();
 
   void create_inputs_outputs(const NodeType *type);
+  void remove_input(ShaderInput *input);
 
   ShaderInput *input(const char *name);
   ShaderOutput *output(const char *name);
@@ -158,6 +164,11 @@ class ShaderNode : public Node {
   virtual void compile(SVMCompiler &compiler) = 0;
   virtual void compile(OSLCompiler &compiler) = 0;
 
+  /* Expand node into additional nodes. */
+  virtual void expand(ShaderGraph * /* graph */)
+  {
+  }
+
   /* ** Node optimization ** */
   /* Check whether the node can be replaced with single constant. */
   virtual void constant_fold(const ConstantFolder & /*folder*/)
@@ -193,10 +204,6 @@ class ShaderNode : public Node {
   {
     return false;
   }
-  virtual bool has_object_dependency()
-  {
-    return false;
-  }
   virtual bool has_attribute_dependency()
   {
     return false;
@@ -322,6 +329,8 @@ class ShaderGraph {
   void connect(ShaderOutput *from, ShaderInput *to);
   void disconnect(ShaderOutput *from);
   void disconnect(ShaderInput *to);
+  void relink(ShaderInput *from, ShaderInput *to);
+  void relink(ShaderOutput *from, ShaderOutput *to);
   void relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to);
 
   void remove_proxy_nodes();
@@ -346,6 +355,7 @@ class ShaderGraph {
   void break_cycles(ShaderNode *node, vector<bool> &visited, vector<bool> &on_stack);
   void bump_from_displacement(bool use_object_space);
   void refine_bump_nodes();
+  void expand();
   void default_inputs(bool do_osl);
   void transform_multi_closure(ShaderNode *node, ShaderOutput *weight_out, bool volume);
 
diff --git a/intern/cycles/render/hair.cpp b/intern/cycles/render/hair.cpp
new file mode 100644
index 00000000000..816c15cf4ef
--- /dev/null
+++ b/intern/cycles/render/hair.cpp
@@ -0,0 +1,488 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/hair.h"
+#include "render/curves.h"
+#include "render/scene.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Hair Curve */
+
+void Hair::Curve::bounds_grow(const int k,
+                              const float3 *curve_keys,
+                              const float *curve_radius,
+                              BoundBox &bounds) const
+{
+  float3 P[4];
+
+  P[0] = curve_keys[max(first_key + k - 1, first_key)];
+  P[1] = curve_keys[first_key + k];
+  P[2] = curve_keys[first_key + k + 1];
+  P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
+
+  float3 lower;
+  float3 upper;
+
+  curvebounds(&lower.x, &upper.x, P, 0);
+  curvebounds(&lower.y, &upper.y, P, 1);
+  curvebounds(&lower.z, &upper.z, P, 2);
+
+  float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
+
+  bounds.grow(lower, mr);
+  bounds.grow(upper, mr);
+}
+
+void Hair::Curve::bounds_grow(const int k,
+                              const float3 *curve_keys,
+                              const float *curve_radius,
+                              const Transform &aligned_space,
+                              BoundBox &bounds) const
+{
+  float3 P[4];
+
+  P[0] = curve_keys[max(first_key + k - 1, first_key)];
+  P[1] = curve_keys[first_key + k];
+  P[2] = curve_keys[first_key + k + 1];
+  P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
+
+  P[0] = transform_point(&aligned_space, P[0]);
+  P[1] = transform_point(&aligned_space, P[1]);
+  P[2] = transform_point(&aligned_space, P[2]);
+  P[3] = transform_point(&aligned_space, P[3]);
+
+  float3 lower;
+  float3 upper;
+
+  curvebounds(&lower.x, &upper.x, P, 0);
+  curvebounds(&lower.y, &upper.y, P, 1);
+  curvebounds(&lower.z, &upper.z, P, 2);
+
+  float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
+
+  bounds.grow(lower, mr);
+  bounds.grow(upper, mr);
+}
+
+void Hair::Curve::bounds_grow(float4 keys[4], BoundBox &bounds) const
+{
+  float3 P[4] = {
+      float4_to_float3(keys[0]),
+      float4_to_float3(keys[1]),
+      float4_to_float3(keys[2]),
+      float4_to_float3(keys[3]),
+  };
+
+  float3 lower;
+  float3 upper;
+
+  curvebounds(&lower.x, &upper.x, P, 0);
+  curvebounds(&lower.y, &upper.y, P, 1);
+  curvebounds(&lower.z, &upper.z, P, 2);
+
+  float mr = max(keys[1].w, keys[2].w);
+
+  bounds.grow(lower, mr);
+  bounds.grow(upper, mr);
+}
+
+void Hair::Curve::motion_keys(const float3 *curve_keys,
+                              const float *curve_radius,
+                              const float3 *key_steps,
+                              size_t num_curve_keys,
+                              size_t num_steps,
+                              float time,
+                              size_t k0,
+                              size_t k1,
+                              float4 r_keys[2]) const
+{
+  /* Figure out which steps we need to fetch and their interpolation factor. */
+  const size_t max_step = num_steps - 1;
+  const size_t step = min((int)(time * max_step), max_step - 1);
+  const float t = time * max_step - step;
+  /* Fetch vertex coordinates. */
+  float4 curr_keys[2];
+  float4 next_keys[2];
+  keys_for_step(
+      curve_keys, curve_radius, key_steps, num_curve_keys, num_steps, step, k0, k1, curr_keys);
+  keys_for_step(
+      curve_keys, curve_radius, key_steps, num_curve_keys, num_steps, step + 1, k0, k1, next_keys);
+  /* Interpolate between steps. */
+  r_keys[0] = (1.0f - t) * curr_keys[0] + t * next_keys[0];
+  r_keys[1] = (1.0f - t) * curr_keys[1] + t * next_keys[1];
+}
+
+void Hair::Curve::cardinal_motion_keys(const float3 *curve_keys,
+                                       const float *curve_radius,
+                                       const float3 *key_steps,
+                                       size_t num_curve_keys,
+                                       size_t num_steps,
+                                       float time,
+                                       size_t k0,
+                                       size_t k1,
+                                       size_t k2,
+                                       size_t k3,
+                                       float4 r_keys[4]) const
+{
+  /* Figure out which steps we need to fetch and their interpolation factor. */
+  const size_t max_step = num_steps - 1;
+  const size_t step = min((int)(time * max_step), max_step - 1);
+  const float t = time * max_step - step;
+  /* Fetch vertex coordinates. */
+  float4 curr_keys[4];
+  float4 next_keys[4];
+  cardinal_keys_for_step(curve_keys,
+                         curve_radius,
+                         key_steps,
+                         num_curve_keys,
+                         num_steps,
+                         step,
+                         k0,
+                         k1,
+                         k2,
+                         k3,
+                         curr_keys);
+  cardinal_keys_for_step(curve_keys,
+                         curve_radius,
+                         key_steps,
+                         num_curve_keys,
+                         num_steps,
+                         step + 1,
+                         k0,
+                         k1,
+                         k2,
+                         k3,
+                         next_keys);
+  /* Interpolate between steps. */
+  r_keys[0] = (1.0f - t) * curr_keys[0] + t * next_keys[0];
+  r_keys[1] = (1.0f - t) * curr_keys[1] + t * next_keys[1];
+  r_keys[2] = (1.0f - t) * curr_keys[2] + t * next_keys[2];
+  r_keys[3] = (1.0f - t) * curr_keys[3] + t * next_keys[3];
+}
+
+void Hair::Curve::keys_for_step(const float3 *curve_keys,
+                                const float *curve_radius,
+                                const float3 *key_steps,
+                                size_t num_curve_keys,
+                                size_t num_steps,
+                                size_t step,
+                                size_t k0,
+                                size_t k1,
+                                float4 r_keys[2]) const
+{
+  k0 = max(k0, 0);
+  k1 = min(k1, num_keys - 1);
+  const size_t center_step = ((num_steps - 1) / 2);
+  if (step == center_step) {
+    /* Center step: regular key location. */
+    /* TODO(sergey): Consider adding make_float4(float3, float)
+     * function.
+     */
+    r_keys[0] = make_float4(curve_keys[first_key + k0].x,
+                            curve_keys[first_key + k0].y,
+                            curve_keys[first_key + k0].z,
+                            curve_radius[first_key + k0]);
+    r_keys[1] = make_float4(curve_keys[first_key + k1].x,
+                            curve_keys[first_key + k1].y,
+                            curve_keys[first_key + k1].z,
+                            curve_radius[first_key + k1]);
+  }
+  else {
+    /* Center step is not stored in this array. */
+    if (step > center_step) {
+      step--;
+    }
+    const size_t offset = first_key + step * num_curve_keys;
+    r_keys[0] = make_float4(key_steps[offset + k0].x,
+                            key_steps[offset + k0].y,
+                            key_steps[offset + k0].z,
+                            curve_radius[first_key + k0]);
+    r_keys[1] = make_float4(key_steps[offset + k1].x,
+                            key_steps[offset + k1].y,
+                            key_steps[offset + k1].z,
+                            curve_radius[first_key + k1]);
+  }
+}
+
+void Hair::Curve::cardinal_keys_for_step(const float3 *curve_keys,
+                                         const float *curve_radius,
+                                         const float3 *key_steps,
+                                         size_t num_curve_keys,
+                                         size_t num_steps,
+                                         size_t step,
+                                         size_t k0,
+                                         size_t k1,
+                                         size_t k2,
+                                         size_t k3,
+                                         float4 r_keys[4]) const
+{
+  k0 = max(k0, 0);
+  k3 = min(k3, num_keys - 1);
+  const size_t center_step = ((num_steps - 1) / 2);
+  if (step == center_step) {
+    /* Center step: regular key location. */
+    r_keys[0] = make_float4(curve_keys[first_key + k0].x,
+                            curve_keys[first_key + k0].y,
+                            curve_keys[first_key + k0].z,
+                            curve_radius[first_key + k0]);
+    r_keys[1] = make_float4(curve_keys[first_key + k1].x,
+                            curve_keys[first_key + k1].y,
+                            curve_keys[first_key + k1].z,
+                            curve_radius[first_key + k1]);
+    r_keys[2] = make_float4(curve_keys[first_key + k2].x,
+                            curve_keys[first_key + k2].y,
+                            curve_keys[first_key + k2].z,
+                            curve_radius[first_key + k2]);
+    r_keys[3] = make_float4(curve_keys[first_key + k3].x,
+                            curve_keys[first_key + k3].y,
+                            curve_keys[first_key + k3].z,
+                            curve_radius[first_key + k3]);
+  }
+  else {
+    /* Center step is not stored in this array. */
+    if (step > center_step) {
+      step--;
+    }
+    const size_t offset = first_key + step * num_curve_keys;
+    r_keys[0] = make_float4(key_steps[offset + k0].x,
+                            key_steps[offset + k0].y,
+                            key_steps[offset + k0].z,
+                            curve_radius[first_key + k0]);
+    r_keys[1] = make_float4(key_steps[offset + k1].x,
+                            key_steps[offset + k1].y,
+                            key_steps[offset + k1].z,
+                            curve_radius[first_key + k1]);
+    r_keys[2] = make_float4(key_steps[offset + k2].x,
+                            key_steps[offset + k2].y,
+                            key_steps[offset + k2].z,
+                            curve_radius[first_key + k2]);
+    r_keys[3] = make_float4(key_steps[offset + k3].x,
+                            key_steps[offset + k3].y,
+                            key_steps[offset + k3].z,
+                            curve_radius[first_key + k3]);
+  }
+}
+
+/* Hair */
+
+NODE_DEFINE(Hair)
+{
+  NodeType *type = NodeType::add("hair", create, NodeType::NONE, Geometry::node_base_type);
+
+  SOCKET_POINT_ARRAY(curve_keys, "Curve Keys", array<float3>());
+  SOCKET_FLOAT_ARRAY(curve_radius, "Curve Radius", array<float>());
+  SOCKET_INT_ARRAY(curve_first_key, "Curve First Key", array<int>());
+  SOCKET_INT_ARRAY(curve_shader, "Curve Shader", array<int>());
+
+  return type;
+}
+
+Hair::Hair() : Geometry(node_type, Geometry::HAIR)
+{
+  curvekey_offset = 0;
+  curve_shape = CURVE_RIBBON;
+}
+
+Hair::~Hair()
+{
+}
+
+void Hair::resize_curves(int numcurves, int numkeys)
+{
+  curve_keys.resize(numkeys);
+  curve_radius.resize(numkeys);
+  curve_first_key.resize(numcurves);
+  curve_shader.resize(numcurves);
+
+  attributes.resize();
+}
+
+void Hair::reserve_curves(int numcurves, int numkeys)
+{
+  curve_keys.reserve(numkeys);
+  curve_radius.reserve(numkeys);
+  curve_first_key.reserve(numcurves);
+  curve_shader.reserve(numcurves);
+
+  attributes.resize(true);
+}
+
+void Hair::clear()
+{
+  Geometry::clear();
+
+  curve_keys.clear();
+  curve_radius.clear();
+  curve_first_key.clear();
+  curve_shader.clear();
+
+  attributes.clear();
+}
+
+void Hair::add_curve_key(float3 co, float radius)
+{
+  curve_keys.push_back_reserved(co);
+  curve_radius.push_back_reserved(radius);
+}
+
+void Hair::add_curve(int first_key, int shader)
+{
+  curve_first_key.push_back_reserved(first_key);
+  curve_shader.push_back_reserved(shader);
+}
+
+void Hair::copy_center_to_motion_step(const int motion_step)
+{
+  Attribute *attr_mP = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  if (attr_mP) {
+    float3 *keys = &curve_keys[0];
+    size_t numkeys = curve_keys.size();
+    memcpy(attr_mP->data_float3() + motion_step * numkeys, keys, sizeof(float3) * numkeys);
+  }
+}
+
+void Hair::get_uv_tiles(ustring map, unordered_set<int> &tiles)
+{
+  Attribute *attr;
+
+  if (map.empty()) {
+    attr = attributes.find(ATTR_STD_UV);
+  }
+  else {
+    attr = attributes.find(map);
+  }
+
+  if (attr) {
+    attr->get_uv_tiles(this, ATTR_PRIM_GEOMETRY, tiles);
+  }
+}
+
+void Hair::compute_bounds()
+{
+  BoundBox bnds = BoundBox::empty;
+  size_t curve_keys_size = curve_keys.size();
+
+  if (curve_keys_size > 0) {
+    for (size_t i = 0; i < curve_keys_size; i++)
+      bnds.grow(curve_keys[i], curve_radius[i]);
+
+    Attribute *curve_attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+    if (use_motion_blur && curve_attr) {
+      size_t steps_size = curve_keys.size() * (motion_steps - 1);
+      float3 *key_steps = curve_attr->data_float3();
+
+      for (size_t i = 0; i < steps_size; i++)
+        bnds.grow(key_steps[i]);
+    }
+
+    if (!bnds.valid()) {
+      bnds = BoundBox::empty;
+
+      /* skip nan or inf coordinates */
+      for (size_t i = 0; i < curve_keys_size; i++)
+        bnds.grow_safe(curve_keys[i], curve_radius[i]);
+
+      if (use_motion_blur && curve_attr) {
+        size_t steps_size = curve_keys.size() * (motion_steps - 1);
+        float3 *key_steps = curve_attr->data_float3();
+
+        for (size_t i = 0; i < steps_size; i++)
+          bnds.grow_safe(key_steps[i]);
+      }
+    }
+  }
+
+  if (!bnds.valid()) {
+    /* empty mesh */
+    bnds.grow(make_float3(0.0f, 0.0f, 0.0f));
+  }
+
+  bounds = bnds;
+}
+
+void Hair::apply_transform(const Transform &tfm, const bool apply_to_motion)
+{
+  /* compute uniform scale */
+  float3 c0 = transform_get_column(&tfm, 0);
+  float3 c1 = transform_get_column(&tfm, 1);
+  float3 c2 = transform_get_column(&tfm, 2);
+  float scalar = powf(fabsf(dot(cross(c0, c1), c2)), 1.0f / 3.0f);
+
+  /* apply transform to curve keys */
+  for (size_t i = 0; i < curve_keys.size(); i++) {
+    float3 co = transform_point(&tfm, curve_keys[i]);
+    float radius = curve_radius[i] * scalar;
+
+    /* scale for curve radius is only correct for uniform scale */
+    curve_keys[i] = co;
+    curve_radius[i] = radius;
+  }
+
+  if (apply_to_motion) {
+    Attribute *curve_attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+    if (curve_attr) {
+      /* apply transform to motion curve keys */
+      size_t steps_size = curve_keys.size() * (motion_steps - 1);
+      float4 *key_steps = curve_attr->data_float4();
+
+      for (size_t i = 0; i < steps_size; i++) {
+        float3 co = transform_point(&tfm, float4_to_float3(key_steps[i]));
+        float radius = key_steps[i].w * scalar;
+
+        /* scale for curve radius is only correct for uniform scale */
+        key_steps[i] = float3_to_float4(co);
+        key_steps[i].w = radius;
+      }
+    }
+  }
+}
+
+void Hair::pack_curves(Scene *scene,
+                       float4 *curve_key_co,
+                       float4 *curve_data,
+                       size_t curvekey_offset)
+{
+  size_t curve_keys_size = curve_keys.size();
+
+  /* pack curve keys */
+  if (curve_keys_size) {
+    float3 *keys_ptr = curve_keys.data();
+    float *radius_ptr = curve_radius.data();
+
+    for (size_t i = 0; i < curve_keys_size; i++)
+      curve_key_co[i] = make_float4(keys_ptr[i].x, keys_ptr[i].y, keys_ptr[i].z, radius_ptr[i]);
+  }
+
+  /* pack curve segments */
+  size_t curve_num = num_curves();
+
+  for (size_t i = 0; i < curve_num; i++) {
+    Curve curve = get_curve(i);
+    int shader_id = curve_shader[i];
+    Shader *shader = (shader_id < used_shaders.size()) ? used_shaders[shader_id] :
+                                                         scene->default_surface;
+    shader_id = scene->shader_manager->get_shader_id(shader, false);
+
+    curve_data[i] = make_float4(__int_as_float(curve.first_key + curvekey_offset),
+                                __int_as_float(curve.num_keys),
+                                __int_as_float(shader_id),
+                                0.0f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/hair.h b/intern/cycles/render/hair.h
new file mode 100644
index 00000000000..39d6a34d799
--- /dev/null
+++ b/intern/cycles/render/hair.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HAIR_H__
+#define __HAIR_H__
+
+#include "render/geometry.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Hair : public Geometry {
+ public:
+  NODE_DECLARE
+
+  /* Hair Curve */
+  struct Curve {
+    int first_key;
+    int num_keys;
+
+    int num_segments() const
+    {
+      return num_keys - 1;
+    }
+
+    void bounds_grow(const int k,
+                     const float3 *curve_keys,
+                     const float *curve_radius,
+                     BoundBox &bounds) const;
+    void bounds_grow(float4 keys[4], BoundBox &bounds) const;
+    void bounds_grow(const int k,
+                     const float3 *curve_keys,
+                     const float *curve_radius,
+                     const Transform &aligned_space,
+                     BoundBox &bounds) const;
+
+    void motion_keys(const float3 *curve_keys,
+                     const float *curve_radius,
+                     const float3 *key_steps,
+                     size_t num_curve_keys,
+                     size_t num_steps,
+                     float time,
+                     size_t k0,
+                     size_t k1,
+                     float4 r_keys[2]) const;
+    void cardinal_motion_keys(const float3 *curve_keys,
+                              const float *curve_radius,
+                              const float3 *key_steps,
+                              size_t num_curve_keys,
+                              size_t num_steps,
+                              float time,
+                              size_t k0,
+                              size_t k1,
+                              size_t k2,
+                              size_t k3,
+                              float4 r_keys[4]) const;
+
+    void keys_for_step(const float3 *curve_keys,
+                       const float *curve_radius,
+                       const float3 *key_steps,
+                       size_t num_curve_keys,
+                       size_t num_steps,
+                       size_t step,
+                       size_t k0,
+                       size_t k1,
+                       float4 r_keys[2]) const;
+    void cardinal_keys_for_step(const float3 *curve_keys,
+                                const float *curve_radius,
+                                const float3 *key_steps,
+                                size_t num_curve_keys,
+                                size_t num_steps,
+                                size_t step,
+                                size_t k0,
+                                size_t k1,
+                                size_t k2,
+                                size_t k3,
+                                float4 r_keys[4]) const;
+  };
+
+  array<float3> curve_keys;
+  array<float> curve_radius;
+  array<int> curve_first_key;
+  array<int> curve_shader;
+
+  /* BVH */
+  size_t curvekey_offset;
+  CurveShapeType curve_shape;
+
+  /* Constructor/Destructor */
+  Hair();
+  ~Hair();
+
+  /* Geometry */
+  void clear() override;
+
+  void resize_curves(int numcurves, int numkeys);
+  void reserve_curves(int numcurves, int numkeys);
+  void add_curve_key(float3 loc, float radius);
+  void add_curve(int first_key, int shader);
+
+  void copy_center_to_motion_step(const int motion_step);
+
+  void compute_bounds() override;
+  void apply_transform(const Transform &tfm, const bool apply_to_motion) override;
+
+  /* Curves */
+  Curve get_curve(size_t i) const
+  {
+    int first = curve_first_key[i];
+    int next_first = (i + 1 < curve_first_key.size()) ? curve_first_key[i + 1] : curve_keys.size();
+
+    Curve curve = {first, next_first - first};
+    return curve;
+  }
+
+  size_t num_keys() const
+  {
+    return curve_keys.size();
+  }
+
+  size_t num_curves() const
+  {
+    return curve_first_key.size();
+  }
+
+  size_t num_segments() const
+  {
+    return curve_keys.size() - curve_first_key.size();
+  }
+
+  /* UDIM */
+  void get_uv_tiles(ustring map, unordered_set<int> &tiles) override;
+
+  /* BVH */
+  void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __HAIR_H__ */
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index ae219e912e0..8d187814d64 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -14,15 +14,20 @@
  * limitations under the License.
  */
 
-#include "device/device.h"
 #include "render/image.h"
+#include "device/device.h"
+#include "render/colorspace.h"
+#include "render/image_oiio.h"
 #include "render/scene.h"
 #include "render/stats.h"
 
 #include "util/util_foreach.h"
+#include "util/util_image.h"
+#include "util/util_image_impl.h"
 #include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_progress.h"
+#include "util/util_task.h"
 #include "util/util_texture.h"
 #include "util/util_unique_ptr.h"
 
@@ -48,21 +53,6 @@ bool isfinite(uint16_t /*value*/)
   return true;
 }
 
-/* The lower three bits of a device texture slot number indicate its type.
- * These functions convert the slot ids from ImageManager "images" ones
- * to device ones and vice verse.
- */
-int type_index_to_flattened_slot(int slot, ImageDataType type)
-{
-  return (slot << IMAGE_DATA_TYPE_SHIFT) | (type);
-}
-
-int flattened_slot_to_type_index(int flat_slot, ImageDataType *type)
-{
-  *type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK);
-  return flat_slot >> IMAGE_DATA_TYPE_SHIFT;
-}
-
 const char *name_from_type(ImageDataType type)
 {
   switch (type) {
@@ -92,301 +82,355 @@ const char *name_from_type(ImageDataType type)
 
 }  // namespace
 
-ImageManager::ImageManager(const DeviceInfo &info)
+/* Image Handle */
+
+ImageHandle::ImageHandle() : manager(NULL)
 {
-  need_update = true;
-  osl_texture_system = NULL;
-  animation_frame = 0;
+}
 
-  /* Set image limits */
-  max_num_images = TEX_NUM_MAX;
-  has_half_images = info.has_half_images;
+ImageHandle::ImageHandle(const ImageHandle &other)
+    : tile_slots(other.tile_slots), manager(other.manager)
+{
+  /* Increase image user count. */
+  foreach (const int slot, tile_slots) {
+    manager->add_image_user(slot);
+  }
+}
+
+ImageHandle &ImageHandle::operator=(const ImageHandle &other)
+{
+  clear();
+  manager = other.manager;
+  tile_slots = other.tile_slots;
 
-  for (size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    tex_num_images[type] = 0;
+  foreach (const int slot, tile_slots) {
+    manager->add_image_user(slot);
   }
+
+  return *this;
 }
 
-ImageManager::~ImageManager()
+ImageHandle::~ImageHandle()
 {
-  for (size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++)
-      assert(!images[type][slot]);
+  clear();
+}
+
+void ImageHandle::clear()
+{
+  foreach (const int slot, tile_slots) {
+    manager->remove_image_user(slot);
   }
+
+  tile_slots.clear();
+  manager = NULL;
 }
 
-void ImageManager::set_osl_texture_system(void *texture_system)
+bool ImageHandle::empty()
 {
-  osl_texture_system = texture_system;
+  return tile_slots.empty();
 }
 
-bool ImageManager::set_animation_frame_update(int frame)
+int ImageHandle::num_tiles()
 {
-  if (frame != animation_frame) {
-    animation_frame = frame;
+  return tile_slots.size();
+}
 
-    for (size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-      for (size_t slot = 0; slot < images[type].size(); slot++) {
-        if (images[type][slot] && images[type][slot]->animated)
-          return true;
-      }
-    }
+ImageMetaData ImageHandle::metadata()
+{
+  if (tile_slots.empty()) {
+    return ImageMetaData();
   }
 
-  return false;
+  ImageManager::Image *img = manager->images[tile_slots.front()];
+  manager->load_image_metadata(img);
+  return img->metadata;
 }
 
-device_memory *ImageManager::image_memory(int flat_slot)
+int ImageHandle::svm_slot(const int tile_index) const
 {
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
+  if (tile_index >= tile_slots.size()) {
+    return -1;
+  }
 
-  Image *img = images[type][slot];
+  if (manager->osl_texture_system) {
+    ImageManager::Image *img = manager->images[tile_slots[tile_index]];
+    if (!img->loader->osl_filepath().empty()) {
+      return -1;
+    }
+  }
 
-  return img->mem;
+  return tile_slots[tile_index];
 }
 
-bool ImageManager::get_image_metadata(int flat_slot, ImageMetaData &metadata)
+device_texture *ImageHandle::image_memory(const int tile_index) const
 {
-  if (flat_slot == -1) {
-    return false;
+  if (tile_index >= tile_slots.size()) {
+    return NULL;
   }
 
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
+  ImageManager::Image *img = manager->images[tile_slots[tile_index]];
+  return img ? img->mem : NULL;
+}
 
-  Image *img = images[type][slot];
-  if (img) {
-    metadata = img->metadata;
-    return true;
-  }
+bool ImageHandle::operator==(const ImageHandle &other) const
+{
+  return manager == other.manager && tile_slots == other.tile_slots;
+}
 
-  return false;
+/* Image MetaData */
+
+ImageMetaData::ImageMetaData()
+    : channels(0),
+      width(0),
+      height(0),
+      depth(0),
+      type(IMAGE_DATA_NUM_TYPES),
+      colorspace(u_colorspace_raw),
+      colorspace_file_format(""),
+      use_transform_3d(false),
+      compress_as_srgb(false)
+{
 }
 
-bool ImageManager::get_image_metadata(const string &filename,
-                                      void *builtin_data,
-                                      ImageMetaData &metadata)
+bool ImageMetaData::operator==(const ImageMetaData &other) const
 {
-  memset(&metadata, 0, sizeof(metadata));
+  return channels == other.channels && width == other.width && height == other.height &&
+         depth == other.depth && use_transform_3d == other.use_transform_3d &&
+         (!use_transform_3d || transform_3d == other.transform_3d) && type == other.type &&
+         colorspace == other.colorspace && compress_as_srgb == other.compress_as_srgb;
+}
 
-  if (builtin_data) {
-    if (builtin_image_info_cb) {
-      builtin_image_info_cb(filename, builtin_data, metadata);
-    }
-    else {
-      return false;
-    }
+bool ImageMetaData::is_float() const
+{
+  return (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4 ||
+          type == IMAGE_DATA_TYPE_HALF || type == IMAGE_DATA_TYPE_HALF4);
+}
+
+void ImageMetaData::detect_colorspace()
+{
+  /* Convert used specified color spaces to one we know how to handle. */
+  colorspace = ColorSpaceManager::detect_known_colorspace(
+      colorspace, colorspace_file_format, is_float());
+
+  if (colorspace == u_colorspace_raw) {
+    /* Nothing to do. */
+  }
+  else if (colorspace == u_colorspace_srgb) {
+    /* Keep sRGB colorspace stored as sRGB, to save memory and/or loading time
+     * for the common case of 8bit sRGB images like PNG. */
+    compress_as_srgb = true;
+  }
+  else {
+    /* Always compress non-raw 8bit images as scene linear + sRGB, as a
+     * heuristic to keep memory usage the same without too much data loss
+     * due to quantization in common cases. */
+    compress_as_srgb = (type == IMAGE_DATA_TYPE_BYTE || type == IMAGE_DATA_TYPE_BYTE4);
 
-    if (metadata.is_float) {
-      metadata.is_linear = true;
-      metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
+    /* If colorspace conversion needed, use half instead of short so we can
+     * represent HDR values that might result from conversion. */
+    if (type == IMAGE_DATA_TYPE_USHORT) {
+      type = IMAGE_DATA_TYPE_HALF;
     }
-    else {
-      metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
+    else if (type == IMAGE_DATA_TYPE_USHORT4) {
+      type = IMAGE_DATA_TYPE_HALF4;
     }
-
-    return true;
   }
+}
 
-  /* Perform preliminary checks, with meaningful logging. */
-  if (!path_exists(filename)) {
-    VLOG(1) << "File '" << filename << "' does not exist.";
-    return false;
+/* Image Loader */
+
+ImageLoader::ImageLoader()
+{
+}
+
+ustring ImageLoader::osl_filepath() const
+{
+  return ustring();
+}
+
+bool ImageLoader::equals(const ImageLoader *a, const ImageLoader *b)
+{
+  if (a == NULL && b == NULL) {
+    return true;
   }
-  if (path_is_directory(filename)) {
-    VLOG(1) << "File '" << filename << "' is a directory, can't use as image.";
-    return false;
+  else {
+    return (a && b && typeid(*a) == typeid(*b) && a->equals(*b));
   }
+}
 
-  unique_ptr<ImageInput> in(ImageInput::create(filename));
+/* Image Manager */
 
-  if (!in) {
-    return false;
-  }
+ImageManager::ImageManager(const DeviceInfo &info)
+{
+  need_update = true;
+  osl_texture_system = NULL;
+  animation_frame = 0;
 
-  ImageSpec spec;
-  if (!in->open(filename, spec)) {
-    return false;
-  }
+  /* Set image limits */
+  has_half_images = info.has_half_images;
+}
 
-  metadata.width = spec.width;
-  metadata.height = spec.height;
-  metadata.depth = spec.depth;
+ImageManager::~ImageManager()
+{
+  for (size_t slot = 0; slot < images.size(); slot++)
+    assert(!images[slot]);
+}
 
-  /* Check the main format, and channel formats. */
-  size_t channel_size = spec.format.basesize();
+void ImageManager::set_osl_texture_system(void *texture_system)
+{
+  osl_texture_system = texture_system;
+}
 
-  if (spec.format.is_floating_point()) {
-    metadata.is_float = true;
-    metadata.is_linear = true;
-  }
+bool ImageManager::set_animation_frame_update(int frame)
+{
+  if (frame != animation_frame) {
+    thread_scoped_lock device_lock(images_mutex);
+    animation_frame = frame;
 
-  for (size_t channel = 0; channel < spec.channelformats.size(); channel++) {
-    channel_size = max(channel_size, spec.channelformats[channel].basesize());
-    if (spec.channelformats[channel].is_floating_point()) {
-      metadata.is_float = true;
-      metadata.is_linear = true;
+    for (size_t slot = 0; slot < images.size(); slot++) {
+      if (images[slot] && images[slot]->params.animated)
+        return true;
     }
   }
 
-  /* check if it's half float */
-  if (spec.format == TypeDesc::HALF) {
-    metadata.is_half = true;
+  return false;
+}
+
+void ImageManager::load_image_metadata(Image *img)
+{
+  if (!img->need_metadata) {
+    return;
   }
 
-  /* basic color space detection, not great but better than nothing
-   * before we do OpenColorIO integration */
-  if (metadata.is_float) {
-    string colorspace = spec.get_string_attribute("oiio:ColorSpace");
+  thread_scoped_lock image_lock(img->mutex);
+  if (!img->need_metadata) {
+    return;
+  }
 
-    metadata.is_linear = !(
-        colorspace == "sRGB" || colorspace == "GammaCorrected" ||
-        (colorspace == "" &&
-         (strcmp(in->format_name(), "png") == 0 || strcmp(in->format_name(), "tiff") == 0 ||
-          strcmp(in->format_name(), "dpx") == 0 || strcmp(in->format_name(), "jpeg2000") == 0)));
+  ImageMetaData &metadata = img->metadata;
+  metadata = ImageMetaData();
+  metadata.colorspace = img->params.colorspace;
+
+  if (img->loader->load_metadata(metadata)) {
+    assert(metadata.type != IMAGE_DATA_NUM_TYPES);
   }
   else {
-    metadata.is_linear = false;
+    metadata.type = IMAGE_DATA_TYPE_BYTE4;
   }
 
-  /* set type and channels */
-  metadata.channels = spec.nchannels;
+  metadata.detect_colorspace();
 
-  if (metadata.is_half) {
-    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_HALF4 : IMAGE_DATA_TYPE_HALF;
-  }
-  else if (metadata.is_float) {
-    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
-  }
-  else if (spec.format == TypeDesc::USHORT) {
-    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_USHORT4 : IMAGE_DATA_TYPE_USHORT;
-  }
-  else {
-    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
+  /* No half textures on OpenCL, use full float instead. */
+  if (!has_half_images) {
+    if (metadata.type == IMAGE_DATA_TYPE_HALF4) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+    }
+    else if (metadata.type == IMAGE_DATA_TYPE_HALF) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT;
+    }
   }
 
-  in->close();
+  img->need_metadata = false;
+}
 
-  return true;
+ImageHandle ImageManager::add_image(const string &filename, const ImageParams &params)
+{
+  const int slot = add_image_slot(new OIIOImageLoader(filename), params, false);
+
+  ImageHandle handle;
+  handle.tile_slots.push_back(slot);
+  handle.manager = this;
+  return handle;
+}
+
+ImageHandle ImageManager::add_image(const string &filename,
+                                    const ImageParams &params,
+                                    const vector<int> &tiles)
+{
+  ImageHandle handle;
+  handle.manager = this;
+
+  foreach (int tile, tiles) {
+    string tile_filename = filename;
+    if (tile != 0) {
+      string_replace(tile_filename, "<UDIM>", string_printf("%04d", tile));
+    }
+    const int slot = add_image_slot(new OIIOImageLoader(tile_filename), params, false);
+    handle.tile_slots.push_back(slot);
+  }
+
+  return handle;
 }
 
-static bool image_equals(ImageManager::Image *image,
-                         const string &filename,
-                         void *builtin_data,
-                         InterpolationType interpolation,
-                         ExtensionType extension,
-                         bool use_alpha)
+ImageHandle ImageManager::add_image(ImageLoader *loader, const ImageParams &params)
 {
-  return image->filename == filename && image->builtin_data == builtin_data &&
-         image->interpolation == interpolation && image->extension == extension &&
-         image->use_alpha == use_alpha;
+  const int slot = add_image_slot(loader, params, true);
+
+  ImageHandle handle;
+  handle.tile_slots.push_back(slot);
+  handle.manager = this;
+  return handle;
 }
 
-int ImageManager::add_image(const string &filename,
-                            void *builtin_data,
-                            bool animated,
-                            float frame,
-                            InterpolationType interpolation,
-                            ExtensionType extension,
-                            bool use_alpha,
-                            ImageMetaData &metadata)
+int ImageManager::add_image_slot(ImageLoader *loader,
+                                 const ImageParams &params,
+                                 const bool builtin)
 {
   Image *img;
   size_t slot;
 
-  get_image_metadata(filename, builtin_data, metadata);
-  ImageDataType type = metadata.type;
-
-  thread_scoped_lock device_lock(device_mutex);
-
-  /* No half textures on OpenCL, use full float instead. */
-  if (!has_half_images) {
-    if (type == IMAGE_DATA_TYPE_HALF4) {
-      type = IMAGE_DATA_TYPE_FLOAT4;
-    }
-    else if (type == IMAGE_DATA_TYPE_HALF) {
-      type = IMAGE_DATA_TYPE_FLOAT;
-    }
-  }
+  thread_scoped_lock device_lock(images_mutex);
 
   /* Fnd existing image. */
-  for (slot = 0; slot < images[type].size(); slot++) {
-    img = images[type][slot];
-    if (img && image_equals(img, filename, builtin_data, interpolation, extension, use_alpha)) {
-      if (img->frame != frame) {
-        img->frame = frame;
-        img->need_load = true;
-      }
-      if (img->use_alpha != use_alpha) {
-        img->use_alpha = use_alpha;
-        img->need_load = true;
-      }
-      if (!(img->metadata == metadata)) {
-        img->metadata = metadata;
-        img->need_load = true;
-      }
+  for (slot = 0; slot < images.size(); slot++) {
+    img = images[slot];
+    if (img && ImageLoader::equals(img->loader, loader) && img->params == params) {
       img->users++;
-      return type_index_to_flattened_slot(slot, type);
+      delete loader;
+      return slot;
     }
   }
 
   /* Find free slot. */
-  for (slot = 0; slot < images[type].size(); slot++) {
-    if (!images[type][slot])
+  for (slot = 0; slot < images.size(); slot++) {
+    if (!images[slot])
       break;
   }
 
-  /* Count if we're over the limit.
-   * Very unlikely, since max_num_images is insanely big. But better safe
-   * than sorry.
-   */
-  int tex_count = 0;
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    tex_count += tex_num_images[type];
-  }
-  if (tex_count > max_num_images) {
-    printf(
-        "ImageManager::add_image: Reached image limit (%d), "
-        "skipping '%s'\n",
-        max_num_images,
-        filename.c_str());
-    return -1;
-  }
-
-  if (slot == images[type].size()) {
-    images[type].resize(images[type].size() + 1);
+  if (slot == images.size()) {
+    images.resize(images.size() + 1);
   }
 
   /* Add new image. */
   img = new Image();
-  img->filename = filename;
-  img->builtin_data = builtin_data;
-  img->metadata = metadata;
-  img->need_load = true;
-  img->animated = animated;
-  img->frame = frame;
-  img->interpolation = interpolation;
-  img->extension = extension;
+  img->params = params;
+  img->loader = loader;
+  img->need_metadata = true;
+  img->need_load = !(osl_texture_system && !img->loader->osl_filepath().empty());
+  img->builtin = builtin;
   img->users = 1;
-  img->use_alpha = use_alpha;
   img->mem = NULL;
 
-  images[type][slot] = img;
-
-  ++tex_num_images[type];
+  images[slot] = img;
 
   need_update = true;
 
-  return type_index_to_flattened_slot(slot, type);
+  return slot;
 }
 
-void ImageManager::remove_image(int flat_slot)
+void ImageManager::add_image_user(int slot)
 {
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
+  thread_scoped_lock device_lock(images_mutex);
+  Image *image = images[slot];
+  assert(image && image->users >= 1);
+
+  image->users++;
+}
 
-  Image *image = images[type][slot];
+void ImageManager::remove_image_user(int slot)
+{
+  thread_scoped_lock device_lock(images_mutex);
+  Image *image = images[slot];
   assert(image && image->users >= 1);
 
   /* decrement user count */
@@ -399,100 +443,20 @@ void ImageManager::remove_image(int flat_slot)
     need_update = true;
 }
 
-void ImageManager::remove_image(const string &filename,
-                                void *builtin_data,
-                                InterpolationType interpolation,
-                                ExtensionType extension,
-                                bool use_alpha)
+static bool image_associate_alpha(ImageManager::Image *img)
 {
-  size_t slot;
-
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (slot = 0; slot < images[type].size(); slot++) {
-      if (images[type][slot] &&
-          image_equals(
-              images[type][slot], filename, builtin_data, interpolation, extension, use_alpha)) {
-        remove_image(type_index_to_flattened_slot(slot, (ImageDataType)type));
-        return;
-      }
-    }
-  }
-}
-
-/* TODO(sergey): Deduplicate with the iteration above, but make it pretty,
- * without bunch of arguments passing around making code readability even
- * more cluttered.
- */
-void ImageManager::tag_reload_image(const string &filename,
-                                    void *builtin_data,
-                                    InterpolationType interpolation,
-                                    ExtensionType extension,
-                                    bool use_alpha)
-{
-  for (size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      if (images[type][slot] &&
-          image_equals(
-              images[type][slot], filename, builtin_data, interpolation, extension, use_alpha)) {
-        images[type][slot]->need_load = true;
-        break;
-      }
-    }
-  }
+  /* For typical RGBA images we let OIIO convert to associated alpha,
+   * but some types we want to leave the RGB channels untouched. */
+  return !(ColorSpaceManager::colorspace_is_data(img->params.colorspace) ||
+           img->params.alpha_type == IMAGE_ALPHA_IGNORE ||
+           img->params.alpha_type == IMAGE_ALPHA_CHANNEL_PACKED);
 }
 
-bool ImageManager::file_load_image_generic(Image *img, unique_ptr<ImageInput> *in)
+template<TypeDesc::BASETYPE FileFormat, typename StorageType>
+bool ImageManager::file_load_image(Image *img, int texture_limit)
 {
-  if (img->filename == "")
-    return false;
-
-  if (!img->builtin_data) {
-    /* NOTE: Error logging is done in meta data acquisition. */
-    if (!path_exists(img->filename) || path_is_directory(img->filename)) {
-      return false;
-    }
-
-    /* load image from file through OIIO */
-    *in = unique_ptr<ImageInput>(ImageInput::create(img->filename));
-
-    if (!*in)
-      return false;
-
-    ImageSpec spec = ImageSpec();
-    ImageSpec config = ImageSpec();
-
-    if (img->use_alpha == false)
-      config.attribute("oiio:UnassociatedAlpha", 1);
-
-    if (!(*in)->open(img->filename, spec, config)) {
-      return false;
-    }
-  }
-  else {
-    /* load image using builtin images callbacks */
-    if (!builtin_image_info_cb || !builtin_image_pixels_cb)
-      return false;
-  }
-
   /* we only handle certain number of components */
   if (!(img->metadata.channels >= 1 && img->metadata.channels <= 4)) {
-    if (*in) {
-      (*in)->close();
-    }
-    return false;
-  }
-
-  return true;
-}
-
-template<TypeDesc::BASETYPE FileFormat, typename StorageType, typename DeviceType>
-bool ImageManager::file_load_image(Image *img,
-                                   ImageDataType type,
-                                   int texture_limit,
-                                   device_vector<DeviceType> &tex_img)
-{
-  unique_ptr<ImageInput> in = NULL;
-  if (!file_load_image_generic(img, &in)) {
     return false;
   }
 
@@ -502,101 +466,46 @@ bool ImageManager::file_load_image(Image *img,
   int depth = img->metadata.depth;
   int components = img->metadata.channels;
 
-  /* Read RGBA pixels. */
+  /* Read pixels. */
   vector<StorageType> pixels_storage;
   StorageType *pixels;
   const size_t max_size = max(max(width, height), depth);
   if (max_size == 0) {
-    /* Don't bother with invalid images. */
+    /* Don't bother with empty images. */
     return false;
   }
+
+  /* Allocate memory as needed, may be smaller to resize down. */
   if (texture_limit > 0 && max_size > texture_limit) {
     pixels_storage.resize(((size_t)width) * height * depth * 4);
     pixels = &pixels_storage[0];
   }
   else {
     thread_scoped_lock device_lock(device_mutex);
-    pixels = (StorageType *)tex_img.alloc(width, height, depth);
+    pixels = (StorageType *)img->mem->alloc(width, height, depth);
   }
+
   if (pixels == NULL) {
     /* Could be that we've run out of memory. */
     return false;
   }
-  bool cmyk = false;
+
   const size_t num_pixels = ((size_t)width) * height * depth;
-  if (in) {
-    StorageType *readpixels = pixels;
-    vector<StorageType> tmppixels;
-    if (components > 4) {
-      tmppixels.resize(((size_t)width) * height * components);
-      readpixels = &tmppixels[0];
-    }
-    if (depth <= 1) {
-      size_t scanlinesize = ((size_t)width) * components * sizeof(StorageType);
-      in->read_image(FileFormat,
-                     (uchar *)readpixels + (height - 1) * scanlinesize,
-                     AutoStride,
-                     -scanlinesize,
-                     AutoStride);
-    }
-    else {
-      in->read_image(FileFormat, (uchar *)readpixels);
-    }
-    if (components > 4) {
-      size_t dimensions = ((size_t)width) * height;
-      for (size_t i = dimensions - 1, pixel = 0; pixel < dimensions; pixel++, i--) {
-        pixels[i * 4 + 3] = tmppixels[i * components + 3];
-        pixels[i * 4 + 2] = tmppixels[i * components + 2];
-        pixels[i * 4 + 1] = tmppixels[i * components + 1];
-        pixels[i * 4 + 0] = tmppixels[i * components + 0];
-      }
-      tmppixels.clear();
-    }
-    cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
-    in->close();
-  }
-  else {
-    if (FileFormat == TypeDesc::FLOAT) {
-      builtin_image_float_pixels_cb(img->filename,
-                                    img->builtin_data,
-                                    (float *)&pixels[0],
-                                    num_pixels * components,
-                                    img->metadata.builtin_free_cache);
-    }
-    else if (FileFormat == TypeDesc::UINT8) {
-      builtin_image_pixels_cb(img->filename,
-                              img->builtin_data,
-                              (uchar *)&pixels[0],
-                              num_pixels * components,
-                              img->metadata.builtin_free_cache);
-    }
-    else {
-      /* TODO(dingto): Support half for ImBuf. */
-    }
-  }
-  /* Check if we actually have a float4 slot, in case components == 1,
-   * but device doesn't support single channel textures.
-   */
-  bool is_rgba = (type == IMAGE_DATA_TYPE_FLOAT4 || type == IMAGE_DATA_TYPE_HALF4 ||
-                  type == IMAGE_DATA_TYPE_BYTE4 || type == IMAGE_DATA_TYPE_USHORT4);
+  img->loader->load_pixels(
+      img->metadata, pixels, num_pixels * components, image_associate_alpha(img));
+
+  /* The kernel can handle 1 and 4 channel images. Anything that is not a single
+   * channel image is converted to RGBA format. */
+  bool is_rgba = (img->metadata.type == IMAGE_DATA_TYPE_FLOAT4 ||
+                  img->metadata.type == IMAGE_DATA_TYPE_HALF4 ||
+                  img->metadata.type == IMAGE_DATA_TYPE_BYTE4 ||
+                  img->metadata.type == IMAGE_DATA_TYPE_USHORT4);
+
   if (is_rgba) {
     const StorageType one = util_image_cast_from_float<StorageType>(1.0f);
 
-    if (cmyk) {
-      /* CMYK */
-      for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-        float c = util_image_cast_to_float(pixels[i * 4 + 0]);
-        float m = util_image_cast_to_float(pixels[i * 4 + 1]);
-        float y = util_image_cast_to_float(pixels[i * 4 + 2]);
-        float k = util_image_cast_to_float(pixels[i * 4 + 3]);
-        pixels[i * 4 + 0] = util_image_cast_from_float<StorageType>((1.0f - c) * (1.0f - k));
-        pixels[i * 4 + 1] = util_image_cast_from_float<StorageType>((1.0f - m) * (1.0f - k));
-        pixels[i * 4 + 2] = util_image_cast_from_float<StorageType>((1.0f - y) * (1.0f - k));
-        pixels[i * 4 + 3] = one;
-      }
-    }
-    else if (components == 2) {
-      /* grayscale + alpha */
+    if (components == 2) {
+      /* Grayscale + alpha to RGBA. */
       for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
         pixels[i * 4 + 3] = pixels[i * 2 + 1];
         pixels[i * 4 + 2] = pixels[i * 2 + 0];
@@ -605,7 +514,7 @@ bool ImageManager::file_load_image(Image *img,
       }
     }
     else if (components == 3) {
-      /* RGB */
+      /* RGB to RGBA. */
       for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
         pixels[i * 4 + 3] = one;
         pixels[i * 4 + 2] = pixels[i * 3 + 2];
@@ -614,7 +523,7 @@ bool ImageManager::file_load_image(Image *img,
       }
     }
     else if (components == 1) {
-      /* grayscale */
+      /* Grayscale to RGBA. */
       for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
         pixels[i * 4 + 3] = one;
         pixels[i * 4 + 2] = pixels[i];
@@ -622,18 +531,27 @@ bool ImageManager::file_load_image(Image *img,
         pixels[i * 4 + 0] = pixels[i];
       }
     }
-    if (img->use_alpha == false) {
+
+    /* Disable alpha if requested by the user. */
+    if (img->params.alpha_type == IMAGE_ALPHA_IGNORE) {
       for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
         pixels[i * 4 + 3] = one;
       }
     }
+
+    if (img->metadata.colorspace != u_colorspace_raw &&
+        img->metadata.colorspace != u_colorspace_srgb) {
+      /* Convert to scene linear. */
+      ColorSpaceManager::to_scene_linear(
+          img->metadata.colorspace, pixels, num_pixels, img->metadata.compress_as_srgb);
+    }
   }
+
   /* Make sure we don't have buggy values. */
   if (FileFormat == TypeDesc::FLOAT) {
     /* For RGBA buffers we put all channels to 0 if either of them is not
      * finite. This way we avoid possible artifacts caused by fully changed
-     * hue.
-     */
+     * hue. */
     if (is_rgba) {
       for (size_t i = 0; i < num_pixels; i += 4) {
         StorageType *pixel = &pixels[i * 4];
@@ -655,13 +573,15 @@ bool ImageManager::file_load_image(Image *img,
       }
     }
   }
+
   /* Scale image down if needed. */
   if (pixels_storage.size() > 0) {
     float scale_factor = 1.0f;
     while (max_size * scale_factor > texture_limit) {
       scale_factor *= 0.5f;
     }
-    VLOG(1) << "Scaling image " << img->filename << " by a factor of " << scale_factor << ".";
+    VLOG(1) << "Scaling image " << img->loader->name() << " by a factor of " << scale_factor
+            << ".";
     vector<StorageType> scaled_pixels;
     size_t scaled_width, scaled_height, scaled_depth;
     util_image_resize_pixels(pixels_storage,
@@ -679,33 +599,32 @@ bool ImageManager::file_load_image(Image *img,
 
     {
       thread_scoped_lock device_lock(device_mutex);
-      texture_pixels = (StorageType *)tex_img.alloc(scaled_width, scaled_height, scaled_depth);
+      texture_pixels = (StorageType *)img->mem->alloc(scaled_width, scaled_height, scaled_depth);
     }
 
     memcpy(texture_pixels, &scaled_pixels[0], scaled_pixels.size() * sizeof(StorageType));
   }
+
   return true;
 }
 
-void ImageManager::device_load_image(
-    Device *device, Scene *scene, ImageDataType type, int slot, Progress *progress)
+void ImageManager::device_load_image(Device *device, Scene *scene, int slot, Progress *progress)
 {
-  if (progress->get_cancel())
+  if (progress->get_cancel()) {
     return;
+  }
 
-  Image *img = images[type][slot];
-
-  if (osl_texture_system && !img->builtin_data)
-    return;
+  Image *img = images[slot];
 
-  string filename = path_filename(images[type][slot]->filename);
-  progress->set_status("Updating Images", "Loading " + filename);
+  progress->set_status("Updating Images", "Loading " + img->loader->name());
 
   const int texture_limit = scene->params.texture_limit;
 
-  /* Slot assignment */
-  int flat_slot = type_index_to_flattened_slot(slot, type);
-  img->mem_name = string_printf("__tex_image_%s_%03d", name_from_type(type), flat_slot);
+  load_image_metadata(img);
+  ImageDataType type = img->metadata.type;
+
+  /* Name for debugging. */
+  img->mem_name = string_printf("__tex_image_%s_%03d", name_from_type(type), slot);
 
   /* Free previous texture in slot. */
   if (img->mem) {
@@ -714,195 +633,131 @@ void ImageManager::device_load_image(
     img->mem = NULL;
   }
 
+  img->mem = new device_texture(
+      device, img->mem_name.c_str(), slot, type, img->params.interpolation, img->params.extension);
+  img->mem->info.use_transform_3d = img->metadata.use_transform_3d;
+  img->mem->info.transform_3d = img->metadata.transform_3d;
+
   /* Create new texture. */
   if (type == IMAGE_DATA_TYPE_FLOAT4) {
-    device_vector<float4> *tex_img = new device_vector<float4>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::FLOAT, float>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::FLOAT, float>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      float *pixels = (float *)tex_img->alloc(1, 1);
+      float *pixels = (float *)img->mem->alloc(1, 1);
 
       pixels[0] = TEX_IMAGE_MISSING_R;
       pixels[1] = TEX_IMAGE_MISSING_G;
       pixels[2] = TEX_IMAGE_MISSING_B;
       pixels[3] = TEX_IMAGE_MISSING_A;
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_FLOAT) {
-    device_vector<float> *tex_img = new device_vector<float>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::FLOAT, float>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::FLOAT, float>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      float *pixels = (float *)tex_img->alloc(1, 1);
+      float *pixels = (float *)img->mem->alloc(1, 1);
 
       pixels[0] = TEX_IMAGE_MISSING_R;
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_BYTE4) {
-    device_vector<uchar4> *tex_img = new device_vector<uchar4>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::UINT8, uchar>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::UINT8, uchar>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      uchar *pixels = (uchar *)tex_img->alloc(1, 1);
+      uchar *pixels = (uchar *)img->mem->alloc(1, 1);
 
       pixels[0] = (TEX_IMAGE_MISSING_R * 255);
       pixels[1] = (TEX_IMAGE_MISSING_G * 255);
       pixels[2] = (TEX_IMAGE_MISSING_B * 255);
       pixels[3] = (TEX_IMAGE_MISSING_A * 255);
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_BYTE) {
-    device_vector<uchar> *tex_img = new device_vector<uchar>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::UINT8, uchar>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::UINT8, uchar>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      uchar *pixels = (uchar *)tex_img->alloc(1, 1);
+      uchar *pixels = (uchar *)img->mem->alloc(1, 1);
 
       pixels[0] = (TEX_IMAGE_MISSING_R * 255);
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_HALF4) {
-    device_vector<half4> *tex_img = new device_vector<half4>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::HALF, half>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::HALF, half>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      half *pixels = (half *)tex_img->alloc(1, 1);
+      half *pixels = (half *)img->mem->alloc(1, 1);
 
       pixels[0] = TEX_IMAGE_MISSING_R;
       pixels[1] = TEX_IMAGE_MISSING_G;
       pixels[2] = TEX_IMAGE_MISSING_B;
       pixels[3] = TEX_IMAGE_MISSING_A;
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_USHORT) {
-    device_vector<uint16_t> *tex_img = new device_vector<uint16_t>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::USHORT, uint16_t>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::USHORT, uint16_t>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      uint16_t *pixels = (uint16_t *)tex_img->alloc(1, 1);
+      uint16_t *pixels = (uint16_t *)img->mem->alloc(1, 1);
 
       pixels[0] = (TEX_IMAGE_MISSING_R * 65535);
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_USHORT4) {
-    device_vector<ushort4> *tex_img = new device_vector<ushort4>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::USHORT, uint16_t>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::USHORT, uint16_t>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      uint16_t *pixels = (uint16_t *)tex_img->alloc(1, 1);
+      uint16_t *pixels = (uint16_t *)img->mem->alloc(1, 1);
 
       pixels[0] = (TEX_IMAGE_MISSING_R * 65535);
       pixels[1] = (TEX_IMAGE_MISSING_G * 65535);
       pixels[2] = (TEX_IMAGE_MISSING_B * 65535);
       pixels[3] = (TEX_IMAGE_MISSING_A * 65535);
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_HALF) {
-    device_vector<half> *tex_img = new device_vector<half>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::HALF, half>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::HALF, half>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      half *pixels = (half *)tex_img->alloc(1, 1);
+      half *pixels = (half *)img->mem->alloc(1, 1);
 
       pixels[0] = TEX_IMAGE_MISSING_R;
     }
+  }
 
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
+  {
     thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
+    img->mem->copy_to_device();
   }
+
+  /* Cleanup memory in image loader. */
+  img->loader->cleanup();
   img->need_load = false;
 }
 
-void ImageManager::device_free_image(Device *, ImageDataType type, int slot)
+void ImageManager::device_free_image(Device *, int slot)
 {
-  Image *img = images[type][slot];
+  Image *img = images[slot];
+  if (img == NULL) {
+    return;
+  }
 
-  if (img) {
-    if (osl_texture_system && !img->builtin_data) {
+  if (osl_texture_system) {
 #ifdef WITH_OSL
-      ustring filename(images[type][slot]->filename);
-      ((OSL::TextureSystem *)osl_texture_system)->invalidate(filename);
-#endif
-    }
-
-    if (img->mem) {
-      thread_scoped_lock device_lock(device_mutex);
-      delete img->mem;
+    ustring filepath = img->loader->osl_filepath();
+    if (!filepath.empty()) {
+      ((OSL::TextureSystem *)osl_texture_system)->invalidate(filepath);
     }
+#endif
+  }
 
-    delete img;
-    images[type][slot] = NULL;
-    --tex_num_images[type];
+  if (img->mem) {
+    thread_scoped_lock device_lock(device_mutex);
+    delete img->mem;
   }
+
+  delete img->loader;
+  delete img;
+  images[slot] = NULL;
 }
 
 void ImageManager::device_update(Device *device, Scene *scene, Progress &progress)
@@ -912,24 +767,14 @@ void ImageManager::device_update(Device *device, Scene *scene, Progress &progres
   }
 
   TaskPool pool;
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      if (!images[type][slot])
-        continue;
-
-      if (images[type][slot]->users == 0) {
-        device_free_image(device, (ImageDataType)type, slot);
-      }
-      else if (images[type][slot]->need_load) {
-        if (!osl_texture_system || images[type][slot]->builtin_data)
-          pool.push(function_bind(&ImageManager::device_load_image,
-                                  this,
-                                  device,
-                                  scene,
-                                  (ImageDataType)type,
-                                  slot,
-                                  &progress));
-      }
+  for (size_t slot = 0; slot < images.size(); slot++) {
+    Image *img = images[slot];
+    if (img && img->users == 0) {
+      device_free_image(device, slot);
+    }
+    else if (img && img->need_load) {
+      pool.push(
+          function_bind(&ImageManager::device_load_image, this, device, scene, slot, &progress));
     }
   }
 
@@ -938,23 +783,16 @@ void ImageManager::device_update(Device *device, Scene *scene, Progress &progres
   need_update = false;
 }
 
-void ImageManager::device_update_slot(Device *device,
-                                      Scene *scene,
-                                      int flat_slot,
-                                      Progress *progress)
+void ImageManager::device_update_slot(Device *device, Scene *scene, int slot, Progress *progress)
 {
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
-
-  Image *image = images[type][slot];
-  assert(image != NULL);
+  Image *img = images[slot];
+  assert(img != NULL);
 
-  if (image->users == 0) {
-    device_free_image(device, type, slot);
+  if (img->users == 0) {
+    device_free_image(device, slot);
   }
-  else if (image->need_load) {
-    if (!osl_texture_system || image->builtin_data)
-      device_load_image(device, scene, type, slot, progress);
+  else if (img->need_load) {
+    device_load_image(device, scene, slot, progress);
   }
 }
 
@@ -967,22 +805,11 @@ void ImageManager::device_load_builtin(Device *device, Scene *scene, Progress &p
   }
 
   TaskPool pool;
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      if (!images[type][slot])
-        continue;
-
-      if (images[type][slot]->need_load) {
-        if (images[type][slot]->builtin_data) {
-          pool.push(function_bind(&ImageManager::device_load_image,
-                                  this,
-                                  device,
-                                  scene,
-                                  (ImageDataType)type,
-                                  slot,
-                                  &progress));
-        }
-      }
+  for (size_t slot = 0; slot < images.size(); slot++) {
+    Image *img = images[slot];
+    if (img && img->need_load && img->builtin) {
+      pool.push(
+          function_bind(&ImageManager::device_load_image, this, device, scene, slot, &progress));
     }
   }
 
@@ -991,31 +818,27 @@ void ImageManager::device_load_builtin(Device *device, Scene *scene, Progress &p
 
 void ImageManager::device_free_builtin(Device *device)
 {
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      if (images[type][slot] && images[type][slot]->builtin_data)
-        device_free_image(device, (ImageDataType)type, slot);
+  for (size_t slot = 0; slot < images.size(); slot++) {
+    Image *img = images[slot];
+    if (img && img->builtin) {
+      device_free_image(device, slot);
     }
   }
 }
 
 void ImageManager::device_free(Device *device)
 {
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      device_free_image(device, (ImageDataType)type, slot);
-    }
-    images[type].clear();
+  for (size_t slot = 0; slot < images.size(); slot++) {
+    device_free_image(device, slot);
   }
+  images.clear();
 }
 
 void ImageManager::collect_statistics(RenderStats *stats)
 {
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    foreach (const Image *image, images[type]) {
-      stats->image.textures.add_entry(
-          NamedSizeEntry(path_filename(image->filename), image->mem->memory_size()));
-    }
+  foreach (const Image *image, images) {
+    stats->image.textures.add_entry(
+        NamedSizeEntry(image->loader->name(), image->mem->memory_size()));
   }
 }
 
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 34f046692f6..fffe7c5152a 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -17,71 +17,162 @@
 #ifndef __IMAGE_H__
 #define __IMAGE_H__
 
-#include "device/device.h"
 #include "device/device_memory.h"
 
-#include "util/util_image.h"
+#include "render/colorspace.h"
+
 #include "util/util_string.h"
 #include "util/util_thread.h"
+#include "util/util_transform.h"
 #include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
 class Device;
+class DeviceInfo;
+class ImageHandle;
+class ImageKey;
+class ImageMetaData;
+class ImageManager;
 class Progress;
 class RenderStats;
 class Scene;
+class ColorSpaceProcessor;
+
+/* Image Parameters */
+class ImageParams {
+ public:
+  bool animated;
+  InterpolationType interpolation;
+  ExtensionType extension;
+  ImageAlphaType alpha_type;
+  ustring colorspace;
+  float frame;
+
+  ImageParams()
+      : animated(false),
+        interpolation(INTERPOLATION_LINEAR),
+        extension(EXTENSION_CLIP),
+        alpha_type(IMAGE_ALPHA_AUTO),
+        colorspace(u_colorspace_raw),
+        frame(0.0f)
+  {
+  }
 
+  bool operator==(const ImageParams &other) const
+  {
+    return (animated == other.animated && interpolation == other.interpolation &&
+            extension == other.extension && alpha_type == other.alpha_type &&
+            colorspace == other.colorspace && frame == other.frame);
+  }
+};
+
+/* Image MetaData
+ *
+ * Information about the image that is available before the image pixels are loaded. */
 class ImageMetaData {
  public:
-  /* Must be set by image file or builtin callback. */
-  bool is_float, is_half;
+  /* Set by ImageLoader.load_metadata(). */
   int channels;
   size_t width, height, depth;
-  bool builtin_free_cache;
+  ImageDataType type;
+
+  /* Optional color space, defaults to raw. */
+  ustring colorspace;
+  const char *colorspace_file_format;
+
+  /* Optional transform for 3D images. */
+  bool use_transform_3d;
+  Transform transform_3d;
 
   /* Automatically set. */
-  ImageDataType type;
-  bool is_linear;
+  bool compress_as_srgb;
 
-  bool operator==(const ImageMetaData &other) const
-  {
-    return is_float == other.is_float && is_half == other.is_half && channels == other.channels &&
-           width == other.width && height == other.height && depth == other.depth &&
-           type == other.type && is_linear == other.is_linear;
-  }
+  ImageMetaData();
+  bool operator==(const ImageMetaData &other) const;
+  bool is_float() const;
+  void detect_colorspace();
 };
 
+/* Image loader base class, that can be subclassed to load image data
+ * from custom sources (file, memory, procedurally generated, etc). */
+class ImageLoader {
+ public:
+  ImageLoader();
+  virtual ~ImageLoader(){};
+
+  /* Load metadata without actual image yet, should be fast. */
+  virtual bool load_metadata(ImageMetaData &metadata) = 0;
+
+  /* Load actual image contents. */
+  virtual bool load_pixels(const ImageMetaData &metadata,
+                           void *pixels,
+                           const size_t pixels_size,
+                           const bool associate_alpha) = 0;
+
+  /* Name for logs and stats. */
+  virtual string name() const = 0;
+
+  /* Optional for OSL texture cache. */
+  virtual ustring osl_filepath() const;
+
+  /* Free any memory used for loading metadata and pixels. */
+  virtual void cleanup(){};
+
+  /* Compare avoid loading the same image multiple times. */
+  virtual bool equals(const ImageLoader &other) const = 0;
+  static bool equals(const ImageLoader *a, const ImageLoader *b);
+
+  /* Work around for no RTTI. */
+};
+
+/* Image Handle
+ *
+ * Access handle for image in the image manager. Multiple shader nodes may
+ * share the same image, and this class handles reference counting for that. */
+class ImageHandle {
+ public:
+  ImageHandle();
+  ImageHandle(const ImageHandle &other);
+  ImageHandle &operator=(const ImageHandle &other);
+  ~ImageHandle();
+
+  bool operator==(const ImageHandle &other) const;
+
+  void clear();
+
+  bool empty();
+  int num_tiles();
+
+  ImageMetaData metadata();
+  int svm_slot(const int tile_index = 0) const;
+  device_texture *image_memory(const int tile_index = 0) const;
+
+ protected:
+  vector<int> tile_slots;
+  ImageManager *manager;
+
+  friend class ImageManager;
+};
+
+/* Image Manager
+ *
+ * Handles loading and storage of all images in the scene. This includes 2D
+ * texture images and 3D volume images. */
 class ImageManager {
  public:
   explicit ImageManager(const DeviceInfo &info);
   ~ImageManager();
 
-  int add_image(const string &filename,
-                void *builtin_data,
-                bool animated,
-                float frame,
-                InterpolationType interpolation,
-                ExtensionType extension,
-                bool use_alpha,
-                ImageMetaData &metadata);
-  void remove_image(int flat_slot);
-  void remove_image(const string &filename,
-                    void *builtin_data,
-                    InterpolationType interpolation,
-                    ExtensionType extension,
-                    bool use_alpha);
-  void tag_reload_image(const string &filename,
-                        void *builtin_data,
-                        InterpolationType interpolation,
-                        ExtensionType extension,
-                        bool use_alpha);
-  bool get_image_metadata(const string &filename, void *builtin_data, ImageMetaData &metadata);
-  bool get_image_metadata(int flat_slot, ImageMetaData &metadata);
+  ImageHandle add_image(const string &filename, const ImageParams &params);
+  ImageHandle add_image(const string &filename,
+                        const ImageParams &params,
+                        const vector<int> &tiles);
+  ImageHandle add_image(ImageLoader *loader, const ImageParams &params);
 
   void device_update(Device *device, Scene *scene, Progress &progress);
-  void device_update_slot(Device *device, Scene *scene, int flat_slot, Progress *progress);
+  void device_update_slot(Device *device, Scene *scene, int slot, Progress *progress);
   void device_free(Device *device);
 
   void device_load_builtin(Device *device, Scene *scene, Progress &progress);
@@ -90,71 +181,50 @@ class ImageManager {
   void set_osl_texture_system(void *texture_system);
   bool set_animation_frame_update(int frame);
 
-  device_memory *image_memory(int flat_slot);
-
   void collect_statistics(RenderStats *stats);
 
   bool need_update;
 
-  /* NOTE: Here pixels_size is a size of storage, which equals to
-   *       width * height * depth.
-   *       Use this to avoid some nasty memory corruptions.
-   */
-  function<void(const string &filename, void *data, ImageMetaData &metadata)>
-      builtin_image_info_cb;
-  function<bool(const string &filename,
-                void *data,
-                unsigned char *pixels,
-                const size_t pixels_size,
-                const bool free_cache)>
-      builtin_image_pixels_cb;
-  function<bool(const string &filename,
-                void *data,
-                float *pixels,
-                const size_t pixels_size,
-                const bool free_cache)>
-      builtin_image_float_pixels_cb;
-
   struct Image {
-    string filename;
-    void *builtin_data;
+    ImageParams params;
     ImageMetaData metadata;
+    ImageLoader *loader;
 
-    bool use_alpha;
-    bool need_load;
-    bool animated;
     float frame;
-    InterpolationType interpolation;
-    ExtensionType extension;
+    bool need_metadata;
+    bool need_load;
+    bool builtin;
 
     string mem_name;
-    device_memory *mem;
+    device_texture *mem;
 
     int users;
+    thread_mutex mutex;
   };
 
  private:
-  int tex_num_images[IMAGE_DATA_NUM_TYPES];
-  int max_num_images;
   bool has_half_images;
 
   thread_mutex device_mutex;
+  thread_mutex images_mutex;
   int animation_frame;
 
-  vector<Image *> images[IMAGE_DATA_NUM_TYPES];
+  vector<Image *> images;
   void *osl_texture_system;
 
-  bool file_load_image_generic(Image *img, unique_ptr<ImageInput> *in);
+  int add_image_slot(ImageLoader *loader, const ImageParams &params, const bool builtin);
+  void add_image_user(int slot);
+  void remove_image_user(int slot);
+
+  void load_image_metadata(Image *img);
+
+  template<TypeDesc::BASETYPE FileFormat, typename StorageType>
+  bool file_load_image(Image *img, int texture_limit);
 
-  template<TypeDesc::BASETYPE FileFormat, typename StorageType, typename DeviceType>
-  bool file_load_image(Image *img,
-                       ImageDataType type,
-                       int texture_limit,
-                       device_vector<DeviceType> &tex_img);
+  void device_load_image(Device *device, Scene *scene, int slot, Progress *progress);
+  void device_free_image(Device *device, int slot);
 
-  void device_load_image(
-      Device *device, Scene *scene, ImageDataType type, int slot, Progress *progress);
-  void device_free_image(Device *device, ImageDataType type, int slot);
+  friend class ImageHandle;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_oiio.cpp b/intern/cycles/render/image_oiio.cpp
new file mode 100644
index 00000000000..c4f95c6b4bc
--- /dev/null
+++ b/intern/cycles/render/image_oiio.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/image_oiio.h"
+
+#include "util/util_image.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+
+CCL_NAMESPACE_BEGIN
+
+OIIOImageLoader::OIIOImageLoader(const string &filepath) : filepath(filepath)
+{
+}
+
+OIIOImageLoader::~OIIOImageLoader()
+{
+}
+
+bool OIIOImageLoader::load_metadata(ImageMetaData &metadata)
+{
+  /* Perform preliminary checks, with meaningful logging. */
+  if (!path_exists(filepath.string())) {
+    VLOG(1) << "File '" << filepath.string() << "' does not exist.";
+    return false;
+  }
+  if (path_is_directory(filepath.string())) {
+    VLOG(1) << "File '" << filepath.string() << "' is a directory, can't use as image.";
+    return false;
+  }
+
+  unique_ptr<ImageInput> in(ImageInput::create(filepath.string()));
+
+  if (!in) {
+    return false;
+  }
+
+  ImageSpec spec;
+  if (!in->open(filepath.string(), spec)) {
+    return false;
+  }
+
+  metadata.width = spec.width;
+  metadata.height = spec.height;
+  metadata.depth = spec.depth;
+  metadata.compress_as_srgb = false;
+
+  /* Check the main format, and channel formats. */
+  size_t channel_size = spec.format.basesize();
+
+  bool is_float = false;
+  bool is_half = false;
+
+  if (spec.format.is_floating_point()) {
+    is_float = true;
+  }
+
+  for (size_t channel = 0; channel < spec.channelformats.size(); channel++) {
+    channel_size = max(channel_size, spec.channelformats[channel].basesize());
+    if (spec.channelformats[channel].is_floating_point()) {
+      is_float = true;
+    }
+  }
+
+  /* check if it's half float */
+  if (spec.format == TypeDesc::HALF) {
+    is_half = true;
+  }
+
+  /* set type and channels */
+  metadata.channels = spec.nchannels;
+
+  if (is_half) {
+    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_HALF4 : IMAGE_DATA_TYPE_HALF;
+  }
+  else if (is_float) {
+    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
+  }
+  else if (spec.format == TypeDesc::USHORT) {
+    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_USHORT4 : IMAGE_DATA_TYPE_USHORT;
+  }
+  else {
+    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
+  }
+
+  metadata.colorspace_file_format = in->format_name();
+
+  in->close();
+
+  return true;
+}
+
+template<TypeDesc::BASETYPE FileFormat, typename StorageType>
+static void oiio_load_pixels(const ImageMetaData &metadata,
+                             const unique_ptr<ImageInput> &in,
+                             StorageType *pixels)
+{
+  const int width = metadata.width;
+  const int height = metadata.height;
+  const int depth = metadata.depth;
+  const int components = metadata.channels;
+
+  /* Read pixels through OpenImageIO. */
+  StorageType *readpixels = pixels;
+  vector<StorageType> tmppixels;
+  if (components > 4) {
+    tmppixels.resize(((size_t)width) * height * components);
+    readpixels = &tmppixels[0];
+  }
+
+  if (depth <= 1) {
+    size_t scanlinesize = ((size_t)width) * components * sizeof(StorageType);
+    in->read_image(FileFormat,
+                   (uchar *)readpixels + (height - 1) * scanlinesize,
+                   AutoStride,
+                   -scanlinesize,
+                   AutoStride);
+  }
+  else {
+    in->read_image(FileFormat, (uchar *)readpixels);
+  }
+
+  if (components > 4) {
+    size_t dimensions = ((size_t)width) * height;
+    for (size_t i = dimensions - 1, pixel = 0; pixel < dimensions; pixel++, i--) {
+      pixels[i * 4 + 3] = tmppixels[i * components + 3];
+      pixels[i * 4 + 2] = tmppixels[i * components + 2];
+      pixels[i * 4 + 1] = tmppixels[i * components + 1];
+      pixels[i * 4 + 0] = tmppixels[i * components + 0];
+    }
+    tmppixels.clear();
+  }
+
+  /* CMYK to RGBA. */
+  const bool cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
+  if (cmyk) {
+    const StorageType one = util_image_cast_from_float<StorageType>(1.0f);
+
+    const size_t num_pixels = ((size_t)width) * height * depth;
+    for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
+      float c = util_image_cast_to_float(pixels[i * 4 + 0]);
+      float m = util_image_cast_to_float(pixels[i * 4 + 1]);
+      float y = util_image_cast_to_float(pixels[i * 4 + 2]);
+      float k = util_image_cast_to_float(pixels[i * 4 + 3]);
+      pixels[i * 4 + 0] = util_image_cast_from_float<StorageType>((1.0f - c) * (1.0f - k));
+      pixels[i * 4 + 1] = util_image_cast_from_float<StorageType>((1.0f - m) * (1.0f - k));
+      pixels[i * 4 + 2] = util_image_cast_from_float<StorageType>((1.0f - y) * (1.0f - k));
+      pixels[i * 4 + 3] = one;
+    }
+  }
+}
+
+bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata,
+                                  void *pixels,
+                                  const size_t,
+                                  const bool associate_alpha)
+{
+  unique_ptr<ImageInput> in = NULL;
+
+  /* NOTE: Error logging is done in meta data acquisition. */
+  if (!path_exists(filepath.string()) || path_is_directory(filepath.string())) {
+    return false;
+  }
+
+  /* load image from file through OIIO */
+  in = unique_ptr<ImageInput>(ImageInput::create(filepath.string()));
+  if (!in) {
+    return false;
+  }
+
+  ImageSpec spec = ImageSpec();
+  ImageSpec config = ImageSpec();
+
+  if (!associate_alpha) {
+    config.attribute("oiio:UnassociatedAlpha", 1);
+  }
+
+  if (!in->open(filepath.string(), spec, config)) {
+    return false;
+  }
+
+  switch (metadata.type) {
+    case IMAGE_DATA_TYPE_BYTE:
+    case IMAGE_DATA_TYPE_BYTE4:
+      oiio_load_pixels<TypeDesc::UINT8, uchar>(metadata, in, (uchar *)pixels);
+      break;
+    case IMAGE_DATA_TYPE_USHORT:
+    case IMAGE_DATA_TYPE_USHORT4:
+      oiio_load_pixels<TypeDesc::USHORT, uint16_t>(metadata, in, (uint16_t *)pixels);
+      break;
+    case IMAGE_DATA_TYPE_HALF:
+    case IMAGE_DATA_TYPE_HALF4:
+      oiio_load_pixels<TypeDesc::HALF, half>(metadata, in, (half *)pixels);
+      break;
+    case IMAGE_DATA_TYPE_FLOAT:
+    case IMAGE_DATA_TYPE_FLOAT4:
+      oiio_load_pixels<TypeDesc::FLOAT, float>(metadata, in, (float *)pixels);
+      break;
+    case IMAGE_DATA_NUM_TYPES:
+      break;
+  }
+
+  in->close();
+  return true;
+}
+
+string OIIOImageLoader::name() const
+{
+  return path_filename(filepath.string());
+}
+
+ustring OIIOImageLoader::osl_filepath() const
+{
+  return filepath;
+}
+
+bool OIIOImageLoader::equals(const ImageLoader &other) const
+{
+  const OIIOImageLoader &other_loader = (const OIIOImageLoader &)other;
+  return filepath == other_loader.filepath;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_oiio.h b/intern/cycles/render/image_oiio.h
new file mode 100644
index 00000000000..a234b968557
--- /dev/null
+++ b/intern/cycles/render/image_oiio.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __IMAGE_OIIO__
+#define __IMAGE_OIIO__
+
+#include "render/image.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OIIOImageLoader : public ImageLoader {
+ public:
+  OIIOImageLoader(const string &filepath);
+  ~OIIOImageLoader();
+
+  bool load_metadata(ImageMetaData &metadata) override;
+
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t pixels_size,
+                   const bool associate_alpha) override;
+
+  string name() const override;
+
+  ustring osl_filepath() const override;
+
+  bool equals(const ImageLoader &other) const override;
+
+ protected:
+  ustring filepath;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __IMAGE_OIIO__ */
diff --git a/intern/cycles/render/image_sky.cpp b/intern/cycles/render/image_sky.cpp
new file mode 100644
index 00000000000..0560907c63e
--- /dev/null
+++ b/intern/cycles/render/image_sky.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/image_sky.h"
+
+#include "sky_model.h"
+
+#include "util/util_image.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_task.h"
+
+CCL_NAMESPACE_BEGIN
+
+SkyLoader::SkyLoader(float sun_elevation,
+                     float altitude,
+                     float air_density,
+                     float dust_density,
+                     float ozone_density)
+    : sun_elevation(sun_elevation),
+      altitude(altitude),
+      air_density(air_density),
+      dust_density(dust_density),
+      ozone_density(ozone_density)
+{
+}
+
+SkyLoader::~SkyLoader(){};
+
+bool SkyLoader::load_metadata(ImageMetaData &metadata)
+{
+  metadata.width = 512;
+  metadata.height = 128;
+  metadata.channels = 3;
+  metadata.depth = 1;
+  metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+  metadata.compress_as_srgb = false;
+  return true;
+}
+
+bool SkyLoader::load_pixels(const ImageMetaData &metadata,
+                            void *pixels,
+                            const size_t /*pixels_size*/,
+                            const bool /*associate_alpha*/)
+{
+  /* definitions */
+  int width = metadata.width;
+  int height = metadata.height;
+  float *pixel_data = (float *)pixels;
+
+  /* precompute sky texture */
+  const int rows_per_task = divide_up(1024, width);
+  parallel_for(blocked_range<size_t>(0, height, rows_per_task),
+               [&](const blocked_range<size_t> &r) {
+                 SKY_nishita_skymodel_precompute_texture(pixel_data,
+                                                         metadata.channels,
+                                                         r.begin(),
+                                                         r.end(),
+                                                         width,
+                                                         height,
+                                                         sun_elevation,
+                                                         altitude,
+                                                         air_density,
+                                                         dust_density,
+                                                         ozone_density);
+               });
+
+  return true;
+}
+
+string SkyLoader::name() const
+{
+  return "sky_nishita";
+}
+
+bool SkyLoader::equals(const ImageLoader & /*other*/) const
+{
+  return false;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_sky.h b/intern/cycles/render/image_sky.h
new file mode 100644
index 00000000000..686f4e5b885
--- /dev/null
+++ b/intern/cycles/render/image_sky.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/image.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SkyLoader : public ImageLoader {
+ private:
+  float sun_elevation;
+  float altitude;
+  float air_density;
+  float dust_density;
+  float ozone_density;
+
+ public:
+  SkyLoader(float sun_elevation,
+            float altitude,
+            float air_density,
+            float dust_density,
+            float ozone_density);
+  ~SkyLoader();
+
+  bool load_metadata(ImageMetaData &metadata) override;
+
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t /*pixels_size*/,
+                   const bool /*associate_alpha*/) override;
+
+  string name() const override;
+
+  bool equals(const ImageLoader & /*other*/) const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_vdb.cpp b/intern/cycles/render/image_vdb.cpp
new file mode 100644
index 00000000000..500131c2d84
--- /dev/null
+++ b/intern/cycles/render/image_vdb.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/image_vdb.h"
+
+#ifdef WITH_OPENVDB
+#  include <openvdb/openvdb.h>
+#  include <openvdb/tools/Dense.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+VDBImageLoader::VDBImageLoader(const string &grid_name) : grid_name(grid_name)
+{
+}
+
+VDBImageLoader::~VDBImageLoader()
+{
+}
+
+bool VDBImageLoader::load_metadata(ImageMetaData &metadata)
+{
+#ifdef WITH_OPENVDB
+  if (!grid) {
+    return false;
+  }
+
+  bbox = grid->evalActiveVoxelBoundingBox();
+  if (bbox.empty()) {
+    return false;
+  }
+
+  /* Set dimensions. */
+  openvdb::Coord dim = bbox.dim();
+  openvdb::Coord min = bbox.min();
+  metadata.width = dim.x();
+  metadata.height = dim.y();
+  metadata.depth = dim.z();
+
+  /* Set data type. */
+  if (grid->isType<openvdb::FloatGrid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::Vec3fGrid>()) {
+    metadata.channels = 3;
+  }
+  else if (grid->isType<openvdb::BoolGrid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::DoubleGrid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::Int32Grid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::Int64Grid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::Vec3IGrid>()) {
+    metadata.channels = 3;
+  }
+  else if (grid->isType<openvdb::Vec3dGrid>()) {
+    metadata.channels = 3;
+  }
+  else if (grid->isType<openvdb::MaskGrid>()) {
+    metadata.channels = 1;
+  }
+  else {
+    return false;
+  }
+
+  if (metadata.channels == 1) {
+    metadata.type = IMAGE_DATA_TYPE_FLOAT;
+  }
+  else {
+    metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+  }
+
+  /* Set transform from object space to voxel index. */
+  openvdb::math::Mat4f grid_matrix = grid->transform().baseMap()->getAffineMap()->getMat4();
+  Transform index_to_object;
+  for (int col = 0; col < 4; col++) {
+    for (int row = 0; row < 3; row++) {
+      index_to_object[row][col] = (float)grid_matrix[col][row];
+    }
+  }
+
+  Transform texture_to_index = transform_translate(min.x(), min.y(), min.z()) *
+                               transform_scale(dim.x(), dim.y(), dim.z());
+
+  metadata.transform_3d = transform_inverse(index_to_object * texture_to_index);
+  metadata.use_transform_3d = true;
+
+  return true;
+#else
+  (void)metadata;
+  return false;
+#endif
+}
+
+bool VDBImageLoader::load_pixels(const ImageMetaData &, void *pixels, const size_t, const bool)
+{
+#ifdef WITH_OPENVDB
+  if (grid->isType<openvdb::FloatGrid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::FloatGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Vec3fGrid>()) {
+    openvdb::tools::Dense<openvdb::Vec3f, openvdb::tools::LayoutXYZ> dense(
+        bbox, (openvdb::Vec3f *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Vec3fGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::BoolGrid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::BoolGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::DoubleGrid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::DoubleGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Int32Grid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Int32Grid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Int64Grid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Int64Grid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Vec3IGrid>()) {
+    openvdb::tools::Dense<openvdb::Vec3f, openvdb::tools::LayoutXYZ> dense(
+        bbox, (openvdb::Vec3f *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Vec3IGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Vec3dGrid>()) {
+    openvdb::tools::Dense<openvdb::Vec3f, openvdb::tools::LayoutXYZ> dense(
+        bbox, (openvdb::Vec3f *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Vec3dGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::MaskGrid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::MaskGrid>(grid), dense);
+  }
+
+  return true;
+#else
+  (void)pixels;
+  return false;
+#endif
+}
+
+string VDBImageLoader::name() const
+{
+  return grid_name;
+}
+
+bool VDBImageLoader::equals(const ImageLoader &other) const
+{
+#ifdef WITH_OPENVDB
+  const VDBImageLoader &other_loader = (const VDBImageLoader &)other;
+  return grid == other_loader.grid;
+#else
+  (void)other;
+  return true;
+#endif
+}
+
+void VDBImageLoader::cleanup()
+{
+#ifdef WITH_OPENVDB
+  /* Free OpenVDB grid memory as soon as we can. */
+  grid.reset();
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_vdb.h b/intern/cycles/render/image_vdb.h
new file mode 100644
index 00000000000..7dec63b11e6
--- /dev/null
+++ b/intern/cycles/render/image_vdb.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __IMAGE_VDB__
+#define __IMAGE_VDB__
+
+#ifdef WITH_OPENVDB
+#  include <openvdb/openvdb.h>
+#endif
+
+#include "render/image.h"
+
+CCL_NAMESPACE_BEGIN
+
+class VDBImageLoader : public ImageLoader {
+ public:
+  VDBImageLoader(const string &grid_name);
+  ~VDBImageLoader();
+
+  virtual bool load_metadata(ImageMetaData &metadata) override;
+
+  virtual bool load_pixels(const ImageMetaData &metadata,
+                           void *pixels,
+                           const size_t pixels_size,
+                           const bool associate_alpha) override;
+
+  virtual string name() const override;
+
+  virtual bool equals(const ImageLoader &other) const override;
+
+  virtual void cleanup() override;
+
+ protected:
+  string grid_name;
+#ifdef WITH_OPENVDB
+  openvdb::GridBase::ConstPtr grid;
+  openvdb::CoordBBox bbox;
+#endif
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __IMAGE_VDB__ */
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index d21fc3f757c..93e50fd170c 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -14,17 +14,22 @@
  * limitations under the License.
  */
 
+#include "render/integrator.h"
 #include "device/device.h"
 #include "render/background.h"
-#include "render/integrator.h"
 #include "render/film.h"
+#include "render/jitter.h"
 #include "render/light.h"
 #include "render/scene.h"
 #include "render/shader.h"
 #include "render/sobol.h"
 
+#include "kernel/kernel_types.h"
+
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
+#include "util/util_logging.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -32,6 +37,7 @@ NODE_DEFINE(Integrator)
 {
   NodeType *type = NodeType::add("integrator", create);
 
+  SOCKET_INT(min_bounce, "Min Bounce", 0);
   SOCKET_INT(max_bounce, "Max Bounce", 7);
 
   SOCKET_INT(max_diffuse_bounce, "Max Diffuse Bounce", 7);
@@ -39,12 +45,13 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(max_transmission_bounce, "Max Transmission Bounce", 7);
   SOCKET_INT(max_volume_bounce, "Max Volume Bounce", 7);
 
+  SOCKET_INT(transparent_min_bounce, "Transparent Min Bounce", 0);
   SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
 
   SOCKET_INT(ao_bounces, "AO Bounces", 0);
 
   SOCKET_INT(volume_max_steps, "Volume Max Steps", 1024);
-  SOCKET_FLOAT(volume_step_size, "Volume Step Size", 0.1f);
+  SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f);
 
   SOCKET_BOOLEAN(caustics_reflective, "Reflective Caustics", true);
   SOCKET_BOOLEAN(caustics_refractive, "Refractive Caustics", true);
@@ -64,6 +71,9 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(volume_samples, "Volume Samples", 1);
   SOCKET_INT(start_sample, "Start Sample", 0);
 
+  SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
+  SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
+
   SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
   SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
   SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
@@ -81,6 +91,7 @@ NODE_DEFINE(Integrator)
   static NodeEnum sampling_pattern_enum;
   sampling_pattern_enum.insert("sobol", SAMPLING_PATTERN_SOBOL);
   sampling_pattern_enum.insert("cmj", SAMPLING_PATTERN_CMJ);
+  sampling_pattern_enum.insert("pmj", SAMPLING_PATTERN_PMJ);
   SOCKET_ENUM(sampling_pattern, "Sampling Pattern", sampling_pattern_enum, SAMPLING_PATTERN_SOBOL);
 
   return type;
@@ -105,6 +116,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   KernelIntegrator *kintegrator = &dscene->data.integrator;
 
   /* integrator parameters */
+  kintegrator->min_bounce = min_bounce + 1;
   kintegrator->max_bounce = max_bounce + 1;
 
   kintegrator->max_diffuse_bounce = max_diffuse_bounce + 1;
@@ -112,6 +124,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   kintegrator->max_transmission_bounce = max_transmission_bounce + 1;
   kintegrator->max_volume_bounce = max_volume_bounce + 1;
 
+  kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
   kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
 
   if (ao_bounces == 0) {
@@ -136,13 +149,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   }
 
   kintegrator->volume_max_steps = volume_max_steps;
-  kintegrator->volume_step_size = volume_step_size;
+  kintegrator->volume_step_rate = volume_step_rate;
 
   kintegrator->caustics_reflective = caustics_reflective;
   kintegrator->caustics_refractive = caustics_refractive;
   kintegrator->filter_glossy = (filter_glossy == 0.0f) ? FLT_MAX : 1.0f / filter_glossy;
 
-  kintegrator->seed = hash_int(seed);
+  kintegrator->seed = hash_uint2(seed, 0);
 
   kintegrator->use_ambient_occlusion = ((Pass::contains(scene->film->passes, PASS_AO)) ||
                                         dscene->data.background.ao_factor != 0.0f);
@@ -175,6 +188,29 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
   kintegrator->sampling_pattern = sampling_pattern;
   kintegrator->aa_samples = aa_samples;
+  if (aa_samples > 0 && adaptive_min_samples == 0) {
+    kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
+    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+            << kintegrator->adaptive_min_samples;
+  }
+  else {
+    kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
+  }
+
+  kintegrator->adaptive_step = 4;
+  kintegrator->adaptive_stop_per_sample = device->info.has_adaptive_stop_per_sample;
+
+  /* Adaptive step must be a power of two for bitwise operations to work. */
+  assert((kintegrator->adaptive_step & (kintegrator->adaptive_step - 1)) == 0);
+
+  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+    kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
+            << kintegrator->adaptive_threshold;
+  }
+  else {
+    kintegrator->adaptive_threshold = adaptive_threshold;
+  }
 
   if (light_sampling_threshold > 0.0f) {
     kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
@@ -204,17 +240,26 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   int dimensions = PRNG_BASE_NUM + max_samples * PRNG_BOUNCE_NUM;
   dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS);
 
-  uint *directions = dscene->sobol_directions.alloc(SOBOL_BITS * dimensions);
+  if (sampling_pattern == SAMPLING_PATTERN_SOBOL) {
+    uint *directions = dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
 
-  sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
+    sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
 
-  dscene->sobol_directions.copy_to_device();
-
-  /* Clamping. */
-  bool use_sample_clamp = (sample_clamp_direct != 0.0f || sample_clamp_indirect != 0.0f);
-  if (use_sample_clamp != scene->film->use_sample_clamp) {
-    scene->film->use_sample_clamp = use_sample_clamp;
-    scene->film->tag_update(scene);
+    dscene->sample_pattern_lut.copy_to_device();
+  }
+  else {
+    constexpr int sequence_size = NUM_PMJ_SAMPLES;
+    constexpr int num_sequences = NUM_PMJ_PATTERNS;
+    float2 *directions = (float2 *)dscene->sample_pattern_lut.alloc(sequence_size * num_sequences *
+                                                                    2);
+    TaskPool pool;
+    for (int j = 0; j < num_sequences; ++j) {
+      float2 *sequence = directions + j * sequence_size;
+      pool.push(
+          function_bind(&progressive_multi_jitter_02_generate_2D, sequence, sequence_size, j));
+    }
+    pool.wait_work();
+    dscene->sample_pattern_lut.copy_to_device();
   }
 
   need_update = false;
@@ -222,7 +267,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
 void Integrator::device_free(Device *, DeviceScene *dscene)
 {
-  dscene->sobol_directions.free();
+  dscene->sample_pattern_lut.free();
 }
 
 bool Integrator::modified(const Integrator &integrator)
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 8270c78b94e..847cb6b3538 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -31,6 +31,7 @@ class Integrator : public Node {
  public:
   NODE_DECLARE
 
+  int min_bounce;
   int max_bounce;
 
   int max_diffuse_bounce;
@@ -38,12 +39,13 @@ class Integrator : public Node {
   int max_transmission_bounce;
   int max_volume_bounce;
 
+  int transparent_min_bounce;
   int transparent_max_bounce;
 
   int ao_bounces;
 
   int volume_max_steps;
-  float volume_step_size;
+  float volume_step_rate;
 
   bool caustics_reflective;
   bool caustics_refractive;
@@ -75,6 +77,9 @@ class Integrator : public Node {
   bool use_light_tree;
   float splitting_threshold;
 
+  int adaptive_min_samples;
+  float adaptive_threshold;
+
   enum Method {
     BRANCHED_PATH = 0,
     PATH = 1,
diff --git a/intern/cycles/render/jitter.cpp b/intern/cycles/render/jitter.cpp
new file mode 100644
index 00000000000..fc47b0e8f0a
--- /dev/null
+++ b/intern/cycles/render/jitter.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This file is based on "Progressive Multi-Jittered Sample Sequences"
+ * by Per Christensen, Andrew Kensler and Charlie Kilpatrick.
+ * http://graphics.pixar.com/library/ProgressiveMultiJitteredSampling/paper.pdf
+ *
+ * Performance can be improved in the future by implementing the new
+ * algorithm from Matt Pharr in  http://jcgt.org/published/0008/01/04/
+ * "Efficient Generation of Points that Satisfy Two-Dimensional Elementary Intervals"
+ */
+
+#include "render/jitter.h"
+
+#include <math.h>
+#include <vector>
+
+CCL_NAMESPACE_BEGIN
+
+static uint cmj_hash(uint i, uint p)
+{
+  i ^= p;
+  i ^= i >> 17;
+  i ^= i >> 10;
+  i *= 0xb36534e5;
+  i ^= i >> 12;
+  i ^= i >> 21;
+  i *= 0x93fc4795;
+  i ^= 0xdf6e307f;
+  i ^= i >> 17;
+  i *= 1 | p >> 18;
+
+  return i;
+}
+
+static float cmj_randfloat(uint i, uint p)
+{
+  return cmj_hash(i, p) * (1.0f / 4294967808.0f);
+}
+
+class PMJ_Generator {
+ public:
+  static void generate_2D(float2 points[], int size, int rng_seed_in)
+  {
+    PMJ_Generator g(rng_seed_in);
+    points[0].x = g.rnd();
+    points[0].y = g.rnd();
+    int N = 1;
+    while (N < size) {
+      g.extend_sequence_even(points, N);
+      g.extend_sequence_odd(points, 2 * N);
+      N = 4 * N;
+    }
+  }
+
+ protected:
+  PMJ_Generator(int rnd_seed_in) : num_samples(1), rnd_index(2), rnd_seed(rnd_seed_in)
+  {
+  }
+
+  float rnd()
+  {
+    return cmj_randfloat(++rnd_index, rnd_seed);
+  }
+
+  virtual void mark_occupied_strata(float2 points[], int N)
+  {
+    int NN = 2 * N;
+    for (int s = 0; s < NN; ++s) {
+      occupied1Dx[s] = occupied1Dy[s] = false;
+    }
+    for (int s = 0; s < N; ++s) {
+      int xstratum = (int)(NN * points[s].x);
+      int ystratum = (int)(NN * points[s].y);
+      occupied1Dx[xstratum] = true;
+      occupied1Dy[ystratum] = true;
+    }
+  }
+
+  virtual void generate_sample_point(
+      float2 points[], float i, float j, float xhalf, float yhalf, int n, int N)
+  {
+    int NN = 2 * N;
+    float2 pt;
+    int xstratum, ystratum;
+    do {
+      pt.x = (i + 0.5f * (xhalf + rnd())) / n;
+      xstratum = (int)(NN * pt.x);
+    } while (occupied1Dx[xstratum]);
+    do {
+      pt.y = (j + 0.5f * (yhalf + rnd())) / n;
+      ystratum = (int)(NN * pt.y);
+    } while (occupied1Dy[ystratum]);
+    occupied1Dx[xstratum] = true;
+    occupied1Dy[ystratum] = true;
+    points[num_samples] = pt;
+    ++num_samples;
+  }
+
+  void extend_sequence_even(float2 points[], int N)
+  {
+    int n = (int)sqrtf(N);
+    occupied1Dx.resize(2 * N);
+    occupied1Dy.resize(2 * N);
+    mark_occupied_strata(points, N);
+    for (int s = 0; s < N; ++s) {
+      float2 oldpt = points[s];
+      float i = floorf(n * oldpt.x);
+      float j = floorf(n * oldpt.y);
+      float xhalf = floorf(2.0f * (n * oldpt.x - i));
+      float yhalf = floorf(2.0f * (n * oldpt.y - j));
+      xhalf = 1.0f - xhalf;
+      yhalf = 1.0f - yhalf;
+      generate_sample_point(points, i, j, xhalf, yhalf, n, N);
+    }
+  }
+
+  void extend_sequence_odd(float2 points[], int N)
+  {
+    int n = (int)sqrtf(N / 2);
+    occupied1Dx.resize(2 * N);
+    occupied1Dy.resize(2 * N);
+    mark_occupied_strata(points, N);
+    std::vector<float> xhalves(N / 2);
+    std::vector<float> yhalves(N / 2);
+    for (int s = 0; s < N / 2; ++s) {
+      float2 oldpt = points[s];
+      float i = floorf(n * oldpt.x);
+      float j = floorf(n * oldpt.y);
+      float xhalf = floorf(2.0f * (n * oldpt.x - i));
+      float yhalf = floorf(2.0f * (n * oldpt.y - j));
+      if (rnd() > 0.5f) {
+        xhalf = 1.0f - xhalf;
+      }
+      else {
+        yhalf = 1.0f - yhalf;
+      }
+      xhalves[s] = xhalf;
+      yhalves[s] = yhalf;
+      generate_sample_point(points, i, j, xhalf, yhalf, n, N);
+    }
+    for (int s = 0; s < N / 2; ++s) {
+      float2 oldpt = points[s];
+      float i = floorf(n * oldpt.x);
+      float j = floorf(n * oldpt.y);
+      float xhalf = 1.0f - xhalves[s];
+      float yhalf = 1.0f - yhalves[s];
+      generate_sample_point(points, i, j, xhalf, yhalf, n, N);
+    }
+  }
+
+  std::vector<bool> occupied1Dx, occupied1Dy;
+  int num_samples;
+  int rnd_index, rnd_seed;
+};
+
+class PMJ02_Generator : public PMJ_Generator {
+ protected:
+  void generate_sample_point(
+      float2 points[], float i, float j, float xhalf, float yhalf, int n, int N) override
+  {
+    int NN = 2 * N;
+    float2 pt;
+    do {
+      pt.x = (i + 0.5f * (xhalf + rnd())) / n;
+      pt.y = (j + 0.5f * (yhalf + rnd())) / n;
+    } while (is_occupied(pt, NN));
+    mark_occupied_strata1(pt, NN);
+    points[num_samples] = pt;
+    ++num_samples;
+  }
+
+  void mark_occupied_strata(float2 points[], int N) override
+  {
+    int NN = 2 * N;
+    int num_shapes = (int)log2f(NN) + 1;
+    occupiedStrata.resize(num_shapes);
+    for (int shape = 0; shape < num_shapes; ++shape) {
+      occupiedStrata[shape].resize(NN);
+      for (int n = 0; n < NN; ++n) {
+        occupiedStrata[shape][n] = false;
+      }
+    }
+    for (int s = 0; s < N; ++s) {
+      mark_occupied_strata1(points[s], NN);
+    }
+  }
+
+  void mark_occupied_strata1(float2 pt, int NN)
+  {
+    int shape = 0;
+    int xdivs = NN;
+    int ydivs = 1;
+    do {
+      int xstratum = (int)(xdivs * pt.x);
+      int ystratum = (int)(ydivs * pt.y);
+      size_t index = ystratum * xdivs + xstratum;
+      assert(index < NN);
+      occupiedStrata[shape][index] = true;
+      shape = shape + 1;
+      xdivs = xdivs / 2;
+      ydivs = ydivs * 2;
+    } while (xdivs > 0);
+  }
+
+  bool is_occupied(float2 pt, int NN)
+  {
+    int shape = 0;
+    int xdivs = NN;
+    int ydivs = 1;
+    do {
+      int xstratum = (int)(xdivs * pt.x);
+      int ystratum = (int)(ydivs * pt.y);
+      size_t index = ystratum * xdivs + xstratum;
+      assert(index < NN);
+      if (occupiedStrata[shape][index]) {
+        return true;
+      }
+      shape = shape + 1;
+      xdivs = xdivs / 2;
+      ydivs = ydivs * 2;
+    } while (xdivs > 0);
+    return false;
+  }
+
+ private:
+  std::vector<std::vector<bool>> occupiedStrata;
+};
+
+static void shuffle(float2 points[], int size, int rng_seed)
+{
+  /* Offset samples by 1.0 for faster scrambling in kernel_random.h */
+  for (int i = 0; i < size; ++i) {
+    points[i].x += 1.0f;
+    points[i].y += 1.0f;
+  }
+
+  if (rng_seed == 0) {
+    return;
+  }
+
+  constexpr int odd[8] = {0, 1, 4, 5, 10, 11, 14, 15};
+  constexpr int even[8] = {2, 3, 6, 7, 8, 9, 12, 13};
+
+  int rng_index = 0;
+  for (int yy = 0; yy < size / 16; ++yy) {
+    for (int xx = 0; xx < 8; ++xx) {
+      int other = (int)(cmj_randfloat(++rng_index, rng_seed) * (8.0f - xx) + xx);
+      float2 tmp = points[odd[other] + yy * 16];
+      points[odd[other] + yy * 16] = points[odd[xx] + yy * 16];
+      points[odd[xx] + yy * 16] = tmp;
+    }
+    for (int xx = 0; xx < 8; ++xx) {
+      int other = (int)(cmj_randfloat(++rng_index, rng_seed) * (8.0f - xx) + xx);
+      float2 tmp = points[even[other] + yy * 16];
+      points[even[other] + yy * 16] = points[even[xx] + yy * 16];
+      points[even[xx] + yy * 16] = tmp;
+    }
+  }
+}
+
+void progressive_multi_jitter_generate_2D(float2 points[], int size, int rng_seed)
+{
+  PMJ_Generator::generate_2D(points, size, rng_seed);
+  shuffle(points, size, rng_seed);
+}
+
+void progressive_multi_jitter_02_generate_2D(float2 points[], int size, int rng_seed)
+{
+  PMJ02_Generator::generate_2D(points, size, rng_seed);
+  shuffle(points, size, rng_seed);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/jitter.h b/intern/cycles/render/jitter.h
new file mode 100644
index 00000000000..ed34c7a4f4d
--- /dev/null
+++ b/intern/cycles/render/jitter.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __JITTER_H__
+#define __JITTER_H__
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+void progressive_multi_jitter_generate_2D(float2 points[], int size, int rng_seed);
+void progressive_multi_jitter_02_generate_2D(float2 points[], int size, int rng_seed);
+
+CCL_NAMESPACE_END
+
+#endif /* __JITTER_H__ */
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 1ed7e6967ac..7afa28b29fb 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "render/background.h"
+#include "render/light.h"
 #include "device/device.h"
-#include "render/integrator.h"
+#include "render/background.h"
 #include "render/film.h"
 #include "render/graph.h"
-#include "render/light.h"
+#include "render/integrator.h"
 #include "render/light_tree.h"
 #include "render/mesh.h"
 #include "render/nodes.h"
@@ -29,9 +29,10 @@
 
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
+#include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_progress.h"
-#include "util/util_logging.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -115,10 +116,13 @@ NODE_DEFINE(Light)
   type_enum.insert("spot", LIGHT_SPOT);
   SOCKET_ENUM(type, "Type", type_enum, LIGHT_POINT);
 
+  SOCKET_COLOR(strength, "Strength", make_float3(1.0f, 1.0f, 1.0f));
+
   SOCKET_POINT(co, "Co", make_float3(0.0f, 0.0f, 0.0f));
 
   SOCKET_VECTOR(dir, "Dir", make_float3(0.0f, 0.0f, 0.0f));
   SOCKET_FLOAT(size, "Size", 0.0f);
+  SOCKET_FLOAT(angle, "Angle", 0.0f);
 
   SOCKET_VECTOR(axisu, "Axis U", make_float3(0.0f, 0.0f, 0.0f));
   SOCKET_FLOAT(sizeu, "Size U", 1.0f);
@@ -163,6 +167,9 @@ void Light::tag_update(Scene *scene)
 
 bool Light::has_contribution(Scene *scene)
 {
+  if (strength == make_float3(0.0f, 0.0f, 0.0f)) {
+    return false;
+  }
   if (is_portal) {
     return false;
   }
@@ -177,7 +184,10 @@ bool Light::has_contribution(Scene *scene)
 LightManager::LightManager()
 {
   need_update = true;
+  need_update_background = true;
   use_light_visibility = false;
+  last_background_enabled = false;
+  last_background_resolution = 0;
 }
 
 LightManager::~LightManager()
@@ -197,7 +207,7 @@ bool LightManager::has_background_light(Scene *scene)
   return false;
 }
 
-void LightManager::disable_ineffective_light(Scene *scene)
+void LightManager::test_enabled_lights(Scene *scene)
 {
   /* Make all lights enabled by default, and perform some preliminary checks
    * needed for finer-tuning of settings (for example, check whether we've
@@ -210,28 +220,40 @@ void LightManager::disable_ineffective_light(Scene *scene)
     has_background |= light->type == LIGHT_BACKGROUND;
   }
 
+  bool background_enabled = false;
+  int background_resolution = 0;
+
   if (has_background) {
     /* Ignore background light if:
      * - If unsupported on a device
      * - If we don't need it (no HDRs etc.)
      */
-    Shader *shader = (scene->background->shader) ? scene->background->shader :
-                                                   scene->default_background;
-    bool disable_mis = !(has_portal || shader->has_surface_spatial_varying);
-    if (disable_mis) {
-      VLOG(1) << "Background MIS has been disabled.\n";
-      foreach (Light *light, scene->lights) {
-        if (light->type == LIGHT_BACKGROUND) {
-          light->is_enabled = false;
-        }
+    Shader *shader = scene->background->get_shader(scene);
+    const bool disable_mis = !(has_portal || shader->has_surface_spatial_varying);
+    VLOG_IF(1, disable_mis) << "Background MIS has been disabled.\n";
+    foreach (Light *light, scene->lights) {
+      if (light->type == LIGHT_BACKGROUND) {
+        light->is_enabled = !disable_mis;
+        background_enabled = !disable_mis;
+        background_resolution = light->map_resolution;
       }
     }
   }
+
+  if (last_background_enabled != background_enabled ||
+      last_background_resolution != background_resolution) {
+    last_background_enabled = background_enabled;
+    last_background_resolution = background_resolution;
+    need_update_background = true;
+  }
 }
 
 bool LightManager::object_usable_as_light(Object *object)
 {
-  Mesh *mesh = object->mesh;
+  Geometry *geom = object->geometry;
+  if (geom->type != Geometry::MESH) {
+    return false;
+  }
   /* Skip objects with NaNs */
   if (!object->bounds.valid()) {
     return false;
@@ -242,10 +264,10 @@ bool LightManager::object_usable_as_light(Object *object)
   }
   /* Skip if we have no emission shaders. */
   /* TODO(sergey): Ideally we want to avoid such duplicated loop, since it'll
-   * iterate all mesh shaders twice (when counting and when calculating
+   * iterate all geometry shaders twice (when counting and when calculating
    * triangle area.
    */
-  foreach (const Shader *shader, mesh->used_shaders) {
+  foreach (const Shader *shader, geom->used_shaders) {
     if (shader->use_mis && shader->has_surface_emission) {
       return true;
     }
@@ -375,7 +397,7 @@ void LightManager::device_update_distribution(Device *device,
       continue;
     }
     /* Count emissive triangles. */
-    Mesh *mesh = object->mesh;
+    Mesh *mesh = static_cast<Mesh *>(object->geometry);
     size_t mesh_num_triangles = mesh->num_triangles();
     for (size_t i = 0; i < mesh_num_triangles; i++) {
       int shader_index = mesh->shader[i];
@@ -649,7 +671,7 @@ void LightManager::device_update_distribution(Device *device,
 
     const Object *object = scene->objects[prim.object_id];
     /* Sum area. */
-    const Mesh *mesh = object->mesh;
+    Mesh *mesh = static_cast<Mesh *>(object->geometry);
     int shader_flag = 0;
 
     if (!(object->visibility & PATH_RAY_DIFFUSE)) {
@@ -707,14 +729,19 @@ void LightManager::device_update_distribution(Device *device,
     distribution[offset].lamp.size = light->size;
     totarea += lightarea;
 
-    if (light->size > 0.0f && light->use_mis)
-      use_lamp_mis = true;
-    if (light->type == LIGHT_BACKGROUND) {
-      num_background_lights++;
-      background_mis = light->use_mis;
+    if (light->type == LIGHT_DISTANT) {
+    num_distant_lights++;
+      use_lamp_mis |= (light->angle > 0.0f && light->use_mis);
     }
-    else if (light->type == LIGHT_DISTANT) {
-      num_distant_lights++;
+    else if (light->type == LIGHT_POINT || light->type == LIGHT_SPOT) {
+      use_lamp_mis |= (light->size > 0.0f && light->use_mis);
+    }
+    else if (light->type == LIGHT_AREA) {
+      use_lamp_mis |= light->use_mis;
+    }
+    else if (light->type == LIGHT_BACKGROUND) {
+      num_background_lights++;
+      background_mis |= light->use_mis;
     }
 
     offset++;
@@ -738,6 +765,7 @@ void LightManager::device_update_distribution(Device *device,
 
   /* update device */
   KernelIntegrator *kintegrator = &dscene->data.integrator;
+  KernelBackground *kbackground = &dscene->data.background;
   KernelFilm *kfilm = &dscene->data.film;
   kintegrator->use_direct_light = (totarea > 0.0f);
 
@@ -803,15 +831,18 @@ void LightManager::device_update_distribution(Device *device,
 
     /* Portals */
     if (num_portals > 0) {
-      kintegrator->portal_offset = light_index;
-      kintegrator->num_portals = num_portals;
-      kintegrator->portal_pdf = background_mis ? 0.5f : 1.0f;
+      kbackground->portal_offset = light_index;
+      kbackground->num_portals = num_portals;
+      kbackground->portal_weight = 1.0f;
     }
     else {
-      kintegrator->num_portals = 0;
-      kintegrator->portal_offset = 0;
-      kintegrator->portal_pdf = 0.0f;
+      kbackground->num_portals = 0;
+      kbackground->portal_offset = 0;
+      kbackground->portal_weight = 0.0f;
     }
+
+    /* Map */
+    kbackground->map_weight = background_mis ? 1.0f : 0.0f;
   }
   else {
     dscene->light_group_sample_cdf.free();
@@ -835,11 +866,14 @@ void LightManager::device_update_distribution(Device *device,
     kintegrator->pdf_triangles = 0.0f;
     kintegrator->pdf_lights = 0.0f;
     kintegrator->use_lamp_mis = false;
-    kintegrator->num_portals = 0;
-    kintegrator->portal_offset = 0;
-    kintegrator->portal_pdf = 0.0f;
+
+    kbackground->num_portals = 0;
+    kbackground->portal_offset = 0;
+    kbackground->portal_weight = 0.0f;
+    kbackground->sun_weight = 0.0f;
     kintegrator->distant_lights_offset = 0;
     kintegrator->background_light_index = 0;
+    kbackground->map_weight = 0.0f;
     kfilm->pass_shadow_scale = 1.0f;
   }
 }
@@ -887,7 +921,7 @@ void LightManager::device_update_background(Device *device,
                                             Scene *scene,
                                             Progress &progress)
 {
-  KernelIntegrator *kintegrator = &dscene->data.integrator;
+  KernelBackground *kbackground = &dscene->data.background;
   Light *background_light = NULL;
 
   /* find background light */
@@ -900,20 +934,87 @@ void LightManager::device_update_background(Device *device,
 
   /* no background light found, signal renderer to skip sampling */
   if (!background_light || !background_light->is_enabled) {
-    kintegrator->pdf_background_res_x = 0;
-    kintegrator->pdf_background_res_y = 0;
+    kbackground->map_res_x = 0;
+    kbackground->map_res_y = 0;
+    kbackground->map_weight = 0.0f;
+    kbackground->sun_weight = 0.0f;
+    kbackground->use_mis = (kbackground->portal_weight > 0.0f);
     return;
   }
 
   progress.set_status("Updating Lights", "Importance map");
 
-  assert(kintegrator->use_direct_light);
+  assert(dscene->data.integrator.use_direct_light);
+
+  int2 environment_res = make_int2(0, 0);
+  Shader *shader = scene->background->get_shader(scene);
+  int num_suns = 0;
+  foreach (ShaderNode *node, shader->graph->nodes) {
+    if (node->type == EnvironmentTextureNode::node_type) {
+      EnvironmentTextureNode *env = (EnvironmentTextureNode *)node;
+      ImageMetaData metadata;
+      if (!env->handle.empty()) {
+        ImageMetaData metadata = env->handle.metadata();
+        environment_res.x = max(environment_res.x, metadata.width);
+        environment_res.y = max(environment_res.y, metadata.height);
+      }
+    }
+    if (node->type == SkyTextureNode::node_type) {
+      SkyTextureNode *sky = (SkyTextureNode *)node;
+      if (sky->type == NODE_SKY_NISHITA && sky->sun_disc) {
+        /* Ensure that the input coordinates aren't transformed before they reach the node.
+         * If that is the case, the logic used for sampling the sun's location does not work
+         * and we have to fall back to map-based sampling. */
+        const ShaderInput *vec_in = sky->input("Vector");
+        if (vec_in && vec_in->link && vec_in->link->parent) {
+          ShaderNode *vec_src = vec_in->link->parent;
+          if ((vec_src->type != TextureCoordinateNode::node_type) ||
+              (vec_in->link != vec_src->output("Generated"))) {
+            environment_res.x = max(environment_res.x, 4096);
+            environment_res.y = max(environment_res.y, 2048);
+            continue;
+          }
+        }
 
-  /* get the resolution from the light's size (we stuff it in there) */
-  int2 res = get_background_map_resolution(background_light, scene);
+        /* Determine sun direction from lat/long and texture mapping. */
+        float latitude = sky->sun_elevation;
+        float longitude = M_2PI_F - sky->sun_rotation + M_PI_2_F;
+        float3 sun_direction = make_float3(
+            cosf(latitude) * cosf(longitude), cosf(latitude) * sinf(longitude), sinf(latitude));
+        Transform sky_transform = transform_inverse(sky->tex_mapping.compute_transform());
+        sun_direction = transform_direction(&sky_transform, sun_direction);
+
+        /* Pack sun direction and size. */
+        float half_angle = sky->sun_size * 0.5f;
+        kbackground->sun = make_float4(
+            sun_direction.x, sun_direction.y, sun_direction.z, half_angle);
+
+        kbackground->sun_weight = 4.0f;
+        environment_res.x = max(environment_res.x, 512);
+        environment_res.y = max(environment_res.y, 256);
+        num_suns++;
+      }
+    }
+  }
 
-  kintegrator->pdf_background_res_x = res.x;
-  kintegrator->pdf_background_res_y = res.y;
+  /* If there's more than one sun, fall back to map sampling instead. */
+  if (num_suns != 1) {
+    kbackground->sun_weight = 0.0f;
+    environment_res.x = max(environment_res.x, 4096);
+    environment_res.y = max(environment_res.y, 2048);
+  }
+
+  /* Enable MIS for background sampling if any strategy is active. */
+  kbackground->use_mis = (kbackground->portal_weight + kbackground->map_weight +
+                          kbackground->sun_weight) > 0.0f;
+
+  /* get the resolution from the light's size (we stuff it in there) */
+  int2 res = make_int2(background_light->map_resolution, background_light->map_resolution / 2);
+  /* If the resolution isn't set manually, try to find an environment texture. */
+  if (res.x == 0) {
+    res = environment_res;
+  kbackground->map_res_x = res.x;
+  kbackground->map_res_y = res.y;
 
   vector<float3> pixels;
   shade_background_pixels(device, dscene, res.x, res.y, pixels, progress);
@@ -927,29 +1028,13 @@ void LightManager::device_update_background(Device *device,
   float2 *cond_cdf = dscene->light_background_conditional_cdf.alloc(cdf_width * res.y);
 
   double time_start = time_dt();
-  if (max(res.x, res.y) < 512) {
-    /* Small enough resolution, faster to do single-threaded. */
-    background_cdf(0, res.y, res.x, res.y, &pixels, cond_cdf);
-  }
-  else {
-    /* Threaded evaluation for large resolution. */
-    const int num_blocks = TaskScheduler::num_threads();
-    const int chunk_size = res.y / num_blocks;
-    int start_row = 0;
-    TaskPool pool;
-    for (int i = 0; i < num_blocks; ++i) {
-      const int current_chunk_size = (i != num_blocks - 1) ? chunk_size : (res.y - i * chunk_size);
-      pool.push(function_bind(&background_cdf,
-                              start_row,
-                              start_row + current_chunk_size,
-                              res.x,
-                              res.y,
-                              &pixels,
-                              cond_cdf));
-      start_row += current_chunk_size;
-    }
-    pool.wait_work();
-  }
+
+  /* Create CDF in parallel. */
+  const int rows_per_task = divide_up(10240, res.x);
+  parallel_for(blocked_range<size_t>(0, res.y, rows_per_task),
+               [&](const blocked_range<size_t> &r) {
+                 background_cdf(r.begin(), r.end(), res.x, res.y, &pixels, cond_cdf);
+               });
 
   /* marginal CDFs (column, V direction, sum of rows) */
   marg_cdf[0].x = cond_cdf[res.x].x;
@@ -1004,7 +1089,6 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
     float3 co = light->co;
     Shader *shader = (light->shader) ? light->shader : scene->default_light;
     int shader_id = scene->shader_manager->get_shader_id(shader);
-    int samples = light->samples;
     int max_bounces = light->max_bounces;
     float random = (float)light->random_id * (1.0f / (float)0xFFFFFFFF);
 
@@ -1029,7 +1113,10 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
     }
 
     klights[light_index].type = light->type;
-    klights[light_index].samples = samples;
+    klights[light_index].samples = light->samples;
+    klights[light_index].strength[0] = light->strength.x;
+    klights[light_index].strength[1] = light->strength.y;
+    klights[light_index].strength[2] = light->strength.z;
 
     if (light->type == LIGHT_POINT) {
       shader_id &= ~SHADER_AREA_LIGHT;
@@ -1050,8 +1137,8 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
     else if (light->type == LIGHT_DISTANT) {
       shader_id &= ~SHADER_AREA_LIGHT;
 
-      float radius = light->size;
-      float angle = atanf(radius);
+      float angle = light->angle / 2.0f;
+      float radius = tanf(angle);
       float cosangle = cosf(angle);
       float area = M_PI_F * radius * radius;
       float invarea = (area > 0.0f) ? 1.0f / area : 1.0f;
@@ -1218,11 +1305,12 @@ void LightManager::device_update(Device *device,
 
   VLOG(1) << "Total " << scene->lights.size() << " lights.";
 
-  device_free(device, dscene);
+  /* Detect which lights are enabled, also determins if we need to update the background. */
+  test_enabled_lights(scene);
 
-  use_light_visibility = false;
+  device_free(device, dscene, need_update_background);
 
-  disable_ineffective_light(scene);
+  use_light_visibility = false;
 
   device_update_points(device, dscene, scene);
   if (progress.get_cancel())
@@ -1232,9 +1320,11 @@ void LightManager::device_update(Device *device,
   if (progress.get_cancel())
     return;
 
-  device_update_background(device, dscene, scene, progress);
-  if (progress.get_cancel())
-    return;
+  if (need_update_background) {
+    device_update_background(device, dscene, scene, progress);
+    if (progress.get_cancel())
+      return;
+  }
 
   device_update_ies(dscene);
   if (progress.get_cancel())
@@ -1246,9 +1336,10 @@ void LightManager::device_update(Device *device,
   }
 
   need_update = false;
+  need_update_background = false;
 }
 
-void LightManager::device_free(Device *, DeviceScene *dscene)
+void LightManager::device_free(Device *, DeviceScene *dscene, const bool free_background)
 {
   dscene->light_distribution.free();
   dscene->light_tree_nodes.free();
@@ -1256,8 +1347,10 @@ void LightManager::device_free(Device *, DeviceScene *dscene)
   dscene->lamp_to_distribution.free();
   dscene->triangle_to_distribution.free();
   dscene->lights.free();
-  dscene->light_background_marginal_cdf.free();
-  dscene->light_background_conditional_cdf.free();
+  if (free_background) {
+    dscene->light_background_marginal_cdf.free();
+    dscene->light_background_conditional_cdf.free();
+  }
   dscene->light_group_sample_prob.free();
   dscene->light_group_sample_cdf.free();
   dscene->leaf_to_first_emitter.free();
@@ -1270,7 +1363,7 @@ void LightManager::tag_update(Scene * /*scene*/)
   need_update = true;
 }
 
-int LightManager::add_ies_from_file(ustring filename)
+int LightManager::add_ies_from_file(const string &filename)
 {
   string content;
 
@@ -1279,10 +1372,10 @@ int LightManager::add_ies_from_file(ustring filename)
     content = "\n";
   }
 
-  return add_ies(ustring(content));
+  return add_ies(content);
 }
 
-int LightManager::add_ies(ustring content)
+int LightManager::add_ies(const string &content)
 {
   uint hash = hash_string(content.c_str());
 
@@ -1314,6 +1407,7 @@ int LightManager::add_ies(ustring content)
   ies_slots[slot]->hash = hash;
 
   need_update = true;
+  need_update_background = true;
 
   return slot;
 }
@@ -1332,6 +1426,7 @@ void LightManager::remove_ies(int slot)
 
   /* If the slot has no more users, update the device to remove it. */
   need_update |= (ies_slots[slot]->users == 0);
+  need_update_background |= need_update;
 }
 
 void LightManager::device_update_ies(DeviceScene *dscene)
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index de84f6a5457..41a627882bd 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -43,10 +43,12 @@ class Light : public Node {
   Light();
 
   LightType type;
+  float3 strength;
   float3 co;
 
   float3 dir;
   float size;
+  float angle;
 
   float3 axisu;
   float sizeu;
@@ -78,7 +80,7 @@ class Light : public Node {
 
   void tag_update(Scene *scene);
 
-  /* Check whether the light has contribution the the scene. */
+  /* Check whether the light has contribution the scene. */
   bool has_contribution(Scene *scene);
 };
 
@@ -87,16 +89,19 @@ class LightManager {
   bool use_light_visibility;
   bool need_update;
 
+  /* Need to update background (including multiple importance map) */
+  bool need_update_background;
+
   LightManager();
   ~LightManager();
 
   /* IES texture management */
-  int add_ies(ustring ies);
-  int add_ies_from_file(ustring filename);
+  int add_ies(const string &ies);
+  int add_ies_from_file(const string &filename);
   void remove_ies(int slot);
 
   void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-  void device_free(Device *device, DeviceScene *dscene);
+  void device_free(Device *device, DeviceScene *dscene, const bool free_background = true);
 
   void tag_update(Scene *scene);
 
@@ -108,7 +113,7 @@ class LightManager {
    * which doesn't contribute to the scene or which is only used for MIS
    * and scene doesn't need MIS.
    */
-  void disable_ineffective_light(Scene *scene);
+  void test_enabled_lights(Scene *scene);
 
   void device_update_points(Device *device, DeviceScene *dscene, Scene *scene);
   void device_update_distribution(Device *device,
@@ -144,6 +149,9 @@ class LightManager {
 
   vector<IESSlot *> ies_slots;
   thread_mutex ies_mutex;
+
+  bool last_background_enabled;
+  int last_background_resolution;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/merge.cpp b/intern/cycles/render/merge.cpp
index cac07e59fe3..3ea3952b96c 100644
--- a/intern/cycles/render/merge.cpp
+++ b/intern/cycles/render/merge.cpp
@@ -22,8 +22,8 @@
 #include "util/util_time.h"
 #include "util/util_unique_ptr.h"
 
-#include <OpenImageIO/imageio.h>
 #include <OpenImageIO/filesystem.h>
+#include <OpenImageIO/imageio.h>
 
 OIIO_NAMESPACE_USING
 
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 54dacf5d1f4..c262d770331 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -17,32 +17,22 @@
 #include "bvh/bvh.h"
 #include "bvh/bvh_build.h"
 
-#include "render/camera.h"
-#include "render/curves.h"
 #include "device/device.h"
+
 #include "render/graph.h"
-#include "render/shader.h"
-#include "render/light.h"
+#include "render/hair.h"
 #include "render/mesh.h"
-#include "render/nodes.h"
 #include "render/object.h"
 #include "render/scene.h"
-#include "render/stats.h"
-
-#include "kernel/osl/osl_globals.h"
 
-#include "subd/subd_split.h"
 #include "subd/subd_patch_table.h"
+#include "subd/subd_split.h"
 
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_progress.h"
 #include "util/util_set.h"
 
-#ifdef WITH_EMBREE
-#  include "bvh/bvh_embree.h"
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 /* Triangle */
@@ -120,263 +110,6 @@ bool Mesh::Triangle::valid(const float3 *verts) const
   return isfinite3_safe(verts[v[0]]) && isfinite3_safe(verts[v[1]]) && isfinite3_safe(verts[v[2]]);
 }
 
-/* Curve */
-
-void Mesh::Curve::bounds_grow(const int k,
-                              const float3 *curve_keys,
-                              const float *curve_radius,
-                              BoundBox &bounds) const
-{
-  float3 P[4];
-
-  P[0] = curve_keys[max(first_key + k - 1, first_key)];
-  P[1] = curve_keys[first_key + k];
-  P[2] = curve_keys[first_key + k + 1];
-  P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
-
-  float3 lower;
-  float3 upper;
-
-  curvebounds(&lower.x, &upper.x, P, 0);
-  curvebounds(&lower.y, &upper.y, P, 1);
-  curvebounds(&lower.z, &upper.z, P, 2);
-
-  float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
-
-  bounds.grow(lower, mr);
-  bounds.grow(upper, mr);
-}
-
-void Mesh::Curve::bounds_grow(const int k,
-                              const float3 *curve_keys,
-                              const float *curve_radius,
-                              const Transform &aligned_space,
-                              BoundBox &bounds) const
-{
-  float3 P[4];
-
-  P[0] = curve_keys[max(first_key + k - 1, first_key)];
-  P[1] = curve_keys[first_key + k];
-  P[2] = curve_keys[first_key + k + 1];
-  P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
-
-  P[0] = transform_point(&aligned_space, P[0]);
-  P[1] = transform_point(&aligned_space, P[1]);
-  P[2] = transform_point(&aligned_space, P[2]);
-  P[3] = transform_point(&aligned_space, P[3]);
-
-  float3 lower;
-  float3 upper;
-
-  curvebounds(&lower.x, &upper.x, P, 0);
-  curvebounds(&lower.y, &upper.y, P, 1);
-  curvebounds(&lower.z, &upper.z, P, 2);
-
-  float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
-
-  bounds.grow(lower, mr);
-  bounds.grow(upper, mr);
-}
-
-void Mesh::Curve::bounds_grow(float4 keys[4], BoundBox &bounds) const
-{
-  float3 P[4] = {
-      float4_to_float3(keys[0]),
-      float4_to_float3(keys[1]),
-      float4_to_float3(keys[2]),
-      float4_to_float3(keys[3]),
-  };
-
-  float3 lower;
-  float3 upper;
-
-  curvebounds(&lower.x, &upper.x, P, 0);
-  curvebounds(&lower.y, &upper.y, P, 1);
-  curvebounds(&lower.z, &upper.z, P, 2);
-
-  float mr = max(keys[1].w, keys[2].w);
-
-  bounds.grow(lower, mr);
-  bounds.grow(upper, mr);
-}
-
-void Mesh::Curve::motion_keys(const float3 *curve_keys,
-                              const float *curve_radius,
-                              const float3 *key_steps,
-                              size_t num_curve_keys,
-                              size_t num_steps,
-                              float time,
-                              size_t k0,
-                              size_t k1,
-                              float4 r_keys[2]) const
-{
-  /* Figure out which steps we need to fetch and their interpolation factor. */
-  const size_t max_step = num_steps - 1;
-  const size_t step = min((int)(time * max_step), max_step - 1);
-  const float t = time * max_step - step;
-  /* Fetch vertex coordinates. */
-  float4 curr_keys[2];
-  float4 next_keys[2];
-  keys_for_step(
-      curve_keys, curve_radius, key_steps, num_curve_keys, num_steps, step, k0, k1, curr_keys);
-  keys_for_step(
-      curve_keys, curve_radius, key_steps, num_curve_keys, num_steps, step + 1, k0, k1, next_keys);
-  /* Interpolate between steps. */
-  r_keys[0] = (1.0f - t) * curr_keys[0] + t * next_keys[0];
-  r_keys[1] = (1.0f - t) * curr_keys[1] + t * next_keys[1];
-}
-
-void Mesh::Curve::cardinal_motion_keys(const float3 *curve_keys,
-                                       const float *curve_radius,
-                                       const float3 *key_steps,
-                                       size_t num_curve_keys,
-                                       size_t num_steps,
-                                       float time,
-                                       size_t k0,
-                                       size_t k1,
-                                       size_t k2,
-                                       size_t k3,
-                                       float4 r_keys[4]) const
-{
-  /* Figure out which steps we need to fetch and their interpolation factor. */
-  const size_t max_step = num_steps - 1;
-  const size_t step = min((int)(time * max_step), max_step - 1);
-  const float t = time * max_step - step;
-  /* Fetch vertex coordinates. */
-  float4 curr_keys[4];
-  float4 next_keys[4];
-  cardinal_keys_for_step(curve_keys,
-                         curve_radius,
-                         key_steps,
-                         num_curve_keys,
-                         num_steps,
-                         step,
-                         k0,
-                         k1,
-                         k2,
-                         k3,
-                         curr_keys);
-  cardinal_keys_for_step(curve_keys,
-                         curve_radius,
-                         key_steps,
-                         num_curve_keys,
-                         num_steps,
-                         step + 1,
-                         k0,
-                         k1,
-                         k2,
-                         k3,
-                         next_keys);
-  /* Interpolate between steps. */
-  r_keys[0] = (1.0f - t) * curr_keys[0] + t * next_keys[0];
-  r_keys[1] = (1.0f - t) * curr_keys[1] + t * next_keys[1];
-  r_keys[2] = (1.0f - t) * curr_keys[2] + t * next_keys[2];
-  r_keys[3] = (1.0f - t) * curr_keys[3] + t * next_keys[3];
-}
-
-void Mesh::Curve::keys_for_step(const float3 *curve_keys,
-                                const float *curve_radius,
-                                const float3 *key_steps,
-                                size_t num_curve_keys,
-                                size_t num_steps,
-                                size_t step,
-                                size_t k0,
-                                size_t k1,
-                                float4 r_keys[2]) const
-{
-  k0 = max(k0, 0);
-  k1 = min(k1, num_keys - 1);
-  const size_t center_step = ((num_steps - 1) / 2);
-  if (step == center_step) {
-    /* Center step: regular key location. */
-    /* TODO(sergey): Consider adding make_float4(float3, float)
-     * function.
-     */
-    r_keys[0] = make_float4(curve_keys[first_key + k0].x,
-                            curve_keys[first_key + k0].y,
-                            curve_keys[first_key + k0].z,
-                            curve_radius[first_key + k0]);
-    r_keys[1] = make_float4(curve_keys[first_key + k1].x,
-                            curve_keys[first_key + k1].y,
-                            curve_keys[first_key + k1].z,
-                            curve_radius[first_key + k1]);
-  }
-  else {
-    /* Center step is not stored in this array. */
-    if (step > center_step) {
-      step--;
-    }
-    const size_t offset = first_key + step * num_curve_keys;
-    r_keys[0] = make_float4(key_steps[offset + k0].x,
-                            key_steps[offset + k0].y,
-                            key_steps[offset + k0].z,
-                            curve_radius[first_key + k0]);
-    r_keys[1] = make_float4(key_steps[offset + k1].x,
-                            key_steps[offset + k1].y,
-                            key_steps[offset + k1].z,
-                            curve_radius[first_key + k1]);
-  }
-}
-
-void Mesh::Curve::cardinal_keys_for_step(const float3 *curve_keys,
-                                         const float *curve_radius,
-                                         const float3 *key_steps,
-                                         size_t num_curve_keys,
-                                         size_t num_steps,
-                                         size_t step,
-                                         size_t k0,
-                                         size_t k1,
-                                         size_t k2,
-                                         size_t k3,
-                                         float4 r_keys[4]) const
-{
-  k0 = max(k0, 0);
-  k3 = min(k3, num_keys - 1);
-  const size_t center_step = ((num_steps - 1) / 2);
-  if (step == center_step) {
-    /* Center step: regular key location. */
-    r_keys[0] = make_float4(curve_keys[first_key + k0].x,
-                            curve_keys[first_key + k0].y,
-                            curve_keys[first_key + k0].z,
-                            curve_radius[first_key + k0]);
-    r_keys[1] = make_float4(curve_keys[first_key + k1].x,
-                            curve_keys[first_key + k1].y,
-                            curve_keys[first_key + k1].z,
-                            curve_radius[first_key + k1]);
-    r_keys[2] = make_float4(curve_keys[first_key + k2].x,
-                            curve_keys[first_key + k2].y,
-                            curve_keys[first_key + k2].z,
-                            curve_radius[first_key + k2]);
-    r_keys[3] = make_float4(curve_keys[first_key + k3].x,
-                            curve_keys[first_key + k3].y,
-                            curve_keys[first_key + k3].z,
-                            curve_radius[first_key + k3]);
-  }
-  else {
-    /* Center step is not stored in this array. */
-    if (step > center_step) {
-      step--;
-    }
-    const size_t offset = first_key + step * num_curve_keys;
-    r_keys[0] = make_float4(key_steps[offset + k0].x,
-                            key_steps[offset + k0].y,
-                            key_steps[offset + k0].z,
-                            curve_radius[first_key + k0]);
-    r_keys[1] = make_float4(key_steps[offset + k1].x,
-                            key_steps[offset + k1].y,
-                            key_steps[offset + k1].z,
-                            curve_radius[first_key + k1]);
-    r_keys[2] = make_float4(key_steps[offset + k2].x,
-                            key_steps[offset + k2].y,
-                            key_steps[offset + k2].z,
-                            curve_radius[first_key + k2]);
-    r_keys[3] = make_float4(key_steps[offset + k3].x,
-                            key_steps[offset + k3].y,
-                            key_steps[offset + k3].z,
-                            curve_radius[first_key + k3]);
-  }
-}
-
 /* SubdFace */
 
 float3 Mesh::SubdFace::normal(const Mesh *mesh) const
@@ -392,58 +125,29 @@ float3 Mesh::SubdFace::normal(const Mesh *mesh) const
 
 NODE_DEFINE(Mesh)
 {
-  NodeType *type = NodeType::add("mesh", create);
-
-  SOCKET_UINT(motion_steps, "Motion Steps", 3);
-  SOCKET_BOOLEAN(use_motion_blur, "Use Motion Blur", false);
+  NodeType *type = NodeType::add("mesh", create, NodeType::NONE, Geometry::node_base_type);
 
   SOCKET_INT_ARRAY(triangles, "Triangles", array<int>());
   SOCKET_POINT_ARRAY(verts, "Vertices", array<float3>());
   SOCKET_INT_ARRAY(shader, "Shader", array<int>());
   SOCKET_BOOLEAN_ARRAY(smooth, "Smooth", array<bool>());
 
-  SOCKET_POINT_ARRAY(curve_keys, "Curve Keys", array<float3>());
-  SOCKET_FLOAT_ARRAY(curve_radius, "Curve Radius", array<float>());
-  SOCKET_INT_ARRAY(curve_first_key, "Curve First Key", array<int>());
-  SOCKET_INT_ARRAY(curve_shader, "Curve Shader", array<int>());
-
   return type;
 }
 
-Mesh::Mesh() : Node(node_type)
+Mesh::Mesh() : Geometry(node_type, Geometry::MESH), subd_attributes(this, ATTR_PRIM_SUBD)
 {
-  need_update = true;
-  need_update_rebuild = false;
-  transform_applied = false;
-  transform_negative_scaled = false;
-  transform_normal = transform_identity();
-  bounds = BoundBox::empty;
-
-  bvh = NULL;
-
-  tri_offset = 0;
   vert_offset = 0;
 
-  curve_offset = 0;
-  curvekey_offset = 0;
-
   patch_offset = 0;
   face_offset = 0;
   corner_offset = 0;
 
-  attr_map_offset = 0;
-
   num_subd_verts = 0;
 
-  attributes.triangle_mesh = this;
-  curve_attributes.curve_mesh = this;
-  subd_attributes.subd_mesh = this;
-
-  geometry_flags = GEOMETRY_NONE;
-
-  volume_isovalue = 0.001f;
-  has_volume = false;
-  has_surface_bssrdf = false;
+  volume_clipping = 0.001f;
+  volume_step_size = 0.0f;
+  volume_object_space = false;
 
   num_ngons = 0;
 
@@ -455,7 +159,6 @@ Mesh::Mesh() : Node(node_type)
 
 Mesh::~Mesh()
 {
-  delete bvh;
   delete patch_table;
   delete subd_params;
 }
@@ -491,26 +194,6 @@ void Mesh::reserve_mesh(int numverts, int numtris)
   attributes.resize(true);
 }
 
-void Mesh::resize_curves(int numcurves, int numkeys)
-{
-  curve_keys.resize(numkeys);
-  curve_radius.resize(numkeys);
-  curve_first_key.resize(numcurves);
-  curve_shader.resize(numcurves);
-
-  curve_attributes.resize();
-}
-
-void Mesh::reserve_curves(int numcurves, int numkeys)
-{
-  curve_keys.reserve(numkeys);
-  curve_radius.reserve(numkeys);
-  curve_first_key.reserve(numcurves);
-  curve_shader.reserve(numcurves);
-
-  curve_attributes.resize(true);
-}
-
 void Mesh::resize_subd_faces(int numfaces, int num_ngons_, int numcorners)
 {
   subd_faces.resize(numfaces);
@@ -531,6 +214,8 @@ void Mesh::reserve_subd_faces(int numfaces, int num_ngons_, int numcorners)
 
 void Mesh::clear(bool preserve_voxel_data)
 {
+  Geometry::clear();
+
   /* clear all verts and triangles */
   verts.clear();
   triangles.clear();
@@ -540,11 +225,6 @@ void Mesh::clear(bool preserve_voxel_data)
   triangle_patch.clear();
   vert_patch_uv.clear();
 
-  curve_keys.clear();
-  curve_radius.clear();
-  curve_first_key.clear();
-  curve_shader.clear();
-
   subd_faces.clear();
   subd_face_corners.clear();
 
@@ -552,24 +232,21 @@ void Mesh::clear(bool preserve_voxel_data)
 
   subd_creases.clear();
 
-  curve_attributes.clear();
   subd_attributes.clear();
   attributes.clear(preserve_voxel_data);
 
-  used_shaders.clear();
-
-  if (!preserve_voxel_data) {
-    geometry_flags = GEOMETRY_NONE;
-  }
-
-  transform_applied = false;
-  transform_negative_scaled = false;
-  transform_normal = transform_identity();
+  vert_to_stitching_key_map.clear();
+  vert_stitching_map.clear();
 
   delete patch_table;
   patch_table = NULL;
 }
 
+void Mesh::clear()
+{
+  clear(false);
+}
+
 void Mesh::add_vertex(float3 P)
 {
   verts.push_back_reserved(P);
@@ -601,18 +278,6 @@ void Mesh::add_triangle(int v0, int v1, int v2, int shader_, bool smooth_)
   }
 }
 
-void Mesh::add_curve_key(float3 co, float radius)
-{
-  curve_keys.push_back_reserved(co);
-  curve_radius.push_back_reserved(radius);
-}
-
-void Mesh::add_curve(int first_key, int shader)
-{
-  curve_first_key.push_back_reserved(first_key);
-  curve_shader.push_back_reserved(shader);
-}
-
 void Mesh::add_subd_face(int *corners, int num_corners, int shader_, bool smooth_)
 {
   int start_corner = subd_face_corners.size();
@@ -632,19 +297,53 @@ void Mesh::add_subd_face(int *corners, int num_corners, int shader_, bool smooth
   subd_faces.push_back_reserved(face);
 }
 
+void Mesh::copy_center_to_motion_step(const int motion_step)
+{
+  Attribute *attr_mP = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+  if (attr_mP) {
+    Attribute *attr_mN = attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
+    Attribute *attr_N = attributes.find(ATTR_STD_VERTEX_NORMAL);
+    float3 *P = &verts[0];
+    float3 *N = (attr_N) ? attr_N->data_float3() : NULL;
+    size_t numverts = verts.size();
+
+    memcpy(attr_mP->data_float3() + motion_step * numverts, P, sizeof(float3) * numverts);
+    if (attr_mN)
+      memcpy(attr_mN->data_float3() + motion_step * numverts, N, sizeof(float3) * numverts);
+  }
+}
+
+void Mesh::get_uv_tiles(ustring map, unordered_set<int> &tiles)
+{
+  Attribute *attr, *subd_attr;
+
+  if (map.empty()) {
+    attr = attributes.find(ATTR_STD_UV);
+    subd_attr = subd_attributes.find(ATTR_STD_UV);
+  }
+  else {
+    attr = attributes.find(map);
+    subd_attr = subd_attributes.find(map);
+  }
+
+  if (attr) {
+    attr->get_uv_tiles(this, ATTR_PRIM_GEOMETRY, tiles);
+  }
+  if (subd_attr) {
+    subd_attr->get_uv_tiles(this, ATTR_PRIM_SUBD, tiles);
+  }
+}
+
 void Mesh::compute_bounds()
 {
   BoundBox bnds = BoundBox::empty;
   size_t verts_size = verts.size();
-  size_t curve_keys_size = curve_keys.size();
 
-  if (verts_size + curve_keys_size > 0) {
+  if (verts_size > 0) {
     for (size_t i = 0; i < verts_size; i++)
       bnds.grow(verts[i]);
 
-    for (size_t i = 0; i < curve_keys_size; i++)
-      bnds.grow(curve_keys[i], curve_radius[i]);
-
     Attribute *attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     if (use_motion_blur && attr) {
       size_t steps_size = verts.size() * (motion_steps - 1);
@@ -654,15 +353,6 @@ void Mesh::compute_bounds()
         bnds.grow(vert_steps[i]);
     }
 
-    Attribute *curve_attr = curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-    if (use_motion_blur && curve_attr) {
-      size_t steps_size = curve_keys.size() * (motion_steps - 1);
-      float3 *key_steps = curve_attr->data_float3();
-
-      for (size_t i = 0; i < steps_size; i++)
-        bnds.grow(key_steps[i]);
-    }
-
     if (!bnds.valid()) {
       bnds = BoundBox::empty;
 
@@ -670,9 +360,6 @@ void Mesh::compute_bounds()
       for (size_t i = 0; i < verts_size; i++)
         bnds.grow_safe(verts[i]);
 
-      for (size_t i = 0; i < curve_keys_size; i++)
-        bnds.grow_safe(curve_keys[i], curve_radius[i]);
-
       if (use_motion_blur && attr) {
         size_t steps_size = verts.size() * (motion_steps - 1);
         float3 *vert_steps = attr->data_float3();
@@ -680,14 +367,6 @@ void Mesh::compute_bounds()
         for (size_t i = 0; i < steps_size; i++)
           bnds.grow_safe(vert_steps[i]);
       }
-
-      if (use_motion_blur && curve_attr) {
-        size_t steps_size = curve_keys.size() * (motion_steps - 1);
-        float3 *key_steps = curve_attr->data_float3();
-
-        for (size_t i = 0; i < steps_size; i++)
-          bnds.grow_safe(key_steps[i]);
-      }
     }
   }
 
@@ -699,6 +378,38 @@ void Mesh::compute_bounds()
   bounds = bnds;
 }
 
+void Mesh::apply_transform(const Transform &tfm, const bool apply_to_motion)
+{
+  transform_normal = transform_transposed_inverse(tfm);
+
+  /* apply to mesh vertices */
+  for (size_t i = 0; i < verts.size(); i++)
+    verts[i] = transform_point(&tfm, verts[i]);
+
+  if (apply_to_motion) {
+    Attribute *attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+    if (attr) {
+      size_t steps_size = verts.size() * (motion_steps - 1);
+      float3 *vert_steps = attr->data_float3();
+
+      for (size_t i = 0; i < steps_size; i++)
+        vert_steps[i] = transform_point(&tfm, vert_steps[i]);
+    }
+
+    Attribute *attr_N = attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
+
+    if (attr_N) {
+      Transform ntfm = transform_normal;
+      size_t steps_size = verts.size() * (motion_steps - 1);
+      float3 *normal_steps = attr_N->data_float3();
+
+      for (size_t i = 0; i < steps_size; i++)
+        normal_steps[i] = normalize(transform_direction(&ntfm, normal_steps[i]));
+    }
+  }
+}
+
 void Mesh::add_face_normals()
 {
   /* don't compute if already there */
@@ -836,11 +547,11 @@ void Mesh::add_undisplaced()
   float3 *data = attr->data_float3();
 
   /* copy verts */
-  size_t size = attr->buffer_size(
-      this, (subdivision_type == SUBDIVISION_NONE) ? ATTR_PRIM_TRIANGLE : ATTR_PRIM_SUBD);
+  size_t size = attr->buffer_size(this, attrs.prim);
 
-  /* Center points for ngons aren't stored in Mesh::verts but are included in size since they will be
-   * calculated later, we subtract them from size here so we don't have an overflow while copying.
+  /* Center points for ngons aren't stored in Mesh::verts but are included in size since they will
+   * be calculated later, we subtract them from size here so we don't have an overflow while
+   * copying.
    */
   size -= num_ngons * attr->data_sizeof();
 
@@ -925,39 +636,6 @@ void Mesh::pack_verts(const vector<uint> &tri_prim_index,
   }
 }
 
-void Mesh::pack_curves(Scene *scene,
-                       float4 *curve_key_co,
-                       float4 *curve_data,
-                       size_t curvekey_offset)
-{
-  size_t curve_keys_size = curve_keys.size();
-
-  /* pack curve keys */
-  if (curve_keys_size) {
-    float3 *keys_ptr = curve_keys.data();
-    float *radius_ptr = curve_radius.data();
-
-    for (size_t i = 0; i < curve_keys_size; i++)
-      curve_key_co[i] = make_float4(keys_ptr[i].x, keys_ptr[i].y, keys_ptr[i].z, radius_ptr[i]);
-  }
-
-  /* pack curve segments */
-  size_t curve_num = num_curves();
-
-  for (size_t i = 0; i < curve_num; i++) {
-    Curve curve = get_curve(i);
-    int shader_id = curve_shader[i];
-    Shader *shader = (shader_id < used_shaders.size()) ? used_shaders[shader_id] :
-                                                         scene->default_surface;
-    shader_id = scene->shader_manager->get_shader_id(shader, false);
-
-    curve_data[i] = make_float4(__int_as_float(curve.first_key + curvekey_offset),
-                                __int_as_float(curve.num_keys),
-                                __int_as_float(shader_id),
-                                0.0f);
-  }
-}
-
 void Mesh::pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset)
 {
   size_t num_faces = subd_faces.size();
@@ -1004,1364 +682,4 @@ void Mesh::pack_patches(uint *patch_data, uint vert_offset, uint face_offset, ui
   }
 }
 
-void Mesh::compute_bvh(
-    Device *device, DeviceScene *dscene, SceneParams *params, Progress *progress, int n, int total)
-{
-  if (progress->get_cancel())
-    return;
-
-  compute_bounds();
-
-  if (need_build_bvh()) {
-    string msg = "Updating Mesh BVH ";
-    if (name == "")
-      msg += string_printf("%u/%u", (uint)(n + 1), (uint)total);
-    else
-      msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
-
-    Object object;
-    object.mesh = this;
-
-    vector<Object *> objects;
-    objects.push_back(&object);
-
-    if (bvh && !need_update_rebuild) {
-      progress->set_status(msg, "Refitting BVH");
-      bvh->objects = objects;
-      bvh->refit(*progress);
-    }
-    else {
-      progress->set_status(msg, "Building BVH");
-
-      BVHParams bparams;
-      bparams.use_spatial_split = params->use_bvh_spatial_split;
-      bparams.bvh_layout = BVHParams::best_bvh_layout(params->bvh_layout,
-                                                      device->get_bvh_layout_mask());
-      bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
-                                    params->use_bvh_unaligned_nodes;
-      bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
-      bparams.num_motion_curve_steps = params->num_bvh_time_steps;
-      bparams.bvh_type = params->bvh_type;
-      bparams.curve_flags = dscene->data.curve.curveflags;
-      bparams.curve_subdivisions = dscene->data.curve.subdivisions;
-
-      delete bvh;
-      bvh = BVH::create(bparams, objects);
-      MEM_GUARDED_CALL(progress, bvh->build, *progress);
-    }
-  }
-
-  need_update = false;
-  need_update_rebuild = false;
-}
-
-void Mesh::tag_update(Scene *scene, bool rebuild)
-{
-  need_update = true;
-
-  if (rebuild) {
-    need_update_rebuild = true;
-    scene->light_manager->need_update = true;
-  }
-  else {
-    foreach (Shader *shader, used_shaders)
-      if (shader->has_surface_emission)
-        scene->light_manager->need_update = true;
-  }
-
-  scene->mesh_manager->need_update = true;
-  scene->object_manager->need_update = true;
-}
-
-bool Mesh::has_motion_blur() const
-{
-  return (use_motion_blur && (attributes.find(ATTR_STD_MOTION_VERTEX_POSITION) ||
-                              curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)));
-}
-
-bool Mesh::has_true_displacement() const
-{
-  foreach (Shader *shader, used_shaders) {
-    if (shader->has_displacement && shader->displacement_method != DISPLACE_BUMP) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-float Mesh::motion_time(int step) const
-{
-  return (motion_steps > 1) ? 2.0f * step / (motion_steps - 1) - 1.0f : 0.0f;
-}
-
-int Mesh::motion_step(float time) const
-{
-  if (motion_steps > 1) {
-    int attr_step = 0;
-
-    for (int step = 0; step < motion_steps; step++) {
-      float step_time = motion_time(step);
-      if (step_time == time) {
-        return attr_step;
-      }
-
-      /* Center step is stored in a separate attribute. */
-      if (step != motion_steps / 2) {
-        attr_step++;
-      }
-    }
-  }
-
-  return -1;
-}
-
-bool Mesh::need_build_bvh() const
-{
-  return !transform_applied || has_surface_bssrdf;
-}
-
-bool Mesh::is_instanced() const
-{
-  /* Currently we treat subsurface objects as instanced.
-   *
-   * While it might be not very optimal for ray traversal, it avoids having
-   * duplicated BVH in the memory, saving quite some space.
-   */
-  return !transform_applied || has_surface_bssrdf;
-}
-
-/* Mesh Manager */
-
-MeshManager::MeshManager()
-{
-  need_update = true;
-  need_flags_update = true;
-}
-
-MeshManager::~MeshManager()
-{
-}
-
-void MeshManager::update_osl_attributes(Device *device,
-                                        Scene *scene,
-                                        vector<AttributeRequestSet> &mesh_attributes)
-{
-#ifdef WITH_OSL
-  /* for OSL, a hash map is used to lookup the attribute by name. */
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
-
-  og->object_name_map.clear();
-  og->attribute_map.clear();
-  og->object_names.clear();
-
-  og->attribute_map.resize(scene->objects.size() * ATTR_PRIM_TYPES);
-
-  for (size_t i = 0; i < scene->objects.size(); i++) {
-    /* set object name to object index map */
-    Object *object = scene->objects[i];
-    og->object_name_map[object->name] = i;
-    og->object_names.push_back(object->name);
-
-    /* set object attributes */
-    foreach (ParamValue &attr, object->attributes) {
-      OSLGlobals::Attribute osl_attr;
-
-      osl_attr.type = attr.type();
-      osl_attr.desc.element = ATTR_ELEMENT_OBJECT;
-      osl_attr.value = attr;
-      osl_attr.desc.offset = 0;
-      osl_attr.desc.flags = 0;
-
-      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][attr.name()] = osl_attr;
-      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_CURVE][attr.name()] = osl_attr;
-      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][attr.name()] = osl_attr;
-    }
-
-    /* find mesh attributes */
-    size_t j;
-
-    for (j = 0; j < scene->meshes.size(); j++)
-      if (scene->meshes[j] == object->mesh)
-        break;
-
-    AttributeRequestSet &attributes = mesh_attributes[j];
-
-    /* set object attributes */
-    foreach (AttributeRequest &req, attributes.requests) {
-      OSLGlobals::Attribute osl_attr;
-
-      if (req.triangle_desc.element != ATTR_ELEMENT_NONE) {
-        osl_attr.desc = req.triangle_desc;
-
-        if (req.triangle_type == TypeDesc::TypeFloat)
-          osl_attr.type = TypeDesc::TypeFloat;
-        else if (req.triangle_type == TypeDesc::TypeMatrix)
-          osl_attr.type = TypeDesc::TypeMatrix;
-        else if (req.triangle_type == TypeFloat2)
-          osl_attr.type = TypeFloat2;
-        else
-          osl_attr.type = TypeDesc::TypeColor;
-
-        if (req.std != ATTR_STD_NONE) {
-          /* if standard attribute, add lookup by geom: name convention */
-          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][stdname] = osl_attr;
-        }
-        else if (req.name != ustring()) {
-          /* add lookup by mesh attribute name */
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][req.name] = osl_attr;
-        }
-      }
-
-      if (req.curve_desc.element != ATTR_ELEMENT_NONE) {
-        osl_attr.desc = req.curve_desc;
-
-        if (req.curve_type == TypeDesc::TypeFloat)
-          osl_attr.type = TypeDesc::TypeFloat;
-        else if (req.curve_type == TypeDesc::TypeMatrix)
-          osl_attr.type = TypeDesc::TypeMatrix;
-        else
-          osl_attr.type = TypeDesc::TypeColor;
-
-        if (req.std != ATTR_STD_NONE) {
-          /* if standard attribute, add lookup by geom: name convention */
-          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_CURVE][stdname] = osl_attr;
-        }
-        else if (req.name != ustring()) {
-          /* add lookup by mesh attribute name */
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_CURVE][req.name] = osl_attr;
-        }
-      }
-
-      if (req.subd_desc.element != ATTR_ELEMENT_NONE) {
-        osl_attr.desc = req.subd_desc;
-
-        if (req.subd_type == TypeDesc::TypeFloat)
-          osl_attr.type = TypeDesc::TypeFloat;
-        else if (req.subd_type == TypeDesc::TypeMatrix)
-          osl_attr.type = TypeDesc::TypeMatrix;
-        else
-          osl_attr.type = TypeDesc::TypeColor;
-
-        if (req.std != ATTR_STD_NONE) {
-          /* if standard attribute, add lookup by geom: name convention */
-          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][stdname] = osl_attr;
-        }
-        else if (req.name != ustring()) {
-          /* add lookup by mesh attribute name */
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][req.name] = osl_attr;
-        }
-      }
-    }
-  }
-#else
-  (void)device;
-  (void)scene;
-  (void)mesh_attributes;
-#endif
-}
-
-void MeshManager::update_svm_attributes(Device *,
-                                        DeviceScene *dscene,
-                                        Scene *scene,
-                                        vector<AttributeRequestSet> &mesh_attributes)
-{
-  /* for SVM, the attributes_map table is used to lookup the offset of an
-   * attribute, based on a unique shader attribute id. */
-
-  /* compute array stride */
-  int attr_map_size = 0;
-
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-    mesh->attr_map_offset = attr_map_size;
-    attr_map_size += (mesh_attributes[i].size() + 1) * ATTR_PRIM_TYPES;
-  }
-
-  if (attr_map_size == 0)
-    return;
-
-  /* create attribute map */
-  uint4 *attr_map = dscene->attributes_map.alloc(attr_map_size);
-  memset(attr_map, 0, dscene->attributes_map.size() * sizeof(uint));
-
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-    AttributeRequestSet &attributes = mesh_attributes[i];
-
-    /* set object attributes */
-    int index = mesh->attr_map_offset;
-
-    foreach (AttributeRequest &req, attributes.requests) {
-      uint id;
-
-      if (req.std == ATTR_STD_NONE)
-        id = scene->shader_manager->get_attribute_id(req.name);
-      else
-        id = scene->shader_manager->get_attribute_id(req.std);
-
-      if (mesh->num_triangles()) {
-        attr_map[index].x = id;
-        attr_map[index].y = req.triangle_desc.element;
-        attr_map[index].z = as_uint(req.triangle_desc.offset);
-
-        if (req.triangle_type == TypeDesc::TypeFloat)
-          attr_map[index].w = NODE_ATTR_FLOAT;
-        else if (req.triangle_type == TypeDesc::TypeMatrix)
-          attr_map[index].w = NODE_ATTR_MATRIX;
-        else if (req.triangle_type == TypeFloat2)
-          attr_map[index].w = NODE_ATTR_FLOAT2;
-        else
-          attr_map[index].w = NODE_ATTR_FLOAT3;
-
-        attr_map[index].w |= req.triangle_desc.flags << 8;
-      }
-
-      index++;
-
-      if (mesh->num_curves()) {
-        attr_map[index].x = id;
-        attr_map[index].y = req.curve_desc.element;
-        attr_map[index].z = as_uint(req.curve_desc.offset);
-
-        if (req.curve_type == TypeDesc::TypeFloat)
-          attr_map[index].w = NODE_ATTR_FLOAT;
-        else if (req.curve_type == TypeDesc::TypeMatrix)
-          attr_map[index].w = NODE_ATTR_MATRIX;
-        else if (req.curve_type == TypeFloat2)
-          attr_map[index].w = NODE_ATTR_FLOAT2;
-        else
-          attr_map[index].w = NODE_ATTR_FLOAT3;
-
-        attr_map[index].w |= req.curve_desc.flags << 8;
-      }
-
-      index++;
-
-      if (mesh->subd_faces.size()) {
-        attr_map[index].x = id;
-        attr_map[index].y = req.subd_desc.element;
-        attr_map[index].z = as_uint(req.subd_desc.offset);
-
-        if (req.subd_type == TypeDesc::TypeFloat)
-          attr_map[index].w = NODE_ATTR_FLOAT;
-        else if (req.subd_type == TypeDesc::TypeMatrix)
-          attr_map[index].w = NODE_ATTR_MATRIX;
-        else if (req.subd_type == TypeFloat2)
-          attr_map[index].w = NODE_ATTR_FLOAT2;
-        else
-          attr_map[index].w = NODE_ATTR_FLOAT3;
-
-        attr_map[index].w |= req.subd_desc.flags << 8;
-      }
-
-      index++;
-    }
-
-    /* terminator */
-    for (int j = 0; j < ATTR_PRIM_TYPES; j++) {
-      attr_map[index].x = ATTR_STD_NONE;
-      attr_map[index].y = 0;
-      attr_map[index].z = 0;
-      attr_map[index].w = 0;
-
-      index++;
-    }
-  }
-
-  /* copy to device */
-  dscene->attributes_map.copy_to_device();
-}
-
-static void update_attribute_element_size(Mesh *mesh,
-                                          Attribute *mattr,
-                                          AttributePrimitive prim,
-                                          size_t *attr_float_size,
-                                          size_t *attr_float2_size,
-                                          size_t *attr_float3_size,
-                                          size_t *attr_uchar4_size)
-{
-  if (mattr) {
-    size_t size = mattr->element_size(mesh, prim);
-
-    if (mattr->element == ATTR_ELEMENT_VOXEL) {
-      /* pass */
-    }
-    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
-      *attr_uchar4_size += size;
-    }
-    else if (mattr->type == TypeDesc::TypeFloat) {
-      *attr_float_size += size;
-    }
-    else if (mattr->type == TypeFloat2) {
-      *attr_float2_size += size;
-    }
-    else if (mattr->type == TypeDesc::TypeMatrix) {
-      *attr_float3_size += size * 4;
-    }
-    else {
-      *attr_float3_size += size;
-    }
-  }
-}
-
-static void update_attribute_element_offset(Mesh *mesh,
-                                            device_vector<float> &attr_float,
-                                            size_t &attr_float_offset,
-                                            device_vector<float2> &attr_float2,
-                                            size_t &attr_float2_offset,
-                                            device_vector<float4> &attr_float3,
-                                            size_t &attr_float3_offset,
-                                            device_vector<uchar4> &attr_uchar4,
-                                            size_t &attr_uchar4_offset,
-                                            Attribute *mattr,
-                                            AttributePrimitive prim,
-                                            TypeDesc &type,
-                                            AttributeDescriptor &desc)
-{
-  if (mattr) {
-    /* store element and type */
-    desc.element = mattr->element;
-    desc.flags = mattr->flags;
-    type = mattr->type;
-
-    /* store attribute data in arrays */
-    size_t size = mattr->element_size(mesh, prim);
-
-    AttributeElement &element = desc.element;
-    int &offset = desc.offset;
-
-    if (mattr->element == ATTR_ELEMENT_VOXEL) {
-      /* store slot in offset value */
-      VoxelAttribute *voxel_data = mattr->data_voxel();
-      offset = voxel_data->slot;
-    }
-    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
-      uchar4 *data = mattr->data_uchar4();
-      offset = attr_uchar4_offset;
-
-      assert(attr_uchar4.size() >= offset + size);
-      for (size_t k = 0; k < size; k++) {
-        attr_uchar4[offset + k] = data[k];
-      }
-      attr_uchar4_offset += size;
-    }
-    else if (mattr->type == TypeDesc::TypeFloat) {
-      float *data = mattr->data_float();
-      offset = attr_float_offset;
-
-      assert(attr_float.size() >= offset + size);
-      for (size_t k = 0; k < size; k++) {
-        attr_float[offset + k] = data[k];
-      }
-      attr_float_offset += size;
-    }
-    else if (mattr->type == TypeFloat2) {
-      float2 *data = mattr->data_float2();
-      offset = attr_float2_offset;
-
-      assert(attr_float2.size() >= offset + size);
-      for (size_t k = 0; k < size; k++) {
-        attr_float2[offset + k] = data[k];
-      }
-      attr_float2_offset += size;
-    }
-    else if (mattr->type == TypeDesc::TypeMatrix) {
-      Transform *tfm = mattr->data_transform();
-      offset = attr_float3_offset;
-
-      assert(attr_float3.size() >= offset + size * 3);
-      for (size_t k = 0; k < size * 3; k++) {
-        attr_float3[offset + k] = (&tfm->x)[k];
-      }
-      attr_float3_offset += size * 3;
-    }
-    else {
-      float4 *data = mattr->data_float4();
-      offset = attr_float3_offset;
-
-      assert(attr_float3.size() >= offset + size);
-      for (size_t k = 0; k < size; k++) {
-        attr_float3[offset + k] = data[k];
-      }
-      attr_float3_offset += size;
-    }
-
-    /* mesh vertex/curve index is global, not per object, so we sneak
-     * a correction for that in here */
-    if (mesh->subdivision_type == Mesh::SUBDIVISION_CATMULL_CLARK &&
-        desc.flags & ATTR_SUBDIVIDED) {
-      /* indices for subdivided attributes are retrieved
-       * from patch table so no need for correction here*/
-    }
-    else if (element == ATTR_ELEMENT_VERTEX)
-      offset -= mesh->vert_offset;
-    else if (element == ATTR_ELEMENT_VERTEX_MOTION)
-      offset -= mesh->vert_offset;
-    else if (element == ATTR_ELEMENT_FACE) {
-      if (prim == ATTR_PRIM_TRIANGLE)
-        offset -= mesh->tri_offset;
-      else
-        offset -= mesh->face_offset;
-    }
-    else if (element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE) {
-      if (prim == ATTR_PRIM_TRIANGLE)
-        offset -= 3 * mesh->tri_offset;
-      else
-        offset -= mesh->corner_offset;
-    }
-    else if (element == ATTR_ELEMENT_CURVE)
-      offset -= mesh->curve_offset;
-    else if (element == ATTR_ELEMENT_CURVE_KEY)
-      offset -= mesh->curvekey_offset;
-    else if (element == ATTR_ELEMENT_CURVE_KEY_MOTION)
-      offset -= mesh->curvekey_offset;
-  }
-  else {
-    /* attribute not found */
-    desc.element = ATTR_ELEMENT_NONE;
-    desc.offset = 0;
-  }
-}
-
-void MeshManager::device_update_attributes(Device *device,
-                                           DeviceScene *dscene,
-                                           Scene *scene,
-                                           Progress &progress)
-{
-  progress.set_status("Updating Mesh", "Computing attributes");
-
-  /* gather per mesh requested attributes. as meshes may have multiple
-   * shaders assigned, this merges the requested attributes that have
-   * been set per shader by the shader manager */
-  vector<AttributeRequestSet> mesh_attributes(scene->meshes.size());
-
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-
-    scene->need_global_attributes(mesh_attributes[i]);
-
-    foreach (Shader *shader, mesh->used_shaders) {
-      mesh_attributes[i].add(shader->attributes);
-    }
-  }
-
-  /* mesh attribute are stored in a single array per data type. here we fill
-   * those arrays, and set the offset and element type to create attribute
-   * maps next */
-
-  /* Pre-allocate attributes to avoid arrays re-allocation which would
-   * take 2x of overall attribute memory usage.
-   */
-  size_t attr_float_size = 0;
-  size_t attr_float2_size = 0;
-  size_t attr_float3_size = 0;
-  size_t attr_uchar4_size = 0;
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-    AttributeRequestSet &attributes = mesh_attributes[i];
-    foreach (AttributeRequest &req, attributes.requests) {
-      Attribute *triangle_mattr = mesh->attributes.find(req);
-      Attribute *curve_mattr = mesh->curve_attributes.find(req);
-      Attribute *subd_mattr = mesh->subd_attributes.find(req);
-
-      update_attribute_element_size(mesh,
-                                    triangle_mattr,
-                                    ATTR_PRIM_TRIANGLE,
-                                    &attr_float_size,
-                                    &attr_float2_size,
-                                    &attr_float3_size,
-                                    &attr_uchar4_size);
-      update_attribute_element_size(mesh,
-                                    curve_mattr,
-                                    ATTR_PRIM_CURVE,
-                                    &attr_float_size,
-                                    &attr_float2_size,
-                                    &attr_float3_size,
-                                    &attr_uchar4_size);
-      update_attribute_element_size(mesh,
-                                    subd_mattr,
-                                    ATTR_PRIM_SUBD,
-                                    &attr_float_size,
-                                    &attr_float2_size,
-                                    &attr_float3_size,
-                                    &attr_uchar4_size);
-    }
-  }
-
-  dscene->attributes_float.alloc(attr_float_size);
-  dscene->attributes_float2.alloc(attr_float2_size);
-  dscene->attributes_float3.alloc(attr_float3_size);
-  dscene->attributes_uchar4.alloc(attr_uchar4_size);
-
-  size_t attr_float_offset = 0;
-  size_t attr_float2_offset = 0;
-  size_t attr_float3_offset = 0;
-  size_t attr_uchar4_offset = 0;
-
-  /* Fill in attributes. */
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-    AttributeRequestSet &attributes = mesh_attributes[i];
-
-    /* todo: we now store std and name attributes from requests even if
-     * they actually refer to the same mesh attributes, optimize */
-    foreach (AttributeRequest &req, attributes.requests) {
-      Attribute *triangle_mattr = mesh->attributes.find(req);
-      Attribute *curve_mattr = mesh->curve_attributes.find(req);
-      Attribute *subd_mattr = mesh->subd_attributes.find(req);
-
-      update_attribute_element_offset(mesh,
-                                      dscene->attributes_float,
-                                      attr_float_offset,
-                                      dscene->attributes_float2,
-                                      attr_float2_offset,
-                                      dscene->attributes_float3,
-                                      attr_float3_offset,
-                                      dscene->attributes_uchar4,
-                                      attr_uchar4_offset,
-                                      triangle_mattr,
-                                      ATTR_PRIM_TRIANGLE,
-                                      req.triangle_type,
-                                      req.triangle_desc);
-
-      update_attribute_element_offset(mesh,
-                                      dscene->attributes_float,
-                                      attr_float_offset,
-                                      dscene->attributes_float2,
-                                      attr_float2_offset,
-                                      dscene->attributes_float3,
-                                      attr_float3_offset,
-                                      dscene->attributes_uchar4,
-                                      attr_uchar4_offset,
-                                      curve_mattr,
-                                      ATTR_PRIM_CURVE,
-                                      req.curve_type,
-                                      req.curve_desc);
-
-      update_attribute_element_offset(mesh,
-                                      dscene->attributes_float,
-                                      attr_float_offset,
-                                      dscene->attributes_float2,
-                                      attr_float2_offset,
-                                      dscene->attributes_float3,
-                                      attr_float3_offset,
-                                      dscene->attributes_uchar4,
-                                      attr_uchar4_offset,
-                                      subd_mattr,
-                                      ATTR_PRIM_SUBD,
-                                      req.subd_type,
-                                      req.subd_desc);
-
-      if (progress.get_cancel())
-        return;
-    }
-  }
-
-  /* create attribute lookup maps */
-  if (scene->shader_manager->use_osl())
-    update_osl_attributes(device, scene, mesh_attributes);
-
-  update_svm_attributes(device, dscene, scene, mesh_attributes);
-
-  if (progress.get_cancel())
-    return;
-
-  /* copy to device */
-  progress.set_status("Updating Mesh", "Copying Attributes to device");
-
-  if (dscene->attributes_float.size()) {
-    dscene->attributes_float.copy_to_device();
-  }
-  if (dscene->attributes_float2.size()) {
-    dscene->attributes_float2.copy_to_device();
-  }
-  if (dscene->attributes_float3.size()) {
-    dscene->attributes_float3.copy_to_device();
-  }
-  if (dscene->attributes_uchar4.size()) {
-    dscene->attributes_uchar4.copy_to_device();
-  }
-
-  if (progress.get_cancel())
-    return;
-
-  /* After mesh attributes and patch tables have been copied to device memory,
-   * we need to update offsets in the objects. */
-  scene->object_manager->device_update_mesh_offsets(device, dscene, scene);
-}
-
-void MeshManager::mesh_calc_offset(Scene *scene)
-{
-  size_t vert_size = 0;
-  size_t tri_size = 0;
-
-  size_t curve_key_size = 0;
-  size_t curve_size = 0;
-
-  size_t patch_size = 0;
-  size_t face_size = 0;
-  size_t corner_size = 0;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    mesh->vert_offset = vert_size;
-    mesh->tri_offset = tri_size;
-
-    mesh->curvekey_offset = curve_key_size;
-    mesh->curve_offset = curve_size;
-
-    mesh->patch_offset = patch_size;
-    mesh->face_offset = face_size;
-    mesh->corner_offset = corner_size;
-
-    vert_size += mesh->verts.size();
-    tri_size += mesh->num_triangles();
-
-    curve_key_size += mesh->curve_keys.size();
-    curve_size += mesh->num_curves();
-
-    if (mesh->subd_faces.size()) {
-      Mesh::SubdFace &last = mesh->subd_faces[mesh->subd_faces.size() - 1];
-      patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
-
-      /* patch tables are stored in same array so include them in patch_size */
-      if (mesh->patch_table) {
-        mesh->patch_table_offset = patch_size;
-        patch_size += mesh->patch_table->total_size();
-      }
-    }
-    face_size += mesh->subd_faces.size();
-    corner_size += mesh->subd_face_corners.size();
-  }
-}
-
-void MeshManager::device_update_mesh(
-    Device *, DeviceScene *dscene, Scene *scene, bool for_displacement, Progress &progress)
-{
-  /* Count. */
-  size_t vert_size = 0;
-  size_t tri_size = 0;
-
-  size_t curve_key_size = 0;
-  size_t curve_size = 0;
-
-  size_t patch_size = 0;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    vert_size += mesh->verts.size();
-    tri_size += mesh->num_triangles();
-
-    curve_key_size += mesh->curve_keys.size();
-    curve_size += mesh->num_curves();
-
-    if (mesh->subd_faces.size()) {
-      Mesh::SubdFace &last = mesh->subd_faces[mesh->subd_faces.size() - 1];
-      patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
-
-      /* patch tables are stored in same array so include them in patch_size */
-      if (mesh->patch_table) {
-        mesh->patch_table_offset = patch_size;
-        patch_size += mesh->patch_table->total_size();
-      }
-    }
-  }
-
-  /* Create mapping from triangle to primitive triangle array. */
-  vector<uint> tri_prim_index(tri_size);
-  if (for_displacement) {
-    /* For displacement kernels we do some trickery to make them believe
-     * we've got all required data ready. However, that data is different
-     * from final render kernels since we don't have BVH yet, so can't
-     * really use same semantic of arrays.
-     */
-    foreach (Mesh *mesh, scene->meshes) {
-      for (size_t i = 0; i < mesh->num_triangles(); ++i) {
-        tri_prim_index[i + mesh->tri_offset] = 3 * (i + mesh->tri_offset);
-      }
-    }
-  }
-  else {
-    for (size_t i = 0; i < dscene->prim_index.size(); ++i) {
-      if ((dscene->prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
-        tri_prim_index[dscene->prim_index[i]] = dscene->prim_tri_index[i];
-      }
-    }
-  }
-
-  /* Fill in all the arrays. */
-  if (tri_size != 0) {
-    /* normals */
-    progress.set_status("Updating Mesh", "Computing normals");
-
-    uint *tri_shader = dscene->tri_shader.alloc(tri_size);
-    float4 *vnormal = dscene->tri_vnormal.alloc(vert_size);
-    uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
-    uint *tri_patch = dscene->tri_patch.alloc(tri_size);
-    float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);
-
-    foreach (Mesh *mesh, scene->meshes) {
-      mesh->pack_shaders(scene, &tri_shader[mesh->tri_offset]);
-      mesh->pack_normals(&vnormal[mesh->vert_offset]);
-      mesh->pack_verts(tri_prim_index,
-                       &tri_vindex[mesh->tri_offset],
-                       &tri_patch[mesh->tri_offset],
-                       &tri_patch_uv[mesh->vert_offset],
-                       mesh->vert_offset,
-                       mesh->tri_offset);
-      if (progress.get_cancel())
-        return;
-    }
-
-    /* vertex coordinates */
-    progress.set_status("Updating Mesh", "Copying Mesh to device");
-
-    dscene->tri_shader.copy_to_device();
-    dscene->tri_vnormal.copy_to_device();
-    dscene->tri_vindex.copy_to_device();
-    dscene->tri_patch.copy_to_device();
-    dscene->tri_patch_uv.copy_to_device();
-  }
-
-  if (curve_size != 0) {
-    progress.set_status("Updating Mesh", "Copying Strands to device");
-
-    float4 *curve_keys = dscene->curve_keys.alloc(curve_key_size);
-    float4 *curves = dscene->curves.alloc(curve_size);
-
-    foreach (Mesh *mesh, scene->meshes) {
-      mesh->pack_curves(scene,
-                        &curve_keys[mesh->curvekey_offset],
-                        &curves[mesh->curve_offset],
-                        mesh->curvekey_offset);
-      if (progress.get_cancel())
-        return;
-    }
-
-    dscene->curve_keys.copy_to_device();
-    dscene->curves.copy_to_device();
-  }
-
-  if (patch_size != 0) {
-    progress.set_status("Updating Mesh", "Copying Patches to device");
-
-    uint *patch_data = dscene->patches.alloc(patch_size);
-
-    foreach (Mesh *mesh, scene->meshes) {
-      mesh->pack_patches(&patch_data[mesh->patch_offset],
-                         mesh->vert_offset,
-                         mesh->face_offset,
-                         mesh->corner_offset);
-
-      if (mesh->patch_table) {
-        mesh->patch_table->copy_adjusting_offsets(&patch_data[mesh->patch_table_offset],
-                                                  mesh->patch_table_offset);
-      }
-
-      if (progress.get_cancel())
-        return;
-    }
-
-    dscene->patches.copy_to_device();
-  }
-
-  if (for_displacement) {
-    float4 *prim_tri_verts = dscene->prim_tri_verts.alloc(tri_size * 3);
-    foreach (Mesh *mesh, scene->meshes) {
-      for (size_t i = 0; i < mesh->num_triangles(); ++i) {
-        Mesh::Triangle t = mesh->get_triangle(i);
-        size_t offset = 3 * (i + mesh->tri_offset);
-        prim_tri_verts[offset + 0] = float3_to_float4(mesh->verts[t.v[0]]);
-        prim_tri_verts[offset + 1] = float3_to_float4(mesh->verts[t.v[1]]);
-        prim_tri_verts[offset + 2] = float3_to_float4(mesh->verts[t.v[2]]);
-      }
-    }
-    dscene->prim_tri_verts.copy_to_device();
-  }
-}
-
-void MeshManager::device_update_bvh(Device *device,
-                                    DeviceScene *dscene,
-                                    Scene *scene,
-                                    Progress &progress)
-{
-  /* bvh build */
-  progress.set_status("Updating Scene BVH", "Building");
-
-  BVHParams bparams;
-  bparams.top_level = true;
-  bparams.bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
-                                                  device->get_bvh_layout_mask());
-  bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
-  bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
-                                scene->params.use_bvh_unaligned_nodes;
-  bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
-  bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
-  bparams.bvh_type = scene->params.bvh_type;
-  bparams.curve_flags = dscene->data.curve.curveflags;
-  bparams.curve_subdivisions = dscene->data.curve.subdivisions;
-
-  VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout) << " layout.";
-
-#ifdef WITH_EMBREE
-  if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
-    if (dscene->data.bvh.scene) {
-      BVHEmbree::destroy(dscene->data.bvh.scene);
-    }
-  }
-#endif
-
-  BVH *bvh = BVH::create(bparams, scene->objects);
-  bvh->build(progress, &device->stats);
-
-  if (progress.get_cancel()) {
-#ifdef WITH_EMBREE
-    if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
-      if (dscene->data.bvh.scene) {
-        BVHEmbree::destroy(dscene->data.bvh.scene);
-      }
-    }
-#endif
-    delete bvh;
-    return;
-  }
-
-  /* copy to device */
-  progress.set_status("Updating Scene BVH", "Copying BVH to device");
-
-  PackedBVH &pack = bvh->pack;
-
-  if (pack.nodes.size()) {
-    dscene->bvh_nodes.steal_data(pack.nodes);
-    dscene->bvh_nodes.copy_to_device();
-  }
-  if (pack.leaf_nodes.size()) {
-    dscene->bvh_leaf_nodes.steal_data(pack.leaf_nodes);
-    dscene->bvh_leaf_nodes.copy_to_device();
-  }
-  if (pack.object_node.size()) {
-    dscene->object_node.steal_data(pack.object_node);
-    dscene->object_node.copy_to_device();
-  }
-  if (pack.prim_tri_index.size()) {
-    dscene->prim_tri_index.steal_data(pack.prim_tri_index);
-    dscene->prim_tri_index.copy_to_device();
-  }
-  if (pack.prim_tri_verts.size()) {
-    dscene->prim_tri_verts.steal_data(pack.prim_tri_verts);
-    dscene->prim_tri_verts.copy_to_device();
-  }
-  if (pack.prim_type.size()) {
-    dscene->prim_type.steal_data(pack.prim_type);
-    dscene->prim_type.copy_to_device();
-  }
-  if (pack.prim_visibility.size()) {
-    dscene->prim_visibility.steal_data(pack.prim_visibility);
-    dscene->prim_visibility.copy_to_device();
-  }
-  if (pack.prim_index.size()) {
-    dscene->prim_index.steal_data(pack.prim_index);
-    dscene->prim_index.copy_to_device();
-  }
-  if (pack.prim_object.size()) {
-    dscene->prim_object.steal_data(pack.prim_object);
-    dscene->prim_object.copy_to_device();
-  }
-  if (pack.prim_time.size()) {
-    dscene->prim_time.steal_data(pack.prim_time);
-    dscene->prim_time.copy_to_device();
-  }
-
-  dscene->data.bvh.root = pack.root_index;
-  dscene->data.bvh.bvh_layout = bparams.bvh_layout;
-  dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
-
-#ifdef WITH_EMBREE
-  if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
-    dscene->data.bvh.scene = ((BVHEmbree *)bvh)->scene;
-  }
-  else {
-    dscene->data.bvh.scene = NULL;
-  }
-#endif
-
-  delete bvh;
-}
-
-void MeshManager::device_update_preprocess(Device *device, Scene *scene, Progress &progress)
-{
-  if (!need_update && !need_flags_update) {
-    return;
-  }
-
-  progress.set_status("Updating Meshes Flags");
-
-  /* Update flags. */
-  bool volume_images_updated = false;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    mesh->has_volume = false;
-
-    foreach (const Shader *shader, mesh->used_shaders) {
-      if (shader->has_volume) {
-        mesh->has_volume = true;
-      }
-      if (shader->has_surface_bssrdf) {
-        mesh->has_surface_bssrdf = true;
-      }
-    }
-
-    if (need_update && mesh->has_volume) {
-      /* Create volume meshes if there is voxel data. */
-      bool has_voxel_attributes = false;
-
-      foreach (Attribute &attr, mesh->attributes.attributes) {
-        if (attr.element == ATTR_ELEMENT_VOXEL) {
-          has_voxel_attributes = true;
-        }
-      }
-
-      if (has_voxel_attributes) {
-        if (!volume_images_updated) {
-          progress.set_status("Updating Meshes Volume Bounds");
-          device_update_volume_images(device, scene, progress);
-          volume_images_updated = true;
-        }
-
-        create_volume_mesh(scene, mesh, progress);
-      }
-    }
-  }
-
-  need_flags_update = false;
-}
-
-void MeshManager::device_update_displacement_images(Device *device,
-                                                    Scene *scene,
-                                                    Progress &progress)
-{
-  progress.set_status("Updating Displacement Images");
-  TaskPool pool;
-  ImageManager *image_manager = scene->image_manager;
-  set<int> bump_images;
-  foreach (Mesh *mesh, scene->meshes) {
-    if (mesh->need_update) {
-      foreach (Shader *shader, mesh->used_shaders) {
-        if (!shader->has_displacement || shader->displacement_method == DISPLACE_BUMP) {
-          continue;
-        }
-        foreach (ShaderNode *node, shader->graph->nodes) {
-          if (node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
-            continue;
-          }
-
-          ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode *>(node);
-          int slot = image_node->slot;
-          if (slot != -1) {
-            bump_images.insert(slot);
-          }
-        }
-      }
-    }
-  }
-  foreach (int slot, bump_images) {
-    pool.push(function_bind(
-        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
-  }
-  pool.wait_work();
-}
-
-void MeshManager::device_update_volume_images(Device *device, Scene *scene, Progress &progress)
-{
-  progress.set_status("Updating Volume Images");
-  TaskPool pool;
-  ImageManager *image_manager = scene->image_manager;
-  set<int> volume_images;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    if (!mesh->need_update) {
-      continue;
-    }
-
-    foreach (Attribute &attr, mesh->attributes.attributes) {
-      if (attr.element != ATTR_ELEMENT_VOXEL) {
-        continue;
-      }
-
-      VoxelAttribute *voxel = attr.data_voxel();
-
-      if (voxel->slot != -1) {
-        volume_images.insert(voxel->slot);
-      }
-    }
-  }
-
-  foreach (int slot, volume_images) {
-    pool.push(function_bind(
-        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
-  }
-  pool.wait_work();
-}
-
-void MeshManager::device_update(Device *device,
-                                DeviceScene *dscene,
-                                Scene *scene,
-                                Progress &progress)
-{
-  if (!need_update)
-    return;
-
-  VLOG(1) << "Total " << scene->meshes.size() << " meshes.";
-
-  bool true_displacement_used = false;
-  size_t total_tess_needed = 0;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    foreach (Shader *shader, mesh->used_shaders) {
-      if (shader->need_update_mesh)
-        mesh->need_update = true;
-    }
-
-    if (mesh->need_update) {
-      /* Update normals. */
-      mesh->add_face_normals();
-      mesh->add_vertex_normals();
-
-      if (mesh->need_attribute(scene, ATTR_STD_POSITION_UNDISPLACED)) {
-        mesh->add_undisplaced();
-      }
-
-      /* Test if we need tessellation. */
-      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE && mesh->num_subd_verts == 0 &&
-          mesh->subd_params) {
-        total_tess_needed++;
-      }
-
-      /* Test if we need displacement. */
-      if (mesh->has_true_displacement()) {
-        true_displacement_used = true;
-      }
-
-      if (progress.get_cancel())
-        return;
-    }
-  }
-
-  /* Tessellate meshes that are using subdivision */
-  if (total_tess_needed) {
-    Camera *dicing_camera = scene->dicing_camera;
-    dicing_camera->update(scene);
-
-    size_t i = 0;
-    foreach (Mesh *mesh, scene->meshes) {
-      if (mesh->need_update && mesh->subdivision_type != Mesh::SUBDIVISION_NONE &&
-          mesh->num_subd_verts == 0 && mesh->subd_params) {
-        string msg = "Tessellating ";
-        if (mesh->name == "")
-          msg += string_printf("%u/%u", (uint)(i + 1), (uint)total_tess_needed);
-        else
-          msg += string_printf(
-              "%s %u/%u", mesh->name.c_str(), (uint)(i + 1), (uint)total_tess_needed);
-
-        progress.set_status("Updating Mesh", msg);
-
-        mesh->subd_params->camera = dicing_camera;
-        DiagSplit dsplit(*mesh->subd_params);
-        mesh->tessellate(&dsplit);
-
-        i++;
-
-        if (progress.get_cancel())
-          return;
-      }
-    }
-  }
-
-  /* Update images needed for true displacement. */
-  bool old_need_object_flags_update = false;
-  if (true_displacement_used) {
-    VLOG(1) << "Updating images used for true displacement.";
-    device_update_displacement_images(device, scene, progress);
-    old_need_object_flags_update = scene->object_manager->need_flags_update;
-    scene->object_manager->device_update_flags(device, dscene, scene, progress, false);
-  }
-
-  /* Device update. */
-  device_free(device, dscene);
-
-  mesh_calc_offset(scene);
-  if (true_displacement_used) {
-    device_update_mesh(device, dscene, scene, true, progress);
-  }
-  if (progress.get_cancel())
-    return;
-
-  device_update_attributes(device, dscene, scene, progress);
-  if (progress.get_cancel())
-    return;
-
-  /* Update displacement. */
-  bool displacement_done = false;
-  size_t num_bvh = 0;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    if (mesh->need_update) {
-      if (displace(device, dscene, scene, mesh, progress)) {
-        displacement_done = true;
-      }
-
-      if (mesh->need_build_bvh()) {
-        num_bvh++;
-      }
-    }
-
-    if (progress.get_cancel())
-      return;
-  }
-
-  /* Device re-update after displacement. */
-  if (displacement_done) {
-    device_free(device, dscene);
-
-    device_update_attributes(device, dscene, scene, progress);
-    if (progress.get_cancel())
-      return;
-  }
-
-  TaskPool pool;
-
-  size_t i = 0;
-  foreach (Mesh *mesh, scene->meshes) {
-    if (mesh->need_update) {
-      pool.push(function_bind(
-          &Mesh::compute_bvh, mesh, device, dscene, &scene->params, &progress, i, num_bvh));
-      if (mesh->need_build_bvh()) {
-        i++;
-      }
-    }
-  }
-
-  TaskPool::Summary summary;
-  pool.wait_work(&summary);
-  VLOG(2) << "Objects BVH build pool statistics:\n" << summary.full_report();
-
-  foreach (Shader *shader, scene->shaders) {
-    shader->need_update_mesh = false;
-  }
-
-  Scene::MotionType need_motion = scene->need_motion();
-  bool motion_blur = need_motion == Scene::MOTION_BLUR;
-
-  /* Update objects. */
-  vector<Object *> volume_objects;
-  foreach (Object *object, scene->objects) {
-    object->compute_bounds(motion_blur);
-  }
-
-  if (progress.get_cancel())
-    return;
-
-  device_update_bvh(device, dscene, scene, progress);
-  if (progress.get_cancel())
-    return;
-
-  device_update_mesh(device, dscene, scene, false, progress);
-  if (progress.get_cancel())
-    return;
-
-  need_update = false;
-
-  if (true_displacement_used) {
-    /* Re-tag flags for update, so they're re-evaluated
-     * for meshes with correct bounding boxes.
-     *
-     * This wouldn't cause wrong results, just true
-     * displacement might be less optimal ot calculate.
-     */
-    scene->object_manager->need_flags_update = old_need_object_flags_update;
-  }
-}
-
-void MeshManager::device_free(Device *device, DeviceScene *dscene)
-{
-  dscene->bvh_nodes.free();
-  dscene->bvh_leaf_nodes.free();
-  dscene->object_node.free();
-  dscene->prim_tri_verts.free();
-  dscene->prim_tri_index.free();
-  dscene->prim_type.free();
-  dscene->prim_visibility.free();
-  dscene->prim_index.free();
-  dscene->prim_object.free();
-  dscene->prim_time.free();
-  dscene->tri_shader.free();
-  dscene->tri_vnormal.free();
-  dscene->tri_vindex.free();
-  dscene->tri_patch.free();
-  dscene->tri_patch_uv.free();
-  dscene->curves.free();
-  dscene->curve_keys.free();
-  dscene->patches.free();
-  dscene->attributes_map.free();
-  dscene->attributes_float.free();
-  dscene->attributes_float2.free();
-  dscene->attributes_float3.free();
-  dscene->attributes_uchar4.free();
-
-#ifdef WITH_OSL
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
-
-  if (og) {
-    og->object_name_map.clear();
-    og->attribute_map.clear();
-    og->object_names.clear();
-  }
-#else
-  (void)device;
-#endif
-}
-
-void MeshManager::tag_update(Scene *scene)
-{
-  need_update = true;
-  scene->object_manager->need_update = true;
-}
-
-void MeshManager::collect_statistics(const Scene *scene, RenderStats *stats)
-{
-  foreach (Mesh *mesh, scene->meshes) {
-    stats->mesh.geometry.add_entry(
-        NamedSizeEntry(string(mesh->name.c_str()), mesh->get_total_size_in_bytes()));
-  }
-}
-
-bool Mesh::need_attribute(Scene *scene, AttributeStandard std)
-{
-  if (std == ATTR_STD_NONE)
-    return false;
-
-  if (scene->need_global_attribute(std))
-    return true;
-
-  foreach (Shader *shader, used_shaders)
-    if (shader->attributes.find(std))
-      return true;
-
-  return false;
-}
-
-bool Mesh::need_attribute(Scene * /*scene*/, ustring name)
-{
-  if (name == ustring())
-    return false;
-
-  foreach (Shader *shader, used_shaders)
-    if (shader->attributes.find(name))
-      return true;
-
-  return false;
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 2bf7b3972e5..3654732b13d 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -19,7 +19,9 @@
 
 #include "graph/node.h"
 
+#include "bvh/bvh_params.h"
 #include "render/attribute.h"
+#include "render/geometry.h"
 #include "render/shader.h"
 
 #include "util/util_array.h"
@@ -27,7 +29,7 @@
 #include "util/util_list.h"
 #include "util/util_map.h"
 #include "util/util_param.h"
-#include "util/util_transform.h"
+#include "util/util_set.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
@@ -49,7 +51,7 @@ struct PackedPatchTable;
 
 /* Mesh */
 
-class Mesh : public Node {
+class Mesh : public Geometry {
  public:
   NODE_DECLARE
 
@@ -108,84 +110,6 @@ class Mesh : public Node {
     return triangles.size() / 3;
   }
 
-  /* Mesh Curve */
-  struct Curve {
-    int first_key;
-    int num_keys;
-
-    int num_segments()
-    {
-      return num_keys - 1;
-    }
-
-    void bounds_grow(const int k,
-                     const float3 *curve_keys,
-                     const float *curve_radius,
-                     BoundBox &bounds) const;
-    void bounds_grow(float4 keys[4], BoundBox &bounds) const;
-    void bounds_grow(const int k,
-                     const float3 *curve_keys,
-                     const float *curve_radius,
-                     const Transform &aligned_space,
-                     BoundBox &bounds) const;
-
-    void motion_keys(const float3 *curve_keys,
-                     const float *curve_radius,
-                     const float3 *key_steps,
-                     size_t num_curve_keys,
-                     size_t num_steps,
-                     float time,
-                     size_t k0,
-                     size_t k1,
-                     float4 r_keys[2]) const;
-    void cardinal_motion_keys(const float3 *curve_keys,
-                              const float *curve_radius,
-                              const float3 *key_steps,
-                              size_t num_curve_keys,
-                              size_t num_steps,
-                              float time,
-                              size_t k0,
-                              size_t k1,
-                              size_t k2,
-                              size_t k3,
-                              float4 r_keys[4]) const;
-
-    void keys_for_step(const float3 *curve_keys,
-                       const float *curve_radius,
-                       const float3 *key_steps,
-                       size_t num_curve_keys,
-                       size_t num_steps,
-                       size_t step,
-                       size_t k0,
-                       size_t k1,
-                       float4 r_keys[2]) const;
-    void cardinal_keys_for_step(const float3 *curve_keys,
-                                const float *curve_radius,
-                                const float3 *key_steps,
-                                size_t num_curve_keys,
-                                size_t num_steps,
-                                size_t step,
-                                size_t k0,
-                                size_t k1,
-                                size_t k2,
-                                size_t k3,
-                                float4 r_keys[4]) const;
-  };
-
-  Curve get_curve(size_t i) const
-  {
-    int first = curve_first_key[i];
-    int next_first = (i + 1 < curve_first_key.size()) ? curve_first_key[i + 1] : curve_keys.size();
-
-    Curve curve = {first, next_first - first};
-    return curve;
-  }
-
-  size_t num_curves() const
-  {
-    return curve_first_key.size();
-  }
-
   /* Mesh SubdFace */
   struct SubdFace {
     int start_corner;
@@ -219,14 +143,6 @@ class Mesh : public Node {
   SubdivisionType subdivision_type;
 
   /* Mesh Data */
-  enum GeometryFlags {
-    GEOMETRY_NONE = 0,
-    GEOMETRY_TRIANGLES = (1 << 0),
-    GEOMETRY_CURVES = (1 << 1),
-  };
-  int geometry_flags; /* used to distinguish meshes with no verts
-                          and meshed for which geometry is not created */
-
   array<int> triangles;
   array<float3> verts;
   array<int> shader;
@@ -236,14 +152,9 @@ class Mesh : public Node {
   array<int> triangle_patch; /* must be < 0 for non subd triangles */
   array<float2> vert_patch_uv;
 
-  float volume_isovalue;
-  bool has_volume;         /* Set in the device_update_flags(). */
-  bool has_surface_bssrdf; /* Set in the device_update_flags(). */
-
-  array<float3> curve_keys;
-  array<float> curve_radius;
-  array<int> curve_first_key;
-  array<int> curve_shader;
+  float volume_clipping;
+  float volume_step_size;
+  bool volume_object_space;
 
   array<SubdFace> subd_faces;
   array<int> subd_face_corners;
@@ -253,65 +164,53 @@ class Mesh : public Node {
 
   SubdParams *subd_params;
 
-  vector<Shader *> used_shaders;
-  AttributeSet attributes;
-  AttributeSet curve_attributes;
   AttributeSet subd_attributes;
 
-  BoundBox bounds;
-  bool transform_applied;
-  bool transform_negative_scaled;
-  Transform transform_normal;
-
   PackedPatchTable *patch_table;
 
-  uint motion_steps;
-  bool use_motion_blur;
-
-  /* Update Flags */
-  bool need_update;
-  bool need_update_rebuild;
-
   /* BVH */
-  BVH *bvh;
-  size_t tri_offset;
   size_t vert_offset;
 
-  size_t curve_offset;
-  size_t curvekey_offset;
-
   size_t patch_offset;
   size_t patch_table_offset;
   size_t face_offset;
   size_t corner_offset;
 
-  size_t attr_map_offset;
-
   size_t num_subd_verts;
 
+ private:
+  unordered_map<int, int> vert_to_stitching_key_map; /* real vert index -> stitching index */
+  unordered_multimap<int, int>
+      vert_stitching_map; /* stitching index -> multiple real vert indices */
+  friend class DiagSplit;
+  friend class GeometryManager;
+
+ public:
   /* Functions */
   Mesh();
   ~Mesh();
 
   void resize_mesh(int numverts, int numfaces);
   void reserve_mesh(int numverts, int numfaces);
-  void resize_curves(int numcurves, int numkeys);
-  void reserve_curves(int numcurves, int numkeys);
   void resize_subd_faces(int numfaces, int num_ngons, int numcorners);
   void reserve_subd_faces(int numfaces, int num_ngons, int numcorners);
-  void clear(bool preserve_voxel_data = false);
+  void clear(bool preserve_voxel_data);
+  void clear() override;
   void add_vertex(float3 P);
   void add_vertex_slow(float3 P);
   void add_triangle(int v0, int v1, int v2, int shader, bool smooth);
-  void add_curve_key(float3 loc, float radius);
-  void add_curve(int first_key, int shader);
   void add_subd_face(int *corners, int num_corners, int shader_, bool smooth_);
 
-  void compute_bounds();
+  void copy_center_to_motion_step(const int motion_step);
+
+  void compute_bounds() override;
+  void apply_transform(const Transform &tfm, const bool apply_to_motion) override;
   void add_face_normals();
   void add_vertex_normals();
   void add_undisplaced();
 
+  void get_uv_tiles(ustring map, unordered_set<int> &tiles) override;
+
   void pack_shaders(Scene *scene, uint *shader);
   void pack_normals(float4 *vnormal);
   void pack_verts(const vector<uint> &tri_prim_index,
@@ -320,101 +219,11 @@ class Mesh : public Node {
                   float2 *tri_patch_uv,
                   size_t vert_offset,
                   size_t tri_offset);
-  void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
   void pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset);
 
-  void compute_bvh(Device *device,
-                   DeviceScene *dscene,
-                   SceneParams *params,
-                   Progress *progress,
-                   int n,
-                   int total);
-
-  bool need_attribute(Scene *scene, AttributeStandard std);
-  bool need_attribute(Scene *scene, ustring name);
-
-  void tag_update(Scene *scene, bool rebuild);
-
-  bool has_motion_blur() const;
-  bool has_true_displacement() const;
-
-  /* Convert between normalized -1..1 motion time and index
-   * in the VERTEX_MOTION attribute. */
-  float motion_time(int step) const;
-  int motion_step(float time) const;
-
-  /* Check whether the mesh should have own BVH built separately. Briefly,
-   * own BVH is needed for mesh, if:
-   *
-   * - It is instanced multiple times, so each instance object should share the
-   *   same BVH tree.
-   * - Special ray intersection is needed, for example to limit subsurface rays
-   *   to only the mesh itself.
-   */
-  bool need_build_bvh() const;
-
-  /* Check if the mesh should be treated as instanced. */
-  bool is_instanced() const;
-
   void tessellate(DiagSplit *split);
 };
 
-/* Mesh Manager */
-
-class MeshManager {
- public:
-  bool need_update;
-  bool need_flags_update;
-
-  MeshManager();
-  ~MeshManager();
-
-  bool displace(Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress);
-
-  /* attributes */
-  void update_osl_attributes(Device *device,
-                             Scene *scene,
-                             vector<AttributeRequestSet> &mesh_attributes);
-  void update_svm_attributes(Device *device,
-                             DeviceScene *dscene,
-                             Scene *scene,
-                             vector<AttributeRequestSet> &mesh_attributes);
-
-  void device_update_preprocess(Device *device, Scene *scene, Progress &progress);
-  void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-
-  void device_free(Device *device, DeviceScene *dscene);
-
-  void tag_update(Scene *scene);
-
-  void create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progress);
-
-  void collect_statistics(const Scene *scene, RenderStats *stats);
-
- protected:
-  /* Calculate verts/triangles/curves offsets in global arrays. */
-  void mesh_calc_offset(Scene *scene);
-
-  void device_update_object(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-
-  void device_update_mesh(Device *device,
-                          DeviceScene *dscene,
-                          Scene *scene,
-                          bool for_displacement,
-                          Progress &progress);
-
-  void device_update_attributes(Device *device,
-                                DeviceScene *dscene,
-                                Scene *scene,
-                                Progress &progress);
-
-  void device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-
-  void device_update_displacement_images(Device *device, Scene *scene, Progress &progress);
-
-  void device_update_volume_images(Device *device, Scene *scene, Progress &progress);
-};
-
 CCL_NAMESPACE_END
 
 #endif /* __MESH_H__ */
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index 5ae9348d83e..467810f9273 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -22,7 +22,9 @@
 #include "render/shader.h"
 
 #include "util/util_foreach.h"
+#include "util/util_map.h"
 #include "util/util_progress.h"
+#include "util/util_set.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -41,7 +43,7 @@ static float3 compute_face_normal(const Mesh::Triangle &t, float3 *verts)
   return norm / normlen;
 }
 
-bool MeshManager::displace(
+bool GeometryManager::displace(
     Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
 {
   /* verify if we have a displacement shader */
@@ -56,7 +58,7 @@ bool MeshManager::displace(
   size_t object_index = OBJECT_NONE;
 
   for (size_t i = 0; i < scene->objects.size(); i++) {
-    if (scene->objects[i]->mesh == mesh) {
+    if (scene->objects[i]->geometry == mesh) {
       object_index = i;
       break;
     }
@@ -89,7 +91,7 @@ bool MeshManager::displace(
 
       /* set up object, primitive and barycentric coordinates */
       int object = object_index;
-      int prim = mesh->tri_offset + i;
+      int prim = mesh->prim_offset + i;
       float u, v;
 
       switch (j) {
@@ -184,6 +186,38 @@ bool MeshManager::displace(
 
   d_output.free();
 
+  /* stitch */
+  unordered_set<int> stitch_keys;
+  for (pair<int, int> i : mesh->vert_to_stitching_key_map) {
+    stitch_keys.insert(i.second); /* stitching index */
+  }
+
+  typedef unordered_multimap<int, int>::iterator map_it_t;
+
+  for (int key : stitch_keys) {
+    pair<map_it_t, map_it_t> verts = mesh->vert_stitching_map.equal_range(key);
+
+    float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+    int num = 0;
+
+    for (map_it_t v = verts.first; v != verts.second; ++v) {
+      int vert = v->second;
+
+      pos += mesh->verts[vert];
+      num++;
+    }
+
+    if (num <= 1) {
+      continue;
+    }
+
+    pos *= 1.0f / num;
+
+    for (map_it_t v = verts.first; v != verts.second; ++v) {
+      mesh->verts[v->second] = pos;
+    }
+  }
+
   /* for displacement method both, we only need to recompute the face
    * normals, as bump mapping in the shader will already alter the
    * vertex normal, so we start from the non-displaced vertex normals
@@ -238,7 +272,25 @@ bool MeshManager::displace(
     for (size_t i = 0; i < num_triangles; i++) {
       if (tri_has_true_disp[i]) {
         for (size_t j = 0; j < 3; j++) {
-          vN[mesh->get_triangle(i).v[j]] += fN[i];
+          int vert = mesh->get_triangle(i).v[j];
+          vN[vert] += fN[i];
+
+          /* add face normals to stitched vertices */
+          if (stitch_keys.size()) {
+            map_it_t key = mesh->vert_to_stitching_key_map.find(vert);
+
+            if (key != mesh->vert_to_stitching_key_map.end()) {
+              pair<map_it_t, map_it_t> verts = mesh->vert_stitching_map.equal_range(key->second);
+
+              for (map_it_t v = verts.first; v != verts.second; ++v) {
+                if (v->second == vert) {
+                  continue;
+                }
+
+                vN[v->second] += fN[i];
+              }
+            }
+          }
         }
       }
     }
@@ -289,8 +341,27 @@ bool MeshManager::displace(
         for (size_t i = 0; i < num_triangles; i++) {
           if (tri_has_true_disp[i]) {
             for (size_t j = 0; j < 3; j++) {
+              int vert = mesh->get_triangle(i).v[j];
               float3 fN = compute_face_normal(mesh->get_triangle(i), mP);
-              mN[mesh->get_triangle(i).v[j]] += fN;
+              mN[vert] += fN;
+
+              /* add face normals to stitched vertices */
+              if (stitch_keys.size()) {
+                map_it_t key = mesh->vert_to_stitching_key_map.find(vert);
+
+                if (key != mesh->vert_to_stitching_key_map.end()) {
+                  pair<map_it_t, map_it_t> verts = mesh->vert_stitching_map.equal_range(
+                      key->second);
+
+                  for (map_it_t v = verts.first; v != verts.second; ++v) {
+                    if (v->second == vert) {
+                      continue;
+                    }
+
+                    mN[v->second] += fN;
+                  }
+                }
+              }
             }
           }
         }
diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp
index 46c8240fb71..3d72b2fab91 100644
--- a/intern/cycles/render/mesh_subdivision.cpp
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -14,16 +14,17 @@
  * limitations under the License.
  */
 
-#include "render/mesh.h"
 #include "render/attribute.h"
 #include "render/camera.h"
+#include "render/mesh.h"
 
-#include "subd/subd_split.h"
 #include "subd/subd_patch.h"
 #include "subd/subd_patch_table.h"
+#include "subd/subd_split.h"
 
-#include "util/util_foreach.h"
 #include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -31,10 +32,10 @@ CCL_NAMESPACE_BEGIN
 
 CCL_NAMESPACE_END
 
-#  include <opensubdiv/far/topologyRefinerFactory.h>
-#  include <opensubdiv/far/primvarRefiner.h>
-#  include <opensubdiv/far/patchTableFactory.h>
 #  include <opensubdiv/far/patchMap.h>
+#  include <opensubdiv/far/patchTableFactory.h>
+#  include <opensubdiv/far/primvarRefiner.h>
+#  include <opensubdiv/far/topologyRefinerFactory.h>
 
 /* specializations of TopologyRefinerFactory for ccl::Mesh */
 
@@ -318,6 +319,9 @@ class OsdData {
 struct OsdPatch : Patch {
   OsdData *osd_data;
 
+  OsdPatch()
+  {
+  }
   OsdPatch(OsdData *data) : osd_data(data)
   {
   }
@@ -358,11 +362,6 @@ struct OsdPatch : Patch {
       *N = (t != 0.0f) ? *N / t : make_float3(0.0f, 0.0f, 1.0f);
     }
   }
-
-  BoundBox bound()
-  {
-    return BoundBox::empty;
-  }
 };
 
 #endif
@@ -395,31 +394,64 @@ void Mesh::tessellate(DiagSplit *split)
   int num_faces = subd_faces.size();
 
   Attribute *attr_vN = subd_attributes.find(ATTR_STD_VERTEX_NORMAL);
-  float3 *vN = attr_vN->data_float3();
+  float3 *vN = (attr_vN) ? attr_vN->data_float3() : NULL;
 
+  /* count patches */
+  int num_patches = 0;
   for (int f = 0; f < num_faces; f++) {
     SubdFace &face = subd_faces[f];
 
     if (face.is_quad()) {
-      /* quad */
-      QuadDice::SubPatch subpatch;
+      num_patches++;
+    }
+    else {
+      num_patches += face.num_corners;
+    }
+  }
 
-      LinearQuadPatch quad_patch;
+  /* build patches from faces */
 #ifdef WITH_OPENSUBDIV
-      OsdPatch osd_patch(&osd_data);
+  if (subdivision_type == SUBDIVISION_CATMULL_CLARK) {
+    vector<OsdPatch> osd_patches(num_patches, &osd_data);
+    OsdPatch *patch = osd_patches.data();
 
-      if (subdivision_type == SUBDIVISION_CATMULL_CLARK) {
-        osd_patch.patch_index = face.ptex_offset;
+    for (int f = 0; f < num_faces; f++) {
+      SubdFace &face = subd_faces[f];
 
-        subpatch.patch = &osd_patch;
+      if (face.is_quad()) {
+        patch->patch_index = face.ptex_offset;
+        patch->from_ngon = false;
+        patch->shader = face.shader;
+        patch++;
+      }
+      else {
+        for (int corner = 0; corner < face.num_corners; corner++) {
+          patch->patch_index = face.ptex_offset + corner;
+          patch->from_ngon = true;
+          patch->shader = face.shader;
+          patch++;
+        }
       }
-      else
+    }
+
+    /* split patches */
+    split->split_patches(osd_patches.data(), sizeof(OsdPatch));
+  }
+  else
 #endif
-      {
-        float3 *hull = quad_patch.hull;
-        float3 *normals = quad_patch.normals;
+  {
+    vector<LinearQuadPatch> linear_patches(num_patches);
+    LinearQuadPatch *patch = linear_patches.data();
+
+    for (int f = 0; f < num_faces; f++) {
+      SubdFace &face = subd_faces[f];
 
-        quad_patch.patch_index = face.ptex_offset;
+      if (face.is_quad()) {
+        float3 *hull = patch->hull;
+        float3 *normals = patch->normals;
+
+        patch->patch_index = face.ptex_offset;
+        patch->from_ngon = false;
 
         for (int i = 0; i < 4; i++) {
           hull[i] = verts[subd_face_corners[face.start_corner + i]];
@@ -440,55 +472,11 @@ void Mesh::tessellate(DiagSplit *split)
         swap(hull[2], hull[3]);
         swap(normals[2], normals[3]);
 
-        subpatch.patch = &quad_patch;
+        patch->shader = face.shader;
+        patch++;
       }
-
-      subpatch.patch->shader = face.shader;
-
-      /* Quad faces need to be split at least once to line up with split ngons, we do this
-       * here in this manner because if we do it later edge factors may end up slightly off.
-       */
-      subpatch.P00 = make_float2(0.0f, 0.0f);
-      subpatch.P10 = make_float2(0.5f, 0.0f);
-      subpatch.P01 = make_float2(0.0f, 0.5f);
-      subpatch.P11 = make_float2(0.5f, 0.5f);
-      split->split_quad(subpatch.patch, &subpatch);
-
-      subpatch.P00 = make_float2(0.5f, 0.0f);
-      subpatch.P10 = make_float2(1.0f, 0.0f);
-      subpatch.P01 = make_float2(0.5f, 0.5f);
-      subpatch.P11 = make_float2(1.0f, 0.5f);
-      split->split_quad(subpatch.patch, &subpatch);
-
-      subpatch.P00 = make_float2(0.0f, 0.5f);
-      subpatch.P10 = make_float2(0.5f, 0.5f);
-      subpatch.P01 = make_float2(0.0f, 1.0f);
-      subpatch.P11 = make_float2(0.5f, 1.0f);
-      split->split_quad(subpatch.patch, &subpatch);
-
-      subpatch.P00 = make_float2(0.5f, 0.5f);
-      subpatch.P10 = make_float2(1.0f, 0.5f);
-      subpatch.P01 = make_float2(0.5f, 1.0f);
-      subpatch.P11 = make_float2(1.0f, 1.0f);
-      split->split_quad(subpatch.patch, &subpatch);
-    }
-    else {
-      /* ngon */
-#ifdef WITH_OPENSUBDIV
-      if (subdivision_type == SUBDIVISION_CATMULL_CLARK) {
-        OsdPatch patch(&osd_data);
-
-        patch.shader = face.shader;
-
-        for (int corner = 0; corner < face.num_corners; corner++) {
-          patch.patch_index = face.ptex_offset + corner;
-
-          split->split_quad(&patch);
-        }
-      }
-      else
-#endif
-      {
+      else {
+        /* ngon */
         float3 center_vert = make_float3(0.0f, 0.0f, 0.0f);
         float3 center_normal = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -499,13 +487,13 @@ void Mesh::tessellate(DiagSplit *split)
         }
 
         for (int corner = 0; corner < face.num_corners; corner++) {
-          LinearQuadPatch patch;
-          float3 *hull = patch.hull;
-          float3 *normals = patch.normals;
+          float3 *hull = patch->hull;
+          float3 *normals = patch->normals;
 
-          patch.patch_index = face.ptex_offset + corner;
+          patch->patch_index = face.ptex_offset + corner;
+          patch->from_ngon = true;
 
-          patch.shader = face.shader;
+          patch->shader = face.shader;
 
           hull[0] =
               verts[subd_face_corners[face.start_corner + mod(corner + 0, face.num_corners)]];
@@ -537,10 +525,13 @@ void Mesh::tessellate(DiagSplit *split)
             }
           }
 
-          split->split_quad(&patch);
+          patch++;
         }
       }
     }
+
+    /* split patches */
+    split->split_patches(linear_patches.data(), sizeof(LinearQuadPatch));
   }
 
   /* interpolate center points for attributes */
diff --git a/intern/cycles/render/mesh_volume.cpp b/intern/cycles/render/mesh_volume.cpp
index a1d61fd4db7..607363d01c6 100644
--- a/intern/cycles/render/mesh_volume.cpp
+++ b/intern/cycles/render/mesh_volume.cpp
@@ -14,29 +14,30 @@
  * limitations under the License.
  */
 
-#include "render/mesh.h"
 #include "render/attribute.h"
+#include "render/mesh.h"
 #include "render/scene.h"
 
 #include "util/util_foreach.h"
+#include "util/util_hash.h"
 #include "util/util_logging.h"
 #include "util/util_progress.h"
 #include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
-static size_t compute_voxel_index(const int3 &resolution, size_t x, size_t y, size_t z)
+const int64_t VOXEL_INDEX_NONE = -1;
+
+static int64_t compute_voxel_index(const int3 &resolution, int64_t x, int64_t y, int64_t z)
 {
-  if (x == -1 || x >= resolution.x) {
-    return -1;
+  if (x < 0 || x >= resolution.x) {
+    return VOXEL_INDEX_NONE;
   }
-
-  if (y == -1 || y >= resolution.y) {
-    return -1;
+  else if (y < 0 || y >= resolution.y) {
+    return VOXEL_INDEX_NONE;
   }
-
-  if (z == -1 || z >= resolution.z) {
-    return -1;
+  else if (z < 0 || z >= resolution.z) {
+    return VOXEL_INDEX_NONE;
   }
 
   return x + y * resolution.x + z * resolution.x * resolution.y;
@@ -135,18 +136,18 @@ static const int CUBE_SIZE = 8;
  *
  * The way the algorithm works is as follows:
  *
- * - the coordinates of active voxels from a dense volume (or 3d image) are
- * gathered inside an auxialliary volume.
- * - each set of coordinates of an CUBE_SIZE cube are mapped to the same
- * coordinate of the auxilliary volume.
- * - quads are created between active and non-active voxels in the auxialliary
- * volume to generate a tight mesh around the volume.
+ * - The coordinates of active voxels from a dense volume (or 3d image) are
+ *   gathered inside an auxiliary volume.
+ * - Each set of coordinates of an CUBE_SIZE cube are mapped to the same
+ *   coordinate of the auxiliary volume.
+ * - Quads are created between active and non-active voxels in the auxiliary
+ *   volume to generate a tight mesh around the volume.
  */
 class VolumeMeshBuilder {
-  /* Auxilliary volume that is used to check if a node already added. */
+  /* Auxiliary volume that is used to check if a node already added. */
   vector<char> grid;
 
-  /* The resolution of the auxilliary volume, set to be equal to 1/CUBE_SIZE
+  /* The resolution of the auxiliary volume, set to be equal to 1/CUBE_SIZE
    * of the original volume on each axis. */
   int3 res;
 
@@ -184,15 +185,15 @@ VolumeMeshBuilder::VolumeMeshBuilder(VolumeParams *volume_params)
   params = volume_params;
   number_of_nodes = 0;
 
-  const size_t x = divide_up(params->resolution.x, CUBE_SIZE);
-  const size_t y = divide_up(params->resolution.y, CUBE_SIZE);
-  const size_t z = divide_up(params->resolution.z, CUBE_SIZE);
+  const int64_t x = divide_up(params->resolution.x, CUBE_SIZE);
+  const int64_t y = divide_up(params->resolution.y, CUBE_SIZE);
+  const int64_t z = divide_up(params->resolution.z, CUBE_SIZE);
 
   /* Adding 2*pad_size since we pad in both positive and negative directions
    * along the axis. */
-  const size_t px = divide_up(params->resolution.x + 2 * params->pad_size, CUBE_SIZE);
-  const size_t py = divide_up(params->resolution.y + 2 * params->pad_size, CUBE_SIZE);
-  const size_t pz = divide_up(params->resolution.z + 2 * params->pad_size, CUBE_SIZE);
+  const int64_t px = divide_up(params->resolution.x + 2 * params->pad_size, CUBE_SIZE);
+  const int64_t py = divide_up(params->resolution.y + 2 * params->pad_size, CUBE_SIZE);
+  const int64_t pz = divide_up(params->resolution.z + 2 * params->pad_size, CUBE_SIZE);
 
   res = make_int3(px, py, pz);
   pad_offset = make_int3(px - x, py - y, pz - z);
@@ -209,7 +210,10 @@ void VolumeMeshBuilder::add_node(int x, int y, int z)
 
   assert((index_x >= 0) && (index_y >= 0) && (index_z >= 0));
 
-  const size_t index = compute_voxel_index(res, index_x, index_y, index_z);
+  const int64_t index = compute_voxel_index(res, index_x, index_y, index_z);
+  if (index == VOXEL_INDEX_NONE) {
+    return;
+  }
 
   /* We already have a node here. */
   if (grid[index] == 1) {
@@ -256,7 +260,7 @@ void VolumeMeshBuilder::generate_vertices_and_quads(vector<ccl::int3> &vertices_
   for (int z = 0; z < res.z; ++z) {
     for (int y = 0; y < res.y; ++y) {
       for (int x = 0; x < res.x; ++x) {
-        size_t voxel_index = compute_voxel_index(res, x, y, z);
+        int64_t voxel_index = compute_voxel_index(res, x, y, z);
         if (grid[voxel_index] == 0) {
           continue;
         }
@@ -285,32 +289,32 @@ void VolumeMeshBuilder::generate_vertices_and_quads(vector<ccl::int3> &vertices_
          */
 
         voxel_index = compute_voxel_index(res, x - 1, y, z);
-        if (voxel_index == -1 || grid[voxel_index] == 0) {
+        if (voxel_index == VOXEL_INDEX_NONE || grid[voxel_index] == 0) {
           create_quad(corners, vertices_is, quads, res, used_verts, QUAD_X_MIN);
         }
 
         voxel_index = compute_voxel_index(res, x + 1, y, z);
-        if (voxel_index == -1 || grid[voxel_index] == 0) {
+        if (voxel_index == VOXEL_INDEX_NONE || grid[voxel_index] == 0) {
           create_quad(corners, vertices_is, quads, res, used_verts, QUAD_X_MAX);
         }
 
         voxel_index = compute_voxel_index(res, x, y - 1, z);
-        if (voxel_index == -1 || grid[voxel_index] == 0) {
+        if (voxel_index == VOXEL_INDEX_NONE || grid[voxel_index] == 0) {
           create_quad(corners, vertices_is, quads, res, used_verts, QUAD_Y_MIN);
         }
 
         voxel_index = compute_voxel_index(res, x, y + 1, z);
-        if (voxel_index == -1 || grid[voxel_index] == 0) {
+        if (voxel_index == VOXEL_INDEX_NONE || grid[voxel_index] == 0) {
           create_quad(corners, vertices_is, quads, res, used_verts, QUAD_Y_MAX);
         }
 
         voxel_index = compute_voxel_index(res, x, y, z - 1);
-        if (voxel_index == -1 || grid[voxel_index] == 0) {
+        if (voxel_index == VOXEL_INDEX_NONE || grid[voxel_index] == 0) {
           create_quad(corners, vertices_is, quads, res, used_verts, QUAD_Z_MIN);
         }
 
         voxel_index = compute_voxel_index(res, x, y, z + 1);
-        if (voxel_index == -1 || grid[voxel_index] == 0) {
+        if (voxel_index == VOXEL_INDEX_NONE || grid[voxel_index] == 0) {
           create_quad(corners, vertices_is, quads, res, used_verts, QUAD_Z_MAX);
         }
       }
@@ -362,7 +366,7 @@ struct VoxelAttributeGrid {
   int channels;
 };
 
-void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progress)
+void GeometryManager::create_volume_mesh(Mesh *mesh, Progress &progress)
 {
   string msg = string_printf("Computing Volume Mesh %s", mesh->name.c_str());
   progress.set_status("Updating Mesh", msg);
@@ -373,13 +377,15 @@ void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progres
   VolumeParams volume_params;
   volume_params.resolution = make_int3(0, 0, 0);
 
+  Transform transform = transform_identity();
+
   foreach (Attribute &attr, mesh->attributes.attributes) {
     if (attr.element != ATTR_ELEMENT_VOXEL) {
       continue;
     }
 
-    VoxelAttribute *voxel = attr.data_voxel();
-    device_memory *image_memory = scene->image_manager->image_memory(voxel->slot);
+    ImageHandle &handle = attr.data_voxel();
+    device_texture *image_memory = handle.image_memory();
     int3 resolution = make_int3(
         image_memory->data_width, image_memory->data_height, image_memory->data_depth);
 
@@ -387,14 +393,20 @@ void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progres
       volume_params.resolution = resolution;
     }
     else if (volume_params.resolution != resolution) {
-      VLOG(1) << "Can't create volume mesh, all voxel grid resolutions must be equal\n";
-      return;
+      /* TODO: support this as it's common for OpenVDB. */
+      VLOG(1) << "Can't create accurate volume mesh, all voxel grid resolutions must be equal\n";
+      continue;
     }
 
     VoxelAttributeGrid voxel_grid;
     voxel_grid.data = static_cast<float *>(image_memory->host_pointer);
     voxel_grid.channels = image_memory->data_elements;
     voxel_grids.push_back(voxel_grid);
+
+    /* TODO: support multiple transforms. */
+    if (image_memory->info.use_transform_3d) {
+      transform = image_memory->info.transform_3d;
+    }
   }
 
   if (voxel_grids.empty()) {
@@ -427,37 +439,41 @@ void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progres
   }
 
   /* Compute start point and cell size from transform. */
-  Attribute *attr = mesh->attributes.find(ATTR_STD_GENERATED_TRANSFORM);
   const int3 resolution = volume_params.resolution;
   float3 start_point = make_float3(0.0f, 0.0f, 0.0f);
   float3 cell_size = make_float3(1.0f / resolution.x, 1.0f / resolution.y, 1.0f / resolution.z);
 
-  if (attr) {
-    const Transform *tfm = attr->data_transform();
-    const Transform itfm = transform_inverse(*tfm);
-    start_point = transform_point(&itfm, start_point);
-    cell_size = transform_direction(&itfm, cell_size);
-  }
+  /* TODO: support arbitrary transforms, not just scale + translate. */
+  const Transform itfm = transform_inverse(transform);
+  start_point = transform_point(&itfm, start_point);
+  cell_size = transform_direction(&itfm, cell_size);
+
+  /* Slightly offset vertex coordinates to avoid overlapping faces with other
+   * volumes or meshes. The proper solution would be to improve intersection in
+   * the kernel to support robust handling of multiple overlapping faces or use
+   * an all-hit intersection similar to shadows. */
+  const float3 face_overlap_avoidance = cell_size * 0.1f *
+                                        hash_uint_to_float(hash_string(mesh->name.c_str()));
 
-  volume_params.start_point = start_point;
+  volume_params.start_point = start_point + face_overlap_avoidance;
   volume_params.cell_size = cell_size;
   volume_params.pad_size = pad_size;
 
   /* Build bounding mesh around non-empty volume cells. */
   VolumeMeshBuilder builder(&volume_params);
-  const float isovalue = mesh->volume_isovalue;
+  const float clipping = mesh->volume_clipping;
 
   for (int z = 0; z < resolution.z; ++z) {
     for (int y = 0; y < resolution.y; ++y) {
       for (int x = 0; x < resolution.x; ++x) {
-        size_t voxel_index = compute_voxel_index(resolution, x, y, z);
+        int64_t voxel_index = compute_voxel_index(resolution, x, y, z);
 
         for (size_t i = 0; i < voxel_grids.size(); ++i) {
           const VoxelAttributeGrid &voxel_grid = voxel_grids[i];
           const int channels = voxel_grid.channels;
 
           for (int c = 0; c < channels; c++) {
-            if (voxel_grid.data[voxel_index * channels + c] >= isovalue) {
+            if (voxel_grid.data[voxel_index * channels + c] >= clipping) {
               builder.add_node_with_padding(x, y, z);
               break;
             }
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index f3572ee1585..d5f65fb54db 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -14,24 +14,30 @@
  * limitations under the License.
  */
 
+#include "render/nodes.h"
+#include "render/colorspace.h"
+#include "render/constant_fold.h"
 #include "render/film.h"
 #include "render/image.h"
+#include "render/image_sky.h"
 #include "render/integrator.h"
 #include "render/light.h"
-#include "render/nodes.h"
+#include "render/mesh.h"
+#include "render/osl.h"
 #include "render/scene.h"
 #include "render/svm.h"
-#include "kernel/svm/svm_color_util.h"
-#include "kernel/svm/svm_ramp_util.h"
-#include "kernel/svm/svm_math_util.h"
-#include "render/osl.h"
-#include "render/constant_fold.h"
 
-#include "util/util_sky_model.h"
+#include "sky_model.h"
+
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_transform.h"
 
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_mapping_util.h"
+#include "kernel/svm/svm_math_util.h"
+#include "kernel/svm/svm_ramp_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Texture Mapping */
@@ -148,7 +154,7 @@ bool TextureMapping::skip()
 
 void TextureMapping::compile(SVMCompiler &compiler, int offset_in, int offset_out)
 {
-  compiler.add_node(NODE_MAPPING, offset_in, offset_out);
+  compiler.add_node(NODE_TEXTURE_MAPPING, offset_in, offset_out);
 
   Transform tfm = compute_transform();
   compiler.add_node(tfm.x);
@@ -162,8 +168,10 @@ void TextureMapping::compile(SVMCompiler &compiler, int offset_in, int offset_ou
   }
 
   if (type == NORMAL) {
-    compiler.add_node(NODE_VECTOR_MATH, NODE_VECTOR_MATH_NORMALIZE, offset_out, offset_out);
-    compiler.add_node(NODE_VECTOR_MATH, SVM_STACK_INVALID, offset_out);
+    compiler.add_node(NODE_VECTOR_MATH,
+                      NODE_VECTOR_MATH_NORMALIZE,
+                      compiler.encode_uchar4(offset_out, offset_out, offset_out),
+                      compiler.encode_uchar4(SVM_STACK_INVALID, offset_out));
   }
 }
 
@@ -207,13 +215,15 @@ NODE_DEFINE(ImageTextureNode)
   TEXTURE_MAPPING_DEFINE(ImageTextureNode);
 
   SOCKET_STRING(filename, "Filename", ustring());
+  SOCKET_STRING(colorspace, "Colorspace", u_colorspace_auto);
 
-  static NodeEnum color_space_enum;
-  color_space_enum.insert("none", NODE_COLOR_SPACE_NONE);
-  color_space_enum.insert("color", NODE_COLOR_SPACE_COLOR);
-  SOCKET_ENUM(color_space, "Color Space", color_space_enum, NODE_COLOR_SPACE_COLOR);
-
-  SOCKET_BOOLEAN(use_alpha, "Use Alpha", true);
+  static NodeEnum alpha_type_enum;
+  alpha_type_enum.insert("auto", IMAGE_ALPHA_AUTO);
+  alpha_type_enum.insert("unassociated", IMAGE_ALPHA_UNASSOCIATED);
+  alpha_type_enum.insert("associated", IMAGE_ALPHA_ASSOCIATED);
+  alpha_type_enum.insert("channel_packed", IMAGE_ALPHA_CHANNEL_PACKED);
+  alpha_type_enum.insert("ignore", IMAGE_ALPHA_IGNORE);
+  SOCKET_ENUM(alpha_type, "Alpha Type", alpha_type_enum, IMAGE_ALPHA_AUTO);
 
   static NodeEnum interpolation_enum;
   interpolation_enum.insert("closest", INTERPOLATION_CLOSEST);
@@ -247,32 +257,90 @@ NODE_DEFINE(ImageTextureNode)
 
 ImageTextureNode::ImageTextureNode() : ImageSlotTextureNode(node_type)
 {
-  image_manager = NULL;
-  slot = -1;
-  is_float = -1;
-  is_linear = false;
-  builtin_data = NULL;
+  colorspace = u_colorspace_raw;
   animated = false;
-}
-
-ImageTextureNode::~ImageTextureNode()
-{
-  if (image_manager) {
-    image_manager->remove_image(
-        filename.string(), builtin_data, interpolation, extension, use_alpha);
-  }
+  tiles.push_back(1001);
 }
 
 ShaderNode *ImageTextureNode::clone() const
 {
   ImageTextureNode *node = new ImageTextureNode(*this);
-  node->image_manager = NULL;
-  node->slot = -1;
-  node->is_float = -1;
-  node->is_linear = false;
+  node->handle = handle;
   return node;
 }
 
+ImageParams ImageTextureNode::image_params() const
+{
+  ImageParams params;
+  params.animated = animated;
+  params.interpolation = interpolation;
+  params.extension = extension;
+  params.alpha_type = alpha_type;
+  params.colorspace = colorspace;
+  return params;
+}
+
+void ImageTextureNode::cull_tiles(Scene *scene, ShaderGraph *graph)
+{
+  /* Box projection computes its own UVs that always lie in the
+   * 1001 tile, so there's no point in loading any others. */
+  if (projection == NODE_IMAGE_PROJ_BOX) {
+    tiles.clear();
+    tiles.push_back(1001);
+    return;
+  }
+
+  if (!scene->params.background) {
+    /* During interactive renders, all tiles are loaded.
+     * While we could support updating this when UVs change, that could lead
+     * to annoying interruptions when loading images while editing UVs. */
+    return;
+  }
+
+  /* Only check UVs for tile culling if there are multiple tiles. */
+  if (tiles.size() < 2) {
+    return;
+  }
+
+  ShaderInput *vector_in = input("Vector");
+  ustring attribute;
+  if (vector_in->link) {
+    ShaderNode *node = vector_in->link->parent;
+    if (node->type == UVMapNode::node_type) {
+      UVMapNode *uvmap = (UVMapNode *)node;
+      attribute = uvmap->attribute;
+    }
+    else if (node->type == TextureCoordinateNode::node_type) {
+      if (vector_in->link != node->output("UV")) {
+        return;
+      }
+    }
+    else {
+      return;
+    }
+  }
+
+  unordered_set<int> used_tiles;
+  /* TODO(lukas): This is quite inefficient. A fairly simple improvement would
+   * be to have a cache in each mesh that is indexed by attribute.
+   * Additionally, building a graph-to-meshes list once could help. */
+  foreach (Geometry *geom, scene->geometry) {
+    foreach (Shader *shader, geom->used_shaders) {
+      if (shader->graph == graph) {
+        geom->get_uv_tiles(attribute, used_tiles);
+      }
+    }
+  }
+
+  ccl::vector<int> new_tiles;
+  foreach (int tile, tiles) {
+    if (used_tiles.count(tile)) {
+      new_tiles.push_back(tile);
+    }
+  }
+  tiles.swap(new_tiles);
+}
+
 void ImageTextureNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
 #ifdef WITH_PTEX
@@ -294,58 +362,80 @@ void ImageTextureNode::compile(SVMCompiler &compiler)
   ShaderOutput *color_out = output("Color");
   ShaderOutput *alpha_out = output("Alpha");
 
-  image_manager = compiler.image_manager;
-  if (is_float == -1) {
-    ImageMetaData metadata;
-    slot = image_manager->add_image(filename.string(),
-                                    builtin_data,
-                                    animated,
-                                    0,
-                                    interpolation,
-                                    extension,
-                                    use_alpha,
-                                    metadata);
-    is_float = metadata.is_float;
-    is_linear = metadata.is_linear;
-  }
-
-  if (slot != -1) {
-    int srgb = (is_linear || color_space != NODE_COLOR_SPACE_COLOR) ? 0 : 1;
-    int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
-
-    if (projection != NODE_IMAGE_PROJ_BOX) {
-      compiler.add_node(NODE_TEX_IMAGE,
-                        slot,
-                        compiler.encode_uchar4(vector_offset,
-                                               compiler.stack_assign_if_linked(color_out),
-                                               compiler.stack_assign_if_linked(alpha_out),
-                                               srgb),
-                        projection);
+  if (handle.empty()) {
+    cull_tiles(compiler.scene, compiler.current_graph);
+    ImageManager *image_manager = compiler.scene->image_manager;
+    handle = image_manager->add_image(filename.string(), image_params(), tiles);
+  }
+
+  /* All tiles have the same metadata. */
+  const ImageMetaData metadata = handle.metadata();
+  const bool compress_as_srgb = metadata.compress_as_srgb;
+  const ustring known_colorspace = metadata.colorspace;
+
+  int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
+  uint flags = 0;
+
+  if (compress_as_srgb) {
+    flags |= NODE_IMAGE_COMPRESS_AS_SRGB;
+  }
+  if (!alpha_out->links.empty()) {
+    const bool unassociate_alpha = !(ColorSpaceManager::colorspace_is_data(colorspace) ||
+                                     alpha_type == IMAGE_ALPHA_CHANNEL_PACKED ||
+                                     alpha_type == IMAGE_ALPHA_IGNORE);
+
+    if (unassociate_alpha) {
+      flags |= NODE_IMAGE_ALPHA_UNASSOCIATE;
+    }
+  }
+
+  if (projection != NODE_IMAGE_PROJ_BOX) {
+    /* If there only is one image (a very common case), we encode it as a negative value. */
+    int num_nodes;
+    if (handle.num_tiles() == 1) {
+      num_nodes = -handle.svm_slot();
     }
     else {
-      compiler.add_node(NODE_TEX_IMAGE_BOX,
-                        slot,
-                        compiler.encode_uchar4(vector_offset,
-                                               compiler.stack_assign_if_linked(color_out),
-                                               compiler.stack_assign_if_linked(alpha_out),
-                                               srgb),
-                        __float_as_int(projection_blend));
+      num_nodes = divide_up(handle.num_tiles(), 2);
     }
 
-    tex_mapping.compile_end(compiler, vector_in, vector_offset);
+    compiler.add_node(NODE_TEX_IMAGE,
+                      num_nodes,
+                      compiler.encode_uchar4(vector_offset,
+                                             compiler.stack_assign_if_linked(color_out),
+                                             compiler.stack_assign_if_linked(alpha_out),
+                                             flags),
+                      projection);
+
+    if (num_nodes > 0) {
+      for (int i = 0; i < num_nodes; i++) {
+        int4 node;
+        node.x = tiles[2 * i];
+        node.y = handle.svm_slot(2 * i);
+        if (2 * i + 1 < tiles.size()) {
+          node.z = tiles[2 * i + 1];
+          node.w = handle.svm_slot(2 * i + 1);
+        }
+        else {
+          node.z = -1;
+          node.w = -1;
+        }
+        compiler.add_node(node.x, node.y, node.z, node.w);
+      }
+    }
   }
   else {
-    /* image not found */
-    if (!color_out->links.empty()) {
-      compiler.add_node(NODE_VALUE_V, compiler.stack_assign(color_out));
-      compiler.add_node(
-          NODE_VALUE_V,
-          make_float3(TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B));
-    }
-    if (!alpha_out->links.empty())
-      compiler.add_node(
-          NODE_VALUE_F, __float_as_int(TEX_IMAGE_MISSING_A), compiler.stack_assign(alpha_out));
+    assert(handle.num_tiles() == 1);
+    compiler.add_node(NODE_TEX_IMAGE_BOX,
+                      handle.svm_slot(),
+                      compiler.encode_uchar4(vector_offset,
+                                             compiler.stack_assign_if_linked(color_out),
+                                             compiler.stack_assign_if_linked(alpha_out),
+                                             flags),
+                      __float_as_int(projection_blend));
   }
+
+  tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
 
 void ImageTextureNode::compile(OSLCompiler &compiler)
@@ -354,46 +444,36 @@ void ImageTextureNode::compile(OSLCompiler &compiler)
 
   tex_mapping.compile(compiler);
 
-  image_manager = compiler.image_manager;
-  if (is_float == -1) {
-    ImageMetaData metadata;
-    if (builtin_data == NULL) {
-      image_manager->get_image_metadata(filename.string(), NULL, metadata);
-    }
-    else {
-      slot = image_manager->add_image(filename.string(),
-                                      builtin_data,
-                                      animated,
-                                      0,
-                                      interpolation,
-                                      extension,
-                                      use_alpha,
-                                      metadata);
-    }
-    is_float = metadata.is_float;
-    is_linear = metadata.is_linear;
+  if (handle.empty()) {
+    ImageManager *image_manager = compiler.scene->image_manager;
+    handle = image_manager->add_image(filename.string(), image_params());
   }
 
-  if (slot == -1) {
-    compiler.parameter(this, "filename");
+  const ImageMetaData metadata = handle.metadata();
+  const bool is_float = metadata.is_float();
+  const bool compress_as_srgb = metadata.compress_as_srgb;
+  const ustring known_colorspace = metadata.colorspace;
+
+  if (handle.svm_slot() == -1) {
+    compiler.parameter_texture(
+        "filename", filename, compress_as_srgb ? u_colorspace_raw : known_colorspace);
   }
   else {
-    /* TODO(sergey): It's not so simple to pass custom attribute
-     * to the texture() function in order to make builtin images
-     * support more clear. So we use special file name which is
-     * "@i<slot_number>" and check whether file name matches this
-     * mask in the OSLRenderServices::texture().
-     */
-    compiler.parameter("filename", string_printf("@i%d", slot).c_str());
+    compiler.parameter_texture("filename", handle.svm_slot());
   }
-  if (is_linear || color_space != NODE_COLOR_SPACE_COLOR)
-    compiler.parameter("color_space", "linear");
-  else
-    compiler.parameter("color_space", "sRGB");
+
+  const bool unassociate_alpha = !(ColorSpaceManager::colorspace_is_data(colorspace) ||
+                                   alpha_type == IMAGE_ALPHA_CHANNEL_PACKED ||
+                                   alpha_type == IMAGE_ALPHA_IGNORE);
+  const bool is_tiled = (filename.find("<UDIM>") != string::npos);
+
   compiler.parameter(this, "projection");
   compiler.parameter(this, "projection_blend");
+  compiler.parameter("compress_as_srgb", compress_as_srgb);
+  compiler.parameter("ignore_alpha", alpha_type == IMAGE_ALPHA_IGNORE);
+  compiler.parameter("unassociate_alpha", !alpha_out->links.empty() && unassociate_alpha);
   compiler.parameter("is_float", is_float);
-  compiler.parameter("use_alpha", !alpha_out->links.empty());
+  compiler.parameter("is_tiled", is_tiled);
   compiler.parameter(this, "interpolation");
   compiler.parameter(this, "extension");
 
@@ -409,13 +489,15 @@ NODE_DEFINE(EnvironmentTextureNode)
   TEXTURE_MAPPING_DEFINE(EnvironmentTextureNode);
 
   SOCKET_STRING(filename, "Filename", ustring());
+  SOCKET_STRING(colorspace, "Colorspace", u_colorspace_auto);
 
-  static NodeEnum color_space_enum;
-  color_space_enum.insert("none", NODE_COLOR_SPACE_NONE);
-  color_space_enum.insert("color", NODE_COLOR_SPACE_COLOR);
-  SOCKET_ENUM(color_space, "Color Space", color_space_enum, NODE_COLOR_SPACE_COLOR);
-
-  SOCKET_BOOLEAN(use_alpha, "Use Alpha", true);
+  static NodeEnum alpha_type_enum;
+  alpha_type_enum.insert("auto", IMAGE_ALPHA_AUTO);
+  alpha_type_enum.insert("unassociated", IMAGE_ALPHA_UNASSOCIATED);
+  alpha_type_enum.insert("associated", IMAGE_ALPHA_ASSOCIATED);
+  alpha_type_enum.insert("channel_packed", IMAGE_ALPHA_CHANNEL_PACKED);
+  alpha_type_enum.insert("ignore", IMAGE_ALPHA_IGNORE);
+  SOCKET_ENUM(alpha_type, "Alpha Type", alpha_type_enum, IMAGE_ALPHA_AUTO);
 
   static NodeEnum interpolation_enum;
   interpolation_enum.insert("closest", INTERPOLATION_CLOSEST);
@@ -439,32 +521,28 @@ NODE_DEFINE(EnvironmentTextureNode)
 
 EnvironmentTextureNode::EnvironmentTextureNode() : ImageSlotTextureNode(node_type)
 {
-  image_manager = NULL;
-  slot = -1;
-  is_float = -1;
-  is_linear = false;
-  builtin_data = NULL;
+  colorspace = u_colorspace_raw;
   animated = false;
 }
 
-EnvironmentTextureNode::~EnvironmentTextureNode()
-{
-  if (image_manager) {
-    image_manager->remove_image(
-        filename.string(), builtin_data, interpolation, EXTENSION_REPEAT, use_alpha);
-  }
-}
-
 ShaderNode *EnvironmentTextureNode::clone() const
 {
   EnvironmentTextureNode *node = new EnvironmentTextureNode(*this);
-  node->image_manager = NULL;
-  node->slot = -1;
-  node->is_float = -1;
-  node->is_linear = false;
+  node->handle = handle;
   return node;
 }
 
+ImageParams EnvironmentTextureNode::image_params() const
+{
+  ImageParams params;
+  params.animated = animated;
+  params.interpolation = interpolation;
+  params.extension = EXTENSION_REPEAT;
+  params.alpha_type = alpha_type;
+  params.colorspace = colorspace;
+  return params;
+}
+
 void EnvironmentTextureNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
 #ifdef WITH_PTEX
@@ -484,93 +562,60 @@ void EnvironmentTextureNode::compile(SVMCompiler &compiler)
   ShaderOutput *color_out = output("Color");
   ShaderOutput *alpha_out = output("Alpha");
 
-  image_manager = compiler.image_manager;
-  if (slot == -1) {
-    ImageMetaData metadata;
-    slot = image_manager->add_image(filename.string(),
-                                    builtin_data,
-                                    animated,
-                                    0,
-                                    interpolation,
-                                    EXTENSION_REPEAT,
-                                    use_alpha,
-                                    metadata);
-    is_float = metadata.is_float;
-    is_linear = metadata.is_linear;
-  }
-
-  if (slot != -1) {
-    int srgb = (is_linear || color_space != NODE_COLOR_SPACE_COLOR) ? 0 : 1;
-    int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
-
-    compiler.add_node(NODE_TEX_ENVIRONMENT,
-                      slot,
-                      compiler.encode_uchar4(vector_offset,
-                                             compiler.stack_assign_if_linked(color_out),
-                                             compiler.stack_assign_if_linked(alpha_out),
-                                             srgb),
-                      projection);
-
-    tex_mapping.compile_end(compiler, vector_in, vector_offset);
+  if (handle.empty()) {
+    ImageManager *image_manager = compiler.scene->image_manager;
+    handle = image_manager->add_image(filename.string(), image_params());
   }
-  else {
-    /* image not found */
-    if (!color_out->links.empty()) {
-      compiler.add_node(NODE_VALUE_V, compiler.stack_assign(color_out));
-      compiler.add_node(
-          NODE_VALUE_V,
-          make_float3(TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B));
-    }
-    if (!alpha_out->links.empty())
-      compiler.add_node(
-          NODE_VALUE_F, __float_as_int(TEX_IMAGE_MISSING_A), compiler.stack_assign(alpha_out));
+
+  const ImageMetaData metadata = handle.metadata();
+  const bool compress_as_srgb = metadata.compress_as_srgb;
+  const ustring known_colorspace = metadata.colorspace;
+
+  int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
+  uint flags = 0;
+
+  if (compress_as_srgb) {
+    flags |= NODE_IMAGE_COMPRESS_AS_SRGB;
   }
+
+  compiler.add_node(NODE_TEX_ENVIRONMENT,
+                    handle.svm_slot(),
+                    compiler.encode_uchar4(vector_offset,
+                                           compiler.stack_assign_if_linked(color_out),
+                                           compiler.stack_assign_if_linked(alpha_out),
+                                           flags),
+                    projection);
+
+  tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
 
 void EnvironmentTextureNode::compile(OSLCompiler &compiler)
 {
-  ShaderOutput *alpha_out = output("Alpha");
+  if (handle.empty()) {
+    ImageManager *image_manager = compiler.scene->image_manager;
+    handle = image_manager->add_image(filename.string(), image_params());
+  }
 
   tex_mapping.compile(compiler);
 
-  /* See comments in ImageTextureNode::compile about support
-   * of builtin images.
-   */
-  image_manager = compiler.image_manager;
-  if (is_float == -1) {
-    ImageMetaData metadata;
-    if (builtin_data == NULL) {
-      image_manager->get_image_metadata(filename.string(), NULL, metadata);
-    }
-    else {
-      slot = image_manager->add_image(filename.string(),
-                                      builtin_data,
-                                      animated,
-                                      0,
-                                      interpolation,
-                                      EXTENSION_REPEAT,
-                                      use_alpha,
-                                      metadata);
-    }
-    is_float = metadata.is_float;
-    is_linear = metadata.is_linear;
-  }
+  const ImageMetaData metadata = handle.metadata();
+  const bool is_float = metadata.is_float();
+  const bool compress_as_srgb = metadata.compress_as_srgb;
+  const ustring known_colorspace = metadata.colorspace;
 
-  if (slot == -1) {
-    compiler.parameter(this, "filename");
+  if (handle.svm_slot() == -1) {
+    compiler.parameter_texture(
+        "filename", filename, compress_as_srgb ? u_colorspace_raw : known_colorspace);
   }
   else {
-    compiler.parameter("filename", string_printf("@i%d", slot).c_str());
+    compiler.parameter_texture("filename", handle.svm_slot());
   }
-  compiler.parameter(this, "projection");
-  if (is_linear || color_space != NODE_COLOR_SPACE_COLOR)
-    compiler.parameter("color_space", "linear");
-  else
-    compiler.parameter("color_space", "sRGB");
 
+  compiler.parameter(this, "projection");
   compiler.parameter(this, "interpolation");
+  compiler.parameter("compress_as_srgb", compress_as_srgb);
+  compiler.parameter("ignore_alpha", alpha_type == IMAGE_ALPHA_IGNORE);
   compiler.parameter("is_float", is_float);
-  compiler.parameter("use_alpha", !alpha_out->links.empty());
   compiler.add(this, "node_environment_texture");
 }
 
@@ -587,7 +632,7 @@ typedef struct SunSky {
 
   /* Parameter */
   float radiance_x, radiance_y, radiance_z;
-  float config_x[9], config_y[9], config_z[9];
+  float config_x[9], config_y[9], config_z[9], nishita_data[10];
 } SunSky;
 
 /* Preetham model */
@@ -597,7 +642,7 @@ static float sky_perez_function(float lam[6], float theta, float gamma)
          (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cosf(gamma) * cosf(gamma));
 }
 
-static void sky_texture_precompute_old(SunSky *sunsky, float3 dir, float turbidity)
+static void sky_texture_precompute_preetham(SunSky *sunsky, float3 dir, float turbidity)
 {
   /*
    * We re-use the SunSky struct of the new model, to avoid extra variables
@@ -660,10 +705,10 @@ static void sky_texture_precompute_old(SunSky *sunsky, float3 dir, float turbidi
 }
 
 /* Hosek / Wilkie */
-static void sky_texture_precompute_new(SunSky *sunsky,
-                                       float3 dir,
-                                       float turbidity,
-                                       float ground_albedo)
+static void sky_texture_precompute_hosek(SunSky *sunsky,
+                                         float3 dir,
+                                         float turbidity,
+                                         float ground_albedo)
 {
   /* Calculate Sun Direction and save coordinates */
   float2 spherical = sky_spherical_coordinates(dir);
@@ -682,8 +727,8 @@ static void sky_texture_precompute_new(SunSky *sunsky,
   float solarElevation = M_PI_2_F - theta;
 
   /* Initialize Sky Model */
-  ArHosekSkyModelState *sky_state;
-  sky_state = arhosek_xyz_skymodelstate_alloc_init(
+  SKY_ArHosekSkyModelState *sky_state;
+  sky_state = SKY_arhosek_xyz_skymodelstate_alloc_init(
       (double)turbidity, (double)ground_albedo, (double)solarElevation);
 
   /* Copy values from sky_state to SunSky */
@@ -697,7 +742,42 @@ static void sky_texture_precompute_new(SunSky *sunsky,
   sunsky->radiance_z = (float)sky_state->radiances[2];
 
   /* Free sky_state */
-  arhosekskymodelstate_free(sky_state);
+  SKY_arhosekskymodelstate_free(sky_state);
+}
+
+/* Nishita improved */
+static void sky_texture_precompute_nishita(SunSky *sunsky,
+                                           bool sun_disc,
+                                           float sun_size,
+                                           float sun_intensity,
+                                           float sun_elevation,
+                                           float sun_rotation,
+                                           float altitude,
+                                           float air_density,
+                                           float dust_density)
+{
+  /* sample 2 sun pixels */
+  float pixel_bottom[3];
+  float pixel_top[3];
+  SKY_nishita_skymodel_precompute_sun(
+      sun_elevation, sun_size, altitude, air_density, dust_density, pixel_bottom, pixel_top);
+  /* limit sun rotation between 0 and 360 degrees */
+  sun_rotation = fmodf(sun_rotation, M_2PI_F);
+  if (sun_rotation < 0.0f) {
+    sun_rotation += M_2PI_F;
+  }
+  sun_rotation = M_2PI_F - sun_rotation;
+  /* send data to svm_sky */
+  sunsky->nishita_data[0] = pixel_bottom[0];
+  sunsky->nishita_data[1] = pixel_bottom[1];
+  sunsky->nishita_data[2] = pixel_bottom[2];
+  sunsky->nishita_data[3] = pixel_top[0];
+  sunsky->nishita_data[4] = pixel_top[1];
+  sunsky->nishita_data[5] = pixel_top[2];
+  sunsky->nishita_data[6] = sun_elevation;
+  sunsky->nishita_data[7] = sun_rotation;
+  sunsky->nishita_data[8] = sun_disc ? sun_size : 0.0f;
+  sunsky->nishita_data[9] = sun_intensity;
 }
 
 NODE_DEFINE(SkyTextureNode)
@@ -707,13 +787,23 @@ NODE_DEFINE(SkyTextureNode)
   TEXTURE_MAPPING_DEFINE(SkyTextureNode);
 
   static NodeEnum type_enum;
-  type_enum.insert("preetham", NODE_SKY_OLD);
-  type_enum.insert("hosek_wilkie", NODE_SKY_NEW);
-  SOCKET_ENUM(type, "Type", type_enum, NODE_SKY_NEW);
+  type_enum.insert("preetham", NODE_SKY_PREETHAM);
+  type_enum.insert("hosek_wilkie", NODE_SKY_HOSEK);
+  type_enum.insert("nishita_improved", NODE_SKY_NISHITA);
+  SOCKET_ENUM(type, "Type", type_enum, NODE_SKY_NISHITA);
 
   SOCKET_VECTOR(sun_direction, "Sun Direction", make_float3(0.0f, 0.0f, 1.0f));
   SOCKET_FLOAT(turbidity, "Turbidity", 2.2f);
   SOCKET_FLOAT(ground_albedo, "Ground Albedo", 0.3f);
+  SOCKET_BOOLEAN(sun_disc, "Sun Disc", true);
+  SOCKET_FLOAT(sun_size, "Sun Size", 0.009512f);
+  SOCKET_FLOAT(sun_intensity, "Sun Intensity", 1.0f);
+  SOCKET_FLOAT(sun_elevation, "Sun Elevation", 15.0f * M_PI_F / 180.0f);
+  SOCKET_FLOAT(sun_rotation, "Sun Rotation", 0.0f);
+  SOCKET_FLOAT(altitude, "Altitude", 1.0f);
+  SOCKET_FLOAT(air_density, "Air", 1.0f);
+  SOCKET_FLOAT(dust_density, "Dust", 1.0f);
+  SOCKET_FLOAT(ozone_density, "Ozone", 1.0f);
 
   SOCKET_IN_POINT(
       vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
@@ -733,10 +823,37 @@ void SkyTextureNode::compile(SVMCompiler &compiler)
   ShaderOutput *color_out = output("Color");
 
   SunSky sunsky;
-  if (type == NODE_SKY_OLD)
-    sky_texture_precompute_old(&sunsky, sun_direction, turbidity);
-  else if (type == NODE_SKY_NEW)
-    sky_texture_precompute_new(&sunsky, sun_direction, turbidity, ground_albedo);
+  if (type == NODE_SKY_PREETHAM)
+    sky_texture_precompute_preetham(&sunsky, sun_direction, turbidity);
+  else if (type == NODE_SKY_HOSEK)
+    sky_texture_precompute_hosek(&sunsky, sun_direction, turbidity, ground_albedo);
+  else if (type == NODE_SKY_NISHITA) {
+    /* Clamp altitude to reasonable values.
+     * Below 1m causes numerical issues and above 60km is space. */
+    float clamped_altitude = clamp(altitude, 1.0f, 59999.0f);
+
+    sky_texture_precompute_nishita(&sunsky,
+                                   sun_disc,
+                                   sun_size,
+                                   sun_intensity,
+                                   sun_elevation,
+                                   sun_rotation,
+                                   clamped_altitude,
+                                   air_density,
+                                   dust_density);
+    /* precomputed texture image parameters */
+    ImageManager *image_manager = compiler.scene->image_manager;
+    ImageParams impar;
+    impar.interpolation = INTERPOLATION_LINEAR;
+    impar.extension = EXTENSION_EXTEND;
+
+    /* precompute sky texture */
+    if (handle.empty()) {
+      SkyLoader *loader = new SkyLoader(
+          sun_elevation, clamped_altitude, air_density, dust_density, ozone_density);
+      handle = image_manager->add_image(loader, impar);
+    }
+  }
   else
     assert(false);
 
@@ -744,38 +861,55 @@ void SkyTextureNode::compile(SVMCompiler &compiler)
 
   compiler.stack_assign(color_out);
   compiler.add_node(NODE_TEX_SKY, vector_offset, compiler.stack_assign(color_out), type);
-  compiler.add_node(__float_as_uint(sunsky.phi),
-                    __float_as_uint(sunsky.theta),
-                    __float_as_uint(sunsky.radiance_x),
-                    __float_as_uint(sunsky.radiance_y));
-  compiler.add_node(__float_as_uint(sunsky.radiance_z),
-                    __float_as_uint(sunsky.config_x[0]),
-                    __float_as_uint(sunsky.config_x[1]),
-                    __float_as_uint(sunsky.config_x[2]));
-  compiler.add_node(__float_as_uint(sunsky.config_x[3]),
-                    __float_as_uint(sunsky.config_x[4]),
-                    __float_as_uint(sunsky.config_x[5]),
-                    __float_as_uint(sunsky.config_x[6]));
-  compiler.add_node(__float_as_uint(sunsky.config_x[7]),
-                    __float_as_uint(sunsky.config_x[8]),
-                    __float_as_uint(sunsky.config_y[0]),
-                    __float_as_uint(sunsky.config_y[1]));
-  compiler.add_node(__float_as_uint(sunsky.config_y[2]),
-                    __float_as_uint(sunsky.config_y[3]),
-                    __float_as_uint(sunsky.config_y[4]),
-                    __float_as_uint(sunsky.config_y[5]));
-  compiler.add_node(__float_as_uint(sunsky.config_y[6]),
-                    __float_as_uint(sunsky.config_y[7]),
-                    __float_as_uint(sunsky.config_y[8]),
-                    __float_as_uint(sunsky.config_z[0]));
-  compiler.add_node(__float_as_uint(sunsky.config_z[1]),
-                    __float_as_uint(sunsky.config_z[2]),
-                    __float_as_uint(sunsky.config_z[3]),
-                    __float_as_uint(sunsky.config_z[4]));
-  compiler.add_node(__float_as_uint(sunsky.config_z[5]),
-                    __float_as_uint(sunsky.config_z[6]),
-                    __float_as_uint(sunsky.config_z[7]),
-                    __float_as_uint(sunsky.config_z[8]));
+  /* nishita doesn't need this data */
+  if (type != NODE_SKY_NISHITA) {
+    compiler.add_node(__float_as_uint(sunsky.phi),
+                      __float_as_uint(sunsky.theta),
+                      __float_as_uint(sunsky.radiance_x),
+                      __float_as_uint(sunsky.radiance_y));
+    compiler.add_node(__float_as_uint(sunsky.radiance_z),
+                      __float_as_uint(sunsky.config_x[0]),
+                      __float_as_uint(sunsky.config_x[1]),
+                      __float_as_uint(sunsky.config_x[2]));
+    compiler.add_node(__float_as_uint(sunsky.config_x[3]),
+                      __float_as_uint(sunsky.config_x[4]),
+                      __float_as_uint(sunsky.config_x[5]),
+                      __float_as_uint(sunsky.config_x[6]));
+    compiler.add_node(__float_as_uint(sunsky.config_x[7]),
+                      __float_as_uint(sunsky.config_x[8]),
+                      __float_as_uint(sunsky.config_y[0]),
+                      __float_as_uint(sunsky.config_y[1]));
+    compiler.add_node(__float_as_uint(sunsky.config_y[2]),
+                      __float_as_uint(sunsky.config_y[3]),
+                      __float_as_uint(sunsky.config_y[4]),
+                      __float_as_uint(sunsky.config_y[5]));
+    compiler.add_node(__float_as_uint(sunsky.config_y[6]),
+                      __float_as_uint(sunsky.config_y[7]),
+                      __float_as_uint(sunsky.config_y[8]),
+                      __float_as_uint(sunsky.config_z[0]));
+    compiler.add_node(__float_as_uint(sunsky.config_z[1]),
+                      __float_as_uint(sunsky.config_z[2]),
+                      __float_as_uint(sunsky.config_z[3]),
+                      __float_as_uint(sunsky.config_z[4]));
+    compiler.add_node(__float_as_uint(sunsky.config_z[5]),
+                      __float_as_uint(sunsky.config_z[6]),
+                      __float_as_uint(sunsky.config_z[7]),
+                      __float_as_uint(sunsky.config_z[8]));
+  }
+  else {
+    compiler.add_node(__float_as_uint(sunsky.nishita_data[0]),
+                      __float_as_uint(sunsky.nishita_data[1]),
+                      __float_as_uint(sunsky.nishita_data[2]),
+                      __float_as_uint(sunsky.nishita_data[3]));
+    compiler.add_node(__float_as_uint(sunsky.nishita_data[4]),
+                      __float_as_uint(sunsky.nishita_data[5]),
+                      __float_as_uint(sunsky.nishita_data[6]),
+                      __float_as_uint(sunsky.nishita_data[7]));
+    compiler.add_node(__float_as_uint(sunsky.nishita_data[8]),
+                      __float_as_uint(sunsky.nishita_data[9]),
+                      handle.svm_slot(),
+                      0);
+  }
 
   tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
@@ -785,10 +919,37 @@ void SkyTextureNode::compile(OSLCompiler &compiler)
   tex_mapping.compile(compiler);
 
   SunSky sunsky;
-  if (type == NODE_SKY_OLD)
-    sky_texture_precompute_old(&sunsky, sun_direction, turbidity);
-  else if (type == NODE_SKY_NEW)
-    sky_texture_precompute_new(&sunsky, sun_direction, turbidity, ground_albedo);
+  if (type == NODE_SKY_PREETHAM)
+    sky_texture_precompute_preetham(&sunsky, sun_direction, turbidity);
+  else if (type == NODE_SKY_HOSEK)
+    sky_texture_precompute_hosek(&sunsky, sun_direction, turbidity, ground_albedo);
+  else if (type == NODE_SKY_NISHITA) {
+    /* Clamp altitude to reasonable values.
+     * Below 1m causes numerical issues and above 60km is space. */
+    float clamped_altitude = clamp(altitude, 1.0f, 59999.0f);
+
+    sky_texture_precompute_nishita(&sunsky,
+                                   sun_disc,
+                                   sun_size,
+                                   sun_intensity,
+                                   sun_elevation,
+                                   sun_rotation,
+                                   clamped_altitude,
+                                   air_density,
+                                   dust_density);
+    /* precomputed texture image parameters */
+    ImageManager *image_manager = compiler.scene->image_manager;
+    ImageParams impar;
+    impar.interpolation = INTERPOLATION_LINEAR;
+    impar.extension = EXTENSION_EXTEND;
+
+    /* precompute sky texture */
+    if (handle.empty()) {
+      SkyLoader *loader = new SkyLoader(
+          sun_elevation, clamped_altitude, air_density, dust_density, ozone_density);
+      handle = image_manager->add_image(loader, impar);
+    }
+  }
   else
     assert(false);
 
@@ -800,6 +961,11 @@ void SkyTextureNode::compile(OSLCompiler &compiler)
   compiler.parameter_array("config_x", sunsky.config_x, 9);
   compiler.parameter_array("config_y", sunsky.config_y, 9);
   compiler.parameter_array("config_z", sunsky.config_z, 9);
+  compiler.parameter_array("nishita_data", sunsky.nishita_data, 10);
+  /* nishita texture */
+  if (type == NODE_SKY_NISHITA) {
+    compiler.parameter_texture("filename", handle.svm_slot());
+  }
   compiler.add(this, "node_sky_texture");
 }
 
@@ -867,14 +1033,23 @@ NODE_DEFINE(NoiseTextureNode)
 
   TEXTURE_MAPPING_DEFINE(NoiseTextureNode);
 
+  static NodeEnum dimensions_enum;
+  dimensions_enum.insert("1D", 1);
+  dimensions_enum.insert("2D", 2);
+  dimensions_enum.insert("3D", 3);
+  dimensions_enum.insert("4D", 4);
+  SOCKET_ENUM(dimensions, "Dimensions", dimensions_enum, 3);
+
+  SOCKET_IN_POINT(
+      vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
+  SOCKET_IN_FLOAT(w, "W", 0.0f);
   SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
   SOCKET_IN_FLOAT(detail, "Detail", 2.0f);
+  SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
   SOCKET_IN_FLOAT(distortion, "Distortion", 0.0f);
-  SOCKET_IN_POINT(
-      vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
 
-  SOCKET_OUT_COLOR(color, "Color");
   SOCKET_OUT_FLOAT(fac, "Fac");
+  SOCKET_OUT_COLOR(color, "Color");
 
   return type;
 }
@@ -885,31 +1060,44 @@ NoiseTextureNode::NoiseTextureNode() : TextureNode(node_type)
 
 void NoiseTextureNode::compile(SVMCompiler &compiler)
 {
-  ShaderInput *distortion_in = input("Distortion");
-  ShaderInput *detail_in = input("Detail");
-  ShaderInput *scale_in = input("Scale");
   ShaderInput *vector_in = input("Vector");
-  ShaderOutput *color_out = output("Color");
+  ShaderInput *w_in = input("W");
+  ShaderInput *scale_in = input("Scale");
+  ShaderInput *detail_in = input("Detail");
+  ShaderInput *roughness_in = input("Roughness");
+  ShaderInput *distortion_in = input("Distortion");
   ShaderOutput *fac_out = output("Fac");
+  ShaderOutput *color_out = output("Color");
 
-  int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
+  int vector_stack_offset = tex_mapping.compile_begin(compiler, vector_in);
+  int w_stack_offset = compiler.stack_assign_if_linked(w_in);
+  int scale_stack_offset = compiler.stack_assign_if_linked(scale_in);
+  int detail_stack_offset = compiler.stack_assign_if_linked(detail_in);
+  int roughness_stack_offset = compiler.stack_assign_if_linked(roughness_in);
+  int distortion_stack_offset = compiler.stack_assign_if_linked(distortion_in);
+  int fac_stack_offset = compiler.stack_assign_if_linked(fac_out);
+  int color_stack_offset = compiler.stack_assign_if_linked(color_out);
 
-  compiler.add_node(NODE_TEX_NOISE,
-                    compiler.encode_uchar4(vector_offset,
-                                           compiler.stack_assign_if_linked(scale_in),
-                                           compiler.stack_assign_if_linked(detail_in),
-                                           compiler.stack_assign_if_linked(distortion_in)),
-                    compiler.encode_uchar4(compiler.stack_assign_if_linked(color_out),
-                                           compiler.stack_assign_if_linked(fac_out)));
-  compiler.add_node(__float_as_int(scale), __float_as_int(detail), __float_as_int(distortion));
+  compiler.add_node(
+      NODE_TEX_NOISE,
+      dimensions,
+      compiler.encode_uchar4(
+          vector_stack_offset, w_stack_offset, scale_stack_offset, detail_stack_offset),
+      compiler.encode_uchar4(
+          roughness_stack_offset, distortion_stack_offset, fac_stack_offset, color_stack_offset));
+  compiler.add_node(
+      __float_as_int(w), __float_as_int(scale), __float_as_int(detail), __float_as_int(roughness));
 
-  tex_mapping.compile_end(compiler, vector_in, vector_offset);
+  compiler.add_node(
+      __float_as_int(distortion), SVM_STACK_INVALID, SVM_STACK_INVALID, SVM_STACK_INVALID);
+
+  tex_mapping.compile_end(compiler, vector_in, vector_stack_offset);
 }
 
 void NoiseTextureNode::compile(OSLCompiler &compiler)
 {
   tex_mapping.compile(compiler);
-
+  compiler.parameter(this, "dimensions");
   compiler.add(this, "node_noise_texture");
 }
 
@@ -921,33 +1109,41 @@ NODE_DEFINE(VoronoiTextureNode)
 
   TEXTURE_MAPPING_DEFINE(VoronoiTextureNode);
 
-  static NodeEnum coloring_enum;
-  coloring_enum.insert("intensity", NODE_VORONOI_INTENSITY);
-  coloring_enum.insert("cells", NODE_VORONOI_CELLS);
-  SOCKET_ENUM(coloring, "Coloring", coloring_enum, NODE_VORONOI_INTENSITY);
+  static NodeEnum dimensions_enum;
+  dimensions_enum.insert("1D", 1);
+  dimensions_enum.insert("2D", 2);
+  dimensions_enum.insert("3D", 3);
+  dimensions_enum.insert("4D", 4);
+  SOCKET_ENUM(dimensions, "Dimensions", dimensions_enum, 3);
 
-  static NodeEnum metric;
-  metric.insert("distance", NODE_VORONOI_DISTANCE);
-  metric.insert("manhattan", NODE_VORONOI_MANHATTAN);
-  metric.insert("chebychev", NODE_VORONOI_CHEBYCHEV);
-  metric.insert("minkowski", NODE_VORONOI_MINKOWSKI);
-  SOCKET_ENUM(metric, "Distance Metric", metric, NODE_VORONOI_INTENSITY);
+  static NodeEnum metric_enum;
+  metric_enum.insert("euclidean", NODE_VORONOI_EUCLIDEAN);
+  metric_enum.insert("manhattan", NODE_VORONOI_MANHATTAN);
+  metric_enum.insert("chebychev", NODE_VORONOI_CHEBYCHEV);
+  metric_enum.insert("minkowski", NODE_VORONOI_MINKOWSKI);
+  SOCKET_ENUM(metric, "Distance Metric", metric_enum, NODE_VORONOI_EUCLIDEAN);
 
   static NodeEnum feature_enum;
-  feature_enum.insert("F1", NODE_VORONOI_F1);
-  feature_enum.insert("F2", NODE_VORONOI_F2);
-  feature_enum.insert("F3", NODE_VORONOI_F3);
-  feature_enum.insert("F4", NODE_VORONOI_F4);
-  feature_enum.insert("F2F1", NODE_VORONOI_F2F1);
-  SOCKET_ENUM(feature, "Feature", feature_enum, NODE_VORONOI_INTENSITY);
+  feature_enum.insert("f1", NODE_VORONOI_F1);
+  feature_enum.insert("f2", NODE_VORONOI_F2);
+  feature_enum.insert("smooth_f1", NODE_VORONOI_SMOOTH_F1);
+  feature_enum.insert("distance_to_edge", NODE_VORONOI_DISTANCE_TO_EDGE);
+  feature_enum.insert("n_sphere_radius", NODE_VORONOI_N_SPHERE_RADIUS);
+  SOCKET_ENUM(feature, "Feature", feature_enum, NODE_VORONOI_F1);
 
-  SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
-  SOCKET_IN_FLOAT(exponent, "Exponent", 0.5f);
   SOCKET_IN_POINT(
       vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
+  SOCKET_IN_FLOAT(w, "W", 0.0f);
+  SOCKET_IN_FLOAT(scale, "Scale", 5.0f);
+  SOCKET_IN_FLOAT(smoothness, "Smoothness", 5.0f);
+  SOCKET_IN_FLOAT(exponent, "Exponent", 0.5f);
+  SOCKET_IN_FLOAT(randomness, "Randomness", 1.0f);
 
+  SOCKET_OUT_FLOAT(distance, "Distance");
   SOCKET_OUT_COLOR(color, "Color");
-  SOCKET_OUT_FLOAT(fac, "Fac");
+  SOCKET_OUT_POINT(position, "Position");
+  SOCKET_OUT_FLOAT(w, "W");
+  SOCKET_OUT_FLOAT(radius, "Radius");
 
   return type;
 }
@@ -958,39 +1154,57 @@ VoronoiTextureNode::VoronoiTextureNode() : TextureNode(node_type)
 
 void VoronoiTextureNode::compile(SVMCompiler &compiler)
 {
-  ShaderInput *scale_in = input("Scale");
   ShaderInput *vector_in = input("Vector");
+  ShaderInput *w_in = input("W");
+  ShaderInput *scale_in = input("Scale");
+  ShaderInput *smoothness_in = input("Smoothness");
   ShaderInput *exponent_in = input("Exponent");
-  ShaderOutput *color_out = output("Color");
-  ShaderOutput *fac_out = output("Fac");
+  ShaderInput *randomness_in = input("Randomness");
 
-  if (vector_in->link)
-    compiler.stack_assign(vector_in);
-  if (scale_in->link)
-    compiler.stack_assign(scale_in);
-  if (exponent_in->link)
-    compiler.stack_assign(exponent_in);
-
-  int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
+  ShaderOutput *distance_out = output("Distance");
+  ShaderOutput *color_out = output("Color");
+  ShaderOutput *position_out = output("Position");
+  ShaderOutput *w_out = output("W");
+  ShaderOutput *radius_out = output("Radius");
+
+  int vector_stack_offset = tex_mapping.compile_begin(compiler, vector_in);
+  int w_in_stack_offset = compiler.stack_assign_if_linked(w_in);
+  int scale_stack_offset = compiler.stack_assign_if_linked(scale_in);
+  int smoothness_stack_offset = compiler.stack_assign_if_linked(smoothness_in);
+  int exponent_stack_offset = compiler.stack_assign_if_linked(exponent_in);
+  int randomness_stack_offset = compiler.stack_assign_if_linked(randomness_in);
+  int distance_stack_offset = compiler.stack_assign_if_linked(distance_out);
+  int color_stack_offset = compiler.stack_assign_if_linked(color_out);
+  int position_stack_offset = compiler.stack_assign_if_linked(position_out);
+  int w_out_stack_offset = compiler.stack_assign_if_linked(w_out);
+  int radius_stack_offset = compiler.stack_assign_if_linked(radius_out);
+
+  compiler.add_node(NODE_TEX_VORONOI, dimensions, feature, metric);
+  compiler.add_node(
+      compiler.encode_uchar4(
+          vector_stack_offset, w_in_stack_offset, scale_stack_offset, smoothness_stack_offset),
+      compiler.encode_uchar4(exponent_stack_offset,
+                             randomness_stack_offset,
+                             distance_stack_offset,
+                             color_stack_offset),
+      compiler.encode_uchar4(position_stack_offset, w_out_stack_offset, radius_stack_offset),
+      __float_as_int(w));
 
-  compiler.add_node(NODE_TEX_VORONOI,
-                    compiler.encode_uchar4(vector_offset, coloring, metric, feature),
-                    compiler.encode_uchar4(compiler.stack_assign_if_linked(scale_in),
-                                           compiler.stack_assign_if_linked(exponent_in),
-                                           compiler.stack_assign(fac_out),
-                                           compiler.stack_assign(color_out)));
-  compiler.add_node(__float_as_int(scale), __float_as_int(exponent));
+  compiler.add_node(__float_as_int(scale),
+                    __float_as_int(smoothness),
+                    __float_as_int(exponent),
+                    __float_as_int(randomness));
 
-  tex_mapping.compile_end(compiler, vector_in, vector_offset);
+  tex_mapping.compile_end(compiler, vector_in, vector_stack_offset);
 }
 
 void VoronoiTextureNode::compile(OSLCompiler &compiler)
 {
   tex_mapping.compile(compiler);
 
-  compiler.parameter(this, "coloring");
-  compiler.parameter(this, "metric");
+  compiler.parameter(this, "dimensions");
   compiler.parameter(this, "feature");
+  compiler.parameter(this, "metric");
   compiler.add(this, "node_voronoi_texture");
 }
 
@@ -1043,17 +1257,17 @@ void IESLightNode::get_slot()
 
   if (slot == -1) {
     if (ies.empty()) {
-      slot = light_manager->add_ies_from_file(filename);
+      slot = light_manager->add_ies_from_file(filename.string());
     }
     else {
-      slot = light_manager->add_ies(ies);
+      slot = light_manager->add_ies(ies.string());
     }
   }
 }
 
 void IESLightNode::compile(SVMCompiler &compiler)
 {
-  light_manager = compiler.light_manager;
+  light_manager = compiler.scene->light_manager;
   get_slot();
 
   ShaderInput *strength_in = input("Strength");
@@ -1075,15 +1289,65 @@ void IESLightNode::compile(SVMCompiler &compiler)
 
 void IESLightNode::compile(OSLCompiler &compiler)
 {
-  light_manager = compiler.light_manager;
+  light_manager = compiler.scene->light_manager;
   get_slot();
 
   tex_mapping.compile(compiler);
 
-  compiler.parameter("slot", slot);
+  compiler.parameter_texture_ies("filename", slot);
   compiler.add(this, "node_ies_light");
 }
 
+/* White Noise Texture */
+
+NODE_DEFINE(WhiteNoiseTextureNode)
+{
+  NodeType *type = NodeType::add("white_noise_texture", create, NodeType::SHADER);
+
+  static NodeEnum dimensions_enum;
+  dimensions_enum.insert("1D", 1);
+  dimensions_enum.insert("2D", 2);
+  dimensions_enum.insert("3D", 3);
+  dimensions_enum.insert("4D", 4);
+  SOCKET_ENUM(dimensions, "Dimensions", dimensions_enum, 3);
+
+  SOCKET_IN_POINT(vector, "Vector", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_FLOAT(w, "W", 0.0f);
+
+  SOCKET_OUT_FLOAT(value, "Value");
+  SOCKET_OUT_COLOR(color, "Color");
+
+  return type;
+}
+
+WhiteNoiseTextureNode::WhiteNoiseTextureNode() : ShaderNode(node_type)
+{
+}
+
+void WhiteNoiseTextureNode::compile(SVMCompiler &compiler)
+{
+  ShaderInput *vector_in = input("Vector");
+  ShaderInput *w_in = input("W");
+  ShaderOutput *value_out = output("Value");
+  ShaderOutput *color_out = output("Color");
+
+  int vector_stack_offset = compiler.stack_assign(vector_in);
+  int w_stack_offset = compiler.stack_assign(w_in);
+  int value_stack_offset = compiler.stack_assign(value_out);
+  int color_stack_offset = compiler.stack_assign(color_out);
+
+  compiler.add_node(NODE_TEX_WHITE_NOISE,
+                    dimensions,
+                    compiler.encode_uchar4(vector_stack_offset, w_stack_offset),
+                    compiler.encode_uchar4(value_stack_offset, color_stack_offset));
+}
+
+void WhiteNoiseTextureNode::compile(OSLCompiler &compiler)
+{
+  compiler.parameter(this, "dimensions");
+  compiler.add(this, "node_white_noise_texture");
+}
+
 /* Musgrave Texture */
 
 NODE_DEFINE(MusgraveTextureNode)
@@ -1092,6 +1356,13 @@ NODE_DEFINE(MusgraveTextureNode)
 
   TEXTURE_MAPPING_DEFINE(MusgraveTextureNode);
 
+  static NodeEnum dimensions_enum;
+  dimensions_enum.insert("1D", 1);
+  dimensions_enum.insert("2D", 2);
+  dimensions_enum.insert("3D", 3);
+  dimensions_enum.insert("4D", 4);
+  SOCKET_ENUM(dimensions, "Dimensions", dimensions_enum, 3);
+
   static NodeEnum type_enum;
   type_enum.insert("multifractal", NODE_MUSGRAVE_MULTIFRACTAL);
   type_enum.insert("fBM", NODE_MUSGRAVE_FBM);
@@ -1100,16 +1371,16 @@ NODE_DEFINE(MusgraveTextureNode)
   type_enum.insert("hetero_terrain", NODE_MUSGRAVE_HETERO_TERRAIN);
   SOCKET_ENUM(type, "Type", type_enum, NODE_MUSGRAVE_FBM);
 
+  SOCKET_IN_POINT(
+      vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
+  SOCKET_IN_FLOAT(w, "W", 0.0f);
   SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
   SOCKET_IN_FLOAT(detail, "Detail", 2.0f);
   SOCKET_IN_FLOAT(dimension, "Dimension", 2.0f);
-  SOCKET_IN_FLOAT(lacunarity, "Lacunarity", 1.0f);
+  SOCKET_IN_FLOAT(lacunarity, "Lacunarity", 2.0f);
   SOCKET_IN_FLOAT(offset, "Offset", 0.0f);
   SOCKET_IN_FLOAT(gain, "Gain", 1.0f);
-  SOCKET_IN_POINT(
-      vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
 
-  SOCKET_OUT_COLOR(color, "Color");
   SOCKET_OUT_FLOAT(fac, "Fac");
 
   return type;
@@ -1122,35 +1393,38 @@ MusgraveTextureNode::MusgraveTextureNode() : TextureNode(node_type)
 void MusgraveTextureNode::compile(SVMCompiler &compiler)
 {
   ShaderInput *vector_in = input("Vector");
+  ShaderInput *w_in = input("W");
   ShaderInput *scale_in = input("Scale");
+  ShaderInput *detail_in = input("Detail");
   ShaderInput *dimension_in = input("Dimension");
   ShaderInput *lacunarity_in = input("Lacunarity");
-  ShaderInput *detail_in = input("Detail");
   ShaderInput *offset_in = input("Offset");
   ShaderInput *gain_in = input("Gain");
   ShaderOutput *fac_out = output("Fac");
-  ShaderOutput *color_out = output("Color");
 
-  int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
+  int vector_stack_offset = tex_mapping.compile_begin(compiler, vector_in);
+  int w_stack_offset = compiler.stack_assign_if_linked(w_in);
+  int scale_stack_offset = compiler.stack_assign_if_linked(scale_in);
+  int detail_stack_offset = compiler.stack_assign_if_linked(detail_in);
+  int dimension_stack_offset = compiler.stack_assign_if_linked(dimension_in);
+  int lacunarity_stack_offset = compiler.stack_assign_if_linked(lacunarity_in);
+  int offset_stack_offset = compiler.stack_assign_if_linked(offset_in);
+  int gain_stack_offset = compiler.stack_assign_if_linked(gain_in);
+  int fac_stack_offset = compiler.stack_assign(fac_out);
 
-  compiler.add_node(NODE_TEX_MUSGRAVE,
-                    compiler.encode_uchar4(type,
-                                           vector_offset,
-                                           compiler.stack_assign_if_linked(color_out),
-                                           compiler.stack_assign_if_linked(fac_out)),
-                    compiler.encode_uchar4(compiler.stack_assign_if_linked(dimension_in),
-                                           compiler.stack_assign_if_linked(lacunarity_in),
-                                           compiler.stack_assign_if_linked(detail_in),
-                                           compiler.stack_assign_if_linked(offset_in)),
-                    compiler.encode_uchar4(compiler.stack_assign_if_linked(gain_in),
-                                           compiler.stack_assign_if_linked(scale_in)));
-  compiler.add_node(__float_as_int(dimension),
-                    __float_as_int(lacunarity),
-                    __float_as_int(detail),
-                    __float_as_int(offset));
-  compiler.add_node(__float_as_int(gain), __float_as_int(scale));
+  compiler.add_node(
+      NODE_TEX_MUSGRAVE,
+      compiler.encode_uchar4(type, dimensions, vector_stack_offset, w_stack_offset),
+      compiler.encode_uchar4(scale_stack_offset,
+                             detail_stack_offset,
+                             dimension_stack_offset,
+                             lacunarity_stack_offset),
+      compiler.encode_uchar4(offset_stack_offset, gain_stack_offset, fac_stack_offset));
+  compiler.add_node(
+      __float_as_int(w), __float_as_int(scale), __float_as_int(detail), __float_as_int(dimension));
+  compiler.add_node(__float_as_int(lacunarity), __float_as_int(offset), __float_as_int(gain));
 
-  tex_mapping.compile_end(compiler, vector_in, vector_offset);
+  tex_mapping.compile_end(compiler, vector_in, vector_stack_offset);
 }
 
 void MusgraveTextureNode::compile(OSLCompiler &compiler)
@@ -1158,6 +1432,7 @@ void MusgraveTextureNode::compile(OSLCompiler &compiler)
   tex_mapping.compile(compiler);
 
   compiler.parameter(this, "type");
+  compiler.parameter(this, "dimensions");
   compiler.add(this, "node_musgrave_texture");
 }
 
@@ -1174,18 +1449,36 @@ NODE_DEFINE(WaveTextureNode)
   type_enum.insert("rings", NODE_WAVE_RINGS);
   SOCKET_ENUM(type, "Type", type_enum, NODE_WAVE_BANDS);
 
+  static NodeEnum bands_direction_enum;
+  bands_direction_enum.insert("x", NODE_WAVE_BANDS_DIRECTION_X);
+  bands_direction_enum.insert("y", NODE_WAVE_BANDS_DIRECTION_Y);
+  bands_direction_enum.insert("z", NODE_WAVE_BANDS_DIRECTION_Z);
+  bands_direction_enum.insert("diagonal", NODE_WAVE_BANDS_DIRECTION_DIAGONAL);
+  SOCKET_ENUM(
+      bands_direction, "Bands Direction", bands_direction_enum, NODE_WAVE_BANDS_DIRECTION_X);
+
+  static NodeEnum rings_direction_enum;
+  rings_direction_enum.insert("x", NODE_WAVE_RINGS_DIRECTION_X);
+  rings_direction_enum.insert("y", NODE_WAVE_RINGS_DIRECTION_Y);
+  rings_direction_enum.insert("z", NODE_WAVE_RINGS_DIRECTION_Z);
+  rings_direction_enum.insert("spherical", NODE_WAVE_RINGS_DIRECTION_SPHERICAL);
+  SOCKET_ENUM(
+      rings_direction, "Rings Direction", rings_direction_enum, NODE_WAVE_BANDS_DIRECTION_X);
+
   static NodeEnum profile_enum;
   profile_enum.insert("sine", NODE_WAVE_PROFILE_SIN);
   profile_enum.insert("saw", NODE_WAVE_PROFILE_SAW);
+  profile_enum.insert("tri", NODE_WAVE_PROFILE_TRI);
   SOCKET_ENUM(profile, "Profile", profile_enum, NODE_WAVE_PROFILE_SIN);
 
+  SOCKET_IN_POINT(
+      vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
   SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
   SOCKET_IN_FLOAT(distortion, "Distortion", 0.0f);
   SOCKET_IN_FLOAT(detail, "Detail", 2.0f);
   SOCKET_IN_FLOAT(detail_scale, "Detail Scale", 0.0f);
-  SOCKET_IN_POINT(
-      vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
-
+  SOCKET_IN_FLOAT(detail_roughness, "Detail Roughness", 0.5f);
+  SOCKET_IN_FLOAT(phase, "Phase Offset", 0.0f);
   SOCKET_OUT_COLOR(color, "Color");
   SOCKET_OUT_FLOAT(fac, "Fac");
 
@@ -1198,31 +1491,38 @@ WaveTextureNode::WaveTextureNode() : TextureNode(node_type)
 
 void WaveTextureNode::compile(SVMCompiler &compiler)
 {
+  ShaderInput *vector_in = input("Vector");
   ShaderInput *scale_in = input("Scale");
   ShaderInput *distortion_in = input("Distortion");
-  ShaderInput *dscale_in = input("Detail Scale");
   ShaderInput *detail_in = input("Detail");
-  ShaderInput *vector_in = input("Vector");
-  ShaderOutput *fac_out = output("Fac");
+  ShaderInput *dscale_in = input("Detail Scale");
+  ShaderInput *droughness_in = input("Detail Roughness");
+  ShaderInput *phase_in = input("Phase Offset");
   ShaderOutput *color_out = output("Color");
+  ShaderOutput *fac_out = output("Fac");
 
   int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
 
   compiler.add_node(NODE_TEX_WAVE,
-                    compiler.encode_uchar4(type,
-                                           compiler.stack_assign_if_linked(color_out),
-                                           compiler.stack_assign_if_linked(fac_out),
-                                           compiler.stack_assign_if_linked(dscale_in)),
+                    compiler.encode_uchar4(type, bands_direction, rings_direction, profile),
                     compiler.encode_uchar4(vector_offset,
                                            compiler.stack_assign_if_linked(scale_in),
-                                           compiler.stack_assign_if_linked(detail_in),
                                            compiler.stack_assign_if_linked(distortion_in)),
-                    profile);
+                    compiler.encode_uchar4(compiler.stack_assign_if_linked(detail_in),
+                                           compiler.stack_assign_if_linked(dscale_in),
+                                           compiler.stack_assign_if_linked(droughness_in),
+                                           compiler.stack_assign_if_linked(phase_in)));
 
-  compiler.add_node(__float_as_int(scale),
-                    __float_as_int(detail),
+  compiler.add_node(compiler.encode_uchar4(compiler.stack_assign_if_linked(color_out),
+                                           compiler.stack_assign_if_linked(fac_out)),
+                    __float_as_int(scale),
                     __float_as_int(distortion),
-                    __float_as_int(detail_scale));
+                    __float_as_int(detail));
+
+  compiler.add_node(__float_as_int(detail_scale),
+                    __float_as_int(detail_roughness),
+                    __float_as_int(phase),
+                    SVM_STACK_INVALID);
 
   tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
@@ -1232,6 +1532,8 @@ void WaveTextureNode::compile(OSLCompiler &compiler)
   tex_mapping.compile(compiler);
 
   compiler.parameter(this, "type");
+  compiler.parameter(this, "bands_direction");
+  compiler.parameter(this, "rings_direction");
   compiler.parameter(this, "profile");
 
   compiler.add(this, "node_wave_texture");
@@ -1474,24 +1776,19 @@ NODE_DEFINE(PointDensityTextureNode)
 
 PointDensityTextureNode::PointDensityTextureNode() : ShaderNode(node_type)
 {
-  image_manager = NULL;
-  slot = -1;
-  builtin_data = NULL;
 }
 
 PointDensityTextureNode::~PointDensityTextureNode()
 {
-  if (image_manager) {
-    image_manager->remove_image(
-        filename.string(), builtin_data, interpolation, EXTENSION_CLIP, true);
-  }
 }
 
 ShaderNode *PointDensityTextureNode::clone() const
 {
+  /* Increase image user count for new node. We need to ensure to not call
+   * add_image again, to work around access of freed data on the Blender
+   * side. A better solution should be found to avoid this. */
   PointDensityTextureNode *node = new PointDensityTextureNode(*this);
-  node->image_manager = NULL;
-  node->slot = -1;
+  node->handle = handle; /* TODO: not needed? */
   return node;
 }
 
@@ -1503,13 +1800,11 @@ void PointDensityTextureNode::attributes(Shader *shader, AttributeRequestSet *at
   ShaderNode::attributes(shader, attributes);
 }
 
-void PointDensityTextureNode::add_image()
+ImageParams PointDensityTextureNode::image_params() const
 {
-  if (slot == -1) {
-    ImageMetaData metadata;
-    slot = image_manager->add_image(
-        filename.string(), builtin_data, false, 0, interpolation, EXTENSION_CLIP, true, metadata);
-  }
+  ImageParams params;
+  params.interpolation = interpolation;
+  return params;
 }
 
 void PointDensityTextureNode::compile(SVMCompiler &compiler)
@@ -1521,11 +1816,13 @@ void PointDensityTextureNode::compile(SVMCompiler &compiler)
   const bool use_density = !density_out->links.empty();
   const bool use_color = !color_out->links.empty();
 
-  image_manager = compiler.image_manager;
-
   if (use_density || use_color) {
-    add_image();
+    if (handle.empty()) {
+      ImageManager *image_manager = compiler.scene->image_manager;
+      handle = image_manager->add_image(filename.string(), image_params());
+    }
 
+    const int slot = handle.svm_slot();
     if (slot != -1) {
       compiler.stack_assign(vector_in);
       compiler.add_node(NODE_TEX_VOXEL,
@@ -1562,14 +1859,13 @@ void PointDensityTextureNode::compile(OSLCompiler &compiler)
   const bool use_density = !density_out->links.empty();
   const bool use_color = !color_out->links.empty();
 
-  image_manager = compiler.image_manager;
-
   if (use_density || use_color) {
-    add_image();
-
-    if (slot != -1) {
-      compiler.parameter("filename", string_printf("@i%d", slot).c_str());
+    if (handle.empty()) {
+      ImageManager *image_manager = compiler.scene->image_manager;
+      handle = image_manager->add_image(filename.string(), image_params());
     }
+
+    compiler.parameter_texture("filename", handle.svm_slot());
     if (space == NODE_TEX_VOXEL_SPACE_WORLD) {
       compiler.parameter("mapping", tfm);
       compiler.parameter("use_mapping", 1);
@@ -1625,9 +1921,18 @@ NODE_DEFINE(MappingNode)
 {
   NodeType *type = NodeType::add("mapping", create, NodeType::SHADER);
 
-  TEXTURE_MAPPING_DEFINE(MappingNode);
+  static NodeEnum type_enum;
+  type_enum.insert("point", NODE_MAPPING_TYPE_POINT);
+  type_enum.insert("texture", NODE_MAPPING_TYPE_TEXTURE);
+  type_enum.insert("vector", NODE_MAPPING_TYPE_VECTOR);
+  type_enum.insert("normal", NODE_MAPPING_TYPE_NORMAL);
+  SOCKET_ENUM(type, "Type", type_enum, NODE_MAPPING_TYPE_POINT);
 
   SOCKET_IN_POINT(vector, "Vector", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_POINT(location, "Location", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_POINT(rotation, "Rotation", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_POINT(scale, "Scale", make_float3(1.0f, 1.0f, 1.0f));
+
   SOCKET_OUT_POINT(vector, "Vector");
 
   return type;
@@ -1637,22 +1942,42 @@ MappingNode::MappingNode() : ShaderNode(node_type)
 {
 }
 
+void MappingNode::constant_fold(const ConstantFolder &folder)
+{
+  if (folder.all_inputs_constant()) {
+    float3 result = svm_mapping((NodeMappingType)type, vector, location, rotation, scale);
+    folder.make_constant(result);
+  }
+  else {
+    folder.fold_mapping((NodeMappingType)type);
+  }
+}
+
 void MappingNode::compile(SVMCompiler &compiler)
 {
   ShaderInput *vector_in = input("Vector");
+  ShaderInput *location_in = input("Location");
+  ShaderInput *rotation_in = input("Rotation");
+  ShaderInput *scale_in = input("Scale");
   ShaderOutput *vector_out = output("Vector");
 
-  tex_mapping.compile(
-      compiler, compiler.stack_assign(vector_in), compiler.stack_assign(vector_out));
+  int vector_stack_offset = compiler.stack_assign(vector_in);
+  int location_stack_offset = compiler.stack_assign(location_in);
+  int rotation_stack_offset = compiler.stack_assign(rotation_in);
+  int scale_stack_offset = compiler.stack_assign(scale_in);
+  int result_stack_offset = compiler.stack_assign(vector_out);
+
+  compiler.add_node(
+      NODE_MAPPING,
+      type,
+      compiler.encode_uchar4(
+          vector_stack_offset, location_stack_offset, rotation_stack_offset, scale_stack_offset),
+      result_stack_offset);
 }
 
 void MappingNode::compile(OSLCompiler &compiler)
 {
-  compiler.parameter("Matrix", tex_mapping.compute_transform());
-  compiler.parameter_point("mapping_min", tex_mapping.min);
-  compiler.parameter_point("mapping_max", tex_mapping.max);
-  compiler.parameter("use_minmax", tex_mapping.use_minmax);
-
+  compiler.parameter(this, "type");
   compiler.add(this, "node_mapping");
 }
 
@@ -1964,12 +2289,11 @@ NODE_DEFINE(AnisotropicBsdfNode)
   SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
 
   static NodeEnum distribution_enum;
-  distribution_enum.insert("beckmann", CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID);
-  distribution_enum.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID);
-  distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID);
-  distribution_enum.insert("ashikhmin_shirley", CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID);
-  SOCKET_ENUM(
-      distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID);
+  distribution_enum.insert("beckmann", CLOSURE_BSDF_MICROFACET_BECKMANN_ID);
+  distribution_enum.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_ID);
+  distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID);
+  distribution_enum.insert("ashikhmin_shirley", CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID);
+  SOCKET_ENUM(distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_GGX_ID);
 
   SOCKET_IN_VECTOR(tangent, "Tangent", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TANGENT);
 
@@ -1984,7 +2308,7 @@ NODE_DEFINE(AnisotropicBsdfNode)
 
 AnisotropicBsdfNode::AnisotropicBsdfNode() : BsdfNode(node_type)
 {
-  closure = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
+  closure = CLOSURE_BSDF_MICROFACET_GGX_ID;
 }
 
 void AnisotropicBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes)
@@ -2003,7 +2327,7 @@ void AnisotropicBsdfNode::compile(SVMCompiler &compiler)
 {
   closure = distribution;
 
-  if (closure == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID)
+  if (closure == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID)
     BsdfNode::compile(
         compiler, input("Roughness"), input("Anisotropy"), input("Rotation"), input("Color"));
   else
@@ -2097,7 +2421,7 @@ void GlossyBsdfNode::compile(SVMCompiler &compiler)
   if (closure == CLOSURE_BSDF_REFLECTION_ID)
     BsdfNode::compile(compiler, NULL, NULL);
   else if (closure == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID)
-    BsdfNode::compile(compiler, input("Roughness"), NULL, input("Color"));
+    BsdfNode::compile(compiler, input("Roughness"), NULL, NULL, input("Color"));
   else
     BsdfNode::compile(compiler, input("Roughness"), NULL);
 }
@@ -2430,6 +2754,8 @@ NODE_DEFINE(PrincipledBsdfNode)
   SOCKET_IN_FLOAT(transmission, "Transmission", 0.0f);
   SOCKET_IN_FLOAT(transmission_roughness, "Transmission Roughness", 0.0f);
   SOCKET_IN_FLOAT(anisotropic_rotation, "Anisotropic Rotation", 0.0f);
+  SOCKET_IN_COLOR(emission, "Emission", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_FLOAT(alpha, "Alpha", 1.0f);
   SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL);
   SOCKET_IN_NORMAL(clearcoat_normal,
                    "Clearcoat Normal",
@@ -2450,6 +2776,48 @@ PrincipledBsdfNode::PrincipledBsdfNode() : BsdfBaseNode(node_type)
   distribution_orig = NBUILTIN_CLOSURES;
 }
 
+void PrincipledBsdfNode::expand(ShaderGraph *graph)
+{
+  ShaderOutput *principled_out = output("BSDF");
+
+  ShaderInput *emission_in = input("Emission");
+  if (emission_in->link || emission != make_float3(0.0f, 0.0f, 0.0f)) {
+    /* Create add closure and emission. */
+    AddClosureNode *add = new AddClosureNode();
+    EmissionNode *emission_node = new EmissionNode();
+    ShaderOutput *new_out = add->output("Closure");
+
+    graph->add(add);
+    graph->add(emission_node);
+
+    emission_node->strength = 1.0f;
+    graph->relink(emission_in, emission_node->input("Color"));
+    graph->relink(principled_out, new_out);
+    graph->connect(emission_node->output("Emission"), add->input("Closure1"));
+    graph->connect(principled_out, add->input("Closure2"));
+
+    principled_out = new_out;
+  }
+
+  ShaderInput *alpha_in = input("Alpha");
+  if (alpha_in->link || alpha != 1.0f) {
+    /* Create mix and transparent BSDF for alpha transparency. */
+    MixClosureNode *mix = new MixClosureNode();
+    TransparentBsdfNode *transparent = new TransparentBsdfNode();
+
+    graph->add(mix);
+    graph->add(transparent);
+
+    graph->relink(alpha_in, mix->input("Fac"));
+    graph->relink(principled_out, mix->output("Closure"));
+    graph->connect(transparent->output("BSDF"), mix->input("Closure1"));
+    graph->connect(principled_out, mix->input("Closure2"));
+  }
+
+  remove_input(emission_in);
+  remove_input(alpha_in);
+}
+
 bool PrincipledBsdfNode::has_surface_bssrdf()
 {
   ShaderInput *subsurface_in = input("Subsurface");
@@ -2630,7 +2998,7 @@ NODE_DEFINE(TransparentBsdfNode)
 {
   NodeType *type = NodeType::add("transparent_bsdf", create, NodeType::SHADER);
 
-  SOCKET_IN_COLOR(color, "Color", make_float3(0.8f, 0.8f, 0.8f));
+  SOCKET_IN_COLOR(color, "Color", make_float3(1.0f, 1.0f, 1.0f));
   SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
 
   SOCKET_OUT_CLOSURE(BSDF, "BSDF");
@@ -3010,7 +3378,7 @@ NODE_DEFINE(PrincipledVolumeNode)
   SOCKET_IN_COLOR(emission_color, "Emission Color", make_float3(1.0f, 1.0f, 1.0f));
   SOCKET_IN_FLOAT(blackbody_intensity, "Blackbody Intensity", 0.0f);
   SOCKET_IN_COLOR(blackbody_tint, "Blackbody Tint", make_float3(1.0f, 1.0f, 1.0f));
-  SOCKET_IN_FLOAT(temperature, "Temperature", 1500.0f);
+  SOCKET_IN_FLOAT(temperature, "Temperature", 1000.0f);
   SOCKET_IN_FLOAT(volume_mix_weight, "VolumeMixWeight", 0.0f, SocketType::SVM_INTERNAL);
 
   SOCKET_OUT_CLOSURE(volume, "Volume");
@@ -3021,6 +3389,8 @@ NODE_DEFINE(PrincipledVolumeNode)
 PrincipledVolumeNode::PrincipledVolumeNode() : VolumeNode(node_type)
 {
   closure = CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID;
+  density_attribute = ustring("density");
+  temperature_attribute = ustring("temperature");
 }
 
 void PrincipledVolumeNode::attributes(Shader *shader, AttributeRequestSet *attributes)
@@ -3289,6 +3659,7 @@ NODE_DEFINE(GeometryNode)
   SOCKET_OUT_POINT(parametric, "Parametric");
   SOCKET_OUT_FLOAT(backfacing, "Backfacing");
   SOCKET_OUT_FLOAT(pointiness, "Pointiness");
+  SOCKET_OUT_FLOAT(random_per_island, "Random Per Island");
 
   return type;
 }
@@ -3307,6 +3678,9 @@ void GeometryNode::attributes(Shader *shader, AttributeRequestSet *attributes)
     if (!output("Pointiness")->links.empty()) {
       attributes->add(ATTR_STD_POINTINESS);
     }
+    if (!output("Random Per Island")->links.empty()) {
+      attributes->add(ATTR_STD_RANDOM_PER_ISLAND);
+    }
   }
 
   ShaderNode::attributes(shader, attributes);
@@ -3372,6 +3746,17 @@ void GeometryNode::compile(SVMCompiler &compiler)
       compiler.add_node(NODE_VALUE_F, __float_as_int(0.0f), compiler.stack_assign(out));
     }
   }
+
+  out = output("Random Per Island");
+  if (!out->links.empty()) {
+    if (compiler.output_type() != SHADER_TYPE_VOLUME) {
+      compiler.add_node(
+          attr_node, ATTR_STD_RANDOM_PER_ISLAND, compiler.stack_assign(out), NODE_ATTR_FLOAT);
+    }
+    else {
+      compiler.add_node(NODE_VALUE_F, __float_as_int(0.0f), compiler.stack_assign(out));
+    }
+  }
 }
 
 void GeometryNode::compile(OSLCompiler &compiler)
@@ -3811,6 +4196,7 @@ NODE_DEFINE(ObjectInfoNode)
   NodeType *type = NodeType::add("object_info", create, NodeType::SHADER);
 
   SOCKET_OUT_VECTOR(location, "Location");
+  SOCKET_OUT_COLOR(color, "Color");
   SOCKET_OUT_FLOAT(object_index, "Object Index");
   SOCKET_OUT_FLOAT(material_index, "Material Index");
   SOCKET_OUT_FLOAT(random, "Random");
@@ -3829,6 +4215,11 @@ void ObjectInfoNode::compile(SVMCompiler &compiler)
     compiler.add_node(NODE_OBJECT_INFO, NODE_INFO_OB_LOCATION, compiler.stack_assign(out));
   }
 
+  out = output("Color");
+  if (!out->links.empty()) {
+    compiler.add_node(NODE_OBJECT_INFO, NODE_INFO_OB_COLOR, compiler.stack_assign(out));
+  }
+
   out = output("Object Index");
   if (!out->links.empty()) {
     compiler.add_node(NODE_OBJECT_INFO, NODE_INFO_OB_INDEX, compiler.stack_assign(out));
@@ -3888,7 +4279,7 @@ void ParticleInfoNode::attributes(Shader *shader, AttributeRequestSet *attribute
   if (!output("Location")->links.empty())
     attributes->add(ATTR_STD_PARTICLE);
 #if 0 /* not yet supported */
-  if(!output("Rotation")->links.empty())
+  if (!output("Rotation")->links.empty())
     attributes->add(ATTR_STD_PARTICLE);
 #endif
   if (!output("Size")->links.empty())
@@ -3933,7 +4324,7 @@ void ParticleInfoNode::compile(SVMCompiler &compiler)
   /* quaternion data is not yet supported by Cycles */
 #if 0
   out = output("Rotation");
-  if(!out->links.empty()) {
+  if (!out->links.empty()) {
     compiler.add_node(NODE_PARTICLE_INFO, NODE_INFO_PAR_ROTATION, compiler.stack_assign(out));
   }
 #endif
@@ -4039,6 +4430,170 @@ void HairInfoNode::compile(OSLCompiler &compiler)
   compiler.add(this, "node_hair_info");
 }
 
+/* Volume Info */
+
+NODE_DEFINE(VolumeInfoNode)
+{
+  NodeType *type = NodeType::add("volume_info", create, NodeType::SHADER);
+
+  SOCKET_OUT_COLOR(color, "Color");
+  SOCKET_OUT_FLOAT(density, "Density");
+  SOCKET_OUT_FLOAT(flame, "Flame");
+  SOCKET_OUT_FLOAT(temperature, "Temperature");
+
+  return type;
+}
+
+VolumeInfoNode::VolumeInfoNode() : ShaderNode(node_type)
+{
+}
+
+/* The requested attributes are not updated after node expansion.
+ * So we explicitly request the required attributes.
+ */
+void VolumeInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+  if (shader->has_volume) {
+    if (!output("Color")->links.empty()) {
+      attributes->add(ATTR_STD_VOLUME_COLOR);
+    }
+    if (!output("Density")->links.empty()) {
+      attributes->add(ATTR_STD_VOLUME_DENSITY);
+    }
+    if (!output("Flame")->links.empty()) {
+      attributes->add(ATTR_STD_VOLUME_FLAME);
+    }
+    if (!output("Temperature")->links.empty()) {
+      attributes->add(ATTR_STD_VOLUME_TEMPERATURE);
+    }
+    attributes->add(ATTR_STD_GENERATED_TRANSFORM);
+  }
+  ShaderNode::attributes(shader, attributes);
+}
+
+void VolumeInfoNode::expand(ShaderGraph *graph)
+{
+  ShaderOutput *color_out = output("Color");
+  if (!color_out->links.empty()) {
+    AttributeNode *attr = new AttributeNode();
+    attr->attribute = "color";
+    graph->add(attr);
+    graph->relink(color_out, attr->output("Color"));
+  }
+
+  ShaderOutput *density_out = output("Density");
+  if (!density_out->links.empty()) {
+    AttributeNode *attr = new AttributeNode();
+    attr->attribute = "density";
+    graph->add(attr);
+    graph->relink(density_out, attr->output("Fac"));
+  }
+
+  ShaderOutput *flame_out = output("Flame");
+  if (!flame_out->links.empty()) {
+    AttributeNode *attr = new AttributeNode();
+    attr->attribute = "flame";
+    graph->add(attr);
+    graph->relink(flame_out, attr->output("Fac"));
+  }
+
+  ShaderOutput *temperature_out = output("Temperature");
+  if (!temperature_out->links.empty()) {
+    AttributeNode *attr = new AttributeNode();
+    attr->attribute = "temperature";
+    graph->add(attr);
+    graph->relink(temperature_out, attr->output("Fac"));
+  }
+}
+
+void VolumeInfoNode::compile(SVMCompiler &)
+{
+}
+
+void VolumeInfoNode::compile(OSLCompiler &)
+{
+}
+
+NODE_DEFINE(VertexColorNode)
+{
+  NodeType *type = NodeType::add("vertex_color", create, NodeType::SHADER);
+
+  SOCKET_STRING(layer_name, "Layer Name", ustring());
+  SOCKET_OUT_COLOR(color, "Color");
+  SOCKET_OUT_FLOAT(alpha, "Alpha");
+
+  return type;
+}
+
+VertexColorNode::VertexColorNode() : ShaderNode(node_type)
+{
+}
+
+void VertexColorNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+  if (!(output("Color")->links.empty() && output("Alpha")->links.empty())) {
+    if (layer_name != "")
+      attributes->add_standard(layer_name);
+    else
+      attributes->add(ATTR_STD_VERTEX_COLOR);
+  }
+  ShaderNode::attributes(shader, attributes);
+}
+
+void VertexColorNode::compile(SVMCompiler &compiler)
+{
+  ShaderOutput *color_out = output("Color");
+  ShaderOutput *alpha_out = output("Alpha");
+  int layer_id = 0;
+
+  if (layer_name != "") {
+    layer_id = compiler.attribute(layer_name);
+  }
+  else {
+    layer_id = compiler.attribute(ATTR_STD_VERTEX_COLOR);
+  }
+
+  ShaderNodeType node;
+
+  if (bump == SHADER_BUMP_DX)
+    node = NODE_VERTEX_COLOR_BUMP_DX;
+  else if (bump == SHADER_BUMP_DY)
+    node = NODE_VERTEX_COLOR_BUMP_DY;
+  else {
+    node = NODE_VERTEX_COLOR;
+  }
+
+  compiler.add_node(
+      node, layer_id, compiler.stack_assign(color_out), compiler.stack_assign(alpha_out));
+}
+
+void VertexColorNode::compile(OSLCompiler &compiler)
+{
+  if (bump == SHADER_BUMP_DX) {
+    compiler.parameter("bump_offset", "dx");
+  }
+  else if (bump == SHADER_BUMP_DY) {
+    compiler.parameter("bump_offset", "dy");
+  }
+  else {
+    compiler.parameter("bump_offset", "center");
+  }
+
+  if (layer_name.empty()) {
+    compiler.parameter("layer_name", ustring("geom:vertex_color"));
+  }
+  else {
+    if (Attribute::name_standard(layer_name.c_str()) != ATTR_STD_NONE) {
+      compiler.parameter("name", (string("geom:") + layer_name.c_str()).c_str());
+    }
+    else {
+      compiler.parameter("layer_name", layer_name.c_str());
+    }
+  }
+
+  compiler.add(this, "node_vertex_color");
+}
+
 /* Value */
 
 NODE_DEFINE(ValueNode)
@@ -5179,6 +5734,212 @@ void OutputNode::compile(OSLCompiler &compiler)
     compiler.add(this, "node_output_displacement");
 }
 
+/* Map Range Node */
+
+NODE_DEFINE(MapRangeNode)
+{
+  NodeType *type = NodeType::add("map_range", create, NodeType::SHADER);
+
+  static NodeEnum type_enum;
+  type_enum.insert("linear", NODE_MAP_RANGE_LINEAR);
+  type_enum.insert("stepped", NODE_MAP_RANGE_STEPPED);
+  type_enum.insert("smoothstep", NODE_MAP_RANGE_SMOOTHSTEP);
+  type_enum.insert("smootherstep", NODE_MAP_RANGE_SMOOTHERSTEP);
+  SOCKET_ENUM(type, "Type", type_enum, NODE_MAP_RANGE_LINEAR);
+
+  SOCKET_IN_FLOAT(value, "Value", 1.0f);
+  SOCKET_IN_FLOAT(from_min, "From Min", 0.0f);
+  SOCKET_IN_FLOAT(from_max, "From Max", 1.0f);
+  SOCKET_IN_FLOAT(to_min, "To Min", 0.0f);
+  SOCKET_IN_FLOAT(to_max, "To Max", 1.0f);
+  SOCKET_IN_FLOAT(steps, "Steps", 4.0f);
+
+  SOCKET_OUT_FLOAT(result, "Result");
+
+  return type;
+}
+
+MapRangeNode::MapRangeNode() : ShaderNode(node_type)
+{
+}
+
+void MapRangeNode::expand(ShaderGraph *graph)
+{
+  if (clamp) {
+    ShaderOutput *result_out = output("Result");
+    if (!result_out->links.empty()) {
+      ClampNode *clamp_node = new ClampNode();
+      clamp_node->type = NODE_CLAMP_RANGE;
+      graph->add(clamp_node);
+      graph->relink(result_out, clamp_node->output("Result"));
+      graph->connect(result_out, clamp_node->input("Value"));
+      if (input("To Min")->link) {
+        graph->connect(input("To Min")->link, clamp_node->input("Min"));
+      }
+      else {
+        clamp_node->min = to_min;
+      }
+      if (input("To Max")->link) {
+        graph->connect(input("To Max")->link, clamp_node->input("Max"));
+      }
+      else {
+        clamp_node->max = to_max;
+      }
+    }
+  }
+}
+
+void MapRangeNode::compile(SVMCompiler &compiler)
+{
+  ShaderInput *value_in = input("Value");
+  ShaderInput *from_min_in = input("From Min");
+  ShaderInput *from_max_in = input("From Max");
+  ShaderInput *to_min_in = input("To Min");
+  ShaderInput *to_max_in = input("To Max");
+  ShaderInput *steps_in = input("Steps");
+  ShaderOutput *result_out = output("Result");
+
+  int value_stack_offset = compiler.stack_assign(value_in);
+  int from_min_stack_offset = compiler.stack_assign_if_linked(from_min_in);
+  int from_max_stack_offset = compiler.stack_assign_if_linked(from_max_in);
+  int to_min_stack_offset = compiler.stack_assign_if_linked(to_min_in);
+  int to_max_stack_offset = compiler.stack_assign_if_linked(to_max_in);
+  int steps_stack_offset = compiler.stack_assign(steps_in);
+  int result_stack_offset = compiler.stack_assign(result_out);
+
+  compiler.add_node(
+      NODE_MAP_RANGE,
+      value_stack_offset,
+      compiler.encode_uchar4(
+          from_min_stack_offset, from_max_stack_offset, to_min_stack_offset, to_max_stack_offset),
+      compiler.encode_uchar4(type, steps_stack_offset, result_stack_offset));
+
+  compiler.add_node(__float_as_int(from_min),
+                    __float_as_int(from_max),
+                    __float_as_int(to_min),
+                    __float_as_int(to_max));
+  compiler.add_node(__float_as_int(steps));
+}
+
+void MapRangeNode::compile(OSLCompiler &compiler)
+{
+  compiler.parameter(this, "type");
+  compiler.add(this, "node_map_range");
+}
+
+/* Clamp Node */
+
+NODE_DEFINE(ClampNode)
+{
+  NodeType *type = NodeType::add("clamp", create, NodeType::SHADER);
+
+  static NodeEnum type_enum;
+  type_enum.insert("minmax", NODE_CLAMP_MINMAX);
+  type_enum.insert("range", NODE_CLAMP_RANGE);
+  SOCKET_ENUM(type, "Type", type_enum, NODE_CLAMP_MINMAX);
+
+  SOCKET_IN_FLOAT(value, "Value", 1.0f);
+  SOCKET_IN_FLOAT(min, "Min", 0.0f);
+  SOCKET_IN_FLOAT(max, "Max", 1.0f);
+
+  SOCKET_OUT_FLOAT(result, "Result");
+
+  return type;
+}
+
+ClampNode::ClampNode() : ShaderNode(node_type)
+{
+}
+
+void ClampNode::constant_fold(const ConstantFolder &folder)
+{
+  if (folder.all_inputs_constant()) {
+    if (type == NODE_CLAMP_RANGE && (min > max)) {
+      folder.make_constant(clamp(value, max, min));
+    }
+    else {
+      folder.make_constant(clamp(value, min, max));
+    }
+  }
+}
+
+void ClampNode::compile(SVMCompiler &compiler)
+{
+  ShaderInput *value_in = input("Value");
+  ShaderInput *min_in = input("Min");
+  ShaderInput *max_in = input("Max");
+  ShaderOutput *result_out = output("Result");
+
+  int value_stack_offset = compiler.stack_assign(value_in);
+  int min_stack_offset = compiler.stack_assign(min_in);
+  int max_stack_offset = compiler.stack_assign(max_in);
+  int result_stack_offset = compiler.stack_assign(result_out);
+
+  compiler.add_node(NODE_CLAMP,
+                    value_stack_offset,
+                    compiler.encode_uchar4(min_stack_offset, max_stack_offset, type),
+                    result_stack_offset);
+  compiler.add_node(__float_as_int(min), __float_as_int(max));
+}
+
+void ClampNode::compile(OSLCompiler &compiler)
+{
+  compiler.parameter(this, "type");
+  compiler.add(this, "node_clamp");
+}
+
+/* AOV Output */
+
+NODE_DEFINE(OutputAOVNode)
+{
+  NodeType *type = NodeType::add("aov_output", create, NodeType::SHADER);
+
+  SOCKET_IN_COLOR(color, "Color", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_FLOAT(value, "Value", 0.0f);
+
+  SOCKET_STRING(name, "AOV Name", ustring(""));
+
+  return type;
+}
+
+OutputAOVNode::OutputAOVNode() : ShaderNode(node_type)
+{
+  special_type = SHADER_SPECIAL_TYPE_OUTPUT_AOV;
+  slot = -1;
+}
+
+void OutputAOVNode::simplify_settings(Scene *scene)
+{
+  slot = scene->film->get_aov_offset(name.string(), is_color);
+  if (slot == -1) {
+    slot = scene->film->get_aov_offset(name.string(), is_color);
+  }
+
+  if (slot == -1 || is_color) {
+    input("Value")->disconnect();
+  }
+  if (slot == -1 || !is_color) {
+    input("Color")->disconnect();
+  }
+}
+
+void OutputAOVNode::compile(SVMCompiler &compiler)
+{
+  assert(slot >= 0);
+
+  if (is_color) {
+    compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), slot);
+  }
+  else {
+    compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), slot);
+  }
+}
+
+void OutputAOVNode::compile(OSLCompiler & /*compiler*/)
+{
+  /* TODO */
+}
+
 /* Math */
 
 NODE_DEFINE(MathNode)
@@ -5190,9 +5951,13 @@ NODE_DEFINE(MathNode)
   type_enum.insert("subtract", NODE_MATH_SUBTRACT);
   type_enum.insert("multiply", NODE_MATH_MULTIPLY);
   type_enum.insert("divide", NODE_MATH_DIVIDE);
+  type_enum.insert("multiply_add", NODE_MATH_MULTIPLY_ADD);
   type_enum.insert("sine", NODE_MATH_SINE);
   type_enum.insert("cosine", NODE_MATH_COSINE);
   type_enum.insert("tangent", NODE_MATH_TANGENT);
+  type_enum.insert("sinh", NODE_MATH_SINH);
+  type_enum.insert("cosh", NODE_MATH_COSH);
+  type_enum.insert("tanh", NODE_MATH_TANH);
   type_enum.insert("arcsine", NODE_MATH_ARCSINE);
   type_enum.insert("arccosine", NODE_MATH_ARCCOSINE);
   type_enum.insert("arctangent", NODE_MATH_ARCTANGENT);
@@ -5208,14 +5973,27 @@ NODE_DEFINE(MathNode)
   type_enum.insert("arctan2", NODE_MATH_ARCTAN2);
   type_enum.insert("floor", NODE_MATH_FLOOR);
   type_enum.insert("ceil", NODE_MATH_CEIL);
-  type_enum.insert("fract", NODE_MATH_FRACT);
+  type_enum.insert("fraction", NODE_MATH_FRACTION);
+  type_enum.insert("trunc", NODE_MATH_TRUNC);
+  type_enum.insert("snap", NODE_MATH_SNAP);
+  type_enum.insert("wrap", NODE_MATH_WRAP);
+  type_enum.insert("pingpong", NODE_MATH_PINGPONG);
   type_enum.insert("sqrt", NODE_MATH_SQRT);
+  type_enum.insert("inversesqrt", NODE_MATH_INV_SQRT);
+  type_enum.insert("sign", NODE_MATH_SIGN);
+  type_enum.insert("exponent", NODE_MATH_EXPONENT);
+  type_enum.insert("radians", NODE_MATH_RADIANS);
+  type_enum.insert("degrees", NODE_MATH_DEGREES);
+  type_enum.insert("smoothmin", NODE_MATH_SMOOTH_MIN);
+  type_enum.insert("smoothmax", NODE_MATH_SMOOTH_MAX);
+  type_enum.insert("compare", NODE_MATH_COMPARE);
   SOCKET_ENUM(type, "Type", type_enum, NODE_MATH_ADD);
 
   SOCKET_BOOLEAN(use_clamp, "Use Clamp", false);
 
-  SOCKET_IN_FLOAT(value1, "Value1", 0.0f);
-  SOCKET_IN_FLOAT(value2, "Value2", 0.0f);
+  SOCKET_IN_FLOAT(value1, "Value1", 0.5f);
+  SOCKET_IN_FLOAT(value2, "Value2", 0.5f);
+  SOCKET_IN_FLOAT(value3, "Value3", 0.0f);
 
   SOCKET_OUT_FLOAT(value, "Value");
 
@@ -5226,13 +6004,29 @@ MathNode::MathNode() : ShaderNode(node_type)
 {
 }
 
+void MathNode::expand(ShaderGraph *graph)
+{
+  if (use_clamp) {
+    ShaderOutput *result_out = output("Value");
+    if (!result_out->links.empty()) {
+      ClampNode *clamp_node = new ClampNode();
+      clamp_node->type = NODE_CLAMP_MINMAX;
+      clamp_node->min = 0.0f;
+      clamp_node->max = 1.0f;
+      graph->add(clamp_node);
+      graph->relink(result_out, clamp_node->output("Result"));
+      graph->connect(result_out, clamp_node->input("Value"));
+    }
+  }
+}
+
 void MathNode::constant_fold(const ConstantFolder &folder)
 {
   if (folder.all_inputs_constant()) {
-    folder.make_constant_clamp(svm_math(type, value1, value2), use_clamp);
+    folder.make_constant(svm_math(type, value1, value2, value3));
   }
   else {
-    folder.fold_math(type, use_clamp);
+    folder.fold_math(type);
   }
 }
 
@@ -5240,22 +6034,24 @@ void MathNode::compile(SVMCompiler &compiler)
 {
   ShaderInput *value1_in = input("Value1");
   ShaderInput *value2_in = input("Value2");
+  ShaderInput *value3_in = input("Value3");
   ShaderOutput *value_out = output("Value");
 
-  compiler.add_node(
-      NODE_MATH, type, compiler.stack_assign(value1_in), compiler.stack_assign(value2_in));
-  compiler.add_node(NODE_MATH, compiler.stack_assign(value_out));
+  int value1_stack_offset = compiler.stack_assign(value1_in);
+  int value2_stack_offset = compiler.stack_assign(value2_in);
+  int value3_stack_offset = compiler.stack_assign(value3_in);
+  int value_stack_offset = compiler.stack_assign(value_out);
 
-  if (use_clamp) {
-    compiler.add_node(NODE_MATH, NODE_MATH_CLAMP, compiler.stack_assign(value_out));
-    compiler.add_node(NODE_MATH, compiler.stack_assign(value_out));
-  }
+  compiler.add_node(
+      NODE_MATH,
+      type,
+      compiler.encode_uchar4(value1_stack_offset, value2_stack_offset, value3_stack_offset),
+      value_stack_offset);
 }
 
 void MathNode::compile(OSLCompiler &compiler)
 {
   compiler.parameter(this, "type");
-  compiler.parameter(this, "use_clamp");
   compiler.add(this, "node_math");
 }
 
@@ -5268,14 +6064,38 @@ NODE_DEFINE(VectorMathNode)
   static NodeEnum type_enum;
   type_enum.insert("add", NODE_VECTOR_MATH_ADD);
   type_enum.insert("subtract", NODE_VECTOR_MATH_SUBTRACT);
-  type_enum.insert("average", NODE_VECTOR_MATH_AVERAGE);
-  type_enum.insert("dot_product", NODE_VECTOR_MATH_DOT_PRODUCT);
+  type_enum.insert("multiply", NODE_VECTOR_MATH_MULTIPLY);
+  type_enum.insert("divide", NODE_VECTOR_MATH_DIVIDE);
+
   type_enum.insert("cross_product", NODE_VECTOR_MATH_CROSS_PRODUCT);
+  type_enum.insert("project", NODE_VECTOR_MATH_PROJECT);
+  type_enum.insert("reflect", NODE_VECTOR_MATH_REFLECT);
+  type_enum.insert("dot_product", NODE_VECTOR_MATH_DOT_PRODUCT);
+
+  type_enum.insert("distance", NODE_VECTOR_MATH_DISTANCE);
+  type_enum.insert("length", NODE_VECTOR_MATH_LENGTH);
+  type_enum.insert("scale", NODE_VECTOR_MATH_SCALE);
   type_enum.insert("normalize", NODE_VECTOR_MATH_NORMALIZE);
+
+  type_enum.insert("snap", NODE_VECTOR_MATH_SNAP);
+  type_enum.insert("floor", NODE_VECTOR_MATH_FLOOR);
+  type_enum.insert("ceil", NODE_VECTOR_MATH_CEIL);
+  type_enum.insert("modulo", NODE_VECTOR_MATH_MODULO);
+  type_enum.insert("wrap", NODE_VECTOR_MATH_WRAP);
+  type_enum.insert("fraction", NODE_VECTOR_MATH_FRACTION);
+  type_enum.insert("absolute", NODE_VECTOR_MATH_ABSOLUTE);
+  type_enum.insert("minimum", NODE_VECTOR_MATH_MINIMUM);
+  type_enum.insert("maximum", NODE_VECTOR_MATH_MAXIMUM);
+
+  type_enum.insert("sine", NODE_VECTOR_MATH_SINE);
+  type_enum.insert("cosine", NODE_VECTOR_MATH_COSINE);
+  type_enum.insert("tangent", NODE_VECTOR_MATH_TANGENT);
   SOCKET_ENUM(type, "Type", type_enum, NODE_VECTOR_MATH_ADD);
 
   SOCKET_IN_VECTOR(vector1, "Vector1", make_float3(0.0f, 0.0f, 0.0f));
   SOCKET_IN_VECTOR(vector2, "Vector2", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_VECTOR(vector3, "Vector3", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
 
   SOCKET_OUT_FLOAT(value, "Value");
   SOCKET_OUT_VECTOR(vector, "Vector");
@@ -5289,12 +6109,11 @@ VectorMathNode::VectorMathNode() : ShaderNode(node_type)
 
 void VectorMathNode::constant_fold(const ConstantFolder &folder)
 {
-  float value;
-  float3 vector;
+  float value = 0.0f;
+  float3 vector = make_float3(0.0f, 0.0f, 0.0f);
 
   if (folder.all_inputs_constant()) {
-    svm_vector_math(&value, &vector, type, vector1, vector2);
-
+    svm_vector_math(&value, &vector, type, vector1, vector2, vector3, scale);
     if (folder.output == output("Value")) {
       folder.make_constant(value);
     }
@@ -5311,15 +6130,34 @@ void VectorMathNode::compile(SVMCompiler &compiler)
 {
   ShaderInput *vector1_in = input("Vector1");
   ShaderInput *vector2_in = input("Vector2");
+  ShaderInput *scale_in = input("Scale");
   ShaderOutput *value_out = output("Value");
   ShaderOutput *vector_out = output("Vector");
 
-  compiler.add_node(NODE_VECTOR_MATH,
-                    type,
-                    compiler.stack_assign(vector1_in),
-                    compiler.stack_assign(vector2_in));
-  compiler.add_node(
-      NODE_VECTOR_MATH, compiler.stack_assign(value_out), compiler.stack_assign(vector_out));
+  int vector1_stack_offset = compiler.stack_assign(vector1_in);
+  int vector2_stack_offset = compiler.stack_assign(vector2_in);
+  int scale_stack_offset = compiler.stack_assign(scale_in);
+  int value_stack_offset = compiler.stack_assign_if_linked(value_out);
+  int vector_stack_offset = compiler.stack_assign_if_linked(vector_out);
+
+  /* 3 Vector Operators */
+  if (type == NODE_VECTOR_MATH_WRAP) {
+    ShaderInput *vector3_in = input("Vector3");
+    int vector3_stack_offset = compiler.stack_assign(vector3_in);
+    compiler.add_node(
+        NODE_VECTOR_MATH,
+        type,
+        compiler.encode_uchar4(vector1_stack_offset, vector2_stack_offset, scale_stack_offset),
+        compiler.encode_uchar4(value_stack_offset, vector_stack_offset));
+    compiler.add_node(vector3_stack_offset);
+  }
+  else {
+    compiler.add_node(
+        NODE_VECTOR_MATH,
+        type,
+        compiler.encode_uchar4(vector1_stack_offset, vector2_stack_offset, scale_stack_offset),
+        compiler.encode_uchar4(value_stack_offset, vector_stack_offset));
+  }
 }
 
 void VectorMathNode::compile(OSLCompiler &compiler)
@@ -5328,6 +6166,62 @@ void VectorMathNode::compile(OSLCompiler &compiler)
   compiler.add(this, "node_vector_math");
 }
 
+/* Vector Rotate */
+
+NODE_DEFINE(VectorRotateNode)
+{
+  NodeType *type = NodeType::add("vector_rotate", create, NodeType::SHADER);
+
+  static NodeEnum type_enum;
+  type_enum.insert("axis", NODE_VECTOR_ROTATE_TYPE_AXIS);
+  type_enum.insert("x_axis", NODE_VECTOR_ROTATE_TYPE_AXIS_X);
+  type_enum.insert("y_axis", NODE_VECTOR_ROTATE_TYPE_AXIS_Y);
+  type_enum.insert("z_axis", NODE_VECTOR_ROTATE_TYPE_AXIS_Z);
+  type_enum.insert("euler_xyz", NODE_VECTOR_ROTATE_TYPE_EULER_XYZ);
+  SOCKET_ENUM(type, "Type", type_enum, NODE_VECTOR_ROTATE_TYPE_AXIS);
+
+  SOCKET_BOOLEAN(invert, "Invert", false);
+
+  SOCKET_IN_VECTOR(vector, "Vector", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_POINT(rotation, "Rotation", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_POINT(center, "Center", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_VECTOR(axis, "Axis", make_float3(0.0f, 0.0f, 1.0f));
+  SOCKET_IN_FLOAT(angle, "Angle", 0.0f);
+  SOCKET_OUT_VECTOR(vector, "Vector");
+
+  return type;
+}
+
+VectorRotateNode::VectorRotateNode() : ShaderNode(node_type)
+{
+}
+
+void VectorRotateNode::compile(SVMCompiler &compiler)
+{
+  ShaderInput *vector_in = input("Vector");
+  ShaderInput *rotation_in = input("Rotation");
+  ShaderInput *center_in = input("Center");
+  ShaderInput *axis_in = input("Axis");
+  ShaderInput *angle_in = input("Angle");
+  ShaderOutput *vector_out = output("Vector");
+
+  compiler.add_node(
+      NODE_VECTOR_ROTATE,
+      compiler.encode_uchar4(
+          type, compiler.stack_assign(vector_in), compiler.stack_assign(rotation_in), invert),
+      compiler.encode_uchar4(compiler.stack_assign(center_in),
+                             compiler.stack_assign(axis_in),
+                             compiler.stack_assign(angle_in)),
+      compiler.stack_assign(vector_out));
+}
+
+void VectorRotateNode::compile(OSLCompiler &compiler)
+{
+  compiler.parameter(this, "type");
+  compiler.parameter(this, "invert");
+  compiler.add(this, "node_vector_rotate");
+}
+
 /* VectorTransform */
 
 NODE_DEFINE(VectorTransformNode)
@@ -5718,7 +6612,7 @@ void SetNormalNode::compile(OSLCompiler &compiler)
 
 OSLNode::OSLNode() : ShaderNode(new NodeType(NodeType::SHADER))
 {
-  special_type = SHADER_SPECIAL_TYPE_SCRIPT;
+  special_type = SHADER_SPECIAL_TYPE_OSL;
 }
 
 OSLNode::~OSLNode()
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 7796711115e..326f1d14168 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -17,8 +17,9 @@
 #ifndef __NODES_H__
 #define __NODES_H__
 
-#include "render/graph.h"
 #include "graph/node.h"
+#include "render/graph.h"
+#include "render/image.h"
 
 #include "util/util_array.h"
 #include "util/util_string.h"
@@ -78,13 +79,19 @@ class ImageSlotTextureNode : public TextureNode {
   {
     special_type = SHADER_SPECIAL_TYPE_IMAGE_SLOT;
   }
-  int slot;
+
+  virtual bool equals(const ShaderNode &other)
+  {
+    const ImageSlotTextureNode &other_node = (const ImageSlotTextureNode &)other;
+    return TextureNode::equals(other) && handle == other_node.handle;
+  }
+
+  ImageHandle handle;
 };
 
 class ImageTextureNode : public ImageSlotTextureNode {
  public:
   SHADER_NODE_NO_CLONE_CLASS(ImageTextureNode)
-  ~ImageTextureNode();
   ShaderNode *clone() const;
   void attributes(Shader *shader, AttributeRequestSet *attributes);
   bool has_attribute_dependency()
@@ -92,32 +99,33 @@ class ImageTextureNode : public ImageSlotTextureNode {
     return true;
   }
 
-  ImageManager *image_manager;
-  int is_float;
-  bool is_linear;
-  bool use_alpha;
+  virtual bool equals(const ShaderNode &other)
+  {
+    const ImageTextureNode &other_node = (const ImageTextureNode &)other;
+    return ImageSlotTextureNode::equals(other) && animated == other_node.animated;
+  }
+
+  ImageParams image_params() const;
+
+  /* Parameters. */
   ustring filename;
-  void *builtin_data;
-  NodeImageColorSpace color_space;
+  ustring colorspace;
+  ImageAlphaType alpha_type;
   NodeImageProjection projection;
   InterpolationType interpolation;
   ExtensionType extension;
   float projection_blend;
   bool animated;
   float3 vector;
+  ccl::vector<int> tiles;
 
-  virtual bool equals(const ShaderNode &other)
-  {
-    const ImageTextureNode &image_node = (const ImageTextureNode &)other;
-    return ImageSlotTextureNode::equals(other) && builtin_data == image_node.builtin_data &&
-           animated == image_node.animated;
-  }
+ protected:
+  void cull_tiles(Scene *scene, ShaderGraph *graph);
 };
 
 class EnvironmentTextureNode : public ImageSlotTextureNode {
  public:
   SHADER_NODE_NO_CLONE_CLASS(EnvironmentTextureNode)
-  ~EnvironmentTextureNode();
   ShaderNode *clone() const;
   void attributes(Shader *shader, AttributeRequestSet *attributes);
   bool has_attribute_dependency()
@@ -129,24 +137,22 @@ class EnvironmentTextureNode : public ImageSlotTextureNode {
     return NODE_GROUP_LEVEL_2;
   }
 
-  ImageManager *image_manager;
-  int is_float;
-  bool is_linear;
-  bool use_alpha;
+  virtual bool equals(const ShaderNode &other)
+  {
+    const EnvironmentTextureNode &other_node = (const EnvironmentTextureNode &)other;
+    return ImageSlotTextureNode::equals(other) && animated == other_node.animated;
+  }
+
+  ImageParams image_params() const;
+
+  /* Parameters. */
   ustring filename;
-  void *builtin_data;
-  NodeImageColorSpace color_space;
+  ustring colorspace;
+  ImageAlphaType alpha_type;
   NodeEnvironmentProjection projection;
   InterpolationType interpolation;
   bool animated;
   float3 vector;
-
-  virtual bool equals(const ShaderNode &other)
-  {
-    const EnvironmentTextureNode &env_node = (const EnvironmentTextureNode &)other;
-    return ImageSlotTextureNode::equals(other) && builtin_data == env_node.builtin_data &&
-           animated == env_node.animated;
-  }
 };
 
 class SkyTextureNode : public TextureNode {
@@ -162,7 +168,17 @@ class SkyTextureNode : public TextureNode {
   float3 sun_direction;
   float turbidity;
   float ground_albedo;
+  bool sun_disc;
+  float sun_size;
+  float sun_intensity;
+  float sun_elevation;
+  float sun_rotation;
+  float altitude;
+  float air_density;
+  float dust_density;
+  float ozone_density;
   float3 vector;
+  ImageHandle handle;
 };
 
 class OutputNode : public ShaderNode {
@@ -181,6 +197,31 @@ class OutputNode : public ShaderNode {
   }
 };
 
+class OutputAOVNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(OutputAOVNode)
+  virtual void simplify_settings(Scene *scene);
+
+  float value;
+  float3 color;
+
+  ustring name;
+
+  virtual int get_group()
+  {
+    return NODE_GROUP_LEVEL_4;
+  }
+
+  /* Don't allow output node de-duplication. */
+  virtual bool equals(const ShaderNode & /*other*/)
+  {
+    return false;
+  }
+
+  int slot;
+  bool is_color;
+};
+
 class GradientTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(GradientTextureNode)
@@ -198,7 +239,8 @@ class NoiseTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(NoiseTextureNode)
 
-  float scale, detail, distortion;
+  int dimensions;
+  float w, scale, detail, roughness, distortion;
   float3 vector;
 };
 
@@ -211,10 +253,22 @@ class VoronoiTextureNode : public TextureNode {
     return NODE_GROUP_LEVEL_2;
   }
 
-  NodeVoronoiColoring coloring;
+  virtual int get_feature()
+  {
+    int result = ShaderNode::get_feature();
+    if (dimensions == 4) {
+      result |= NODE_FEATURE_VORONOI_EXTRA;
+    }
+    else if (dimensions >= 2 && feature == NODE_VORONOI_SMOOTH_F1) {
+      result |= NODE_FEATURE_VORONOI_EXTRA;
+    }
+    return result;
+  }
+
+  int dimensions;
   NodeVoronoiDistanceMetric metric;
   NodeVoronoiFeature feature;
-  float scale, exponent;
+  float w, scale, exponent, smoothness, randomness;
   float3 vector;
 };
 
@@ -227,8 +281,9 @@ class MusgraveTextureNode : public TextureNode {
     return NODE_GROUP_LEVEL_2;
   }
 
+  int dimensions;
   NodeMusgraveType type;
-  float scale, detail, dimension, lacunarity, offset, gain;
+  float w, scale, detail, dimension, lacunarity, offset, gain;
   float3 vector;
 };
 
@@ -242,9 +297,11 @@ class WaveTextureNode : public TextureNode {
   }
 
   NodeWaveType type;
+  NodeWaveBandsDirection bands_direction;
+  NodeWaveRingsDirection rings_direction;
   NodeWaveProfile profile;
 
-  float scale, distortion, detail, detail_scale;
+  float scale, distortion, detail, detail_scale, detail_roughness, phase;
   float3 vector;
 };
 
@@ -297,7 +354,7 @@ class PointDensityTextureNode : public ShaderNode {
   SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
   virtual int get_group()
   {
-    return NODE_GROUP_LEVEL_3;
+    return NODE_GROUP_LEVEL_4;
   }
 
   ~PointDensityTextureNode();
@@ -312,27 +369,23 @@ class PointDensityTextureNode : public ShaderNode {
   {
     return true;
   }
-  bool has_object_dependency()
-  {
-    return true;
-  }
-
-  void add_image();
 
+  /* Parameters. */
   ustring filename;
   NodeTexVoxelSpace space;
   InterpolationType interpolation;
   Transform tfm;
   float3 vector;
 
-  ImageManager *image_manager;
-  int slot;
-  void *builtin_data;
+  /* Runtime. */
+  ImageHandle handle;
+
+  ImageParams image_params() const;
 
   virtual bool equals(const ShaderNode &other)
   {
-    const PointDensityTextureNode &point_dendity_node = (const PointDensityTextureNode &)other;
-    return ShaderNode::equals(other) && builtin_data == point_dendity_node.builtin_data;
+    const PointDensityTextureNode &other_node = (const PointDensityTextureNode &)other;
+    return ShaderNode::equals(other) && handle == other_node.handle;
   }
 };
 
@@ -360,6 +413,19 @@ class IESLightNode : public TextureNode {
   void get_slot();
 };
 
+class WhiteNoiseTextureNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(WhiteNoiseTextureNode)
+  virtual int get_group()
+  {
+    return NODE_GROUP_LEVEL_2;
+  }
+
+  int dimensions;
+  float3 vector;
+  float w;
+};
+
 class MappingNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MappingNode)
@@ -367,9 +433,10 @@ class MappingNode : public ShaderNode {
   {
     return NODE_GROUP_LEVEL_2;
   }
+  void constant_fold(const ConstantFolder &folder);
 
-  float3 vector;
-  TextureMapping tex_mapping;
+  float3 vector, location, rotation, scale;
+  NodeMappingType type;
 };
 
 class RGBToBWNode : public ShaderNode {
@@ -477,6 +544,7 @@ class PrincipledBsdfNode : public BsdfBaseNode {
  public:
   SHADER_NODE_CLASS(PrincipledBsdfNode)
 
+  void expand(ShaderGraph *graph);
   bool has_surface_bssrdf();
   bool has_bssrdf_bump();
   void compile(SVMCompiler &compiler,
@@ -505,6 +573,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
   float surface_mix_weight;
   ClosureType distribution, distribution_orig;
   ClosureType subsurface_method;
+  float3 emission;
+  float alpha;
 
   bool has_integrator_dependency();
   void attributes(Shader *shader, AttributeRequestSet *attributes);
@@ -832,10 +902,6 @@ class TextureCoordinateNode : public ShaderNode {
   {
     return true;
   }
-  bool has_object_dependency()
-  {
-    return use_transform;
-  }
 
   float3 normal_osl;
   bool from_dupli;
@@ -935,6 +1001,37 @@ class HairInfoNode : public ShaderNode {
   }
 };
 
+class VolumeInfoNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(VolumeInfoNode)
+  void attributes(Shader *shader, AttributeRequestSet *attributes);
+  bool has_attribute_dependency()
+  {
+    return true;
+  }
+  bool has_spatial_varying()
+  {
+    return true;
+  }
+  void expand(ShaderGraph *graph);
+};
+
+class VertexColorNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(VertexColorNode)
+  void attributes(Shader *shader, AttributeRequestSet *attributes);
+  bool has_attribute_dependency()
+  {
+    return true;
+  }
+  bool has_spatial_varying()
+  {
+    return true;
+  }
+
+  ustring layer_name;
+};
+
 class ValueNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(ValueNode)
@@ -1215,6 +1312,32 @@ class BlackbodyNode : public ShaderNode {
   float temperature;
 };
 
+class MapRangeNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(MapRangeNode)
+  virtual int get_group()
+  {
+    return NODE_GROUP_LEVEL_3;
+  }
+  void expand(ShaderGraph *graph);
+
+  float value, from_min, from_max, to_min, to_max, steps;
+  NodeMapRangeType type;
+  bool clamp;
+};
+
+class ClampNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(ClampNode)
+  void constant_fold(const ConstantFolder &folder);
+  virtual int get_group()
+  {
+    return NODE_GROUP_LEVEL_3;
+  }
+  float value, min, max;
+  NodeClampType type;
+};
+
 class MathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MathNode)
@@ -1222,11 +1345,13 @@ class MathNode : public ShaderNode {
   {
     return NODE_GROUP_LEVEL_1;
   }
+  void expand(ShaderGraph *graph);
   void constant_fold(const ConstantFolder &folder);
 
   float value1;
   float value2;
-  NodeMath type;
+  float value3;
+  NodeMathType type;
   bool use_clamp;
 };
 
@@ -1253,7 +1378,26 @@ class VectorMathNode : public ShaderNode {
 
   float3 vector1;
   float3 vector2;
-  NodeVectorMath type;
+  float3 vector3;
+  float scale;
+  NodeVectorMathType type;
+};
+
+class VectorRotateNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(VectorRotateNode)
+
+  virtual int get_group()
+  {
+    return NODE_GROUP_LEVEL_3;
+  }
+  NodeVectorRotateType type;
+  bool invert;
+  float3 vector;
+  float3 center;
+  float3 axis;
+  float angle;
+  float3 rotation;
 };
 
 class VectorTransformNode : public ShaderNode {
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 6c6f8810412..f200e409b9e 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -14,22 +14,25 @@
  * limitations under the License.
  */
 
-#include "render/camera.h"
+#include "render/object.h"
 #include "device/device.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "render/hair.h"
+#include "render/integrator.h"
 #include "render/light.h"
 #include "render/mesh.h"
-#include "render/curves.h"
-#include "render/object.h"
 #include "render/particles.h"
 #include "render/scene.h"
 
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
+#include "util/util_murmurhash.h"
 #include "util/util_progress.h"
 #include "util/util_set.h"
+#include "util/util_task.h"
 #include "util/util_vector.h"
-#include "util/util_murmurhash.h"
 
 #include "subd/subd_patch_table.h"
 
@@ -64,6 +67,7 @@ struct UpdateObjectTransformState {
   KernelObject *objects;
   Transform *object_motion_pass;
   DecomposedTransform *object_motion;
+  float *object_volume_step;
 
   /* Flags which will be synchronized to Integrator. */
   bool have_motion;
@@ -74,7 +78,6 @@ struct UpdateObjectTransformState {
   Scene *scene;
 
   /* Some locks to keep everything thread-safe. */
-  thread_spin_lock queue_lock;
   thread_spin_lock surface_area_lock;
 
   /* First unused object index in the queue. */
@@ -87,9 +90,10 @@ NODE_DEFINE(Object)
 {
   NodeType *type = NodeType::add("object", create);
 
-  SOCKET_NODE(mesh, "Mesh", &Mesh::node_type);
+  SOCKET_NODE(geometry, "Geometry", &Geometry::node_base_type);
   SOCKET_TRANSFORM(tfm, "Transform", transform_identity());
   SOCKET_UINT(visibility, "Visibility", ~0);
+  SOCKET_COLOR(color, "Color", make_float3(0.0f, 0.0f, 0.0f));
   SOCKET_UINT(random_id, "Random ID", 0);
   SOCKET_INT(pass_id, "Pass ID", 0);
   SOCKET_BOOLEAN(use_holdout, "Use Holdout", false);
@@ -97,6 +101,7 @@ NODE_DEFINE(Object)
   SOCKET_POINT(dupli_generated, "Dupli Generated", make_float3(0.0f, 0.0f, 0.0f));
   SOCKET_POINT2(dupli_uv, "Dupli UV", make_float2(0.0f, 0.0f));
   SOCKET_TRANSFORM_ARRAY(motion, "Motion", array<Transform>());
+  SOCKET_FLOAT(shadow_terminator_offset, "Terminator Offset", 0.0f);
 
   SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", false);
 
@@ -151,7 +156,7 @@ void Object::update_motion()
 
 void Object::compute_bounds(bool motion_blur)
 {
-  BoundBox mbounds = mesh->bounds;
+  BoundBox mbounds = geometry->bounds;
 
   if (motion_blur && use_motion()) {
     array<DecomposedTransform> decomp(motion.size());
@@ -171,7 +176,7 @@ void Object::compute_bounds(bool motion_blur)
   }
   else {
     /* No motion blur case. */
-    if (mesh->transform_applied) {
+    if (geometry->transform_applied) {
       bounds = mbounds;
     }
     else {
@@ -182,89 +187,18 @@ void Object::compute_bounds(bool motion_blur)
 
 void Object::apply_transform(bool apply_to_motion)
 {
-  if (!mesh || tfm == transform_identity())
+  if (!geometry || tfm == transform_identity())
     return;
 
-  /* triangles */
-  if (mesh->verts.size()) {
-    /* store matrix to transform later. when accessing these as attributes we
-     * do not want the transform to be applied for consistency between static
-     * and dynamic BVH, so we do it on packing. */
-    mesh->transform_normal = transform_transposed_inverse(tfm);
-
-    /* apply to mesh vertices */
-    for (size_t i = 0; i < mesh->verts.size(); i++)
-      mesh->verts[i] = transform_point(&tfm, mesh->verts[i]);
-
-    if (apply_to_motion) {
-      Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-      if (attr) {
-        size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
-        float3 *vert_steps = attr->data_float3();
-
-        for (size_t i = 0; i < steps_size; i++)
-          vert_steps[i] = transform_point(&tfm, vert_steps[i]);
-      }
-
-      Attribute *attr_N = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
-
-      if (attr_N) {
-        Transform ntfm = mesh->transform_normal;
-        size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
-        float3 *normal_steps = attr_N->data_float3();
-
-        for (size_t i = 0; i < steps_size; i++)
-          normal_steps[i] = normalize(transform_direction(&ntfm, normal_steps[i]));
-      }
-    }
-  }
-
-  /* curves */
-  if (mesh->curve_keys.size()) {
-    /* compute uniform scale */
-    float3 c0 = transform_get_column(&tfm, 0);
-    float3 c1 = transform_get_column(&tfm, 1);
-    float3 c2 = transform_get_column(&tfm, 2);
-    float scalar = powf(fabsf(dot(cross(c0, c1), c2)), 1.0f / 3.0f);
-
-    /* apply transform to curve keys */
-    for (size_t i = 0; i < mesh->curve_keys.size(); i++) {
-      float3 co = transform_point(&tfm, mesh->curve_keys[i]);
-      float radius = mesh->curve_radius[i] * scalar;
-
-      /* scale for curve radius is only correct for uniform scale */
-      mesh->curve_keys[i] = co;
-      mesh->curve_radius[i] = radius;
-    }
-
-    if (apply_to_motion) {
-      Attribute *curve_attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-      if (curve_attr) {
-        /* apply transform to motion curve keys */
-        size_t steps_size = mesh->curve_keys.size() * (mesh->motion_steps - 1);
-        float4 *key_steps = curve_attr->data_float4();
-
-        for (size_t i = 0; i < steps_size; i++) {
-          float3 co = transform_point(&tfm, float4_to_float3(key_steps[i]));
-          float radius = key_steps[i].w * scalar;
-
-          /* scale for curve radius is only correct for uniform scale */
-          key_steps[i] = float3_to_float4(co);
-          key_steps[i].w = radius;
-        }
-      }
-    }
-  }
+  geometry->apply_transform(tfm, apply_to_motion);
 
   /* we keep normals pointing in same direction on negative scale, notify
-   * mesh about this in it (re)calculates normals */
+   * geometry about this in it (re)calculates normals */
   if (transform_negative_scale(tfm))
-    mesh->transform_negative_scaled = true;
+    geometry->transform_negative_scaled = true;
 
   if (bounds.valid()) {
-    mesh->compute_bounds();
+    geometry->compute_bounds();
     compute_bounds(false);
   }
 
@@ -274,19 +208,18 @@ void Object::apply_transform(bool apply_to_motion)
 
 void Object::tag_update(Scene *scene)
 {
-  if (mesh) {
-    if (mesh->transform_applied)
-      mesh->need_update = true;
+  if (geometry) {
+    if (geometry->transform_applied)
+      geometry->need_update = true;
 
-    foreach (Shader *shader, mesh->used_shaders) {
+    foreach (Shader *shader, geometry->used_shaders) {
       if (shader->use_mis && shader->has_surface_emission)
         scene->light_manager->need_update = true;
     }
   }
 
   scene->camera->need_flags_update = true;
-  scene->curve_system_manager->need_update = true;
-  scene->mesh_manager->need_update = true;
+  scene->geometry_manager->need_update = true;
   scene->object_manager->need_update = true;
 }
 
@@ -335,6 +268,82 @@ uint Object::visibility_for_tracing() const
   return trace_visibility;
 }
 
+float Object::compute_volume_step_size() const
+{
+  if (geometry->type != Geometry::MESH) {
+    return FLT_MAX;
+  }
+
+  Mesh *mesh = static_cast<Mesh *>(geometry);
+
+  if (!mesh->has_volume) {
+    return FLT_MAX;
+  }
+
+  /* Compute step rate from shaders. */
+  float step_rate = FLT_MAX;
+
+  foreach (Shader *shader, mesh->used_shaders) {
+    if (shader->has_volume) {
+      if ((shader->heterogeneous_volume && shader->has_volume_spatial_varying) ||
+          (shader->has_volume_attribute_dependency)) {
+        step_rate = fminf(shader->volume_step_rate, step_rate);
+      }
+    }
+  }
+
+  if (step_rate == FLT_MAX) {
+    return FLT_MAX;
+  }
+
+  /* Compute step size from voxel grids. */
+  float step_size = FLT_MAX;
+
+  foreach (Attribute &attr, mesh->attributes.attributes) {
+    if (attr.element == ATTR_ELEMENT_VOXEL) {
+      ImageHandle &handle = attr.data_voxel();
+      const ImageMetaData &metadata = handle.metadata();
+      if (metadata.width == 0 || metadata.height == 0 || metadata.depth == 0) {
+        continue;
+      }
+
+      /* User specified step size. */
+      float voxel_step_size = mesh->volume_step_size;
+
+      if (voxel_step_size == 0.0f) {
+        /* Auto detect step size. */
+        float3 size = make_float3(
+            1.0f / metadata.width, 1.0f / metadata.height, 1.0f / metadata.depth);
+
+        /* Step size is transformed from voxel to world space. */
+        Transform voxel_tfm = tfm;
+        if (metadata.use_transform_3d) {
+          voxel_tfm = tfm * transform_inverse(metadata.transform_3d);
+        }
+        voxel_step_size = min3(fabs(transform_direction(&voxel_tfm, size)));
+      }
+      else if (mesh->volume_object_space) {
+        /* User specified step size in object space. */
+        float3 size = make_float3(voxel_step_size, voxel_step_size, voxel_step_size);
+        voxel_step_size = min3(fabs(transform_direction(&tfm, size)));
+      }
+
+      if (voxel_step_size > 0.0f) {
+        step_size = fminf(voxel_step_size, step_size);
+      }
+    }
+  }
+
+  if (step_size == FLT_MAX) {
+    /* Fall back to 1/10th of bounds for procedural volumes. */
+    step_size = 0.1f * average(bounds.size());
+  }
+
+  step_size *= step_rate;
+
+  return step_size;
+}
+
 int Object::get_device_index() const
 {
   return index;
@@ -352,31 +361,33 @@ ObjectManager::~ObjectManager()
 {
 }
 
-void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state, Object *ob)
+static float object_surface_area(UpdateObjectTransformState *state,
+                                 const Transform &tfm,
+                                 Geometry *geom)
 {
-  KernelObject &kobject = state->objects[ob->index];
-  Transform *object_motion_pass = state->object_motion_pass;
-
-  Mesh *mesh = ob->mesh;
-  uint flag = 0;
+  if (geom->type != Geometry::MESH) {
+    return 0.0f;
+  }
 
-  /* Compute transformations. */
-  Transform tfm = ob->tfm;
-  Transform itfm = transform_inverse(tfm);
+  Mesh *mesh = static_cast<Mesh *>(geom);
+  if (mesh->has_volume) {
+    /* Volume density automatically adjust to object scale. */
+    if (mesh->volume_object_space) {
+      const float3 unit = normalize(make_float3(1.0f, 1.0f, 1.0f));
+      return 1.0f / len(transform_direction(&tfm, unit));
+    }
+    else {
+      return 1.0f;
+    }
+  }
 
   /* Compute surface area. for uniform scale we can do avoid the many
    * transform calls and share computation for instances.
    *
    * TODO(brecht): Correct for displacement, and move to a better place.
    */
-  float uniform_scale;
   float surface_area = 0.0f;
-  float pass_id = ob->pass_id;
-  float random_number = (float)ob->random_id * (1.0f / (float)0xFFFFFFFF);
-  int particle_index = (ob->particle_system) ?
-                           ob->particle_index + state->particle_offset[ob->particle_system] :
-                           0;
-
+  float uniform_scale;
   if (transform_uniform_scale(tfm, uniform_scale)) {
     map<Mesh *, float>::iterator it;
 
@@ -422,19 +433,49 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
     }
   }
 
+  return surface_area;
+}
+
+void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state, Object *ob)
+{
+  KernelObject &kobject = state->objects[ob->index];
+  Transform *object_motion_pass = state->object_motion_pass;
+
+  Geometry *geom = ob->geometry;
+  uint flag = 0;
+
+  /* Compute transformations. */
+  Transform tfm = ob->tfm;
+  Transform itfm = transform_inverse(tfm);
+
+  float3 color = ob->color;
+  float pass_id = ob->pass_id;
+  float random_number = (float)ob->random_id * (1.0f / (float)0xFFFFFFFF);
+  int particle_index = (ob->particle_system) ?
+                           ob->particle_index + state->particle_offset[ob->particle_system] :
+                           0;
+
   kobject.tfm = tfm;
   kobject.itfm = itfm;
-  kobject.surface_area = surface_area;
+  kobject.surface_area = object_surface_area(state, tfm, geom);
+  kobject.color[0] = color.x;
+  kobject.color[1] = color.y;
+  kobject.color[2] = color.z;
   kobject.pass_id = pass_id;
   kobject.random_number = random_number;
   kobject.particle_index = particle_index;
   kobject.motion_offset = 0;
 
-  if (mesh->use_motion_blur) {
+  if (geom->use_motion_blur) {
     state->have_motion = true;
   }
-  if (mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
-    flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+
+  if (geom->type == Geometry::MESH) {
+    /* TODO: why only mesh? */
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    if (mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
+      flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+    }
   }
 
   if (state->need_motion == Scene::MOTION_PASS) {
@@ -455,7 +496,7 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
     /* Motion transformations, is world/object space depending if mesh
      * comes with deformed position in object space, or if we transform
      * the shading point in world space. */
-    if (!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
+    if (!(flag & SD_OBJECT_HAS_VERTEX_MOTION)) {
       tfm_pre = tfm_pre * itfm;
       tfm_post = tfm_post * itfm;
     }
@@ -480,66 +521,34 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
   kobject.dupli_generated[0] = ob->dupli_generated[0];
   kobject.dupli_generated[1] = ob->dupli_generated[1];
   kobject.dupli_generated[2] = ob->dupli_generated[2];
-  kobject.numkeys = mesh->curve_keys.size();
+  kobject.numkeys = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom)->curve_keys.size() :
+                                                     0;
   kobject.dupli_uv[0] = ob->dupli_uv[0];
   kobject.dupli_uv[1] = ob->dupli_uv[1];
-  int totalsteps = mesh->motion_steps;
+  int totalsteps = geom->motion_steps;
   kobject.numsteps = (totalsteps - 1) / 2;
-  kobject.numverts = mesh->verts.size();
+  kobject.numverts = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom)->verts.size() : 0;
   kobject.patch_map_offset = 0;
   kobject.attribute_map_offset = 0;
   uint32_t hash_name = util_murmur_hash3(ob->name.c_str(), ob->name.length(), 0);
   uint32_t hash_asset = util_murmur_hash3(ob->asset_name.c_str(), ob->asset_name.length(), 0);
   kobject.cryptomatte_object = util_hash_to_float(hash_name);
   kobject.cryptomatte_asset = util_hash_to_float(hash_asset);
+  kobject.shadow_terminator_offset = 1.0f / (1.0f - 0.5f * ob->shadow_terminator_offset);
 
   /* Object flag. */
   if (ob->use_holdout) {
     flag |= SD_OBJECT_HOLDOUT_MASK;
   }
   state->object_flag[ob->index] = flag;
+  state->object_volume_step[ob->index] = FLT_MAX;
 
   /* Have curves. */
-  if (mesh->num_curves()) {
+  if (geom->type == Geometry::HAIR) {
     state->have_curves = true;
   }
 }
 
-bool ObjectManager::device_update_object_transform_pop_work(UpdateObjectTransformState *state,
-                                                            int *start_index,
-                                                            int *num_objects)
-{
-  /* Tweakable parameter, number of objects per chunk.
-   * Too small value will cause some extra overhead due to spin lock,
-   * too big value might not use all threads nicely.
-   */
-  static const int OBJECTS_PER_TASK = 32;
-  bool have_work = false;
-  state->queue_lock.lock();
-  int num_scene_objects = state->scene->objects.size();
-  if (state->queue_start_object < num_scene_objects) {
-    int count = min(OBJECTS_PER_TASK, num_scene_objects - state->queue_start_object);
-    *start_index = state->queue_start_object;
-    *num_objects = count;
-    state->queue_start_object += count;
-    have_work = true;
-  }
-  state->queue_lock.unlock();
-  return have_work;
-}
-
-void ObjectManager::device_update_object_transform_task(UpdateObjectTransformState *state)
-{
-  int start_index, num_objects;
-  while (device_update_object_transform_pop_work(state, &start_index, &num_objects)) {
-    for (int i = 0; i < num_objects; ++i) {
-      const int object_index = start_index + i;
-      Object *ob = state->scene->objects[object_index];
-      device_update_object_transform(state, ob);
-    }
-  }
-}
-
 void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene, Progress &progress)
 {
   UpdateObjectTransformState state;
@@ -551,6 +560,7 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene,
 
   state.objects = dscene->objects.alloc(scene->objects.size());
   state.object_flag = dscene->object_flag.alloc(scene->objects.size());
+  state.object_volume_step = dscene->object_volume_step.alloc(scene->objects.size());
   state.object_motion = NULL;
   state.object_motion_pass = NULL;
 
@@ -584,28 +594,19 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene,
     numparticles += psys->particles.size();
   }
 
-  /* NOTE: If it's just a handful of objects we deal with them in a single
-   * thread to avoid threading overhead. However, this threshold is might
-   * need some tweaks to make mid-complex scenes optimal.
-   */
-  if (scene->objects.size() < 64) {
-    foreach (Object *ob, scene->objects) {
-      device_update_object_transform(&state, ob);
-      if (progress.get_cancel()) {
-        return;
-      }
-    }
-  }
-  else {
-    const int num_threads = TaskScheduler::num_threads();
-    TaskPool pool;
-    for (int i = 0; i < num_threads; ++i) {
-      pool.push(function_bind(&ObjectManager::device_update_object_transform_task, this, &state));
-    }
-    pool.wait_work();
-    if (progress.get_cancel()) {
-      return;
-    }
+  /* Parallel object update, with grain size to avoid too much threading overhead
+   * for individual objects. */
+  static const int OBJECTS_PER_TASK = 32;
+  parallel_for(blocked_range<size_t>(0, scene->objects.size(), OBJECTS_PER_TASK),
+               [&](const blocked_range<size_t> &r) {
+                 for (size_t i = r.begin(); i != r.end(); i++) {
+                   Object *ob = state.scene->objects[i];
+                   device_update_object_transform(&state, ob);
+                 }
+               });
+
+  if (progress.get_cancel()) {
+    return;
   }
 
   dscene->objects.copy_to_device();
@@ -618,7 +619,6 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene,
 
   dscene->data.bvh.have_motion = state.have_motion;
   dscene->data.bvh.have_curves = state.have_curves;
-  dscene->data.bvh.have_instancing = true;
 }
 
 void ObjectManager::device_update(Device *device,
@@ -671,25 +671,30 @@ void ObjectManager::device_update_flags(
 
   /* Object info flag. */
   uint *object_flag = dscene->object_flag.data();
+  float *object_volume_step = dscene->object_volume_step.data();
 
   /* Object volume intersection. */
   vector<Object *> volume_objects;
   bool has_volume_objects = false;
   foreach (Object *object, scene->objects) {
-    if (object->mesh->has_volume) {
+    if (object->geometry->has_volume) {
       if (bounds_valid) {
         volume_objects.push_back(object);
       }
       has_volume_objects = true;
+      object_volume_step[object->index] = object->compute_volume_step_size();
+    }
+    else {
+      object_volume_step[object->index] = FLT_MAX;
     }
   }
 
   foreach (Object *object, scene->objects) {
-    if (object->mesh->has_volume) {
+    if (object->geometry->has_volume) {
       object_flag[object->index] |= SD_OBJECT_HAS_VOLUME;
       object_flag[object->index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
 
-      foreach (Attribute &attr, object->mesh->attributes.attributes) {
+      foreach (Attribute &attr, object->geometry->attributes.attributes) {
         if (attr.element == ATTR_ELEMENT_VOXEL) {
           object_flag[object->index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
         }
@@ -698,6 +703,7 @@ void ObjectManager::device_update_flags(
     else {
       object_flag[object->index] &= ~(SD_OBJECT_HAS_VOLUME | SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
     }
+
     if (object->is_shadow_catcher) {
       object_flag[object->index] |= SD_OBJECT_SHADOW_CATCHER;
     }
@@ -726,6 +732,7 @@ void ObjectManager::device_update_flags(
 
   /* Copy object flag. */
   dscene->object_flag.copy_to_device();
+  dscene->object_volume_step.copy_to_device();
 }
 
 void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Scene *scene)
@@ -739,21 +746,24 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc
   bool update = false;
 
   foreach (Object *object, scene->objects) {
-    Mesh *mesh = object->mesh;
-
-    if (mesh->patch_table) {
-      uint patch_map_offset = 2 * (mesh->patch_table_offset + mesh->patch_table->total_size() -
-                                   mesh->patch_table->num_nodes * PATCH_NODE_SIZE) -
-                              mesh->patch_offset;
-
-      if (kobjects[object->index].patch_map_offset != patch_map_offset) {
-        kobjects[object->index].patch_map_offset = patch_map_offset;
-        update = true;
+    Geometry *geom = object->geometry;
+
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      if (mesh->patch_table) {
+        uint patch_map_offset = 2 * (mesh->patch_table_offset + mesh->patch_table->total_size() -
+                                     mesh->patch_table->num_nodes * PATCH_NODE_SIZE) -
+                                mesh->patch_offset;
+
+        if (kobjects[object->index].patch_map_offset != patch_map_offset) {
+          kobjects[object->index].patch_map_offset = patch_map_offset;
+          update = true;
+        }
       }
     }
 
-    if (kobjects[object->index].attribute_map_offset != mesh->attr_map_offset) {
-      kobjects[object->index].attribute_map_offset = mesh->attr_map_offset;
+    if (kobjects[object->index].attribute_map_offset != geom->attr_map_offset) {
+      kobjects[object->index].attribute_map_offset = geom->attr_map_offset;
       update = true;
     }
   }
@@ -769,26 +779,26 @@ void ObjectManager::device_free(Device *, DeviceScene *dscene)
   dscene->object_motion_pass.free();
   dscene->object_motion.free();
   dscene->object_flag.free();
+  dscene->object_volume_step.free();
 }
 
 void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, Progress &progress)
 {
   /* todo: normals and displacement should be done before applying transform! */
-  /* todo: create objects/meshes in right order! */
+  /* todo: create objects/geometry in right order! */
 
-  /* counter mesh users */
-  map<Mesh *, int> mesh_users;
+  /* counter geometry users */
+  map<Geometry *, int> geometry_users;
   Scene::MotionType need_motion = scene->need_motion();
   bool motion_blur = need_motion == Scene::MOTION_BLUR;
   bool apply_to_motion = need_motion != Scene::MOTION_PASS;
   int i = 0;
-  bool have_instancing = false;
 
   foreach (Object *object, scene->objects) {
-    map<Mesh *, int>::iterator it = mesh_users.find(object->mesh);
+    map<Geometry *, int>::iterator it = geometry_users.find(object->geometry);
 
-    if (it == mesh_users.end())
-      mesh_users[object->mesh] = 1;
+    if (it == geometry_users.end())
+      geometry_users[object->geometry] = 1;
     else
       it->second++;
   }
@@ -798,46 +808,52 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, P
 
   uint *object_flag = dscene->object_flag.data();
 
-  /* apply transforms for objects with single user meshes */
+  /* apply transforms for objects with single user geometry */
   foreach (Object *object, scene->objects) {
     /* Annoying feedback loop here: we can't use is_instanced() because
      * it'll use uninitialized transform_applied flag.
      *
-     * Could be solved by moving reference counter to Mesh.
+     * Could be solved by moving reference counter to Geometry.
      */
-    if ((mesh_users[object->mesh] == 1 && !object->mesh->has_surface_bssrdf) &&
-        !object->mesh->has_true_displacement() &&
-        object->mesh->subdivision_type == Mesh::SUBDIVISION_NONE) {
+    Geometry *geom = object->geometry;
+    bool apply = (geometry_users[geom] == 1) && !geom->has_surface_bssrdf &&
+                 !geom->has_true_displacement();
+
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      apply = apply && mesh->subdivision_type == Mesh::SUBDIVISION_NONE;
+    }
+    else if (geom->type == Geometry::HAIR) {
+      /* Can't apply non-uniform scale to curves, this can't be represented by
+       * control points and radius alone. */
+      float scale;
+      apply = apply && transform_uniform_scale(object->tfm, scale);
+    }
+
+    if (apply) {
       if (!(motion_blur && object->use_motion())) {
-        if (!object->mesh->transform_applied) {
+        if (!geom->transform_applied) {
           object->apply_transform(apply_to_motion);
-          object->mesh->transform_applied = true;
+          geom->transform_applied = true;
 
           if (progress.get_cancel())
             return;
         }
 
         object_flag[i] |= SD_OBJECT_TRANSFORM_APPLIED;
-        if (object->mesh->transform_negative_scaled)
+        if (geom->transform_negative_scaled)
           object_flag[i] |= SD_OBJECT_NEGATIVE_SCALE_APPLIED;
       }
-      else
-        have_instancing = true;
     }
-    else
-      have_instancing = true;
 
     i++;
   }
-
-  dscene->data.bvh.have_instancing = have_instancing;
 }
 
 void ObjectManager::tag_update(Scene *scene)
 {
   need_update = true;
-  scene->curve_system_manager->need_update = true;
-  scene->mesh_manager->need_update = true;
+  scene->geometry_manager->need_update = true;
   scene->light_manager->need_update = true;
 }
 
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 2fd43900da1..ac9b4c331f5 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -23,8 +23,8 @@
 #include "util/util_array.h"
 #include "util/util_boundbox.h"
 #include "util/util_param.h"
-#include "util/util_transform.h"
 #include "util/util_thread.h"
+#include "util/util_transform.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
@@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 class DeviceScene;
-class Mesh;
+class Geometry;
 class ParticleSystem;
 class Progress;
 class Scene;
@@ -46,11 +46,12 @@ class Object : public Node {
  public:
   NODE_DECLARE
 
-  Mesh *mesh;
+  Geometry *geometry;
   Transform tfm;
   BoundBox bounds;
   uint random_id;
   int pass_id;
+  float3 color;
   ustring asset_name;
   vector<ParamValue> attributes;
   uint visibility;
@@ -58,6 +59,7 @@ class Object : public Node {
   bool hide_on_missing_motion;
   bool use_holdout;
   bool is_shadow_catcher;
+  float shadow_terminator_offset;
 
   float3 dupli_generated;
   float2 dupli_uv;
@@ -80,6 +82,9 @@ class Object : public Node {
   int motion_step(float time) const;
   void update_motion();
 
+  /* Maximum number of motion steps supported (due to Embree). */
+  static const uint MAX_MOTION_STEPS = 129;
+
   /* Check whether object is traceable and it worth adding it to
    * kernel scene.
    */
@@ -93,6 +98,9 @@ class Object : public Node {
   /* Returns the index that is used in the kernel for this object. */
   int get_device_index() const;
 
+  /* Compute step size from attributes, shaders, transforms. */
+  float compute_volume_step_size() const;
+
  protected:
   /* Specifies the position of the object in scene->objects and
    * in the device vectors. Gets set in device_update. */
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index b66a46938be..5c62ae73e47 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -16,12 +16,14 @@
 
 #include "device/device.h"
 
+#include "render/background.h"
+#include "render/colorspace.h"
 #include "render/graph.h"
 #include "render/light.h"
+#include "render/nodes.h"
 #include "render/osl.h"
 #include "render/scene.h"
 #include "render/shader.h"
-#include "render/nodes.h"
 
 #ifdef WITH_OSL
 
@@ -29,6 +31,7 @@
 #  include "kernel/osl/osl_services.h"
 #  include "kernel/osl/osl_shader.h"
 
+#  include "util/util_aligned_malloc.h"
 #  include "util/util_foreach.h"
 #  include "util/util_logging.h"
 #  include "util/util_md5.h"
@@ -53,6 +56,7 @@ OSLRenderServices *OSLShaderManager::services_shared = NULL;
 int OSLShaderManager::ss_shared_users = 0;
 thread_mutex OSLShaderManager::ss_shared_mutex;
 thread_mutex OSLShaderManager::ss_mutex;
+int OSLCompiler::texture_shared_unique_id = 0;
 
 /* Shader Manager */
 
@@ -98,11 +102,12 @@ void OSLShaderManager::device_update(Device *device,
 
   device_free(device, dscene, scene);
 
-  /* determine which shaders are in use */
-  device_update_shaders_used(scene);
+  /* set texture system */
+  scene->image_manager->set_osl_texture_system((void *)ts);
 
   /* create shaders */
   OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  Shader *background_shader = scene->background->get_shader(scene);
 
   foreach (Shader *shader, scene->shaders) {
     assert(shader->graph);
@@ -115,9 +120,9 @@ void OSLShaderManager::device_update(Device *device,
      * compile shaders alternating */
     thread_scoped_lock lock(ss_mutex);
 
-    OSLCompiler compiler((void *)this, (void *)ss, scene->image_manager, scene->light_manager);
-    compiler.background = (shader == scene->default_background);
-    compiler.compile(scene, og, shader);
+    OSLCompiler compiler(this, services, ss, scene);
+    compiler.background = (shader == background_shader);
+    compiler.compile(og, shader);
 
     if (shader->use_mis && shader->has_surface_emission)
       scene->light_manager->need_update = true;
@@ -128,7 +133,7 @@ void OSLShaderManager::device_update(Device *device,
   og->ts = ts;
   og->services = services;
 
-  int background_id = scene->shader_manager->get_shader_id(scene->default_background);
+  int background_id = scene->shader_manager->get_shader_id(background_shader);
   og->background_state = og->surface_state[background_id & SHADER_MASK];
   og->use = true;
 
@@ -137,15 +142,16 @@ void OSLShaderManager::device_update(Device *device,
 
   need_update = false;
 
-  /* set texture system */
-  scene->image_manager->set_osl_texture_system((void *)ts);
+  /* add special builtin texture types */
+  services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
+  services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
 
   device_update_common(device, dscene, scene, progress);
 
   {
     /* Perform greedyjit optimization.
      *
-     * This might waste time on optimizing gorups which are never actually
+     * This might waste time on optimizing groups which are never actually
      * used, but this prevents OSL from allocating data on TLS at render
      * time.
      *
@@ -218,7 +224,8 @@ void OSLShaderManager::shading_system_init()
   thread_scoped_lock lock(ss_shared_mutex);
 
   if (ss_shared_users == 0) {
-    services_shared = new OSLRenderServices();
+    /* Must use aligned new due to concurrent hash map. */
+    services_shared = util_aligned_new<OSLRenderServices>(ts_shared);
 
     string shader_path = path_get("shader");
 #  ifdef _WIN32
@@ -287,7 +294,7 @@ void OSLShaderManager::shading_system_free()
     delete ss_shared;
     ss_shared = NULL;
 
-    delete services_shared;
+    util_aligned_delete(services_shared);
     services_shared = NULL;
   }
 
@@ -309,7 +316,7 @@ bool OSLShaderManager::osl_compile(const string &inputfile, const string &output
   string include_path_arg = string("-I") + shader_path;
   options.push_back(include_path_arg);
 
-  stdosl_path = path_get("shader/stdosl.h");
+  stdosl_path = path_get("shader/stdcycles.h");
 
   /* compile */
   OSL::OSLCompiler *compiler = new OSL::OSLCompiler(&OSL::ErrorHandler::default_handler());
@@ -430,27 +437,35 @@ const char *OSLShaderManager::shader_load_bytecode(const string &hash, const str
   return loaded_shaders.find(hash)->first.c_str();
 }
 
-OSLNode *OSLShaderManager::osl_node(const std::string &filepath,
+/* This is a static function to avoid RTTI link errors with only this
+ * file being compiled without RTTI to match OSL and LLVM libraries. */
+OSLNode *OSLShaderManager::osl_node(ShaderManager *manager,
+                                    const std::string &filepath,
                                     const std::string &bytecode_hash,
                                     const std::string &bytecode)
 {
+  if (!manager->use_osl()) {
+    return NULL;
+  }
+
   /* create query */
+  OSLShaderManager *osl_manager = static_cast<OSLShaderManager *>(manager);
   const char *hash;
 
   if (!filepath.empty()) {
-    hash = shader_load_filepath(filepath);
+    hash = osl_manager->shader_load_filepath(filepath);
   }
   else {
-    hash = shader_test_loaded(bytecode_hash);
+    hash = osl_manager->shader_test_loaded(bytecode_hash);
     if (!hash)
-      hash = shader_load_bytecode(bytecode_hash, bytecode);
+      hash = osl_manager->shader_load_bytecode(bytecode_hash, bytecode);
   }
 
   if (!hash) {
     return NULL;
   }
 
-  OSLShaderInfo *info = shader_loaded_info(hash);
+  OSLShaderInfo *info = osl_manager->shader_loaded_info(hash);
 
   /* count number of inputs */
   size_t num_inputs = 0;
@@ -555,15 +570,12 @@ OSLNode *OSLShaderManager::osl_node(const std::string &filepath,
 
 /* Graph Compiler */
 
-OSLCompiler::OSLCompiler(void *manager_,
-                         void *shadingsys_,
-                         ImageManager *image_manager_,
-                         LightManager *light_manager_)
+OSLCompiler::OSLCompiler(OSLShaderManager *manager,
+                         OSLRenderServices *services,
+                         OSL::ShadingSystem *ss,
+                         Scene *scene)
+    : scene(scene), manager(manager), services(services), ss(ss)
 {
-  manager = manager_;
-  shadingsys = shadingsys_;
-  image_manager = image_manager_;
-  light_manager = light_manager_;
   current_type = SHADER_TYPE_SURFACE;
   current_shader = NULL;
   background = false;
@@ -649,11 +661,9 @@ bool OSLCompiler::node_skip_input(ShaderNode *node, ShaderInput *input)
 
 void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
-
   /* load filepath */
   if (isfilepath) {
-    name = ((OSLShaderManager *)manager)->shader_load_filepath(name);
+    name = manager->shader_load_filepath(name);
 
     if (name == NULL)
       return;
@@ -665,9 +675,6 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
       /* checks to untangle graphs */
       if (node_skip_input(node, input))
         continue;
-      /* already has default value assigned */
-      else if (input->flags() & SocketType::DEFAULT_LINK_MASK)
-        continue;
 
       string param_name = compatible_name(node, input);
       const SocketType &socket = input->socket_type;
@@ -731,7 +738,7 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
   }
 
   /* test if we shader contains specific closures */
-  OSLShaderInfo *info = ((OSLShaderManager *)manager)->shader_loaded_info(name);
+  OSLShaderInfo *info = manager->shader_loaded_info(name);
 
   if (current_type == SHADER_TYPE_SURFACE) {
     if (info) {
@@ -753,14 +760,8 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
   else if (current_type == SHADER_TYPE_VOLUME) {
     if (node->has_spatial_varying())
       current_shader->has_volume_spatial_varying = true;
-  }
-
-  if (node->has_object_dependency()) {
-    current_shader->has_object_dependency = true;
-  }
-
-  if (node->has_attribute_dependency()) {
-    current_shader->has_attribute_dependency = true;
+    if (node->has_attribute_dependency())
+      current_shader->has_volume_attribute_dependency = true;
   }
 
   if (node->has_integrator_dependency()) {
@@ -778,7 +779,6 @@ static TypeDesc array_typedesc(TypeDesc typedesc, int arraylength)
 
 void OSLCompiler::parameter(ShaderNode *node, const char *name)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ustring uname = ustring(name);
   const SocketType &socket = *(node->type->find_input(uname));
 
@@ -930,56 +930,47 @@ void OSLCompiler::parameter(ShaderNode *node, const char *name)
 
 void OSLCompiler::parameter(const char *name, float f)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ss->Parameter(name, TypeDesc::TypeFloat, &f);
 }
 
 void OSLCompiler::parameter_color(const char *name, float3 f)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ss->Parameter(name, TypeDesc::TypeColor, &f);
 }
 
 void OSLCompiler::parameter_point(const char *name, float3 f)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ss->Parameter(name, TypeDesc::TypePoint, &f);
 }
 
 void OSLCompiler::parameter_normal(const char *name, float3 f)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ss->Parameter(name, TypeDesc::TypeNormal, &f);
 }
 
 void OSLCompiler::parameter_vector(const char *name, float3 f)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ss->Parameter(name, TypeDesc::TypeVector, &f);
 }
 
 void OSLCompiler::parameter(const char *name, int f)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ss->Parameter(name, TypeDesc::TypeInt, &f);
 }
 
 void OSLCompiler::parameter(const char *name, const char *s)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ss->Parameter(name, TypeDesc::TypeString, &s);
 }
 
 void OSLCompiler::parameter(const char *name, ustring s)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   const char *str = s.c_str();
   ss->Parameter(name, TypeDesc::TypeString, &str);
 }
 
 void OSLCompiler::parameter(const char *name, const Transform &tfm)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   ProjectionTransform projection(tfm);
   projection = projection_transpose(projection);
   ss->Parameter(name, TypeDesc::TypeMatrix, (float *)&projection);
@@ -987,7 +978,6 @@ void OSLCompiler::parameter(const char *name, const Transform &tfm)
 
 void OSLCompiler::parameter_array(const char *name, const float f[], int arraylen)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   TypeDesc type = TypeDesc::TypeFloat;
   type.arraylen = arraylen;
   ss->Parameter(name, type, f);
@@ -1004,7 +994,6 @@ void OSLCompiler::parameter_color_array(const char *name, const array<float3> &f
     table[i][2] = f[i].z;
   }
 
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
   TypeDesc type = TypeDesc::TypeColor;
   type.arraylen = table.size();
   ss->Parameter(name, type, table.data());
@@ -1082,8 +1071,6 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet &nodes)
 
 OSL::ShaderGroupRef OSLCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType type)
 {
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)shadingsys;
-
   current_type = type;
 
   OSL::ShaderGroupRef group = ss->ShaderGroupBegin(shader->name.c_str());
@@ -1123,7 +1110,7 @@ OSL::ShaderGroupRef OSLCompiler::compile_type(Shader *shader, ShaderGraph *graph
   return group;
 }
 
-void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
+void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
 {
   if (shader->need_update) {
     ShaderGraph *graph = shader->graph;
@@ -1150,8 +1137,7 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
     shader->has_displacement = false;
     shader->has_surface_spatial_varying = false;
     shader->has_volume_spatial_varying = false;
-    shader->has_object_dependency = false;
-    shader->has_attribute_dependency = false;
+    shader->has_volume_attribute_dependency = false;
     shader->has_integrator_dependency = false;
 
     /* generate surface shader */
@@ -1194,6 +1180,35 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
   og->bump_state.push_back(shader->osl_surface_bump_ref);
 }
 
+void OSLCompiler::parameter_texture(const char *name, ustring filename, ustring colorspace)
+{
+  /* Textured loaded through the OpenImageIO texture cache. For this
+   * case we need to do runtime color space conversion. */
+  OSLTextureHandle *handle = new OSLTextureHandle(OSLTextureHandle::OIIO);
+  handle->processor = ColorSpaceManager::get_processor(colorspace);
+  services->textures.insert(filename, handle);
+  parameter(name, filename);
+}
+
+void OSLCompiler::parameter_texture(const char *name, int svm_slot)
+{
+  /* Texture loaded through SVM image texture system. We generate a unique
+   * name, which ends up being used in OSLRenderServices::get_texture_handle
+   * to get handle again. Note that this name must be unique between multiple
+   * render sessions as the render services are shared. */
+  ustring filename(string_printf("@svm%d", texture_shared_unique_id++).c_str());
+  services->textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::SVM, svm_slot));
+  parameter(name, filename);
+}
+
+void OSLCompiler::parameter_texture_ies(const char *name, int svm_slot)
+{
+  /* IES light textures stored in SVM. */
+  ustring filename(string_printf("@svm%d", texture_shared_unique_id++).c_str());
+  services->textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::IES, svm_slot));
+  parameter(name, filename);
+}
+
 #else
 
 void OSLCompiler::add(ShaderNode * /*node*/, const char * /*name*/, bool /*isfilepath*/)
@@ -1248,6 +1263,20 @@ void OSLCompiler::parameter_color_array(const char * /*name*/, const array<float
 {
 }
 
+void OSLCompiler::parameter_texture(const char * /* name */,
+                                    ustring /* filename */,
+                                    ustring /* colorspace */)
+{
+}
+
+void OSLCompiler::parameter_texture(const char * /* name */, int /* svm_slot */)
+{
+}
+
+void OSLCompiler::parameter_texture_ies(const char * /* name */, int /* svm_slot */)
+{
+}
+
 #endif /* WITH_OSL */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h
index aec518a6c2b..4dd9f6630f2 100644
--- a/intern/cycles/render/osl.h
+++ b/intern/cycles/render/osl.h
@@ -93,9 +93,10 @@ class OSLShaderManager : public ShaderManager {
   OSLShaderInfo *shader_loaded_info(const string &hash);
 
   /* create OSL node using OSLQuery */
-  OSLNode *osl_node(const std::string &filepath,
-                    const std::string &bytecode_hash = "",
-                    const std::string &bytecode = "");
+  static OSLNode *osl_node(ShaderManager *manager,
+                           const std::string &filepath,
+                           const std::string &bytecode_hash = "",
+                           const std::string &bytecode = "");
 
  protected:
   void texture_system_init();
@@ -127,11 +128,13 @@ class OSLShaderManager : public ShaderManager {
 
 class OSLCompiler {
  public:
-  OSLCompiler(void *manager,
-              void *shadingsys,
-              ImageManager *image_manager,
-              LightManager *light_manager);
-  void compile(Scene *scene, OSLGlobals *og, Shader *shader);
+#ifdef WITH_OSL
+  OSLCompiler(OSLShaderManager *manager,
+              OSLRenderServices *services,
+              OSL::ShadingSystem *shadingsys,
+              Scene *scene);
+#endif
+  void compile(OSLGlobals *og, Shader *shader);
 
   void add(ShaderNode *node, const char *name, bool isfilepath = false);
 
@@ -152,14 +155,17 @@ class OSLCompiler {
 
   void parameter_attribute(const char *name, ustring s);
 
+  void parameter_texture(const char *name, ustring filename, ustring colorspace);
+  void parameter_texture(const char *name, int svm_slot);
+  void parameter_texture_ies(const char *name, int svm_slot);
+
   ShaderType output_type()
   {
     return current_type;
   }
 
   bool background;
-  ImageManager *image_manager;
-  LightManager *light_manager;
+  Scene *scene;
 
  private:
 #ifdef WITH_OSL
@@ -171,12 +177,16 @@ class OSLCompiler {
 
   void find_dependencies(ShaderNodeSet &dependencies, ShaderInput *input);
   void generate_nodes(const ShaderNodeSet &nodes);
+
+  OSLShaderManager *manager;
+  OSLRenderServices *services;
+  OSL::ShadingSystem *ss;
 #endif
 
-  void *shadingsys;
-  void *manager;
   ShaderType current_type;
   Shader *current_shader;
+
+  static int texture_shared_unique_id;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index 8335404b197..ec9276eff86 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "device/device.h"
 #include "render/particles.h"
+#include "device/device.h"
 #include "render/scene.h"
 
 #include "util/util_foreach.h"
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 98fccf406d4..843d4738a95 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -16,11 +16,11 @@
 
 #include <stdlib.h>
 
+#include "device/device.h"
 #include "render/background.h"
 #include "render/bake.h"
 #include "render/camera.h"
 #include "render/curves.h"
-#include "device/device.h"
 #include "render/film.h"
 #include "render/integrator.h"
 #include "render/light.h"
@@ -41,38 +41,45 @@
 CCL_NAMESPACE_BEGIN
 
 DeviceScene::DeviceScene(Device *device)
-    : bvh_nodes(device, "__bvh_nodes", MEM_TEXTURE),
-      bvh_leaf_nodes(device, "__bvh_leaf_nodes", MEM_TEXTURE),
-      object_node(device, "__object_node", MEM_TEXTURE),
-      prim_tri_index(device, "__prim_tri_index", MEM_TEXTURE),
-      prim_tri_verts(device, "__prim_tri_verts", MEM_TEXTURE),
-      prim_type(device, "__prim_type", MEM_TEXTURE),
-      prim_visibility(device, "__prim_visibility", MEM_TEXTURE),
-      prim_index(device, "__prim_index", MEM_TEXTURE),
-      prim_object(device, "__prim_object", MEM_TEXTURE),
-      prim_time(device, "__prim_time", MEM_TEXTURE),
-      tri_shader(device, "__tri_shader", MEM_TEXTURE),
-      tri_vnormal(device, "__tri_vnormal", MEM_TEXTURE),
-      tri_vindex(device, "__tri_vindex", MEM_TEXTURE),
-      tri_patch(device, "__tri_patch", MEM_TEXTURE),
-      tri_patch_uv(device, "__tri_patch_uv", MEM_TEXTURE),
-      curves(device, "__curves", MEM_TEXTURE),
-      curve_keys(device, "__curve_keys", MEM_TEXTURE),
-      patches(device, "__patches", MEM_TEXTURE),
-      objects(device, "__objects", MEM_TEXTURE),
-      object_motion_pass(device, "__object_motion_pass", MEM_TEXTURE),
-      object_motion(device, "__object_motion", MEM_TEXTURE),
-      object_flag(device, "__object_flag", MEM_TEXTURE),
-      camera_motion(device, "__camera_motion", MEM_TEXTURE),
-      attributes_map(device, "__attributes_map", MEM_TEXTURE),
-      attributes_float(device, "__attributes_float", MEM_TEXTURE),
-      attributes_float2(device, "__attributes_float2", MEM_TEXTURE),
-      attributes_float3(device, "__attributes_float3", MEM_TEXTURE),
-      attributes_uchar4(device, "__attributes_uchar4", MEM_TEXTURE),
-      light_distribution(device, "__light_distribution", MEM_TEXTURE),
-      lights(device, "__lights", MEM_TEXTURE),
-      light_background_marginal_cdf(device, "__light_background_marginal_cdf", MEM_TEXTURE),
-      light_background_conditional_cdf(device, "__light_background_conditional_cdf", MEM_TEXTURE),
+    : bvh_nodes(device, "__bvh_nodes", MEM_GLOBAL),
+      bvh_leaf_nodes(device, "__bvh_leaf_nodes", MEM_GLOBAL),
+      object_node(device, "__object_node", MEM_GLOBAL),
+      prim_tri_index(device, "__prim_tri_index", MEM_GLOBAL),
+      prim_tri_verts(device, "__prim_tri_verts", MEM_GLOBAL),
+      prim_type(device, "__prim_type", MEM_GLOBAL),
+      prim_visibility(device, "__prim_visibility", MEM_GLOBAL),
+      prim_index(device, "__prim_index", MEM_GLOBAL),
+      prim_object(device, "__prim_object", MEM_GLOBAL),
+      prim_time(device, "__prim_time", MEM_GLOBAL),
+      tri_shader(device, "__tri_shader", MEM_GLOBAL),
+      tri_vnormal(device, "__tri_vnormal", MEM_GLOBAL),
+      tri_vindex(device, "__tri_vindex", MEM_GLOBAL),
+      tri_patch(device, "__tri_patch", MEM_GLOBAL),
+      tri_patch_uv(device, "__tri_patch_uv", MEM_GLOBAL),
+      curves(device, "__curves", MEM_GLOBAL),
+      curve_keys(device, "__curve_keys", MEM_GLOBAL),
+      patches(device, "__patches", MEM_GLOBAL),
+      objects(device, "__objects", MEM_GLOBAL),
+      object_motion_pass(device, "__object_motion_pass", MEM_GLOBAL),
+      object_motion(device, "__object_motion", MEM_GLOBAL),
+      object_flag(device, "__object_flag", MEM_GLOBAL),
+      object_volume_step(device, "__object_volume_step", MEM_GLOBAL),
+      camera_motion(device, "__camera_motion", MEM_GLOBAL),
+      attributes_map(device, "__attributes_map", MEM_GLOBAL),
+      attributes_float(device, "__attributes_float", MEM_GLOBAL),
+      attributes_float2(device, "__attributes_float2", MEM_GLOBAL),
+      attributes_float3(device, "__attributes_float3", MEM_GLOBAL),
+      attributes_uchar4(device, "__attributes_uchar4", MEM_GLOBAL),
+      light_distribution(device, "__light_distribution", MEM_GLOBAL),
+      lights(device, "__lights", MEM_GLOBAL),
+      light_background_marginal_cdf(device, "__light_background_marginal_cdf", MEM_GLOBAL),
+      light_background_conditional_cdf(device, "__light_background_conditional_cdf", MEM_GLOBAL),
+      particles(device, "__particles", MEM_GLOBAL),
+      svm_nodes(device, "__svm_nodes", MEM_GLOBAL),
+      shaders(device, "__shaders", MEM_GLOBAL),
+      lookup_table(device, "__lookup_table", MEM_GLOBAL),
+      sample_pattern_lut(device, "__sample_pattern_lut", MEM_GLOBAL),
+      ies_lights(device, "__ies", MEM_GLOBAL)
       light_tree_nodes(device, "__light_tree_nodes", MEM_TEXTURE),
       light_distribution_to_node(device, "__light_distribution_to_node", MEM_TEXTURE),
       lamp_to_distribution(device, "__lamp_to_distribution", MEM_TEXTURE),
@@ -81,18 +88,20 @@ DeviceScene::DeviceScene(Device *device)
       light_group_sample_prob(device, "__light_group_sample_prob", MEM_TEXTURE),
       leaf_to_first_emitter(device, "__leaf_to_first_emitter", MEM_TEXTURE),
       light_tree_leaf_emitters(device, "__light_tree_leaf_emitters", MEM_TEXTURE),
-      particles(device, "__particles", MEM_TEXTURE),
-      svm_nodes(device, "__svm_nodes", MEM_TEXTURE),
-      shaders(device, "__shaders", MEM_TEXTURE),
-      lookup_table(device, "__lookup_table", MEM_TEXTURE),
-      sobol_directions(device, "__sobol_directions", MEM_TEXTURE),
-      ies_lights(device, "__ies", MEM_TEXTURE)
 {
   memset((void *)&data, 0, sizeof(data));
 }
 
 Scene::Scene(const SceneParams &params_, Device *device)
-    : name("Scene"), device(device), dscene(device), params(params_)
+    : name("Scene"),
+      default_surface(NULL),
+      default_volume(NULL),
+      default_light(NULL),
+      default_background(NULL),
+      default_empty(NULL),
+      device(device),
+      dscene(device),
+      params(params_)
 {
   memset((void *)&dscene.data, 0, sizeof(dscene.data));
 
@@ -102,19 +111,20 @@ Scene::Scene(const SceneParams &params_, Device *device)
   film = new Film();
   background = new Background();
   light_manager = new LightManager();
-  mesh_manager = new MeshManager();
+  geometry_manager = new GeometryManager();
   object_manager = new ObjectManager();
   integrator = new Integrator();
   image_manager = new ImageManager(device->info);
   particle_system_manager = new ParticleSystemManager();
-  curve_system_manager = new CurveSystemManager();
   bake_manager = new BakeManager();
 
   /* OSL only works on the CPU */
   if (device->info.has_osl)
-    shader_manager = ShaderManager::create(this, params.shadingsystem);
+    shader_manager = ShaderManager::create(params.shadingsystem);
   else
-    shader_manager = ShaderManager::create(this, SHADINGSYSTEM_SVM);
+    shader_manager = ShaderManager::create(SHADINGSYSTEM_SVM);
+
+  shader_manager->add_default(this);
 }
 
 Scene::~Scene()
@@ -126,8 +136,8 @@ void Scene::free_memory(bool final)
 {
   foreach (Shader *s, shaders)
     delete s;
-  foreach (Mesh *m, meshes)
-    delete m;
+  foreach (Geometry *g, geometry)
+    delete g;
   foreach (Object *o, objects)
     delete o;
   foreach (Light *l, lights)
@@ -136,7 +146,7 @@ void Scene::free_memory(bool final)
     delete p;
 
   shaders.clear();
-  meshes.clear();
+  geometry.clear();
   objects.clear();
   lights.clear();
   particle_systems.clear();
@@ -148,12 +158,11 @@ void Scene::free_memory(bool final)
     integrator->device_free(device, &dscene);
 
     object_manager->device_free(device, &dscene);
-    mesh_manager->device_free(device, &dscene);
+    geometry_manager->device_free(device, &dscene);
     shader_manager->device_free(device, &dscene, this);
     light_manager->device_free(device, &dscene);
 
     particle_system_manager->device_free(device, &dscene);
-    curve_system_manager->device_free(device, &dscene);
 
     bake_manager->device_free(device, &dscene);
 
@@ -173,11 +182,10 @@ void Scene::free_memory(bool final)
     delete background;
     delete integrator;
     delete object_manager;
-    delete mesh_manager;
+    delete geometry_manager;
     delete shader_manager;
     delete light_manager;
     delete particle_system_manager;
-    delete curve_system_manager;
     delete image_manager;
     delete bake_manager;
   }
@@ -219,7 +227,7 @@ void Scene::device_update(Device *device_, Progress &progress)
   if (progress.get_cancel() || device->have_error())
     return;
 
-  mesh_manager->device_update_preprocess(device, this, progress);
+  geometry_manager->device_update_preprocess(device, this, progress);
 
   if (progress.get_cancel() || device->have_error())
     return;
@@ -230,12 +238,6 @@ void Scene::device_update(Device *device_, Progress &progress)
   if (progress.get_cancel() || device->have_error())
     return;
 
-  progress.set_status("Updating Hair Systems");
-  curve_system_manager->device_update(device, &dscene, this, progress);
-
-  if (progress.get_cancel() || device->have_error())
-    return;
-
   progress.set_status("Updating Particle Systems");
   particle_system_manager->device_update(device, &dscene, this, progress);
 
@@ -243,7 +245,7 @@ void Scene::device_update(Device *device_, Progress &progress)
     return;
 
   progress.set_status("Updating Meshes");
-  mesh_manager->device_update(device, &dscene, this, progress);
+  geometry_manager->device_update(device, &dscene, this, progress);
 
   if (progress.get_cancel() || device->have_error())
     return;
@@ -364,10 +366,9 @@ bool Scene::need_update()
 bool Scene::need_data_update()
 {
   return (background->need_update || image_manager->need_update || object_manager->need_update ||
-          mesh_manager->need_update || light_manager->need_update || lookup_tables->need_update ||
-          integrator->need_update || shader_manager->need_update ||
-          particle_system_manager->need_update || curve_system_manager->need_update ||
-          bake_manager->need_update || film->need_update);
+          geometry_manager->need_update || light_manager->need_update ||
+          lookup_tables->need_update || integrator->need_update || shader_manager->need_update ||
+          particle_system_manager->need_update || bake_manager->need_update || film->need_update);
 }
 
 bool Scene::need_reset()
@@ -387,10 +388,9 @@ void Scene::reset()
   background->tag_update(this);
   integrator->tag_update(this);
   object_manager->tag_update(this);
-  mesh_manager->tag_update(this);
+  geometry_manager->tag_update(this);
   light_manager->tag_update(this);
   particle_system_manager->tag_update(this);
-  curve_system_manager->tag_update(this);
 }
 
 void Scene::device_free()
@@ -400,7 +400,7 @@ void Scene::device_free()
 
 void Scene::collect_statistics(RenderStats *stats)
 {
-  mesh_manager->collect_statistics(this, stats);
+  geometry_manager->collect_statistics(this, stats);
   image_manager->collect_statistics(stats);
 }
 
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index e48cec1fd57..721ecbe14ac 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -44,8 +44,8 @@ class Integrator;
 class Light;
 class LightManager;
 class LookupTables;
-class Mesh;
-class MeshManager;
+class Geometry;
+class GeometryManager;
 class Object;
 class ObjectManager;
 class ParticleSystemManager;
@@ -91,6 +91,7 @@ class DeviceScene {
   device_vector<Transform> object_motion_pass;
   device_vector<DecomposedTransform> object_motion;
   device_vector<uint> object_flag;
+  device_vector<float> object_volume_step;
 
   /* cameras */
   device_vector<DecomposedTransform> camera_motion;
@@ -127,7 +128,7 @@ class DeviceScene {
   device_vector<float> lookup_table;
 
   /* integrator */
-  device_vector<uint> sobol_directions;
+  device_vector<uint> sample_pattern_lut;
 
   /* ies lights */
   device_vector<float> ies_lights;
@@ -175,9 +176,13 @@ class SceneParams {
   bool use_bvh_spatial_split;
   bool use_bvh_unaligned_nodes;
   int num_bvh_time_steps;
+  int hair_subdivisions;
+  CurveShapeType hair_shape;
   bool persistent_data;
   int texture_limit;
 
+  bool background;
+
   SceneParams()
   {
     shadingsystem = SHADINGSYSTEM_SVM;
@@ -186,8 +191,11 @@ class SceneParams {
     use_bvh_spatial_split = false;
     use_bvh_unaligned_nodes = true;
     num_bvh_time_steps = 0;
+    hair_subdivisions = 3;
+    hair_shape = CURVE_RIBBON;
     persistent_data = false;
     texture_limit = 0;
+    background = true;
   }
 
   bool modified(const SceneParams &params)
@@ -197,8 +205,15 @@ class SceneParams {
              use_bvh_spatial_split == params.use_bvh_spatial_split &&
              use_bvh_unaligned_nodes == params.use_bvh_unaligned_nodes &&
              num_bvh_time_steps == params.num_bvh_time_steps &&
+             hair_subdivisions == params.hair_subdivisions && hair_shape == params.hair_shape &&
              persistent_data == params.persistent_data && texture_limit == params.texture_limit);
   }
+
+  int curve_subdivisions()
+  {
+    /* Matching the tesselation rate limit in Embree. */
+    return clamp(1 << hair_subdivisions, 1, 16);
+  }
 };
 
 /* Scene */
@@ -218,7 +233,7 @@ class Scene {
 
   /* data lists */
   vector<Object *> objects;
-  vector<Mesh *> meshes;
+  vector<Geometry *> geometry;
   vector<Shader *> shaders;
   vector<Light *> lights;
   vector<ParticleSystem *> particle_systems;
@@ -227,14 +242,14 @@ class Scene {
   ImageManager *image_manager;
   LightManager *light_manager;
   ShaderManager *shader_manager;
-  MeshManager *mesh_manager;
+  GeometryManager *geometry_manager;
   ObjectManager *object_manager;
   ParticleSystemManager *particle_system_manager;
-  CurveSystemManager *curve_system_manager;
   BakeManager *bake_manager;
 
   /* default shaders */
   Shader *default_surface;
+  Shader *default_volume;
   Shader *default_light;
   Shader *default_background;
   Shader *default_empty;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 29eb779a7d6..c5033359c6b 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include <string.h>
 #include <limits.h>
+#include <string.h>
 
+#include "device/device.h"
+#include "render/bake.h"
 #include "render/buffers.h"
 #include "render/camera.h"
-#include "device/device.h"
 #include "render/graph.h"
 #include "render/integrator.h"
 #include "render/light.h"
@@ -27,7 +28,6 @@
 #include "render/object.h"
 #include "render/scene.h"
 #include "render/session.h"
-#include "render/bake.h"
 
 #include "util/util_foreach.h"
 #include "util/util_function.h"
@@ -61,8 +61,10 @@ Session::Session(const SessionParams &params_)
 
   TaskScheduler::init(params.threads);
 
+  /* Create CPU/GPU devices. */
   device = Device::create(params.device, stats, profiler, params.background);
 
+  /* Create buffers for interactive rendering. */
   if (params.background && !params.write_render_cb) {
     buffers = NULL;
     display = NULL;
@@ -72,6 +74,9 @@ Session::Session(const SessionParams &params_)
     display = new DisplayBuffer(device, params.display_buffer_linear);
   }
 
+  /* Validate denoising parameters. */
+  set_denoising(params.denoising);
+
   session_thread = NULL;
   scene = NULL;
 
@@ -83,7 +88,7 @@ Session::Session(const SessionParams &params_)
 
   display_outdated = false;
   gpu_draw_ready = false;
-  gpu_need_tonemap = false;
+  gpu_need_display_buffer_update = false;
   pause = false;
   kernels_loaded = false;
 
@@ -97,8 +102,8 @@ Session::~Session()
     /* wait for session thread to end */
     progress.set_cancel("Exiting");
 
-    gpu_need_tonemap = false;
-    gpu_need_tonemap_cond.notify_all();
+    gpu_need_display_buffer_update = false;
+    gpu_need_display_buffer_update_cond.notify_all();
 
     {
       thread_scoped_lock pause_lock(pause_mutex);
@@ -110,12 +115,12 @@ Session::~Session()
   }
 
   if (params.write_render_cb) {
-    /* tonemap and write out image if requested */
+    /* Copy to display buffer and write out image if requested */
     delete display;
 
     display = new DisplayBuffer(device, false);
     display->reset(buffers->params);
-    tonemap(params.samples);
+    copy_to_display_buffer(params.samples);
 
     int w = display->draw_width;
     int h = display->draw_height;
@@ -168,8 +173,8 @@ void Session::reset_gpu(BufferParams &buffer_params, int samples)
 
   reset_(buffer_params, samples);
 
-  gpu_need_tonemap = false;
-  gpu_need_tonemap_cond.notify_all();
+  gpu_need_display_buffer_update = false;
+  gpu_need_display_buffer_update_cond.notify_all();
 
   pause_cond.notify_all();
 }
@@ -183,14 +188,15 @@ bool Session::draw_gpu(BufferParams &buffer_params, DeviceDrawParams &draw_param
   if (gpu_draw_ready) {
     /* then verify the buffers have the expected size, so we don't
      * draw previous results in a resized window */
-    if (!buffer_params.modified(display->params)) {
-      /* for CUDA we need to do tonemapping still, since we can
-       * only access GL buffers from the main thread */
-      if (gpu_need_tonemap) {
+    if (buffer_params.width == display->params.width &&
+        buffer_params.height == display->params.height) {
+      /* for CUDA we need to do tone-mapping still, since we can
+       * only access GL buffers from the main thread. */
+      if (gpu_need_display_buffer_update) {
         thread_scoped_lock buffers_lock(buffers_mutex);
-        tonemap(tile_manager.state.sample);
-        gpu_need_tonemap = false;
-        gpu_need_tonemap_cond.notify_all();
+        copy_to_display_buffer(tile_manager.state.sample);
+        gpu_need_display_buffer_update = false;
+        gpu_need_display_buffer_update_cond.notify_all();
       }
 
       display->draw(device, draw_params);
@@ -211,6 +217,7 @@ void Session::run_gpu()
 
   reset_time = time_dt();
   last_update_time = time_dt();
+  last_display_time = last_update_time;
 
   progress.set_render_start_time();
 
@@ -232,7 +239,7 @@ void Session::run_gpu()
     }
 
     /* Don't go in pause mode when image was rendered with preview kernels
-     * When feature kernels become available the session will be resetted. */
+     * When feature kernels become available the session will be reset. */
     else if (no_tiles && kernel_state == DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL) {
       time_sleep(0.1);
     }
@@ -285,9 +292,7 @@ void Session::run_gpu()
 
       if (progress.get_cancel())
         break;
-    }
 
-    if (!no_tiles) {
       /* buffers mutex is locked entirely while rendering each
        * sample, and released/reacquired on each iteration to allow
        * reset and draw in between */
@@ -297,7 +302,9 @@ void Session::run_gpu()
       update_status_time();
 
       /* render */
-      render();
+      bool delayed_denoise = false;
+      const bool need_denoise = render_need_denoise(delayed_denoise);
+      render(need_denoise);
 
       device->task_wait();
 
@@ -307,17 +314,17 @@ void Session::run_gpu()
       /* update status and timing */
       update_status_time();
 
-      gpu_need_tonemap = true;
+      gpu_need_display_buffer_update = !delayed_denoise;
       gpu_draw_ready = true;
       progress.set_update();
 
-      /* wait for tonemap */
+      /* wait for until display buffer is updated */
       if (!params.background) {
-        while (gpu_need_tonemap) {
+        while (gpu_need_display_buffer_update) {
           if (progress.get_cancel())
             break;
 
-          gpu_need_tonemap_cond.wait(buffers_lock);
+          gpu_need_display_buffer_update_cond.wait(buffers_lock);
         }
       }
 
@@ -361,7 +368,8 @@ bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_param
   if (display->draw_ready()) {
     /* then verify the buffers have the expected size, so we don't
      * draw previous results in a resized window */
-    if (!buffer_params.modified(display->params)) {
+    if (buffer_params.width == display->params.width &&
+        buffer_params.height == display->params.height) {
       display->draw(device, draw_params);
 
       if (display_outdated && (time_dt() - reset_time) > params.text_timeout)
@@ -374,7 +382,7 @@ bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_param
   return false;
 }
 
-bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
+bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_types)
 {
   if (progress.get_cancel()) {
     if (params.progressive_refine == false) {
@@ -389,8 +397,14 @@ bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
   Tile *tile;
   int device_num = device->device_number(tile_device);
 
-  if (!tile_manager.next_tile(tile, device_num))
+  while (!tile_manager.next_tile(tile, device_num, tile_types)) {
+    /* Wait for denoising tiles to become available */
+    if ((tile_types & RenderTile::DENOISE) && !progress.get_cancel() && tile_manager.has_tiles()) {
+      denoising_cond.wait(tile_lock);
+      continue;
+    }
     return false;
+  }
 
   /* fill render tile */
   rtile.x = tile_manager.state.buffer.full_x + tile->x;
@@ -401,7 +415,16 @@ bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
   rtile.num_samples = tile_manager.state.num_samples;
   rtile.resolution = tile_manager.state.resolution_divider;
   rtile.tile_index = tile->index;
-  rtile.task = (tile->state == Tile::DENOISE) ? RenderTile::DENOISE : RenderTile::PATH_TRACE;
+
+  if (tile->state == Tile::DENOISE) {
+    rtile.task = RenderTile::DENOISE;
+  }
+  else if (read_bake_tile_cb) {
+    rtile.task = RenderTile::BAKE;
+  }
+  else {
+    rtile.task = RenderTile::PATH_TRACE;
+  }
 
   tile_lock.unlock();
 
@@ -415,6 +438,15 @@ bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
 
     device->map_tile(tile_device, rtile);
 
+    /* Reset copy state, since buffer contents change after the tile was acquired */
+    buffers->map_neighbor_copied = false;
+
+    /* This hack ensures that the copy in 'MultiDevice::map_neighbor_tiles' accounts
+     * for the buffer resolution divider. */
+    buffers->buffer.data_width = (buffers->params.width * buffers->params.get_passes_size()) /
+                                 tile_manager.state.resolution_divider;
+    buffers->buffer.data_height = buffers->params.height / tile_manager.state.resolution_divider;
+
     return true;
   }
 
@@ -431,17 +463,28 @@ bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
     tile->buffers->reset(buffer_params);
   }
 
+  tile->buffers->map_neighbor_copied = false;
+
   tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
 
   rtile.buffer = tile->buffers->buffer.device_pointer;
   rtile.buffers = tile->buffers;
   rtile.sample = tile_manager.state.sample;
 
-  /* this will tag tile as IN PROGRESS in blender-side render pipeline,
-   * which is needed to highlight currently rendering tile before first
-   * sample was processed for it
-   */
-  update_tile_sample(rtile);
+  if (read_bake_tile_cb) {
+    /* This will read any passes needed as input for baking. */
+    {
+      thread_scoped_lock tile_lock(tile_mutex);
+      read_bake_tile_cb(rtile);
+    }
+    rtile.buffers->buffer.copy_to_device();
+  }
+  else {
+    /* This will tag tile as IN PROGRESS in blender-side render pipeline,
+     * which is needed to highlight currently rendering tile before first
+     * sample was processed for it. */
+    update_tile_sample(rtile);
+  }
 
   return true;
 }
@@ -461,7 +504,7 @@ void Session::update_tile_sample(RenderTile &rtile)
   update_status_time();
 }
 
-void Session::release_tile(RenderTile &rtile)
+void Session::release_tile(RenderTile &rtile, const bool need_denoise)
 {
   thread_scoped_lock tile_lock(tile_mutex);
 
@@ -469,7 +512,8 @@ void Session::release_tile(RenderTile &rtile)
 
   bool delete_tile;
 
-  if (tile_manager.finish_tile(rtile.tile_index, delete_tile)) {
+  if (tile_manager.finish_tile(rtile.tile_index, need_denoise, delete_tile)) {
+    /* Finished tile pixels write. */
     if (write_render_tile_cb && params.progressive_refine == false) {
       write_render_tile_cb(rtile);
     }
@@ -480,66 +524,99 @@ void Session::release_tile(RenderTile &rtile)
     }
   }
   else {
+    /* In progress tile pixels update. */
     if (update_render_tile_cb && params.progressive_refine == false) {
       update_render_tile_cb(rtile, false);
     }
   }
 
   update_status_time();
+
+  /* Notify denoising thread that a tile was finished. */
+  denoising_cond.notify_all();
 }
 
-void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+void Session::map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
 {
   thread_scoped_lock tile_lock(tile_mutex);
 
-  int center_idx = tiles[4].tile_index;
-  assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
-  BufferParams buffer_params = tile_manager.params;
-  int4 image_region = make_int4(buffer_params.full_x,
-                                buffer_params.full_y,
-                                buffer_params.full_x + buffer_params.width,
-                                buffer_params.full_y + buffer_params.height);
-
-  for (int dy = -1, i = 0; dy <= 1; dy++) {
-    for (int dx = -1; dx <= 1; dx++, i++) {
-      int px = tiles[4].x + dx * params.tile_size.x;
-      int py = tiles[4].y + dy * params.tile_size.y;
-      if (px >= image_region.x && py >= image_region.y && px < image_region.z &&
-          py < image_region.w) {
-        int tile_index = center_idx + dy * tile_manager.state.tile_stride + dx;
-        Tile *tile = &tile_manager.state.tiles[tile_index];
-        assert(tile->buffers);
-
-        tiles[i].buffer = tile->buffers->buffer.device_pointer;
-        tiles[i].x = tile_manager.state.buffer.full_x + tile->x;
-        tiles[i].y = tile_manager.state.buffer.full_y + tile->y;
-        tiles[i].w = tile->w;
-        tiles[i].h = tile->h;
-        tiles[i].buffers = tile->buffers;
-
-        tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride);
-      }
-      else {
-        tiles[i].buffer = (device_ptr)NULL;
-        tiles[i].buffers = NULL;
-        tiles[i].x = clamp(px, image_region.x, image_region.z);
-        tiles[i].y = clamp(py, image_region.y, image_region.w);
-        tiles[i].w = tiles[i].h = 0;
+  const int4 image_region = make_int4(
+      tile_manager.state.buffer.full_x,
+      tile_manager.state.buffer.full_y,
+      tile_manager.state.buffer.full_x + tile_manager.state.buffer.width,
+      tile_manager.state.buffer.full_y + tile_manager.state.buffer.height);
+
+  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+
+  if (!tile_manager.schedule_denoising) {
+    /* Fix up tile slices with overlap. */
+    if (tile_manager.slice_overlap != 0) {
+      int y = max(center_tile.y - tile_manager.slice_overlap, image_region.y);
+      center_tile.h = min(center_tile.y + center_tile.h + tile_manager.slice_overlap,
+                          image_region.w) -
+                      y;
+      center_tile.y = y;
+    }
+
+    /* Tiles are not being denoised individually, which means the entire image is processed. */
+    neighbors.set_bounds_from_center();
+  }
+  else {
+    int center_idx = center_tile.tile_index;
+    assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
+
+    for (int dy = -1, i = 0; dy <= 1; dy++) {
+      for (int dx = -1; dx <= 1; dx++, i++) {
+        RenderTile &rtile = neighbors.tiles[i];
+        int nindex = tile_manager.get_neighbor_index(center_idx, i);
+        if (nindex >= 0) {
+          Tile *tile = &tile_manager.state.tiles[nindex];
+
+          rtile.x = image_region.x + tile->x;
+          rtile.y = image_region.y + tile->y;
+          rtile.w = tile->w;
+          rtile.h = tile->h;
+
+          if (buffers) {
+            tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
+
+            rtile.buffer = buffers->buffer.device_pointer;
+            rtile.buffers = buffers;
+          }
+          else {
+            assert(tile->buffers);
+            tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
+
+            rtile.buffer = tile->buffers->buffer.device_pointer;
+            rtile.buffers = tile->buffers;
+          }
+        }
+        else {
+          int px = center_tile.x + dx * params.tile_size.x;
+          int py = center_tile.y + dy * params.tile_size.y;
+
+          rtile.x = clamp(px, image_region.x, image_region.z);
+          rtile.y = clamp(py, image_region.y, image_region.w);
+          rtile.w = rtile.h = 0;
+
+          rtile.buffer = (device_ptr)NULL;
+          rtile.buffers = NULL;
+        }
       }
     }
   }
 
-  assert(tiles[4].buffers);
-  device->map_neighbor_tiles(tile_device, tiles);
+  assert(center_tile.buffers);
+  device->map_neighbor_tiles(tile_device, neighbors);
 
   /* The denoised result is written back to the original tile. */
-  tiles[9] = tiles[4];
+  neighbors.target = center_tile;
 }
 
-void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device)
+void Session::unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
 {
   thread_scoped_lock tile_lock(tile_mutex);
-  device->unmap_neighbor_tiles(tile_device, tiles);
+  device->unmap_neighbor_tiles(tile_device, neighbors);
 }
 
 void Session::run_cpu()
@@ -547,6 +624,7 @@ void Session::run_cpu()
   bool tiles_written = false;
 
   last_update_time = time_dt();
+  last_display_time = last_update_time;
 
   {
     /* reset once to start */
@@ -561,7 +639,7 @@ void Session::run_cpu()
   while (!progress.get_cancel()) {
     /* advance to next tile */
     bool no_tiles = !tile_manager.next();
-    bool need_tonemap = false;
+    bool need_copy_to_display_buffer = false;
 
     DeviceKernelStatus kernel_state = DEVICE_KERNEL_UNKNOWN;
     if (no_tiles) {
@@ -577,7 +655,7 @@ void Session::run_cpu()
     }
 
     /* Don't go in pause mode when preview kernels are used
-     * When feature kernels become available the session will be resetted. */
+     * When feature kernels become available the session will be reset. */
     else if (no_tiles && kernel_state == DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL) {
       time_sleep(0.1);
     }
@@ -622,11 +700,6 @@ void Session::run_cpu()
     }
 
     if (!no_tiles) {
-      /* buffers mutex is locked entirely while rendering each
-       * sample, and released/reacquired on each iteration to allow
-       * reset and draw in between */
-      thread_scoped_lock buffers_lock(buffers_mutex);
-
       /* update scene */
       scoped_timer update_timer;
       if (update_scene()) {
@@ -640,17 +713,24 @@ void Session::run_cpu()
       if (progress.get_cancel())
         break;
 
+      /* buffers mutex is locked entirely while rendering each
+       * sample, and released/reacquired on each iteration to allow
+       * reset and draw in between */
+      thread_scoped_lock buffers_lock(buffers_mutex);
+
       /* update status and timing */
       update_status_time();
 
       /* render */
-      render();
+      bool delayed_denoise = false;
+      const bool need_denoise = render_need_denoise(delayed_denoise);
+      render(need_denoise);
 
       /* update status and timing */
       update_status_time();
 
       if (!params.background)
-        need_tonemap = true;
+        need_copy_to_display_buffer = !delayed_denoise;
 
       if (!device->error_message().empty())
         progress.set_error(device->error_message());
@@ -668,10 +748,10 @@ void Session::run_cpu()
         delayed_reset.do_reset = false;
         reset_(delayed_reset.params, delayed_reset.samples);
       }
-      else if (need_tonemap) {
-        /* tonemap only if we do not reset, we don't we don't
+      else if (need_copy_to_display_buffer) {
+        /* Only copy to display_buffer if we do not reset, we don't
          * want to show the result of an incomplete sample */
-        tonemap(tile_manager.state.sample);
+        copy_to_display_buffer(tile_manager.state.sample);
       }
 
       if (!device->error_message().empty())
@@ -700,26 +780,30 @@ DeviceRequestedFeatures Session::get_requested_device_features()
    */
   bool use_motion = scene->need_motion() == Scene::MotionType::MOTION_BLUR;
   requested_features.use_hair = false;
+  requested_features.use_hair_thick = (scene->params.hair_shape == CURVE_THICK);
   requested_features.use_object_motion = false;
   requested_features.use_camera_motion = use_motion && scene->camera->use_motion();
   foreach (Object *object, scene->objects) {
-    Mesh *mesh = object->mesh;
-    if (mesh->num_curves()) {
-      requested_features.use_hair = true;
-    }
+    Geometry *geom = object->geometry;
     if (use_motion) {
-      requested_features.use_object_motion |= object->use_motion() | mesh->use_motion_blur;
-      requested_features.use_camera_motion |= mesh->use_motion_blur;
+      requested_features.use_object_motion |= object->use_motion() | geom->use_motion_blur;
+      requested_features.use_camera_motion |= geom->use_motion_blur;
     }
-#ifdef WITH_OPENSUBDIV
-    if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
-      requested_features.use_patch_evaluation = true;
-    }
-#endif
     if (object->is_shadow_catcher) {
       requested_features.use_shadow_tricks = true;
     }
-    requested_features.use_true_displacement |= mesh->has_true_displacement();
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+#ifdef WITH_OPENSUBDIV
+      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
+        requested_features.use_patch_evaluation = true;
+      }
+#endif
+      requested_features.use_true_displacement |= mesh->has_true_displacement();
+    }
+    else if (geom->type == Geometry::HAIR) {
+      requested_features.use_hair = true;
+    }
   }
 
   requested_features.use_background_light = scene->light_manager->has_background_light(scene);
@@ -728,7 +812,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
   requested_features.use_baking = bake_manager->get_baking();
   requested_features.use_integrator_branched = (scene->integrator->method ==
                                                 Integrator::BRANCHED_PATH);
-  if (params.run_denoising) {
+  if (params.denoising.use || params.denoising.store_passes) {
     requested_features.use_denoising = true;
     requested_features.use_shadow_tricks = true;
   }
@@ -757,7 +841,7 @@ bool Session::load_kernels(bool lock_scene)
         message = "Failed loading render kernel, see console for errors";
 
       progress.set_error(message);
-      progress.set_status("Error", message);
+      progress.set_status(message);
       progress.set_update();
       return false;
     }
@@ -796,7 +880,7 @@ void Session::run()
 
   /* progress update */
   if (progress.get_cancel())
-    progress.set_status("Cancel", progress.get_cancel_message());
+    progress.set_status(progress.get_cancel_message());
   else
     progress.set_update();
 }
@@ -844,9 +928,6 @@ void Session::set_samples(int samples)
     params.samples = samples;
     tile_manager.set_samples(samples);
 
-    {
-      thread_scoped_lock pause_lock(pause_mutex);
-    }
     pause_cond.notify_all();
   }
 }
@@ -868,6 +949,40 @@ void Session::set_pause(bool pause_)
     pause_cond.notify_all();
 }
 
+void Session::set_denoising(const DenoiseParams &denoising)
+{
+  bool need_denoise = denoising.need_denoising_task();
+
+  /* Lock buffers so no denoising operation is triggered while the settings are changed here. */
+  thread_scoped_lock buffers_lock(buffers_mutex);
+  params.denoising = denoising;
+
+  if (!(params.device.denoisers & denoising.type)) {
+    if (need_denoise) {
+      progress.set_error("Denoiser type not supported by compute device");
+    }
+
+    params.denoising.use = false;
+    need_denoise = false;
+  }
+
+  // TODO(pmours): Query the required overlap value for denoising from the device?
+  tile_manager.slice_overlap = need_denoise && !params.background ? 64 : 0;
+
+  /* Schedule per tile denoising for final renders if we are either denoising or
+   * need prefiltered passes for the native denoiser. */
+  tile_manager.schedule_denoising = need_denoise && !buffers;
+}
+
+void Session::set_denoising_start_sample(int sample)
+{
+  if (sample != params.denoising.start_sample) {
+    params.denoising.start_sample = sample;
+
+    pause_cond.notify_all();
+  }
+}
+
 void Session::wait()
 {
   if (session_thread) {
@@ -890,7 +1005,7 @@ bool Session::update_scene()
   int height = tile_manager.state.buffer.full_height;
   int resolution = tile_manager.state.resolution_divider;
 
-  if (width != cam->width || height != cam->height) {
+  if (width != cam->width || height != cam->height || resolution != cam->resolution) {
     cam->width = width;
     cam->height = height;
     cam->resolution = resolution;
@@ -902,7 +1017,7 @@ bool Session::update_scene()
   Integrator *integrator = scene->integrator;
   BakeManager *bake_manager = scene->bake_manager;
 
-  if (integrator->sampling_pattern == SAMPLING_PATTERN_CMJ || bake_manager->get_baking()) {
+  if (integrator->sampling_pattern != SAMPLING_PATTERN_SOBOL || bake_manager->get_baking()) {
     int aa_samples = tile_manager.num_samples;
 
     if (aa_samples != integrator->aa_samples) {
@@ -913,7 +1028,8 @@ bool Session::update_scene()
 
   /* update scene */
   if (scene->need_update()) {
-    bool new_kernels_needed = load_kernels(false);
+    /* Updated used shader tag so we know which features are need for the kernel. */
+    scene->shader_manager->update_shaders_used(scene);
 
     /* Update max_closures. */
     KernelIntegrator *kintegrator = &scene->dscene.data.integrator;
@@ -925,6 +1041,9 @@ bool Session::update_scene()
       kintegrator->max_closures = MAX_CLOSURE;
     }
 
+    /* Load render kernels, before device update where we upload data to the GPU. */
+    bool new_kernels_needed = load_kernels(false);
+
     progress.set_status("Updating Scene");
     MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
 
@@ -978,14 +1097,14 @@ void Session::update_status_time(bool show_pause, bool show_done)
        */
       substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
     }
-    if (params.full_denoising) {
+    if (params.denoising.use && params.denoising.type != DENOISER_OPENIMAGEDENOISE) {
       substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
     }
-    else if (params.run_denoising) {
+    else if (params.denoising.store_passes && params.denoising.type == DENOISER_NLM) {
       substatus += string_printf(", Prefiltered %d tiles", progress.get_denoised_tiles());
     }
   }
-  else if (tile_manager.num_samples == INT_MAX)
+  else if (tile_manager.num_samples == Integrator::MAX_SAMPLES)
     substatus = string_printf("Path Tracing Sample %d", progressive_sample + 1);
   else
     substatus = string_printf("Path Tracing Sample %d/%d", progressive_sample + 1, num_samples);
@@ -1005,18 +1124,59 @@ void Session::update_status_time(bool show_pause, bool show_done)
   progress.set_status(status, substatus);
 }
 
-void Session::render()
+bool Session::render_need_denoise(bool &delayed)
+{
+  delayed = false;
+
+  /* Not supported yet for baking. */
+  if (read_bake_tile_cb) {
+    return false;
+  }
+
+  /* Denoising enabled? */
+  if (!params.denoising.need_denoising_task()) {
+    return false;
+  }
+
+  if (params.background) {
+    /* Background render, only denoise when rendering the last sample. */
+    return tile_manager.done();
+  }
+
+  /* Viewport render. */
+
+  /* It can happen that denoising was already enabled, but the scene still needs an update. */
+  if (scene->film->need_update || !scene->film->denoising_data_offset) {
+    return false;
+  }
+
+  /* Do not denoise until the sample at which denoising should start is reached. */
+  if (tile_manager.state.sample < min(params.denoising.start_sample, params.samples - 1)) {
+    return false;
+  }
+
+  /* Avoid excessive denoising in viewport after reaching a certain amount of samples. */
+  delayed = (tile_manager.state.sample >= 20 &&
+             (time_dt() - last_display_time) < params.progressive_update_timeout);
+  return !delayed;
+}
+
+void Session::render(bool need_denoise)
 {
-  /* Clear buffers. */
   if (buffers && tile_manager.state.sample == tile_manager.range_start_sample) {
+    /* Clear buffers. */
     buffers->zero();
   }
 
+  if (tile_manager.state.buffer.width == 0 || tile_manager.state.buffer.height == 0) {
+    return; /* Avoid empty launches. */
+  }
+
   /* Add path trace task. */
   DeviceTask task(DeviceTask::RENDER);
 
-  task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2);
-  task.release_tile = function_bind(&Session::release_tile, this, _1);
+  task.acquire_tile = function_bind(&Session::acquire_tile, this, _2, _1, _3);
+  task.release_tile = function_bind(&Session::release_tile, this, _1, need_denoise);
   task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
   task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
   task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
@@ -1024,29 +1184,56 @@ void Session::render()
   task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
   task.need_finish_queue = params.progressive_refine;
   task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
-  task.requested_tile_size = params.tile_size;
-  task.passes_size = tile_manager.params.get_passes_size();
 
-  if (params.run_denoising) {
+  task.adaptive_sampling.use = (scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ) &&
+                               scene->dscene.data.film.pass_adaptive_aux_buffer;
+  task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
+  task.adaptive_sampling.adaptive_step = scene->dscene.data.integrator.adaptive_step;
+
+  /* Acquire render tiles by default. */
+  task.tile_types = RenderTile::PATH_TRACE;
+
+  if (need_denoise) {
     task.denoising = params.denoising;
 
-    assert(!scene->film->need_update);
     task.pass_stride = scene->film->pass_stride;
     task.target_pass_stride = task.pass_stride;
     task.pass_denoising_data = scene->film->denoising_data_offset;
     task.pass_denoising_clean = scene->film->denoising_clean_offset;
 
     task.denoising_from_render = true;
-    task.denoising_do_filter = params.full_denoising;
-    task.denoising_write_passes = params.write_denoising_passes;
+
+    if (tile_manager.schedule_denoising) {
+      /* Acquire denoising tiles during rendering. */
+      task.tile_types |= RenderTile::DENOISE;
+    }
+    else {
+      assert(buffers);
+
+      /* Schedule rendering and wait for it to finish. */
+      device->task_add(task);
+      device->task_wait();
+
+      /* Then run denoising on the whole image at once. */
+      task.type = DeviceTask::DENOISE_BUFFER;
+      task.x = tile_manager.state.buffer.full_x;
+      task.y = tile_manager.state.buffer.full_y;
+      task.w = tile_manager.state.buffer.width;
+      task.h = tile_manager.state.buffer.height;
+      task.buffer = buffers->buffer.device_pointer;
+      task.sample = tile_manager.state.sample;
+      task.num_samples = tile_manager.state.num_samples;
+      tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
+      task.buffers = buffers;
+    }
   }
 
   device->task_add(task);
 }
 
-void Session::tonemap(int sample)
+void Session::copy_to_display_buffer(int sample)
 {
-  /* add tonemap task */
+  /* add film conversion task */
   DeviceTask task(DeviceTask::FILM_CONVERT);
 
   task.x = tile_manager.state.buffer.full_x;
@@ -1065,6 +1252,8 @@ void Session::tonemap(int sample)
 
     /* set display to new size */
     display->draw_set(task.w, task.h);
+
+    last_display_time = time_dt();
   }
 
   display_outdated = false;
@@ -1142,8 +1331,11 @@ int Session::get_max_closure_count()
 
   int max_closures = 0;
   for (int i = 0; i < scene->shaders.size(); i++) {
-    int num_closures = scene->shaders[i]->graph->get_num_closures();
-    max_closures = max(max_closures, num_closures);
+    Shader *shader = scene->shaders[i];
+    if (shader->used) {
+      int num_closures = shader->graph->get_num_closures();
+      max_closures = max(max_closures, num_closures);
+    }
   }
   max_closure_global = max(max_closure_global, max_closures);
 
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 60d8f7a8b14..e3ac054ead3 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -17,8 +17,8 @@
 #ifndef __SESSION_H__
 #define __SESSION_H__
 
-#include "render/buffers.h"
 #include "device/device.h"
+#include "render/buffers.h"
 #include "render/shader.h"
 #include "render/stats.h"
 #include "render/tile.h"
@@ -53,16 +53,15 @@ class SessionParams {
   int2 tile_size;
   TileOrder tile_order;
   int start_resolution;
+  int denoising_start_sample;
   int pixel_size;
   int threads;
+  bool adaptive_sampling;
 
   bool use_profiling;
 
   bool display_buffer_linear;
 
-  bool run_denoising;
-  bool write_denoising_passes;
-  bool full_denoising;
   DenoiseParams denoising;
 
   double cancel_timeout;
@@ -84,15 +83,13 @@ class SessionParams {
     samples = 1024;
     tile_size = make_int2(64, 64);
     start_resolution = INT_MAX;
+    denoising_start_sample = 0;
     pixel_size = 1;
     threads = 0;
+    adaptive_sampling = false;
 
     use_profiling = false;
 
-    run_denoising = false;
-    write_denoising_passes = false;
-    full_denoising = false;
-
     display_buffer_linear = false;
 
     cancel_timeout = 0.1;
@@ -107,17 +104,20 @@ class SessionParams {
   bool modified(const SessionParams &params)
   {
     return !(device == params.device && background == params.background &&
-             progressive_refine == params.progressive_refine
-             /* && samples == params.samples */
-             && progressive == params.progressive && experimental == params.experimental &&
+             progressive_refine == params.progressive_refine &&
+             /* samples == params.samples && denoising_start_sample ==
+                params.denoising_start_sample && */
+             progressive == params.progressive && experimental == params.experimental &&
              tile_size == params.tile_size && start_resolution == params.start_resolution &&
              pixel_size == params.pixel_size && threads == params.threads &&
+             adaptive_sampling == params.adaptive_sampling &&
              use_profiling == params.use_profiling &&
              display_buffer_linear == params.display_buffer_linear &&
              cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
              text_timeout == params.text_timeout &&
              progressive_update_timeout == params.progressive_update_timeout &&
-             tile_order == params.tile_order && shadingsystem == params.shadingsystem);
+             tile_order == params.tile_order && shadingsystem == params.shadingsystem &&
+             denoising.type == params.denoising.type);
   }
 };
 
@@ -140,6 +140,7 @@ class Session {
 
   function<void(RenderTile &)> write_render_tile_cb;
   function<void(RenderTile &, bool)> update_render_tile_cb;
+  function<void(RenderTile &)> read_bake_tile_cb;
 
   explicit Session(const SessionParams &params);
   ~Session();
@@ -150,8 +151,10 @@ class Session {
 
   bool ready_to_reset();
   void reset(BufferParams &params, int samples);
-  void set_samples(int samples);
   void set_pause(bool pause);
+  void set_samples(int samples);
+  void set_denoising(const DenoiseParams &denoising);
+  void set_denoising_start_sample(int sample);
 
   bool update_scene();
   bool load_kernels(bool lock_scene = true);
@@ -176,8 +179,9 @@ class Session {
 
   void update_status_time(bool show_pause = false, bool show_done = false);
 
-  void tonemap(int sample);
-  void render();
+  void render(bool use_denoise);
+  void copy_to_display_buffer(int sample);
+
   void reset_(BufferParams &params, int samples);
 
   void run_cpu();
@@ -188,12 +192,14 @@ class Session {
   bool draw_gpu(BufferParams &params, DeviceDrawParams &draw_params);
   void reset_gpu(BufferParams &params, int samples);
 
-  bool acquire_tile(Device *tile_device, RenderTile &tile);
+  bool render_need_denoise(bool &delayed);
+
+  bool acquire_tile(RenderTile &tile, Device *tile_device, uint tile_types);
   void update_tile_sample(RenderTile &tile);
-  void release_tile(RenderTile &tile);
+  void release_tile(RenderTile &tile, const bool need_denoise);
 
-  void map_neighbor_tiles(RenderTile *tiles, Device *tile_device);
-  void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device);
+  void map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
+  void unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
 
   bool device_use_gl;
 
@@ -202,8 +208,8 @@ class Session {
   volatile bool display_outdated;
 
   volatile bool gpu_draw_ready;
-  volatile bool gpu_need_tonemap;
-  thread_condition_variable gpu_need_tonemap_cond;
+  volatile bool gpu_need_display_buffer_update;
+  thread_condition_variable gpu_need_display_buffer_update_cond;
 
   bool pause;
   thread_condition_variable pause_cond;
@@ -211,14 +217,16 @@ class Session {
   thread_mutex tile_mutex;
   thread_mutex buffers_mutex;
   thread_mutex display_mutex;
+  thread_condition_variable denoising_cond;
 
   bool kernels_loaded;
   DeviceRequestedFeatures loaded_kernel_features;
 
   double reset_time;
+  double last_update_time;
+  double last_display_time;
 
   /* progressive refine */
-  double last_update_time;
   bool update_progressive_refine(bool cancel);
 
   DeviceRequestedFeatures get_requested_device_features();
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index ac3303cbfeb..1120d909e98 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
+#include "device/device.h"
+
 #include "render/background.h"
 #include "render/camera.h"
-#include "device/device.h"
+#include "render/colorspace.h"
 #include "render/graph.h"
 #include "render/integrator.h"
 #include "render/light.h"
@@ -31,6 +33,7 @@
 
 #include "util/util_foreach.h"
 #include "util/util_murmurhash.h"
+#include "util/util_task.h"
 
 #ifdef WITH_OCIO
 #  include <OpenColorIO/OpenColorIO.h>
@@ -166,7 +169,7 @@ NODE_DEFINE(Shader)
   SOCKET_ENUM(volume_sampling_method,
               "Volume Sampling Method",
               volume_sampling_method_enum,
-              VOLUME_SAMPLING_DISTANCE);
+              VOLUME_SAMPLING_MULTIPLE_IMPORTANCE);
 
   static NodeEnum volume_interpolation_method_enum;
   volume_interpolation_method_enum.insert("linear", VOLUME_INTERPOLATION_LINEAR);
@@ -176,6 +179,8 @@ NODE_DEFINE(Shader)
               volume_interpolation_method_enum,
               VOLUME_INTERPOLATION_LINEAR);
 
+  SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f);
+
   static NodeEnum displacement_method_enum;
   displacement_method_enum.insert("bump", DISPLACE_BUMP);
   displacement_method_enum.insert("true", DISPLACE_TRUE);
@@ -201,10 +206,10 @@ Shader::Shader() : Node(node_type)
   has_bssrdf_bump = false;
   has_surface_spatial_varying = false;
   has_volume_spatial_varying = false;
-  has_object_dependency = false;
-  has_attribute_dependency = false;
+  has_volume_attribute_dependency = false;
   has_integrator_dependency = false;
   has_volume_connected = false;
+  prev_volume_step_rate = 0.0f;
 
   displacement_method = DISPLACE_BUMP;
 
@@ -212,8 +217,7 @@ Shader::Shader() : Node(node_type)
   used = false;
 
   need_update = true;
-  need_update_mesh = true;
-  need_sync_object = false;
+  need_update_geometry = true;
 }
 
 Shader::~Shader()
@@ -223,6 +227,13 @@ Shader::~Shader()
 
 bool Shader::is_constant_emission(float3 *emission)
 {
+  /* If the shader has AOVs, they need to be evaluated, so we can't skip the shader. */
+  foreach (ShaderNode *node, graph->nodes) {
+    if (node->special_type == SHADER_SPECIAL_TYPE_OUTPUT_AOV) {
+      return false;
+    }
+  }
+
   ShaderInput *surf = graph->output()->input("Surface");
 
   if (surf->link == NULL) {
@@ -279,7 +290,7 @@ void Shader::set_graph(ShaderGraph *graph_)
     const char *new_hash = (graph_) ? graph_->displacement_hash.c_str() : "";
 
     if (strcmp(old_hash, new_hash) != 0) {
-      need_update_mesh = true;
+      need_update_geometry = true;
     }
   }
 
@@ -308,8 +319,11 @@ void Shader::tag_update(Scene *scene)
    * has use_mis set to false. We are quite close to release now, so
    * better to be safe.
    */
-  if (this == scene->default_background && scene->light_manager->has_background_light(scene)) {
-    scene->light_manager->need_update = true;
+  if (this == scene->background->get_shader(scene)) {
+    scene->light_manager->need_update_background = true;
+    if (scene->light_manager->has_background_light(scene)) {
+      scene->light_manager->need_update = true;
+    }
   }
 
   /* quick detection of which kind of shaders we have to avoid loading
@@ -337,15 +351,16 @@ void Shader::tag_update(Scene *scene)
   }
 
   /* compare if the attributes changed, mesh manager will check
-   * need_update_mesh, update the relevant meshes and clear it. */
+   * need_update_geometry, update the relevant meshes and clear it. */
   if (attributes.modified(prev_attributes)) {
-    need_update_mesh = true;
-    scene->mesh_manager->need_update = true;
+    need_update_geometry = true;
+    scene->geometry_manager->need_update = true;
   }
 
-  if (has_volume != prev_has_volume) {
-    scene->mesh_manager->need_flags_update = true;
+  if (has_volume != prev_has_volume || volume_step_rate != prev_volume_step_rate) {
+    scene->geometry_manager->need_flags_update = true;
     scene->object_manager->need_flags_update = true;
+    prev_volume_step_rate = volume_step_rate;
   }
 }
 
@@ -405,7 +420,7 @@ ShaderManager::~ShaderManager()
 {
 }
 
-ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
+ShaderManager *ShaderManager::create(int shadingsystem)
 {
   ShaderManager *manager;
 
@@ -421,8 +436,6 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
     manager = new SVMShaderManager();
   }
 
-  add_default(scene);
-
   return manager;
 }
 
@@ -461,8 +474,12 @@ int ShaderManager::get_shader_id(Shader *shader, bool smooth)
   return id;
 }
 
-void ShaderManager::device_update_shaders_used(Scene *scene)
+void ShaderManager::update_shaders_used(Scene *scene)
 {
+  if (!need_update) {
+    return;
+  }
+
   /* figure out which shaders are in use, so SVM/OSL can skip compiling them
    * for speed and avoid loading image textures into memory */
   uint id = 0;
@@ -479,8 +496,8 @@ void ShaderManager::device_update_shaders_used(Scene *scene)
   if (scene->background->shader)
     scene->background->shader->used = true;
 
-  foreach (Mesh *mesh, scene->meshes)
-    foreach (Shader *shader, mesh->used_shaders)
+  foreach (Geometry *geom, scene->geometry)
+    foreach (Shader *shader, geom->used_shaders)
       shader->used = true;
 
   foreach (Light *light, scene->lights)
@@ -521,10 +538,12 @@ void ShaderManager::device_update_common(Device *device,
     /* in this case we can assume transparent surface */
     if (shader->has_volume_connected && !shader->has_surface)
       flag |= SD_HAS_ONLY_VOLUME;
-    if (shader->heterogeneous_volume && shader->has_volume_spatial_varying)
-      flag |= SD_HETEROGENEOUS_VOLUME;
-    if (shader->has_attribute_dependency)
-      flag |= SD_NEED_ATTRIBUTES;
+    if (shader->has_volume) {
+      if (shader->heterogeneous_volume && shader->has_volume_spatial_varying)
+        flag |= SD_HETEROGENEOUS_VOLUME;
+    }
+    if (shader->has_volume_attribute_dependency)
+      flag |= SD_NEED_VOLUME_ATTRIBUTES;
     if (shader->has_bssrdf_bump)
       flag |= SD_HAS_BSSRDF_BUMP;
     if (device->info.has_volume_decoupled) {
@@ -613,9 +632,27 @@ void ShaderManager::add_default(Scene *scene)
 
     Shader *shader = new Shader();
     shader->name = "default_surface";
-    shader->graph = graph;
+    shader->set_graph(graph);
     scene->shaders.push_back(shader);
     scene->default_surface = shader;
+    shader->tag_update(scene);
+  }
+
+  /* default volume */
+  {
+    ShaderGraph *graph = new ShaderGraph();
+
+    PrincipledVolumeNode *principled = new PrincipledVolumeNode();
+    graph->add(principled);
+
+    graph->connect(principled->output("Volume"), graph->output()->input("Volume"));
+
+    Shader *shader = new Shader();
+    shader->name = "default_volume";
+    shader->set_graph(graph);
+    scene->shaders.push_back(shader);
+    scene->default_volume = shader;
+    shader->tag_update(scene);
   }
 
   /* default light */
@@ -631,9 +668,10 @@ void ShaderManager::add_default(Scene *scene)
 
     Shader *shader = new Shader();
     shader->name = "default_light";
-    shader->graph = graph;
+    shader->set_graph(graph);
     scene->shaders.push_back(shader);
     scene->default_light = shader;
+    shader->tag_update(scene);
   }
 
   /* default background */
@@ -642,9 +680,10 @@ void ShaderManager::add_default(Scene *scene)
 
     Shader *shader = new Shader();
     shader->name = "default_background";
-    shader->graph = graph;
+    shader->set_graph(graph);
     scene->shaders.push_back(shader);
     scene->default_background = shader;
+    shader->tag_update(scene);
   }
 
   /* default empty */
@@ -653,9 +692,10 @@ void ShaderManager::add_default(Scene *scene)
 
     Shader *shader = new Shader();
     shader->name = "default_empty";
-    shader->graph = graph;
+    shader->set_graph(graph);
     scene->shaders.push_back(shader);
     scene->default_empty = shader;
+    shader->tag_update(scene);
   }
 }
 
@@ -694,6 +734,10 @@ void ShaderManager::get_requested_features(Scene *scene,
   requested_features->nodes_features = 0;
   for (int i = 0; i < scene->shaders.size(); i++) {
     Shader *shader = scene->shaders[i];
+    if (!shader->used) {
+      continue;
+    }
+
     /* Gather requested features from all the nodes from the graph nodes. */
     get_requested_graph_features(shader->graph, requested_features);
     ShaderNode *output_node = shader->graph->output();
@@ -701,6 +745,8 @@ void ShaderManager::get_requested_features(Scene *scene,
       requested_features->nodes_features |= NODE_FEATURE_BUMP;
       if (shader->displacement_method == DISPLACE_BOTH) {
         requested_features->nodes_features |= NODE_FEATURE_BUMP_STATE;
+        requested_features->max_nodes_group = max(requested_features->max_nodes_group,
+                                                  NODE_GROUP_LEVEL_1);
       }
     }
     /* On top of volume nodes, also check if we need volume sampling because
@@ -717,6 +763,8 @@ void ShaderManager::free_memory()
 #ifdef WITH_OSL
   OSLShaderManager::free_memory();
 #endif
+
+  ColorSpaceManager::free_memory();
 }
 
 float ShaderManager::linear_rgb_to_gray(float3 c)
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 600b0cc59d4..993b467b396 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -23,8 +23,8 @@
 #  include <OSL/oslexec.h>
 #endif
 
-#include "render/attribute.h"
 #include "kernel/kernel_types.h"
+#include "render/attribute.h"
 
 #include "graph/node.h"
 
@@ -92,11 +92,12 @@ class Shader : public Node {
   bool heterogeneous_volume;
   VolumeSampling volume_sampling_method;
   int volume_interpolation_method;
+  float volume_step_rate;
+  float prev_volume_step_rate;
 
   /* synchronization */
   bool need_update;
-  bool need_update_mesh;
-  bool need_sync_object;
+  bool need_update_geometry;
 
   /* If the shader has only volume components, the surface is assumed to
    * be transparent.
@@ -118,8 +119,7 @@ class Shader : public Node {
   bool has_bssrdf_bump;
   bool has_surface_spatial_varying;
   bool has_volume_spatial_varying;
-  bool has_object_dependency;
-  bool has_attribute_dependency;
+  bool has_volume_attribute_dependency;
   bool has_integrator_dependency;
 
   /* displacement */
@@ -143,8 +143,10 @@ class Shader : public Node {
   Shader();
   ~Shader();
 
-  /* Checks whether the shader consists of just a emission node with fixed inputs that's connected directly to the output.
-   * If yes, it sets the content of emission to the constant value (color * strength), which is then used for speeding up light evaluation. */
+  /* Checks whether the shader consists of just a emission node with fixed inputs that's connected
+   * directly to the output.
+   * If yes, it sets the content of emission to the constant value (color * strength), which is
+   * then used for speeding up light evaluation. */
   bool is_constant_emission(float3 *emission);
 
   void set_graph(ShaderGraph *graph);
@@ -161,7 +163,7 @@ class ShaderManager {
  public:
   bool need_update;
 
-  static ShaderManager *create(Scene *scene, int shadingsystem);
+  static ShaderManager *create(int shadingsystem);
   virtual ~ShaderManager();
 
   virtual void reset(Scene *scene) = 0;
@@ -178,7 +180,6 @@ class ShaderManager {
                              Progress &progress) = 0;
   virtual void device_free(Device *device, DeviceScene *dscene, Scene *scene) = 0;
 
-  void device_update_shaders_used(Scene *scene);
   void device_update_common(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
   void device_free_common(Device *device, DeviceScene *dscene, Scene *scene);
 
@@ -194,6 +195,7 @@ class ShaderManager {
   static void add_default(Scene *scene);
 
   /* Selective nodes compilation. */
+  void update_shaders_used(Scene *scene);
   void get_requested_features(Scene *scene, DeviceRequestedFeatures *requested_features);
 
   static void free_memory();
diff --git a/intern/cycles/render/sobol.cpp b/intern/cycles/render/sobol.cpp
index 487599476d4..c821249b239 100644
--- a/intern/cycles/render/sobol.cpp
+++ b/intern/cycles/render/sobol.cpp
@@ -54,15 +54,15 @@ CCL_NAMESPACE_BEGIN
 #define SOBOL_MAX_NUMBER 32
 
 typedef struct SobolDirectionNumbers {
-	uint d, s, a;
-	uint m[SOBOL_MAX_NUMBER];
+  uint d, s, a;
+  uint m[SOBOL_MAX_NUMBER];
 } SobolDirectionNumbers;
 
 /* Note: this file is skipped by clang-format. */
 
 /* Keep simple alignment. */
 /* clang-format off */
-static SobolDirectionNumbers SOBOL_NUMBERS[SOBOL_MAX_DIMENSIONS - 1] = {
+static const SobolDirectionNumbers SOBOL_NUMBERS[SOBOL_MAX_DIMENSIONS - 1] = {
 {2, 1, 0, {1}},
 {3, 2, 1, {1, 3}},
 {4, 3, 1, {1, 3, 1}},
@@ -21268,40 +21268,40 @@ static SobolDirectionNumbers SOBOL_NUMBERS[SOBOL_MAX_DIMENSIONS - 1] = {
 
 void sobol_generate_direction_vectors(uint vectors[][SOBOL_BITS], int dimensions)
 {
-	assert(dimensions <= SOBOL_MAX_DIMENSIONS);
+  assert(dimensions <= SOBOL_MAX_DIMENSIONS);
 
-	const uint L = SOBOL_BITS;
+  const uint L = SOBOL_BITS;
 
-	/* first dimension is exception */
-	uint *v = vectors[0];
+  /* first dimension is exception */
+  uint *v = vectors[0];
 
-	for(uint i = 0; i < L; i++)
-		v[i] = 1 << (31-i); // all m's = 1
+  for (uint i = 0; i < L; i++)
+    v[i] = 1 << (31 - i);  // all m's = 1
 
-	for(int dim = 1; dim < dimensions; dim++) {
-		SobolDirectionNumbers *numbers = &SOBOL_NUMBERS[dim-1];
-		uint s = numbers->s;
-		uint a = numbers->a;
-		uint *m = numbers->m;
+  for (int dim = 1; dim < dimensions; dim++) {
+    const SobolDirectionNumbers *numbers = &SOBOL_NUMBERS[dim - 1];
+    const uint s = numbers->s;
+    const uint a = numbers->a;
+    const uint *m = numbers->m;
 
-		v = vectors[dim];
+    v = vectors[dim];
 
-		if(L <= s) {
-			for(uint i = 0; i < L; i++)
-				v[i] = m[i] << (31-i);
-		}
-		else {
-			for(uint i = 0; i < s; i++)
-				v[i] = m[i] << (31-i);
+    if (L <= s) {
+      for (uint i = 0; i < L; i++)
+        v[i] = m[i] << (31 - i);
+    }
+    else {
+      for (uint i = 0; i < s; i++)
+        v[i] = m[i] << (31 - i);
 
-			for(uint i = s; i < L; i++) {
-				v[i] = v[i-s] ^ (v[i-s] >> s);
+      for (uint i = s; i < L; i++) {
+        v[i] = v[i - s] ^ (v[i - s] >> s);
 
-				for(uint k = 1; k < s; k++)
-					v[i] ^= (((a >> (s-1-k)) & 1) * v[i-k]);
-			}
-		}
-	}
+        for (uint k = 1; k < s; k++)
+          v[i] ^= (((a >> (s - 1 - k)) & 1) * v[i - k]);
+      }
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/stats.h b/intern/cycles/render/stats.h
index f1bf1903483..e45403a3754 100644
--- a/intern/cycles/render/stats.h
+++ b/intern/cycles/render/stats.h
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
  * semantic around the units of size, it just should be the same for all
  * entries.
  *
- * This is a generic entry foi all size-related statistics, which helps
+ * This is a generic entry for all size-related statistics, which helps
  * avoiding duplicating code for things like sorting.
  */
 class NamedSizeEntry {
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index d8e3e24f39e..88714e20a90 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -15,6 +15,8 @@
  */
 
 #include "device/device.h"
+
+#include "render/background.h"
 #include "render/graph.h"
 #include "render/light.h"
 #include "render/mesh.h"
@@ -23,8 +25,8 @@
 #include "render/shader.h"
 #include "render/svm.h"
 
-#include "util/util_logging.h"
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
 #include "util/util_progress.h"
 #include "util/util_task.h"
 
@@ -47,46 +49,23 @@ void SVMShaderManager::reset(Scene * /*scene*/)
 void SVMShaderManager::device_update_shader(Scene *scene,
                                             Shader *shader,
                                             Progress *progress,
-                                            array<int4> *global_svm_nodes)
+                                            array<int4> *svm_nodes)
 {
   if (progress->get_cancel()) {
     return;
   }
   assert(shader->graph);
 
-  array<int4> svm_nodes;
-  svm_nodes.push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
+  svm_nodes->push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
 
   SVMCompiler::Summary summary;
-  SVMCompiler compiler(scene->shader_manager, scene->image_manager, scene->light_manager);
-  compiler.background = (shader == scene->default_background);
-  compiler.compile(scene, shader, svm_nodes, 0, &summary);
+  SVMCompiler compiler(scene);
+  compiler.background = (shader == scene->background->get_shader(scene));
+  compiler.compile(shader, *svm_nodes, 0, &summary);
 
   VLOG(2) << "Compilation summary:\n"
           << "Shader name: " << shader->name << "\n"
           << summary.full_report();
-
-  nodes_lock_.lock();
-  if (shader->use_mis && shader->has_surface_emission) {
-    scene->light_manager->need_update = true;
-  }
-
-  /* The copy needs to be done inside the lock, if another thread resizes the array
-   * while memcpy is running, it'll be copying into possibly invalid/freed ram.
-   */
-  size_t global_nodes_size = global_svm_nodes->size();
-  global_svm_nodes->resize(global_nodes_size + svm_nodes.size());
-
-  /* Offset local SVM nodes to a global address space. */
-  int4 &jump_node = (*global_svm_nodes)[shader->id];
-  jump_node.y = svm_nodes[0].y + global_nodes_size - 1;
-  jump_node.z = svm_nodes[0].z + global_nodes_size - 1;
-  jump_node.w = svm_nodes[0].w + global_nodes_size - 1;
-  /* Copy new nodes to global storage. */
-  memcpy(&(*global_svm_nodes)[global_nodes_size],
-         &svm_nodes[1],
-         sizeof(int4) * (svm_nodes.size() - 1));
-  nodes_lock_.unlock();
 }
 
 void SVMShaderManager::device_update(Device *device,
@@ -97,30 +76,25 @@ void SVMShaderManager::device_update(Device *device,
   if (!need_update)
     return;
 
-  VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
+  const int num_shaders = scene->shaders.size();
+
+  VLOG(1) << "Total " << num_shaders << " shaders.";
 
   double start_time = time_dt();
 
   /* test if we need to update */
   device_free(device, dscene, scene);
 
-  /* determine which shaders are in use */
-  device_update_shaders_used(scene);
-
-  /* svm_nodes */
-  array<int4> svm_nodes;
-  size_t i;
-
-  for (i = 0; i < scene->shaders.size(); i++) {
-    svm_nodes.push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
-  }
-
+  /* Build all shaders. */
   TaskPool task_pool;
-  foreach (Shader *shader, scene->shaders) {
-    task_pool.push(
-        function_bind(
-            &SVMShaderManager::device_update_shader, this, scene, shader, &progress, &svm_nodes),
-        false);
+  vector<array<int4>> shader_svm_nodes(num_shaders);
+  for (int i = 0; i < num_shaders; i++) {
+    task_pool.push(function_bind(&SVMShaderManager::device_update_shader,
+                                 this,
+                                 scene,
+                                 scene->shaders[i],
+                                 &progress,
+                                 &shader_svm_nodes[i]));
   }
   task_pool.wait_work();
 
@@ -128,20 +102,60 @@ void SVMShaderManager::device_update(Device *device,
     return;
   }
 
-  dscene->svm_nodes.steal_data(svm_nodes);
-  dscene->svm_nodes.copy_to_device();
+  /* The global node list contains a jump table (one node per shader)
+   * followed by the nodes of all shaders. */
+  int svm_nodes_size = num_shaders;
+  for (int i = 0; i < num_shaders; i++) {
+    /* Since we're not copying the local jump node, the size ends up being one node lower. */
+    svm_nodes_size += shader_svm_nodes[i].size() - 1;
+  }
+
+  int4 *svm_nodes = dscene->svm_nodes.alloc(svm_nodes_size);
 
-  for (i = 0; i < scene->shaders.size(); i++) {
+  int node_offset = num_shaders;
+  for (int i = 0; i < num_shaders; i++) {
     Shader *shader = scene->shaders[i];
+
     shader->need_update = false;
+    if (shader->use_mis && shader->has_surface_emission) {
+      scene->light_manager->need_update = true;
+    }
+
+    /* Update the global jump table.
+     * Each compiled shader starts with a jump node that has offsets local
+     * to the shader, so copy those and add the offset into the global node list. */
+    int4 &global_jump_node = svm_nodes[shader->id];
+    int4 &local_jump_node = shader_svm_nodes[i][0];
+
+    global_jump_node.x = NODE_SHADER_JUMP;
+    global_jump_node.y = local_jump_node.y - 1 + node_offset;
+    global_jump_node.z = local_jump_node.z - 1 + node_offset;
+    global_jump_node.w = local_jump_node.w - 1 + node_offset;
+
+    node_offset += shader_svm_nodes[i].size() - 1;
+  }
+
+  /* Copy the nodes of each shader into the correct location. */
+  svm_nodes += num_shaders;
+  for (int i = 0; i < num_shaders; i++) {
+    int shader_size = shader_svm_nodes[i].size() - 1;
+
+    memcpy(svm_nodes, &shader_svm_nodes[i][1], sizeof(int4) * shader_size);
+    svm_nodes += shader_size;
+  }
+
+  if (progress.get_cancel()) {
+    return;
   }
 
+  dscene->svm_nodes.copy_to_device();
+
   device_update_common(device, dscene, scene, progress);
 
   need_update = false;
 
-  VLOG(1) << "Shader manager updated " << scene->shaders.size() << " shaders in "
-          << time_dt() - start_time << " seconds.";
+  VLOG(1) << "Shader manager updated " << num_shaders << " shaders in " << time_dt() - start_time
+          << " seconds.";
 }
 
 void SVMShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
@@ -153,13 +167,8 @@ void SVMShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
 
 /* Graph Compiler */
 
-SVMCompiler::SVMCompiler(ShaderManager *shader_manager_,
-                         ImageManager *image_manager_,
-                         LightManager *light_manager_)
+SVMCompiler::SVMCompiler(Scene *scene) : scene(scene)
 {
-  shader_manager = shader_manager_;
-  image_manager = image_manager_;
-  light_manager = light_manager_;
   max_stack_use = 0;
   current_type = SHADER_TYPE_SURFACE;
   current_shader = NULL;
@@ -392,12 +401,12 @@ void SVMCompiler::add_node(const float4 &f)
 
 uint SVMCompiler::attribute(ustring name)
 {
-  return shader_manager->get_attribute_id(name);
+  return scene->shader_manager->get_attribute_id(name);
 }
 
 uint SVMCompiler::attribute(AttributeStandard std)
 {
-  return shader_manager->get_attribute_id(std);
+  return scene->shader_manager->get_attribute_id(std);
 }
 
 uint SVMCompiler::attribute_standard(ustring name)
@@ -434,14 +443,8 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet &done)
   else if (current_type == SHADER_TYPE_VOLUME) {
     if (node->has_spatial_varying())
       current_shader->has_volume_spatial_varying = true;
-  }
-
-  if (node->has_object_dependency()) {
-    current_shader->has_object_dependency = true;
-  }
-
-  if (node->has_attribute_dependency()) {
-    current_shader->has_attribute_dependency = true;
+    if (node->has_attribute_dependency())
+      current_shader->has_volume_attribute_dependency = true;
   }
 
   if (node->has_integrator_dependency()) {
@@ -538,6 +541,24 @@ void SVMCompiler::generated_shared_closure_nodes(ShaderNode *root_node,
   }
 }
 
+void SVMCompiler::generate_aov_node(ShaderNode *node, CompilerState *state)
+{
+  /* execute dependencies for node */
+  foreach (ShaderInput *in, node->inputs) {
+    if (in->link != NULL) {
+      ShaderNodeSet dependencies;
+      find_dependencies(dependencies, state->nodes_done, in);
+      generate_svm_nodes(dependencies, state);
+    }
+  }
+
+  /* compile node itself */
+  generate_node(node, state->nodes_done);
+
+  state->nodes_done.insert(node);
+  state->nodes_done_flag[node->id] = true;
+}
+
 void SVMCompiler::generate_multi_closure(ShaderNode *root_node,
                                          ShaderNode *node,
                                          CompilerState *state)
@@ -687,21 +708,21 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
   current_graph = graph;
 
   /* get input in output node */
-  ShaderNode *node = graph->output();
+  ShaderNode *output = graph->output();
   ShaderInput *clin = NULL;
 
   switch (type) {
     case SHADER_TYPE_SURFACE:
-      clin = node->input("Surface");
+      clin = output->input("Surface");
       break;
     case SHADER_TYPE_VOLUME:
-      clin = node->input("Volume");
+      clin = output->input("Volume");
       break;
     case SHADER_TYPE_DISPLACEMENT:
-      clin = node->input("Displacement");
+      clin = output->input("Displacement");
       break;
     case SHADER_TYPE_BUMP:
-      clin = node->input("Normal");
+      clin = output->input("Normal");
       break;
     default:
       assert(0);
@@ -712,10 +733,10 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
   memset((void *)&active_stack, 0, sizeof(active_stack));
   current_svm_nodes.clear();
 
-  foreach (ShaderNode *node_iter, graph->nodes) {
-    foreach (ShaderInput *input, node_iter->inputs)
+  foreach (ShaderNode *node, graph->nodes) {
+    foreach (ShaderInput *input, node->inputs)
       input->stack_offset = SVM_STACK_INVALID;
-    foreach (ShaderOutput *output, node_iter->outputs)
+    foreach (ShaderOutput *output, node->outputs)
       output->stack_offset = SVM_STACK_INVALID;
   }
 
@@ -729,6 +750,7 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
   }
 
   if (shader->used) {
+    CompilerState state(graph);
     if (clin->link) {
       bool generate = false;
 
@@ -753,13 +775,36 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
       }
 
       if (generate) {
-        CompilerState state(graph);
         generate_multi_closure(clin->link->parent, clin->link->parent, &state);
       }
     }
 
     /* compile output node */
-    node->compile(*this);
+    output->compile(*this);
+
+    if (type == SHADER_TYPE_SURFACE) {
+      vector<OutputAOVNode *> aov_outputs;
+      foreach (ShaderNode *node, graph->nodes) {
+        if (node->special_type == SHADER_SPECIAL_TYPE_OUTPUT_AOV) {
+          OutputAOVNode *aov_node = static_cast<OutputAOVNode *>(node);
+          if (aov_node->slot >= 0) {
+            aov_outputs.push_back(aov_node);
+          }
+        }
+      }
+      if (aov_outputs.size() > 0) {
+        /* AOV passes are only written if the object is directly visible, so
+         * there is no point in evaluating all the nodes generated only for the
+         * AOV outputs if that's not the case. Therefore, we insert
+         * NODE_AOV_START into the shader before the AOV-only nodes are
+         * generated which tells the kernel that it can stop evaluation
+         * early if AOVs will not be written. */
+        add_node(NODE_AOV_START, 0, 0, 0);
+        foreach (OutputAOVNode *node, aov_outputs) {
+          generate_aov_node(node, &state);
+        }
+      }
+    }
   }
 
   /* add node to restore state after bump shader has finished */
@@ -773,14 +818,14 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
     compile_failed = false;
   }
 
-  /* for bump shaders we fall thru to the surface shader, but if this is any other kind of shader it ends here */
+  /* for bump shaders we fall thru to the surface shader, but if this is any other kind of shader
+   * it ends here */
   if (type != SHADER_TYPE_BUMP) {
     add_node(NODE_END, 0, 0, 0);
   }
 }
 
-void SVMCompiler::compile(
-    Scene *scene, Shader *shader, array<int4> &svm_nodes, int index, Summary *summary)
+void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Summary *summary)
 {
   /* copy graph for shader with bump mapping */
   ShaderNode *output = shader->graph->output();
@@ -812,8 +857,7 @@ void SVMCompiler::compile(
   shader->has_displacement = false;
   shader->has_surface_spatial_varying = false;
   shader->has_volume_spatial_varying = false;
-  shader->has_object_dependency = false;
-  shader->has_attribute_dependency = false;
+  shader->has_volume_attribute_dependency = false;
   shader->has_integrator_dependency = false;
 
   /* generate bump shader */
@@ -828,7 +872,8 @@ void SVMCompiler::compile(
   {
     scoped_timer timer((summary != NULL) ? &summary->time_generate_surface : NULL);
     compile_type(shader, shader->graph, SHADER_TYPE_SURFACE);
-    /* only set jump offset if there's no bump shader, as the bump shader will fall thru to this one if it exists */
+    /* only set jump offset if there's no bump shader, as the bump shader will fall thru to this
+     * one if it exists */
     if (!has_bump) {
       svm_nodes[index].y = svm_nodes.size();
     }
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index d6964fb158b..61923fc40ac 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -50,13 +50,10 @@ class SVMShaderManager : public ShaderManager {
   void device_free(Device *device, DeviceScene *dscene, Scene *scene);
 
  protected:
-  /* Lock used to synchronize threaded nodes compilation. */
-  thread_spin_lock nodes_lock_;
-
   void device_update_shader(Scene *scene,
                             Shader *shader,
                             Progress *progress,
-                            array<int4> *global_svm_nodes);
+                            array<int4> *svm_nodes);
 };
 
 /* Graph Compiler */
@@ -96,11 +93,8 @@ class SVMCompiler {
     string full_report() const;
   };
 
-  SVMCompiler(ShaderManager *shader_manager,
-              ImageManager *image_manager,
-              LightManager *light_manager);
-  void compile(
-      Scene *scene, Shader *shader, array<int4> &svm_nodes, int index, Summary *summary = NULL);
+  SVMCompiler(Scene *scene);
+  void compile(Shader *shader, array<int4> &svm_nodes, int index, Summary *summary = NULL);
 
   int stack_assign(ShaderOutput *output);
   int stack_assign(ShaderInput *input);
@@ -129,9 +123,8 @@ class SVMCompiler {
     return current_type;
   }
 
-  ImageManager *image_manager;
-  ShaderManager *shader_manager;
-  LightManager *light_manager;
+  Scene *scene;
+  ShaderGraph *current_graph;
   bool background;
 
  protected:
@@ -207,6 +200,7 @@ class SVMCompiler {
                          ShaderInput *input,
                          ShaderNode *skip_node = NULL);
   void generate_node(ShaderNode *node, ShaderNodeSet &done);
+  void generate_aov_node(ShaderNode *node, CompilerState *state);
   void generate_closure_node(ShaderNode *node, CompilerState *state);
   void generated_shared_closure_nodes(ShaderNode *root_node,
                                       ShaderNode *node,
@@ -223,7 +217,6 @@ class SVMCompiler {
   array<int4> current_svm_nodes;
   ShaderType current_type;
   Shader *current_shader;
-  ShaderGraph *current_graph;
   Stack active_stack;
   int max_stack_use;
   uint mix_weight_offset;
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index d88925939e3..270e05abe29 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include "render/tables.h"
 #include "device/device.h"
 #include "render/scene.h"
-#include "render/tables.h"
 
 #include "util/util_logging.h"
 
diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h
index 12b59bb0aeb..3ed2959ae59 100644
--- a/intern/cycles/render/tables.h
+++ b/intern/cycles/render/tables.h
@@ -18,6 +18,7 @@
 #define __TABLES_H__
 
 #include "util/util_list.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 3148b5ef664..375c9fd8e09 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -101,6 +101,7 @@ TileManager::TileManager(bool progressive_,
   tile_order = tile_order_;
   start_resolution = start_resolution_;
   pixel_size = pixel_size_;
+  slice_overlap = 0;
   num_samples = num_samples_;
   num_devices = num_devices_;
   preserve_tile_device = preserve_tile_device_;
@@ -170,8 +171,9 @@ void TileManager::set_samples(int num_samples_)
   }
   else {
     uint64_t pixel_samples = 0;
-    /* While rendering in the viewport, the initial preview resolution is increased to the native resolution
-     * before the actual rendering begins. Therefore, additional pixel samples will be rendered. */
+    /* While rendering in the viewport, the initial preview resolution is increased to the native
+     * resolution before the actual rendering begins. Therefore, additional pixel samples will be
+     * rendered. */
     int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size);
     while (divider > pixel_size) {
       int image_w = max(1, params.width / divider);
@@ -190,8 +192,9 @@ void TileManager::set_samples(int num_samples_)
   }
 }
 
-/* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render device.
- * If sliced is true, slice image into as much pieces as how many devices are rendering this image. */
+/* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render
+ * device. If sliced is true, slice image into as much pieces as how many devices are rendering
+ * this image. */
 int TileManager::gen_tiles(bool sliced)
 {
   int resolution = state.resolution_divider;
@@ -199,8 +202,7 @@ int TileManager::gen_tiles(bool sliced)
   int image_h = max(1, params.height / resolution);
   int2 center = make_int2(image_w / 2, image_h / 2);
 
-  int num_logical_devices = preserve_tile_device ? num_devices : 1;
-  int num = min(image_h, num_logical_devices);
+  int num = preserve_tile_device || sliced ? min(image_h, num_devices) : 1;
   int slice_num = sliced ? num : 1;
   int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
 
@@ -214,7 +216,7 @@ int TileManager::gen_tiles(bool sliced)
   tile_list = state.render_tiles.begin();
 
   if (tile_order == TILE_HILBERT_SPIRAL) {
-    assert(!sliced);
+    assert(!sliced && slice_overlap == 0);
 
     int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
     state.tiles.resize(tile_w * tile_h);
@@ -255,7 +257,8 @@ int TileManager::gen_tiles(bool sliced)
         }
 
         int2 pos = block * block_size + tile * tile_size + offset;
-        /* Only add tiles which are in the image (tiles outside of the image can be generated since the spiral is always square). */
+        /* Only add tiles which are in the image (tiles outside of the image can be generated since
+         * the spiral is always square). */
         if (pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
           int w = min(tile_size.x, image_w - pos.x);
           int h = min(tile_size.y, image_h - pos.y);
@@ -316,6 +319,12 @@ int TileManager::gen_tiles(bool sliced)
     int slice_h = (slice == slice_num - 1) ? image_h - slice * (image_h / slice_num) :
                                              image_h / slice_num;
 
+    if (slice_overlap != 0) {
+      int slice_y_offset = max(slice_y - slice_overlap, 0);
+      slice_h = min(slice_y + slice_h + slice_overlap, image_h) - slice_y_offset;
+      slice_y = slice_y_offset;
+    }
+
     int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
 
     int tiles_per_device = divide_up(tile_w * tile_h, num);
@@ -336,7 +345,8 @@ int TileManager::gen_tiles(bool sliced)
           cur_tiles++;
 
           if (cur_tiles == tiles_per_device) {
-            /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */
+            /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that
+             * case. */
             if (tile_order != TILE_BOTTOM_TO_TOP) {
               tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
             }
@@ -359,6 +369,7 @@ void TileManager::gen_render_tiles()
 {
   /* Regenerate just the render tiles for progressive render. */
   foreach (Tile &tile, state.tiles) {
+    tile.state = Tile::RENDER;
     state.render_tiles[tile.device].push_back(tile.index);
   }
 }
@@ -382,23 +393,36 @@ void TileManager::set_tiles()
 
 int TileManager::get_neighbor_index(int index, int neighbor)
 {
-  static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
+  /* Neighbor indices:
+   *   0 1 2
+   *   3 4 5
+   *   6 7 8
+   */
+  static const int dx[] = {-1, 0, 1, -1, 0, 1, -1, 0, 1};
+  static const int dy[] = {-1, -1, -1, 0, 0, 0, 1, 1, 1};
 
   int resolution = state.resolution_divider;
   int image_w = max(1, params.width / resolution);
   int image_h = max(1, params.height / resolution);
+
+  int num = min(image_h, num_devices);
+  int slice_num = !background ? num : 1;
+  int slice_h = image_h / slice_num;
+
   int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-  int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
+  int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
 
-  int nx = state.tiles[index].x / tile_size.x + dx[neighbor],
-      ny = state.tiles[index].y / tile_size.y + dy[neighbor];
-  if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h)
+  /* Tiles in the state tile list are always indexed from left to right, top to bottom. */
+  int nx = (index % tile_w) + dx[neighbor];
+  int ny = (index / tile_w) + dy[neighbor];
+  if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h * slice_num)
     return -1;
 
   return ny * state.tile_stride + nx;
 }
 
-/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */
+/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state
+ * min_state. */
 bool TileManager::check_neighbor_state(int index, Tile::State min_state)
 {
   if (index < 0 || state.tiles[index].state < min_state) {
@@ -415,24 +439,22 @@ bool TileManager::check_neighbor_state(int index, Tile::State min_state)
   return true;
 }
 
-/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */
-bool TileManager::finish_tile(int index, bool &delete_tile)
+/* Returns whether the tile should be written (and freed if no denoising is used) instead of
+ * updating. */
+bool TileManager::finish_tile(const int index, const bool need_denoise, bool &delete_tile)
 {
   delete_tile = false;
 
-  if (progressive) {
-    return true;
-  }
-
   switch (state.tiles[index].state) {
     case Tile::RENDER: {
-      if (!schedule_denoising) {
+      if (!(schedule_denoising && need_denoise)) {
         state.tiles[index].state = Tile::DONE;
-        delete_tile = true;
+        delete_tile = !progressive;
         return true;
       }
       state.tiles[index].state = Tile::RENDERED;
-      /* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */
+      /* For each neighbor and the tile itself, check whether all of its neighbors have been
+       * rendered. If yes, it can be denoised. */
       for (int neighbor = 0; neighbor < 9; neighbor++) {
         int nindex = get_neighbor_index(index, neighbor);
         if (check_neighbor_state(nindex, Tile::RENDERED)) {
@@ -444,19 +466,24 @@ bool TileManager::finish_tile(int index, bool &delete_tile)
     }
     case Tile::DENOISE: {
       state.tiles[index].state = Tile::DENOISED;
-      /* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */
+      /* For each neighbor and the tile itself, check whether all of its neighbors have been
+       * denoised. If yes, it can be freed. */
       for (int neighbor = 0; neighbor < 9; neighbor++) {
         int nindex = get_neighbor_index(index, neighbor);
         if (check_neighbor_state(nindex, Tile::DENOISED)) {
           state.tiles[nindex].state = Tile::DONE;
-          /* It can happen that the tile just finished denoising and already can be freed here.
-           * However, in that case it still has to be written before deleting, so we can't delete it yet. */
-          if (neighbor == 8) {
-            delete_tile = true;
-          }
-          else {
-            delete state.tiles[nindex].buffers;
-            state.tiles[nindex].buffers = NULL;
+          /* Do not delete finished tiles in progressive mode. */
+          if (!progressive) {
+            /* It can happen that the tile just finished denoising and already can be freed here.
+             * However, in that case it still has to be written before deleting, so we can't delete
+             * it yet. */
+            if (neighbor == 4) {
+              delete_tile = true;
+            }
+            else {
+              delete state.tiles[nindex].buffers;
+              state.tiles[nindex].buffers = NULL;
+            }
           }
         }
       }
@@ -468,27 +495,65 @@ bool TileManager::finish_tile(int index, bool &delete_tile)
   }
 }
 
-bool TileManager::next_tile(Tile *&tile, int device)
+bool TileManager::next_tile(Tile *&tile, int device, uint tile_types)
 {
-  int logical_device = preserve_tile_device ? device : 0;
+  /* Preserve device if requested, unless this is a separate denoising device that just wants to
+   * grab any available tile. */
+  const bool preserve_device = preserve_tile_device && device < num_devices;
 
-  if (logical_device >= state.render_tiles.size())
-    return false;
+  if (tile_types & RenderTile::DENOISE) {
+    int tile_index = -1;
+    int logical_device = preserve_device ? device : 0;
 
-  if (!state.denoising_tiles[logical_device].empty()) {
-    int idx = state.denoising_tiles[logical_device].front();
-    state.denoising_tiles[logical_device].pop_front();
-    tile = &state.tiles[idx];
-    return true;
+    while (logical_device < state.denoising_tiles.size()) {
+      if (state.denoising_tiles[logical_device].empty()) {
+        if (preserve_device) {
+          break;
+        }
+        else {
+          logical_device++;
+          continue;
+        }
+      }
+
+      tile_index = state.denoising_tiles[logical_device].front();
+      state.denoising_tiles[logical_device].pop_front();
+      break;
+    }
+
+    if (tile_index >= 0) {
+      tile = &state.tiles[tile_index];
+      return true;
+    }
   }
 
-  if (state.render_tiles[logical_device].empty())
-    return false;
+  if (tile_types & RenderTile::PATH_TRACE) {
+    int tile_index = -1;
+    int logical_device = preserve_device ? device : 0;
 
-  int idx = state.render_tiles[logical_device].front();
-  state.render_tiles[logical_device].pop_front();
-  tile = &state.tiles[idx];
-  return true;
+    while (logical_device < state.render_tiles.size()) {
+      if (state.render_tiles[logical_device].empty()) {
+        if (preserve_device) {
+          break;
+        }
+        else {
+          logical_device++;
+          continue;
+        }
+      }
+
+      tile_index = state.render_tiles[logical_device].front();
+      state.render_tiles[logical_device].pop_front();
+      break;
+    }
+
+    if (tile_index >= 0) {
+      tile = &state.tiles[tile_index];
+      return true;
+    }
+  }
+
+  return false;
 }
 
 bool TileManager::done()
@@ -499,6 +564,16 @@ bool TileManager::done()
          (state.sample + state.num_samples >= end_sample);
 }
 
+bool TileManager::has_tiles()
+{
+  foreach (Tile &tile, state.tiles) {
+    if (tile.state != Tile::DONE) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool TileManager::next()
 {
   if (done())
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 017c1af0ead..4858a275d5c 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -89,6 +89,7 @@ class TileManager {
   } state;
 
   int num_samples;
+  int slice_overlap;
 
   TileManager(bool progressive,
               int num_samples,
@@ -105,15 +106,19 @@ class TileManager {
   void reset(BufferParams &params, int num_samples);
   void set_samples(int num_samples);
   bool next();
-  bool next_tile(Tile *&tile, int device = 0);
-  bool finish_tile(int index, bool &delete_tile);
+  bool next_tile(Tile *&tile, int device, uint tile_types);
+  bool finish_tile(const int index, const bool need_denoise, bool &delete_tile);
   bool done();
+  bool has_tiles();
 
   void set_tile_order(TileOrder tile_order_)
   {
     tile_order = tile_order_;
   }
 
+  int get_neighbor_index(int index, int neighbor);
+  bool check_neighbor_state(int index, Tile::State state);
+
   /* ** Sample range rendering. ** */
 
   /* Start sample in the range. */
@@ -160,9 +165,6 @@ class TileManager {
   /* Generate tile list, return number of tiles. */
   int gen_tiles(bool sliced);
   void gen_render_tiles();
-
-  int get_neighbor_index(int index, int neighbor);
-  bool check_neighbor_state(int index, Tile::State state);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/subd/CMakeLists.txt b/intern/cycles/subd/CMakeLists.txt
index f5ceaa0436d..b874c5c3c2d 100644
--- a/intern/cycles/subd/CMakeLists.txt
+++ b/intern/cycles/subd/CMakeLists.txt
@@ -19,6 +19,7 @@ set(SRC_HEADERS
   subd_patch.h
   subd_patch_table.h
   subd_split.h
+  subd_subpatch.h
 )
 
 set(LIB
diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp
index 6b062ecfea2..91c7f4bea05 100644
--- a/intern/cycles/subd/subd_dice.cpp
+++ b/intern/cycles/subd/subd_dice.cpp
@@ -38,107 +38,91 @@ EdgeDice::EdgeDice(const SubdParams &params_) : params(params_)
   }
 }
 
-void EdgeDice::reserve(int num_verts)
+void EdgeDice::reserve(int num_verts, int num_triangles)
 {
   Mesh *mesh = params.mesh;
 
   vert_offset = mesh->verts.size();
   tri_offset = mesh->num_triangles();
 
-  /* todo: optimize so we can reserve in advance, this is like push_back_slow() */
-  if (vert_offset + num_verts > mesh->verts.capacity()) {
-    mesh->reserve_mesh(size_t((vert_offset + num_verts) * 1.2), mesh->num_triangles());
-  }
-
-  mesh->resize_mesh(vert_offset + num_verts, tri_offset);
+  mesh->resize_mesh(mesh->verts.size() + num_verts, mesh->num_triangles());
+  mesh->reserve_mesh(mesh->verts.size() + num_verts, mesh->num_triangles() + num_triangles);
 
   Attribute *attr_vN = mesh->attributes.add(ATTR_STD_VERTEX_NORMAL);
 
-  mesh_P = mesh->verts.data();
-  mesh_N = attr_vN->data_float3();
+  mesh_P = mesh->verts.data() + vert_offset;
+  mesh_N = attr_vN->data_float3() + vert_offset;
+
+  params.mesh->num_subd_verts += num_verts;
 }
 
-int EdgeDice::add_vert(Patch *patch, float2 uv)
+void EdgeDice::set_vert(Patch *patch, int index, float2 uv)
 {
   float3 P, N;
 
   patch->eval(&P, NULL, NULL, &N, uv.x, uv.y);
 
-  assert(vert_offset < params.mesh->verts.size());
-
-  mesh_P[vert_offset] = P;
-  mesh_N[vert_offset] = N;
-  params.mesh->vert_patch_uv[vert_offset] = make_float2(uv.x, uv.y);
-
-  if (params.ptex) {
-    Attribute *attr_ptex_uv = params.mesh->attributes.add(ATTR_STD_PTEX_UV);
-    params.mesh->attributes.resize();
+  assert(index < params.mesh->verts.size());
 
-    float3 *ptex_uv = attr_ptex_uv->data_float3();
-    ptex_uv[vert_offset] = make_float3(uv.x, uv.y, 0.0f);
-  }
-
-  params.mesh->num_subd_verts++;
-
-  return vert_offset++;
+  mesh_P[index] = P;
+  mesh_N[index] = N;
+  params.mesh->vert_patch_uv[index + vert_offset] = make_float2(uv.x, uv.y);
 }
 
 void EdgeDice::add_triangle(Patch *patch, int v0, int v1, int v2)
 {
   Mesh *mesh = params.mesh;
 
-  /* todo: optimize so we can reserve in advance, this is like push_back_slow() */
-  if (mesh->triangles.size() == mesh->triangles.capacity())
-    mesh->reserve_mesh(mesh->verts.size(), size_t(max(mesh->num_triangles() + 1, 1) * 1.2));
-
-  mesh->add_triangle(v0, v1, v2, patch->shader, true);
+  mesh->add_triangle(v0 + vert_offset, v1 + vert_offset, v2 + vert_offset, patch->shader, true);
   params.mesh->triangle_patch[params.mesh->num_triangles() - 1] = patch->patch_index;
 
-  if (params.ptex) {
-    Attribute *attr_ptex_face_id = params.mesh->attributes.add(ATTR_STD_PTEX_FACE_ID);
-    params.mesh->attributes.resize();
-
-    float *ptex_face_id = attr_ptex_face_id->data_float();
-    ptex_face_id[tri_offset] = (float)patch->ptex_face_id();
-  }
-
   tri_offset++;
 }
 
-void EdgeDice::stitch_triangles(Patch *patch, vector<int> &outer, vector<int> &inner)
+void EdgeDice::stitch_triangles(Subpatch &sub, int edge)
 {
-  if (inner.size() == 0 || outer.size() == 0)
+  int Mu = max(sub.edge_u0.T, sub.edge_u1.T);
+  int Mv = max(sub.edge_v0.T, sub.edge_v1.T);
+  Mu = max(Mu, 2);
+  Mv = max(Mv, 2);
+
+  int outer_T = sub.edges[edge].T;
+  int inner_T = ((edge % 2) == 0) ? Mv - 2 : Mu - 2;
+
+  if (inner_T < 0 || outer_T < 0)
     return;  // XXX avoid crashes for Mu or Mv == 1, missing polygons
 
   /* stitch together two arrays of verts with triangles. at each step,
    * we compare using the next verts on both sides, to find the split
    * direction with the smallest diagonal, and use that in order to keep
    * the triangle shape reasonable. */
-  for (size_t i = 0, j = 0; i + 1 < inner.size() || j + 1 < outer.size();) {
+  for (size_t i = 0, j = 0; i < inner_T || j < outer_T;) {
     int v0, v1, v2;
 
-    v0 = inner[i];
-    v1 = outer[j];
+    v0 = sub.get_vert_along_grid_edge(edge, i);
+    v1 = sub.get_vert_along_edge(edge, j);
 
-    if (j + 1 == outer.size()) {
-      v2 = inner[++i];
+    if (j == outer_T) {
+      v2 = sub.get_vert_along_grid_edge(edge, ++i);
     }
-    else if (i + 1 == inner.size()) {
-      v2 = outer[++j];
+    else if (i == inner_T) {
+      v2 = sub.get_vert_along_edge(edge, ++j);
     }
     else {
       /* length of diagonals */
-      float len1 = len_squared(mesh_P[inner[i]] - mesh_P[outer[j + 1]]);
-      float len2 = len_squared(mesh_P[outer[j]] - mesh_P[inner[i + 1]]);
+      float len1 = len_squared(mesh_P[sub.get_vert_along_grid_edge(edge, i)] -
+                               mesh_P[sub.get_vert_along_edge(edge, j + 1)]);
+      float len2 = len_squared(mesh_P[sub.get_vert_along_edge(edge, j)] -
+                               mesh_P[sub.get_vert_along_grid_edge(edge, i + 1)]);
 
       /* use smallest diagonal */
       if (len1 < len2)
-        v2 = outer[++j];
+        v2 = sub.get_vert_along_edge(edge, ++j);
       else
-        v2 = inner[++i];
+        v2 = sub.get_vert_along_grid_edge(edge, ++i);
     }
 
-    add_triangle(patch, v0, v1, v2);
+    add_triangle(sub.patch, v1, v0, v2);
   }
 }
 
@@ -148,22 +132,15 @@ QuadDice::QuadDice(const SubdParams &params_) : EdgeDice(params_)
 {
 }
 
-void QuadDice::reserve(EdgeFactors &ef, int Mu, int Mv)
-{
-  /* XXX need to make this also work for edge factor 0 and 1 */
-  int num_verts = (ef.tu0 + ef.tu1 + ef.tv0 + ef.tv1) + (Mu - 1) * (Mv - 1);
-  EdgeDice::reserve(num_verts);
-}
-
-float2 QuadDice::map_uv(SubPatch &sub, float u, float v)
+float2 QuadDice::map_uv(Subpatch &sub, float u, float v)
 {
   /* map UV from subpatch to patch parametric coordinates */
-  float2 d0 = interp(sub.P00, sub.P01, v);
-  float2 d1 = interp(sub.P10, sub.P11, v);
+  float2 d0 = interp(sub.c00, sub.c01, v);
+  float2 d1 = interp(sub.c10, sub.c11, v);
   return interp(d0, d1, u);
 }
 
-float3 QuadDice::eval_projected(SubPatch &sub, float u, float v)
+float3 QuadDice::eval_projected(Subpatch &sub, float u, float v)
 {
   float2 uv = map_uv(sub, u, v);
   float3 P;
@@ -175,70 +152,41 @@ float3 QuadDice::eval_projected(SubPatch &sub, float u, float v)
   return P;
 }
 
-int QuadDice::add_vert(SubPatch &sub, float u, float v)
+void QuadDice::set_vert(Subpatch &sub, int index, float u, float v)
 {
-  return EdgeDice::add_vert(sub.patch, map_uv(sub, u, v));
+  EdgeDice::set_vert(sub.patch, index, map_uv(sub, u, v));
 }
 
-void QuadDice::add_side_u(SubPatch &sub,
-                          vector<int> &outer,
-                          vector<int> &inner,
-                          int Mu,
-                          int Mv,
-                          int tu,
-                          int side,
-                          int offset)
+void QuadDice::set_side(Subpatch &sub, int edge)
 {
-  outer.clear();
-  inner.clear();
+  int t = sub.edges[edge].T;
 
   /* set verts on the edge of the patch */
-  outer.push_back(offset + ((side) ? 2 : 0));
-
-  for (int i = 1; i < tu; i++) {
-    float u = i / (float)tu;
-    float v = (side) ? 1.0f : 0.0f;
-
-    outer.push_back(add_vert(sub, u, v));
-  }
-
-  outer.push_back(offset + ((side) ? 3 : 1));
-
-  /* set verts on the edge of the inner grid */
-  for (int i = 0; i < Mu - 1; i++) {
-    int j = (side) ? Mv - 1 - 1 : 0;
-    inner.push_back(offset + 4 + i + j * (Mu - 1));
-  }
-}
-
-void QuadDice::add_side_v(SubPatch &sub,
-                          vector<int> &outer,
-                          vector<int> &inner,
-                          int Mu,
-                          int Mv,
-                          int tv,
-                          int side,
-                          int offset)
-{
-  outer.clear();
-  inner.clear();
-
-  /* set verts on the edge of the patch */
-  outer.push_back(offset + ((side) ? 1 : 0));
-
-  for (int j = 1; j < tv; j++) {
-    float u = (side) ? 1.0f : 0.0f;
-    float v = j / (float)tv;
-
-    outer.push_back(add_vert(sub, u, v));
-  }
-
-  outer.push_back(offset + ((side) ? 3 : 2));
+  for (int i = 0; i < t; i++) {
+    float f = i / (float)t;
+
+    float u, v;
+    switch (edge) {
+      case 0:
+        u = 0;
+        v = f;
+        break;
+      case 1:
+        u = f;
+        v = 1;
+        break;
+      case 2:
+        u = 1;
+        v = 1.0f - f;
+        break;
+      case 3:
+      default:
+        u = 1.0f - f;
+        v = 0;
+        break;
+    }
 
-  /* set verts on the edge of the inner grid */
-  for (int j = 0; j < Mv - 1; j++) {
-    int i = (side) ? Mu - 1 - 1 : 0;
-    inner.push_back(offset + 4 + i + j * (Mu - 1));
+    set_vert(sub, sub.get_vert_along_edge(edge, i), u, v);
   }
 }
 
@@ -247,7 +195,7 @@ float QuadDice::quad_area(const float3 &a, const float3 &b, const float3 &c, con
   return triangle_area(a, b, d) + triangle_area(a, d, c);
 }
 
-float QuadDice::scale_factor(SubPatch &sub, EdgeFactors &ef, int Mu, int Mv)
+float QuadDice::scale_factor(Subpatch &sub, int Mu, int Mv)
 {
   /* estimate area as 4x largest of 4 quads */
   float3 P[3][3];
@@ -269,23 +217,14 @@ float QuadDice::scale_factor(SubPatch &sub, EdgeFactors &ef, int Mu, int Mv)
   // XXX does the -sqrt solution matter
   // XXX max(D, 0.0) is highly suspicious, need to test cases
   // where D goes negative
-  float N = 0.5f * (Ntris - (ef.tu0 + ef.tu1 + ef.tv0 + ef.tv1));
+  float N = 0.5f * (Ntris - (sub.edge_u0.T + sub.edge_u1.T + sub.edge_v0.T + sub.edge_v1.T));
   float D = 4.0f * N * Mu * Mv + (Mu + Mv) * (Mu + Mv);
   float S = (Mu + Mv + sqrtf(max(D, 0.0f))) / (2 * Mu * Mv);
 
   return S;
 }
 
-void QuadDice::add_corners(SubPatch &sub)
-{
-  /* add verts for patch corners */
-  add_vert(sub, 0.0f, 0.0f);
-  add_vert(sub, 1.0f, 0.0f);
-  add_vert(sub, 0.0f, 1.0f);
-  add_vert(sub, 1.0f, 1.0f);
-}
-
-void QuadDice::add_grid(SubPatch &sub, int Mu, int Mv, int offset)
+void QuadDice::add_grid(Subpatch &sub, int Mu, int Mv, int offset)
 {
   /* create inner grid */
   float du = 1.0f / (float)Mu;
@@ -296,13 +235,13 @@ void QuadDice::add_grid(SubPatch &sub, int Mu, int Mv, int offset)
       float u = i * du;
       float v = j * dv;
 
-      add_vert(sub, u, v);
+      set_vert(sub, offset + (i - 1) + (j - 1) * (Mu - 1), u, v);
 
       if (i < Mu - 1 && j < Mv - 1) {
-        int i1 = offset + 4 + (i - 1) + (j - 1) * (Mu - 1);
-        int i2 = offset + 4 + i + (j - 1) * (Mu - 1);
-        int i3 = offset + 4 + i + j * (Mu - 1);
-        int i4 = offset + 4 + (i - 1) + j * (Mu - 1);
+        int i1 = offset + (i - 1) + (j - 1) * (Mu - 1);
+        int i2 = offset + i + (j - 1) * (Mu - 1);
+        int i3 = offset + i + j * (Mu - 1);
+        int i4 = offset + (i - 1) + j * (Mu - 1);
 
         add_triangle(sub.patch, i1, i2, i3);
         add_triangle(sub.patch, i1, i3, i4);
@@ -311,48 +250,34 @@ void QuadDice::add_grid(SubPatch &sub, int Mu, int Mv, int offset)
   }
 }
 
-void QuadDice::dice(SubPatch &sub, EdgeFactors &ef)
+void QuadDice::dice(Subpatch &sub)
 {
   /* compute inner grid size with scale factor */
-  int Mu = max(ef.tu0, ef.tu1);
-  int Mv = max(ef.tv0, ef.tv1);
+  int Mu = max(sub.edge_u0.T, sub.edge_u1.T);
+  int Mv = max(sub.edge_v0.T, sub.edge_v1.T);
 
-#if 0 /* Doesnt work very well, especially at grazing angles. */
+#if 0 /* Doesn't work very well, especially at grazing angles. */
   float S = scale_factor(sub, ef, Mu, Mv);
 #else
   float S = 1.0f;
 #endif
 
-  Mu = max((int)ceil(S * Mu), 2);  // XXX handle 0 & 1?
-  Mv = max((int)ceil(S * Mv), 2);  // XXX handle 0 & 1?
-
-  /* reserve space for new verts */
-  int offset = params.mesh->verts.size();
-  reserve(ef, Mu, Mv);
-
-  /* corners and inner grid */
-  add_corners(sub);
-  add_grid(sub, Mu, Mv, offset);
-
-  /* bottom side */
-  vector<int> outer, inner;
-
-  add_side_u(sub, outer, inner, Mu, Mv, ef.tu0, 0, offset);
-  stitch_triangles(sub.patch, outer, inner);
-
-  /* top side */
-  add_side_u(sub, outer, inner, Mu, Mv, ef.tu1, 1, offset);
-  stitch_triangles(sub.patch, inner, outer);
+  Mu = max((int)ceilf(S * Mu), 2);  // XXX handle 0 & 1?
+  Mv = max((int)ceilf(S * Mv), 2);  // XXX handle 0 & 1?
 
-  /* left side */
-  add_side_v(sub, outer, inner, Mu, Mv, ef.tv0, 0, offset);
-  stitch_triangles(sub.patch, inner, outer);
+  /* inner grid */
+  add_grid(sub, Mu, Mv, sub.inner_grid_vert_offset);
 
-  /* right side */
-  add_side_v(sub, outer, inner, Mu, Mv, ef.tv1, 1, offset);
-  stitch_triangles(sub.patch, outer, inner);
+  /* sides */
+  set_side(sub, 0);
+  set_side(sub, 1);
+  set_side(sub, 2);
+  set_side(sub, 3);
 
-  assert(vert_offset == params.mesh->verts.size());
+  stitch_triangles(sub, 0);
+  stitch_triangles(sub, 1);
+  stitch_triangles(sub, 2);
+  stitch_triangles(sub, 3);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h
index eee54e01861..ee63403d40c 100644
--- a/intern/cycles/subd/subd_dice.h
+++ b/intern/cycles/subd/subd_dice.h
@@ -25,6 +25,8 @@
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
+#include "subd/subd_subpatch.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Camera;
@@ -67,78 +69,33 @@ class EdgeDice {
 
   explicit EdgeDice(const SubdParams &params);
 
-  void reserve(int num_verts);
+  void reserve(int num_verts, int num_triangles);
 
-  int add_vert(Patch *patch, float2 uv);
+  void set_vert(Patch *patch, int index, float2 uv);
   void add_triangle(Patch *patch, int v0, int v1, int v2);
 
-  void stitch_triangles(Patch *patch, vector<int> &outer, vector<int> &inner);
+  void stitch_triangles(Subpatch &sub, int edge);
 };
 
-/* Quad EdgeDice
- *
- * Edge tessellation factors and subpatch coordinates are as follows:
- *
- *            tu1
- *     P01 --------- P11
- *     |               |
- * tv0 |               | tv1
- *     |               |
- *     P00 --------- P10
- *            tu0
- */
+/* Quad EdgeDice */
 
 class QuadDice : public EdgeDice {
  public:
-  struct SubPatch {
-    Patch *patch;
-
-    float2 P00;
-    float2 P10;
-    float2 P01;
-    float2 P11;
-  };
-
-  struct EdgeFactors {
-    int tu0;
-    int tu1;
-    int tv0;
-    int tv1;
-  };
-
   explicit QuadDice(const SubdParams &params);
 
-  void reserve(EdgeFactors &ef, int Mu, int Mv);
-  float3 eval_projected(SubPatch &sub, float u, float v);
-
-  float2 map_uv(SubPatch &sub, float u, float v);
-  int add_vert(SubPatch &sub, float u, float v);
-
-  void add_corners(SubPatch &sub);
-  void add_grid(SubPatch &sub, int Mu, int Mv, int offset);
-
-  void add_side_u(SubPatch &sub,
-                  vector<int> &outer,
-                  vector<int> &inner,
-                  int Mu,
-                  int Mv,
-                  int tu,
-                  int side,
-                  int offset);
-
-  void add_side_v(SubPatch &sub,
-                  vector<int> &outer,
-                  vector<int> &inner,
-                  int Mu,
-                  int Mv,
-                  int tv,
-                  int side,
-                  int offset);
+  float3 eval_projected(Subpatch &sub, float u, float v);
+
+  float2 map_uv(Subpatch &sub, float u, float v);
+  void set_vert(Subpatch &sub, int index, float u, float v);
+
+  void add_grid(Subpatch &sub, int Mu, int Mv, int offset);
+
+  void set_side(Subpatch &sub, int edge);
 
   float quad_area(const float3 &a, const float3 &b, const float3 &c, const float3 &d);
-  float scale_factor(SubPatch &sub, EdgeFactors &ef, int Mu, int Mv);
+  float scale_factor(Subpatch &sub, int Mu, int Mv);
 
-  void dice(SubPatch &sub, EdgeFactors &ef);
+  void dice(Subpatch &sub);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h
index 5209d4d0b07..8fe423bc94d 100644
--- a/intern/cycles/subd/subd_patch.h
+++ b/intern/cycles/subd/subd_patch.h
@@ -24,18 +24,17 @@ CCL_NAMESPACE_BEGIN
 
 class Patch {
  public:
-  virtual ~Patch()
+  Patch() : patch_index(0), shader(0), from_ngon(false)
   {
   }
+
+  virtual ~Patch() = default;
+
   virtual void eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v) = 0;
-  virtual BoundBox bound() = 0;
-  virtual int ptex_face_id()
-  {
-    return -1;
-  }
 
   int patch_index;
   int shader;
+  bool from_ngon;
 };
 
 /* Linear Quad Patch */
diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp
index 803363bc240..1a8c182510c 100644
--- a/intern/cycles/subd/subd_split.cpp
+++ b/intern/cycles/subd/subd_split.cpp
@@ -21,6 +21,9 @@
 #include "subd/subd_patch.h"
 #include "subd/subd_split.h"
 
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
 #include "util/util_math.h"
 #include "util/util_types.h"
 
@@ -28,14 +31,12 @@ CCL_NAMESPACE_BEGIN
 
 /* DiagSplit */
 
-DiagSplit::DiagSplit(const SubdParams &params_) : params(params_)
-{
-}
+#define DSPLIT_NON_UNIFORM -1
+#define STITCH_NGON_CENTER_VERT_INDEX_OFFSET 0x60000000
+#define STITCH_NGON_SPLIT_EDGE_CENTER_VERT_TAG (0x60000000 - 1)
 
-void DiagSplit::dispatch(QuadDice::SubPatch &sub, QuadDice::EdgeFactors &ef)
+DiagSplit::DiagSplit(const SubdParams &params_) : params(params_)
 {
-  subpatches_quad.push_back(sub);
-  edgefactors_quad.push_back(ef);
 }
 
 float3 DiagSplit::to_world(Patch *patch, float2 uv)
@@ -49,45 +50,62 @@ float3 DiagSplit::to_world(Patch *patch, float2 uv)
   return P;
 }
 
-int DiagSplit::T(Patch *patch, float2 Pstart, float2 Pend)
+static void order_float2(float2 &a, float2 &b)
 {
-  float3 Plast = make_float3(0.0f, 0.0f, 0.0f);
+  if (b.x < a.x || b.y < a.y) {
+    swap(a, b);
+  }
+}
+
+int DiagSplit::T(Patch *patch, float2 Pstart, float2 Pend, bool recursive_resolve)
+{
+  order_float2(Pstart, Pend); /* May not be necessary, but better to be safe. */
+
   float Lsum = 0.0f;
   float Lmax = 0.0f;
 
-  for (int i = 0; i < params.test_steps; i++) {
+  float3 Plast = to_world(patch, Pstart);
+
+  for (int i = 1; i < params.test_steps; i++) {
     float t = i / (float)(params.test_steps - 1);
 
     float3 P = to_world(patch, Pstart + t * (Pend - Pstart));
 
-    if (i > 0) {
-      float L;
+    float L;
 
-      if (!params.camera) {
-        L = len(P - Plast);
-      }
-      else {
-        Camera *cam = params.camera;
-
-        float pixel_width = cam->world_to_raster_size((P + Plast) * 0.5f);
-        L = len(P - Plast) / pixel_width;
-      }
+    if (!params.camera) {
+      L = len(P - Plast);
+    }
+    else {
+      Camera *cam = params.camera;
 
-      Lsum += L;
-      Lmax = max(L, Lmax);
+      float pixel_width = cam->world_to_raster_size((P + Plast) * 0.5f);
+      L = len(P - Plast) / pixel_width;
     }
 
+    Lsum += L;
+    Lmax = max(L, Lmax);
+
     Plast = P;
   }
 
-  int tmin = (int)ceil(Lsum / params.dicing_rate);
-  int tmax = (int)ceil((params.test_steps - 1) * Lmax /
-                       params.dicing_rate);  // XXX paper says N instead of N-1, seems wrong?
+  int tmin = (int)ceilf(Lsum / params.dicing_rate);
+  int tmax = (int)ceilf((params.test_steps - 1) * Lmax /
+                        params.dicing_rate);  // XXX paper says N instead of N-1, seems wrong?
+  int res = max(tmax, 1);
 
-  if (tmax - tmin > params.split_threshold)
-    return DSPLIT_NON_UNIFORM;
+  if (tmax - tmin > params.split_threshold) {
+    if (!recursive_resolve) {
+      res = DSPLIT_NON_UNIFORM;
+    }
+    else {
+      float2 P = (Pstart + Pend) * 0.5f;
+      res = T(patch, Pstart, P, true) + T(patch, P, Pend, true);
+    }
+  }
 
-  return tmax;
+  limit_edge_factor(res, patch, Pstart, Pend);
+  return res;
 }
 
 void DiagSplit::partition_edge(
@@ -99,159 +117,632 @@ void DiagSplit::partition_edge(
     *t1 = T(patch, *P, Pend);
   }
   else {
-    int I = (int)floor((float)t * 0.5f);
-    *P = interp(Pstart, Pend, (t == 0) ? 0 : I / (float)t); /* XXX is t faces or verts */
+    assert(t >= 2); /* Need at least two segments to partition into. */
+
+    int I = (int)floorf((float)t * 0.5f);
+    *P = interp(Pstart, Pend, I / (float)t);
     *t0 = I;
     *t1 = t - I;
   }
 }
 
-static void limit_edge_factors(const QuadDice::SubPatch &sub, QuadDice::EdgeFactors &ef, int max_t)
+void DiagSplit::limit_edge_factor(int &T, Patch *patch, float2 Pstart, float2 Pend)
+{
+  int max_t = 1 << params.max_level;
+  int max_t_for_edge = int(max_t * len(Pstart - Pend));
+
+  if (patch->from_ngon) {
+    max_t_for_edge >>= 1; /* Initial split of ngon causes edges to extend half the distance. */
+  }
+
+  T = (max_t_for_edge <= 1) ? 1 : min(T, max_t_for_edge);
+
+  assert(T >= 1 || T == DSPLIT_NON_UNIFORM);
+}
+
+void DiagSplit::resolve_edge_factors(Subpatch &sub)
 {
-  float2 P00 = sub.P00;
-  float2 P01 = sub.P01;
-  float2 P10 = sub.P10;
-  float2 P11 = sub.P11;
-
-  int tu0 = int(max_t * len(P10 - P00));
-  int tu1 = int(max_t * len(P11 - P01));
-  int tv0 = int(max_t * len(P01 - P00));
-  int tv1 = int(max_t * len(P11 - P10));
-
-  ef.tu0 = tu0 <= 1 ? 1 : min(ef.tu0, tu0);
-  ef.tu1 = tu1 <= 1 ? 1 : min(ef.tu1, tu1);
-  ef.tv0 = tv0 <= 1 ? 1 : min(ef.tv0, tv0);
-  ef.tv1 = tv1 <= 1 ? 1 : min(ef.tv1, tv1);
+  /* Resolve DSPLIT_NON_UNIFORM to actual T value if splitting is no longer possible. */
+  if (sub.edge_u0.T == 1 && sub.edge_u1.T == DSPLIT_NON_UNIFORM) {
+    sub.edge_u1.T = T(sub.patch, sub.c01, sub.c11, true);
+  }
+  if (sub.edge_u1.T == 1 && sub.edge_u0.T == DSPLIT_NON_UNIFORM) {
+    sub.edge_u0.T = T(sub.patch, sub.c00, sub.c10, true);
+  }
+  if (sub.edge_v0.T == 1 && sub.edge_v1.T == DSPLIT_NON_UNIFORM) {
+    sub.edge_v1.T = T(sub.patch, sub.c11, sub.c10, true);
+  }
+  if (sub.edge_v1.T == 1 && sub.edge_v0.T == DSPLIT_NON_UNIFORM) {
+    sub.edge_v0.T = T(sub.patch, sub.c01, sub.c00, true);
+  }
 }
 
-void DiagSplit::split(QuadDice::SubPatch &sub, QuadDice::EdgeFactors &ef, int depth)
+void DiagSplit::split(Subpatch &sub, int depth)
 {
   if (depth > 32) {
     /* We should never get here, but just in case end recursion safely. */
-    ef.tu0 = 1;
-    ef.tu1 = 1;
-    ef.tv0 = 1;
-    ef.tv1 = 1;
+    assert(!"diagsplit recursion limit reached");
+
+    sub.edge_u0.T = 1;
+    sub.edge_u1.T = 1;
+    sub.edge_v0.T = 1;
+    sub.edge_v1.T = 1;
 
-    dispatch(sub, ef);
+    subpatches.push_back(sub);
     return;
   }
 
-  bool split_u = (ef.tu0 == DSPLIT_NON_UNIFORM || ef.tu1 == DSPLIT_NON_UNIFORM);
-  bool split_v = (ef.tv0 == DSPLIT_NON_UNIFORM || ef.tv1 == DSPLIT_NON_UNIFORM);
+  bool split_u = (sub.edge_u0.T == DSPLIT_NON_UNIFORM || sub.edge_u1.T == DSPLIT_NON_UNIFORM);
+  bool split_v = (sub.edge_v0.T == DSPLIT_NON_UNIFORM || sub.edge_v1.T == DSPLIT_NON_UNIFORM);
 
   /* Split subpatches such that the ratio of T for opposite edges doesn't
-     * exceed 1.5, this reduces over tessellation for some patches
+   * exceed 1.5, this reduces over tessellation for some patches
    */
-  bool tmp_split_v = split_v;
-  if (!split_u && min(ef.tu0, ef.tu1) > 8 && min(ef.tu0, ef.tu1) * 1.5f < max(ef.tu0, ef.tu1))
+  /* clang-format off */
+  if (min(sub.edge_u0.T, sub.edge_u1.T) > 8 && /* Must be uniform and preferably greater than 8 to split. */
+      min(sub.edge_v0.T, sub.edge_v1.T) >= 2 && /* Must be uniform and at least 2 to split. */
+      max(sub.edge_u0.T, sub.edge_u1.T) / min(sub.edge_u0.T, sub.edge_u1.T) > 1.5f)
+  {
     split_v = true;
-  if (!tmp_split_v && min(ef.tu0, ef.tu1) > 8 && min(ef.tv0, ef.tv1) * 1.5f < max(ef.tv0, ef.tv1))
+  }
+  if (min(sub.edge_v0.T, sub.edge_v1.T) > 8 &&
+      min(sub.edge_u0.T, sub.edge_u1.T) >= 2 &&
+      max(sub.edge_v0.T, sub.edge_v1.T) / min(sub.edge_v0.T, sub.edge_v1.T) > 1.5f)
+  {
     split_u = true;
+  }
+  /* clang-format on */
 
-  /* alternate axis */
+  /* Alternate axis. */
   if (split_u && split_v) {
     split_u = depth % 2;
   }
 
-  if (split_u) {
-    /* partition edges */
-    QuadDice::EdgeFactors ef0, ef1;
-    float2 Pu0, Pu1;
+  if (!split_u && !split_v) {
+    /* Add the unsplit subpatch. */
+    subpatches.push_back(sub);
+    Subpatch &subpatch = subpatches[subpatches.size() - 1];
+
+    /* Update T values and offsets. */
+    for (int i = 0; i < 4; i++) {
+      Subpatch::edge_t &edge = subpatch.edges[i];
+
+      edge.offset = edge.edge->T;
+      edge.edge->T += edge.T;
+    }
+  }
+  else {
+    /* Copy into new subpatches. */
+    Subpatch sub_a = sub;
+    Subpatch sub_b = sub;
+
+    /* Pointers to various subpatch elements. */
+    Subpatch::edge_t *sub_across_0, *sub_across_1;
+    Subpatch::edge_t *sub_a_across_0, *sub_a_across_1;
+    Subpatch::edge_t *sub_b_across_0, *sub_b_across_1;
+
+    Subpatch::edge_t *sub_a_split, *sub_b_split;
+
+    float2 *Pa, *Pb, *Pc, *Pd;
+
+    /* Set pointers based on split axis. */
+    if (split_u) {
+      sub_across_0 = &sub.edge_u0;
+      sub_across_1 = &sub.edge_u1;
+      sub_a_across_0 = &sub_a.edge_u0;
+      sub_a_across_1 = &sub_a.edge_u1;
+      sub_b_across_0 = &sub_b.edge_u0;
+      sub_b_across_1 = &sub_b.edge_u1;
+
+      sub_a_split = &sub_a.edge_v1;
+      sub_b_split = &sub_b.edge_v0;
+
+      Pa = &sub_a.c11;
+      Pb = &sub_a.c10;
+      Pc = &sub_b.c01;
+      Pd = &sub_b.c00;
+    }
+    else {
+      sub_across_0 = &sub.edge_v0;
+      sub_across_1 = &sub.edge_v1;
+      sub_a_across_0 = &sub_a.edge_v0;
+      sub_a_across_1 = &sub_a.edge_v1;
+      sub_b_across_0 = &sub_b.edge_v0;
+      sub_b_across_1 = &sub_b.edge_v1;
+
+      sub_a_split = &sub_a.edge_u0;
+      sub_b_split = &sub_b.edge_u1;
+
+      Pa = &sub_a.c10;
+      Pb = &sub_a.c00;
+      Pc = &sub_b.c11;
+      Pd = &sub_b.c01;
+    }
 
-    partition_edge(sub.patch, &Pu0, &ef0.tu0, &ef1.tu0, sub.P00, sub.P10, ef.tu0);
-    partition_edge(sub.patch, &Pu1, &ef0.tu1, &ef1.tu1, sub.P01, sub.P11, ef.tu1);
+    /* Partition edges */
+    float2 P0, P1;
 
-    /* split */
-    int tsplit = T(sub.patch, Pu0, Pu1);
-    ef0.tv0 = ef.tv0;
-    ef0.tv1 = tsplit;
+    partition_edge(
+        sub.patch, &P0, &sub_a_across_0->T, &sub_b_across_0->T, *Pd, *Pb, sub_across_0->T);
+    partition_edge(
+        sub.patch, &P1, &sub_a_across_1->T, &sub_b_across_1->T, *Pc, *Pa, sub_across_1->T);
 
-    ef1.tv0 = tsplit;
-    ef1.tv1 = ef.tv1;
+    /* Split */
+    *Pa = P1;
+    *Pb = P0;
 
-    /* create subpatches */
-    QuadDice::SubPatch sub0 = {sub.patch, sub.P00, Pu0, sub.P01, Pu1};
-    QuadDice::SubPatch sub1 = {sub.patch, Pu0, sub.P10, Pu1, sub.P11};
+    *Pc = P1;
+    *Pd = P0;
 
-    limit_edge_factors(sub0, ef0, 1 << params.max_level);
-    limit_edge_factors(sub1, ef1, 1 << params.max_level);
+    int tsplit = T(sub.patch, P0, P1);
 
-    split(sub0, ef0, depth + 1);
-    split(sub1, ef1, depth + 1);
+    if (depth == -2 && tsplit == 1) {
+      tsplit = 2; /* Ensure we can always split at depth -1. */
+    }
+
+    sub_a_split->T = tsplit;
+    sub_b_split->T = tsplit;
+
+    resolve_edge_factors(sub_a);
+    resolve_edge_factors(sub_b);
+
+    /* Create new edge */
+    Edge &edge = *alloc_edge();
+
+    sub_a_split->edge = &edge;
+    sub_b_split->edge = &edge;
+
+    sub_a_split->offset = 0;
+    sub_b_split->offset = 0;
+
+    sub_a_split->indices_decrease_along_edge = false;
+    sub_b_split->indices_decrease_along_edge = true;
+
+    sub_a_split->sub_edges_created_in_reverse_order = !split_u;
+    sub_b_split->sub_edges_created_in_reverse_order = !split_u;
+
+    edge.top_indices_decrease = sub_across_1->sub_edges_created_in_reverse_order;
+    edge.bottom_indices_decrease = sub_across_0->sub_edges_created_in_reverse_order;
+
+    /* Recurse */
+    edge.T = 0;
+    split(sub_a, depth + 1);
+
+    int edge_t = edge.T;
+    (void)edge_t;
+
+    edge.top_offset = sub_across_1->edge->T;
+    edge.bottom_offset = sub_across_0->edge->T;
+
+    edge.T = 0; /* We calculate T twice along each edge. :/ */
+    split(sub_b, depth + 1);
+
+    assert(edge.T == edge_t); /* If this fails we will crash at some later point! */
+
+    edge.top = sub_across_1->edge;
+    edge.bottom = sub_across_0->edge;
   }
-  else if (split_v) {
-    /* partition edges */
-    QuadDice::EdgeFactors ef0, ef1;
-    float2 Pv0, Pv1;
+}
+
+int DiagSplit::alloc_verts(int n)
+{
+  int a = num_alloced_verts;
+  num_alloced_verts += n;
+  return a;
+}
 
-    partition_edge(sub.patch, &Pv0, &ef0.tv0, &ef1.tv0, sub.P00, sub.P01, ef.tv0);
-    partition_edge(sub.patch, &Pv1, &ef0.tv1, &ef1.tv1, sub.P10, sub.P11, ef.tv1);
+Edge *DiagSplit::alloc_edge()
+{
+  edges.emplace_back();
+  return &edges.back();
+}
 
-    /* split */
-    int tsplit = T(sub.patch, Pv0, Pv1);
-    ef0.tu0 = ef.tu0;
-    ef0.tu1 = tsplit;
+void DiagSplit::split_patches(Patch *patches, size_t patches_byte_stride)
+{
+  int patch_index = 0;
 
-    ef1.tu0 = tsplit;
-    ef1.tu1 = ef.tu1;
+  for (int f = 0; f < params.mesh->subd_faces.size(); f++) {
+    Mesh::SubdFace &face = params.mesh->subd_faces[f];
 
-    /* create subpatches */
-    QuadDice::SubPatch sub0 = {sub.patch, sub.P00, sub.P10, Pv0, Pv1};
-    QuadDice::SubPatch sub1 = {sub.patch, Pv0, Pv1, sub.P01, sub.P11};
+    Patch *patch = (Patch *)(((char *)patches) + patch_index * patches_byte_stride);
 
-    limit_edge_factors(sub0, ef0, 1 << params.max_level);
-    limit_edge_factors(sub1, ef1, 1 << params.max_level);
+    if (face.is_quad()) {
+      patch_index++;
 
-    split(sub0, ef0, depth + 1);
-    split(sub1, ef1, depth + 1);
+      split_quad(face, patch);
+    }
+    else {
+      patch_index += face.num_corners;
+
+      split_ngon(face, patch, patches_byte_stride);
+    }
   }
-  else {
-    dispatch(sub, ef);
+
+  params.mesh->vert_to_stitching_key_map.clear();
+  params.mesh->vert_stitching_map.clear();
+
+  post_split();
+}
+
+static Edge *create_edge_from_corner(DiagSplit *split,
+                                     const Mesh *mesh,
+                                     const Mesh::SubdFace &face,
+                                     int corner,
+                                     bool &reversed,
+                                     int v0,
+                                     int v1)
+{
+  int a = mesh->subd_face_corners[face.start_corner + mod(corner + 0, face.num_corners)];
+  int b = mesh->subd_face_corners[face.start_corner + mod(corner + 1, face.num_corners)];
+
+  reversed = !(b < a);
+
+  if (b < a) {
+    swap(a, b);
+    swap(v0, v1);
   }
+
+  Edge *edge = split->alloc_edge();
+
+  edge->is_stitch_edge = true;
+  edge->stitch_start_vert_index = a;
+  edge->stitch_end_vert_index = b;
+
+  edge->start_vert_index = v0;
+  edge->end_vert_index = v1;
+
+  edge->stitch_edge_key = {a, b};
+
+  return edge;
 }
 
-void DiagSplit::split_quad(Patch *patch, QuadDice::SubPatch *subpatch)
+void DiagSplit::split_quad(const Mesh::SubdFace &face, Patch *patch)
 {
-  QuadDice::SubPatch sub_split;
-  QuadDice::EdgeFactors ef_split;
+  Subpatch subpatch(patch);
+
+  int v = alloc_verts(4);
+
+  bool v0_reversed, u1_reversed, v1_reversed, u0_reversed;
+  subpatch.edge_v0.edge = create_edge_from_corner(
+      this, params.mesh, face, 3, v0_reversed, v + 3, v + 0);
+  subpatch.edge_u1.edge = create_edge_from_corner(
+      this, params.mesh, face, 2, u1_reversed, v + 2, v + 3);
+  subpatch.edge_v1.edge = create_edge_from_corner(
+      this, params.mesh, face, 1, v1_reversed, v + 1, v + 2);
+  subpatch.edge_u0.edge = create_edge_from_corner(
+      this, params.mesh, face, 0, u0_reversed, v + 0, v + 1);
+
+  subpatch.edge_v0.sub_edges_created_in_reverse_order = !v0_reversed;
+  subpatch.edge_u1.sub_edges_created_in_reverse_order = u1_reversed;
+  subpatch.edge_v1.sub_edges_created_in_reverse_order = v1_reversed;
+  subpatch.edge_u0.sub_edges_created_in_reverse_order = !u0_reversed;
+
+  subpatch.edge_v0.indices_decrease_along_edge = v0_reversed;
+  subpatch.edge_u1.indices_decrease_along_edge = u1_reversed;
+  subpatch.edge_v1.indices_decrease_along_edge = v1_reversed;
+  subpatch.edge_u0.indices_decrease_along_edge = u0_reversed;
+
+  /* Forces a split in both axis for quads, needed to match split of ngons into quads. */
+  subpatch.edge_u0.T = DSPLIT_NON_UNIFORM;
+  subpatch.edge_u1.T = DSPLIT_NON_UNIFORM;
+  subpatch.edge_v0.T = DSPLIT_NON_UNIFORM;
+  subpatch.edge_v1.T = DSPLIT_NON_UNIFORM;
+
+  split(subpatch, -2);
+}
+
+static Edge *create_split_edge_from_corner(DiagSplit *split,
+                                           const Mesh *mesh,
+                                           const Mesh::SubdFace &face,
+                                           int corner,
+                                           int side,
+                                           bool &reversed,
+                                           int v0,
+                                           int v1,
+                                           int vc)
+{
+  Edge *edge = split->alloc_edge();
+
+  int a = mesh->subd_face_corners[face.start_corner + mod(corner + 0, face.num_corners)];
+  int b = mesh->subd_face_corners[face.start_corner + mod(corner + 1, face.num_corners)];
 
-  if (subpatch) {
-    sub_split = *subpatch;
+  if (b < a) {
+    edge->stitch_edge_key = {b, a};
   }
   else {
-    sub_split.patch = patch;
-    sub_split.P00 = make_float2(0.0f, 0.0f);
-    sub_split.P10 = make_float2(1.0f, 0.0f);
-    sub_split.P01 = make_float2(0.0f, 1.0f);
-    sub_split.P11 = make_float2(1.0f, 1.0f);
+    edge->stitch_edge_key = {a, b};
   }
 
-  ef_split.tu0 = T(patch, sub_split.P00, sub_split.P10);
-  ef_split.tu1 = T(patch, sub_split.P01, sub_split.P11);
-  ef_split.tv0 = T(patch, sub_split.P00, sub_split.P01);
-  ef_split.tv1 = T(patch, sub_split.P10, sub_split.P11);
+  reversed = !(b < a);
 
-  limit_edge_factors(sub_split, ef_split, 1 << params.max_level);
+  if (side == 0) {
+    a = vc;
+  }
+  else {
+    b = vc;
+  }
+
+  if (!reversed) {
+    swap(a, b);
+    swap(v0, v1);
+  }
+
+  edge->is_stitch_edge = true;
+  edge->stitch_start_vert_index = a;
+  edge->stitch_end_vert_index = b;
+
+  edge->start_vert_index = v0;
+  edge->end_vert_index = v1;
+
+  return edge;
+}
+
+void DiagSplit::split_ngon(const Mesh::SubdFace &face, Patch *patches, size_t patches_byte_stride)
+{
+  Edge *prev_edge_u0 = nullptr;
+  Edge *first_edge_v0 = nullptr;
+
+  for (int corner = 0; corner < face.num_corners; corner++) {
+    Patch *patch = (Patch *)(((char *)patches) + corner * patches_byte_stride);
+
+    Subpatch subpatch(patch);
+
+    int v = alloc_verts(4);
+
+    /* Setup edges. */
+    Edge *edge_u1 = alloc_edge();
+    Edge *edge_v1 = alloc_edge();
+
+    edge_v1->is_stitch_edge = true;
+    edge_u1->is_stitch_edge = true;
+
+    edge_u1->stitch_start_vert_index = -(face.start_corner + mod(corner + 0, face.num_corners)) -
+                                       1;
+    edge_u1->stitch_end_vert_index = STITCH_NGON_CENTER_VERT_INDEX_OFFSET + face.ptex_offset;
+
+    edge_u1->start_vert_index = v + 3;
+    edge_u1->end_vert_index = v + 2;
+
+    edge_u1->stitch_edge_key = {edge_u1->stitch_start_vert_index, edge_u1->stitch_end_vert_index};
+
+    edge_v1->stitch_start_vert_index = -(face.start_corner + mod(corner + 1, face.num_corners)) -
+                                       1;
+    edge_v1->stitch_end_vert_index = STITCH_NGON_CENTER_VERT_INDEX_OFFSET + face.ptex_offset;
+
+    edge_v1->start_vert_index = v + 1;
+    edge_v1->end_vert_index = v + 2;
+
+    edge_v1->stitch_edge_key = {edge_v1->stitch_start_vert_index, edge_v1->stitch_end_vert_index};
+
+    bool v0_reversed, u0_reversed;
+
+    subpatch.edge_v0.edge = create_split_edge_from_corner(this,
+                                                          params.mesh,
+                                                          face,
+                                                          corner - 1,
+                                                          0,
+                                                          v0_reversed,
+                                                          v + 3,
+                                                          v + 0,
+                                                          STITCH_NGON_SPLIT_EDGE_CENTER_VERT_TAG);
+
+    subpatch.edge_u1.edge = edge_u1;
+    subpatch.edge_v1.edge = edge_v1;
+
+    subpatch.edge_u0.edge = create_split_edge_from_corner(this,
+                                                          params.mesh,
+                                                          face,
+                                                          corner + 0,
+                                                          1,
+                                                          u0_reversed,
+                                                          v + 0,
+                                                          v + 1,
+                                                          STITCH_NGON_SPLIT_EDGE_CENTER_VERT_TAG);
+
+    subpatch.edge_v0.sub_edges_created_in_reverse_order = !v0_reversed;
+    subpatch.edge_u1.sub_edges_created_in_reverse_order = false;
+    subpatch.edge_v1.sub_edges_created_in_reverse_order = true;
+    subpatch.edge_u0.sub_edges_created_in_reverse_order = !u0_reversed;
+
+    subpatch.edge_v0.indices_decrease_along_edge = v0_reversed;
+    subpatch.edge_u1.indices_decrease_along_edge = false;
+    subpatch.edge_v1.indices_decrease_along_edge = true;
+    subpatch.edge_u0.indices_decrease_along_edge = u0_reversed;
+
+    /* Perfrom split. */
+    {
+      subpatch.edge_u0.T = T(subpatch.patch, subpatch.c00, subpatch.c10);
+      subpatch.edge_u1.T = T(subpatch.patch, subpatch.c01, subpatch.c11);
+      subpatch.edge_v0.T = T(subpatch.patch, subpatch.c00, subpatch.c01);
+      subpatch.edge_v1.T = T(subpatch.patch, subpatch.c10, subpatch.c11);
+
+      resolve_edge_factors(subpatch);
+
+      split(subpatch, 0);
+    }
+
+    /* Update offsets after T is known from split. */
+    edge_u1->top = subpatch.edge_v0.edge;
+    edge_u1->stitch_top_offset = edge_u1->top->T * (v0_reversed ? -1 : 1);
+    edge_v1->top = subpatch.edge_u0.edge;
+    edge_v1->stitch_top_offset = edge_v1->top->T * (!u0_reversed ? -1 : 1);
+
+    if (corner == 0) {
+      first_edge_v0 = subpatch.edge_v0.edge;
+    }
+
+    if (prev_edge_u0) {
+      if (v0_reversed) {
+        subpatch.edge_v0.edge->stitch_offset = prev_edge_u0->T;
+      }
+      else {
+        prev_edge_u0->stitch_offset = subpatch.edge_v0.edge->T;
+      }
+
+      int T = subpatch.edge_v0.edge->T + prev_edge_u0->T;
+      subpatch.edge_v0.edge->stitch_edge_T = T;
+      prev_edge_u0->stitch_edge_T = T;
+    }
 
-  split(sub_split, ef_split);
+    if (corner == face.num_corners - 1) {
+      if (v0_reversed) {
+        subpatch.edge_u0.edge->stitch_offset = first_edge_v0->T;
+      }
+      else {
+        first_edge_v0->stitch_offset = subpatch.edge_u0.edge->T;
+      }
+
+      int T = first_edge_v0->T + subpatch.edge_u0.edge->T;
+      first_edge_v0->stitch_edge_T = T;
+      subpatch.edge_u0.edge->stitch_edge_T = T;
+    }
+
+    prev_edge_u0 = subpatch.edge_u0.edge;
+  }
+}
+
+void DiagSplit::post_split()
+{
+  int num_stitch_verts = 0;
+
+  /* All patches are now split, and all T values known. */
 
+  foreach (Edge &edge, edges) {
+    if (edge.second_vert_index < 0) {
+      edge.second_vert_index = alloc_verts(edge.T - 1);
+    }
+
+    if (edge.is_stitch_edge) {
+      num_stitch_verts = max(num_stitch_verts,
+                             max(edge.stitch_start_vert_index, edge.stitch_end_vert_index));
+    }
+  }
+
+  num_stitch_verts += 1;
+
+  /* Map of edge key to edge stitching vert offset. */
+  struct pair_hasher {
+    size_t operator()(const pair<int, int> &k) const
+    {
+      return hash_uint2(k.first, k.second);
+    }
+  };
+  typedef unordered_map<pair<int, int>, int, pair_hasher> edge_stitch_verts_map_t;
+  edge_stitch_verts_map_t edge_stitch_verts_map;
+
+  foreach (Edge &edge, edges) {
+    if (edge.is_stitch_edge) {
+      if (edge.stitch_edge_T == 0) {
+        edge.stitch_edge_T = edge.T;
+      }
+
+      if (edge_stitch_verts_map.find(edge.stitch_edge_key) == edge_stitch_verts_map.end()) {
+        edge_stitch_verts_map[edge.stitch_edge_key] = num_stitch_verts;
+        num_stitch_verts += edge.stitch_edge_T - 1;
+      }
+    }
+  }
+
+  /* Set start and end indices for edges generated from a split. */
+  foreach (Edge &edge, edges) {
+    if (edge.start_vert_index < 0) {
+      /* Fixup offsets. */
+      if (edge.top_indices_decrease) {
+        edge.top_offset = edge.top->T - edge.top_offset;
+      }
+
+      edge.start_vert_index = edge.top->get_vert_along_edge(edge.top_offset);
+    }
+
+    if (edge.end_vert_index < 0) {
+      if (edge.bottom_indices_decrease) {
+        edge.bottom_offset = edge.bottom->T - edge.bottom_offset;
+      }
+
+      edge.end_vert_index = edge.bottom->get_vert_along_edge(edge.bottom_offset);
+    }
+  }
+
+  int vert_offset = params.mesh->verts.size();
+
+  /* Add verts to stitching map. */
+  foreach (const Edge &edge, edges) {
+    if (edge.is_stitch_edge) {
+      int second_stitch_vert_index = edge_stitch_verts_map[edge.stitch_edge_key];
+
+      for (int i = 0; i <= edge.T; i++) {
+        /* Get proper stitching key. */
+        int key;
+
+        if (i == 0) {
+          key = edge.stitch_start_vert_index;
+        }
+        else if (i == edge.T) {
+          key = edge.stitch_end_vert_index;
+        }
+        else {
+          key = second_stitch_vert_index + i - 1 + edge.stitch_offset;
+        }
+
+        if (key == STITCH_NGON_SPLIT_EDGE_CENTER_VERT_TAG) {
+          if (i == 0) {
+            key = second_stitch_vert_index - 1 + edge.stitch_offset;
+          }
+          else if (i == edge.T) {
+            key = second_stitch_vert_index - 1 + edge.T;
+          }
+        }
+        else if (key < 0 && edge.top) { /* ngon spoke edge */
+          int s = edge_stitch_verts_map[edge.top->stitch_edge_key];
+          if (edge.stitch_top_offset >= 0) {
+            key = s - 1 + edge.stitch_top_offset;
+          }
+          else {
+            key = s - 1 + edge.top->stitch_edge_T + edge.stitch_top_offset;
+          }
+        }
+
+        /* Get real vert index. */
+        int vert = edge.get_vert_along_edge(i) + vert_offset;
+
+        /* Add to map */
+        if (params.mesh->vert_to_stitching_key_map.find(vert) ==
+            params.mesh->vert_to_stitching_key_map.end()) {
+          params.mesh->vert_to_stitching_key_map[vert] = key;
+          params.mesh->vert_stitching_map.insert({key, vert});
+        }
+      }
+    }
+  }
+
+  /* Dice; TODO(mai): Move this out of split. */
   QuadDice dice(params);
 
-  for (size_t i = 0; i < subpatches_quad.size(); i++) {
-    QuadDice::SubPatch &sub = subpatches_quad[i];
-    QuadDice::EdgeFactors &ef = edgefactors_quad[i];
+  int num_verts = num_alloced_verts;
+  int num_triangles = 0;
+
+  for (size_t i = 0; i < subpatches.size(); i++) {
+    subpatches[i].inner_grid_vert_offset = num_verts;
+    num_verts += subpatches[i].calc_num_inner_verts();
+    num_triangles += subpatches[i].calc_num_triangles();
+  }
+
+  dice.reserve(num_verts, num_triangles);
+
+  for (size_t i = 0; i < subpatches.size(); i++) {
+    Subpatch &sub = subpatches[i];
 
-    ef.tu0 = max(ef.tu0, 1);
-    ef.tu1 = max(ef.tu1, 1);
-    ef.tv0 = max(ef.tv0, 1);
-    ef.tv1 = max(ef.tv1, 1);
+    sub.edge_u0.T = max(sub.edge_u0.T, 1);
+    sub.edge_u1.T = max(sub.edge_u1.T, 1);
+    sub.edge_v0.T = max(sub.edge_v0.T, 1);
+    sub.edge_v1.T = max(sub.edge_v1.T, 1);
 
-    dice.dice(sub, ef);
+    dice.dice(sub);
   }
 
-  subpatches_quad.clear();
-  edgefactors_quad.clear();
+  /* Cleanup */
+  subpatches.clear();
+  edges.clear();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h
index 6e68d8ee598..773f4ddf120 100644
--- a/intern/cycles/subd/subd_split.h
+++ b/intern/cycles/subd/subd_split.h
@@ -23,35 +23,51 @@
  * for more details. */
 
 #include "subd/subd_dice.h"
+#include "subd/subd_subpatch.h"
 
+#include "util/util_deque.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
+#include <deque>
+
 CCL_NAMESPACE_BEGIN
 
 class Mesh;
 class Patch;
 
-#define DSPLIT_NON_UNIFORM -1
-
 class DiagSplit {
- public:
-  vector<QuadDice::SubPatch> subpatches_quad;
-  vector<QuadDice::EdgeFactors> edgefactors_quad;
-
   SubdParams params;
 
-  explicit DiagSplit(const SubdParams &params);
+  vector<Subpatch> subpatches;
+  /* deque is used so that element pointers remain vaild when size is changed. */
+  deque<Edge> edges;
 
   float3 to_world(Patch *patch, float2 uv);
-  int T(Patch *patch, float2 Pstart, float2 Pend);
+  int T(Patch *patch, float2 Pstart, float2 Pend, bool recursive_resolve = false);
+
+  void limit_edge_factor(int &T, Patch *patch, float2 Pstart, float2 Pend);
+  void resolve_edge_factors(Subpatch &sub);
+
   void partition_edge(
       Patch *patch, float2 *P, int *t0, int *t1, float2 Pstart, float2 Pend, int t);
 
-  void dispatch(QuadDice::SubPatch &sub, QuadDice::EdgeFactors &ef);
-  void split(QuadDice::SubPatch &sub, QuadDice::EdgeFactors &ef, int depth = 0);
+  void split(Subpatch &sub, int depth = 0);
+
+  int num_alloced_verts = 0;
+  int alloc_verts(int n); /* Returns start index of new verts. */
+
+ public:
+  Edge *alloc_edge();
+
+  explicit DiagSplit(const SubdParams &params);
+
+  void split_patches(Patch *patches, size_t patches_byte_stride);
+
+  void split_quad(const Mesh::SubdFace &face, Patch *patch);
+  void split_ngon(const Mesh::SubdFace &face, Patch *patches, size_t patches_byte_stride);
 
-  void split_quad(Patch *patch, QuadDice::SubPatch *subpatch = NULL);
+  void post_split();
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/subd/subd_subpatch.h b/intern/cycles/subd/subd_subpatch.h
new file mode 100644
index 00000000000..1a32b763cb8
--- /dev/null
+++ b/intern/cycles/subd/subd_subpatch.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SUBD_SUBPATCH_H__
+#define __SUBD_SUBPATCH_H__
+
+#include "util/util_map.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Subpatch */
+
+class Subpatch {
+ public:
+  class Patch *patch; /* Patch this is a subpatch of. */
+  int inner_grid_vert_offset;
+
+  struct edge_t {
+    int T;
+    int offset; /* Offset along main edge, interpretation depends on the two flags below. */
+
+    bool indices_decrease_along_edge;
+    bool sub_edges_created_in_reverse_order;
+
+    struct Edge *edge;
+
+    int get_vert_along_edge(int n) const;
+  };
+
+  /*
+   *            eu1
+   *     c01 --------- c11
+   *     |               |
+   * ev0 |               | ev1
+   *     |               |
+   *     c00 --------- c10
+   *            eu0
+   */
+
+  union {
+    float2 corners[4]; /* UV within patch, clockwise starting from uv (0, 0) towards (0, 1) etc. */
+    struct {
+      float2 c00, c01, c11, c10;
+    };
+  };
+
+  union {
+    edge_t
+        edges[4]; /* Edges of this subpatch, each edge starts at the corner of the same index. */
+    struct {
+      edge_t edge_v0, edge_u1, edge_v1, edge_u0;
+    };
+  };
+
+  explicit Subpatch(Patch *patch = nullptr)
+      : patch(patch),
+        c00(make_float2(0.0f, 0.0f)),
+        c01(make_float2(0.0f, 1.0f)),
+        c11(make_float2(1.0f, 1.0f)),
+        c10(make_float2(1.0f, 0.0f))
+  {
+  }
+
+  Subpatch(Patch *patch, float2 c00, float2 c01, float2 c11, float2 c10)
+      : patch(patch), c00(c00), c01(c01), c11(c11), c10(c10)
+  {
+  }
+
+  int calc_num_inner_verts() const
+  {
+    int Mu = max(edge_u0.T, edge_u1.T);
+    int Mv = max(edge_v0.T, edge_v1.T);
+    Mu = max(Mu, 2);
+    Mv = max(Mv, 2);
+    return (Mu - 1) * (Mv - 1);
+  }
+
+  int calc_num_triangles() const
+  {
+    int Mu = max(edge_u0.T, edge_u1.T);
+    int Mv = max(edge_v0.T, edge_v1.T);
+    Mu = max(Mu, 2);
+    Mv = max(Mv, 2);
+
+    int inner_triangles = (Mu - 2) * (Mv - 2) * 2;
+    int edge_triangles = edge_u0.T + edge_u1.T + edge_v0.T + edge_v1.T + (Mu - 2) * 2 +
+                         (Mv - 2) * 2;
+
+    return inner_triangles + edge_triangles;
+  }
+
+  int get_vert_along_edge(int e, int n) const;
+
+  int get_vert_along_grid_edge(int edge, int n) const
+  {
+    int Mu = max(edge_u0.T, edge_u1.T);
+    int Mv = max(edge_v0.T, edge_v1.T);
+    Mu = max(Mu, 2);
+    Mv = max(Mv, 2);
+
+    switch (edge) {
+      case 0:
+        return inner_grid_vert_offset + n * (Mu - 1);
+      case 1:
+        return inner_grid_vert_offset + (Mu - 1) * (Mv - 2) + n;
+      case 2:
+        return inner_grid_vert_offset + ((Mu - 1) * (Mv - 1) - 1) - n * (Mu - 1);
+      case 3:
+        return inner_grid_vert_offset + (Mu - 2) - n;
+    }
+
+    return -1;
+  }
+};
+
+struct Edge {
+  /* Number of segments the edge will be diced into, see DiagSplit paper. */
+  int T;
+
+  /* top is edge adjacent to start, bottom is adjacent to end. */
+  Edge *top, *bottom;
+
+  int top_offset, bottom_offset;
+  bool top_indices_decrease, bottom_indices_decrease;
+
+  int start_vert_index;
+  int end_vert_index;
+
+  /* Index of the second vert from this edges corner along the edge towards the next corner. */
+  int second_vert_index;
+
+  /* Vertices on edge are to be stitched. */
+  bool is_stitch_edge;
+
+  /* Key to match this edge with others to be stitched with.
+   * The ints in the pair are ordered stitching indices */
+  pair<int, int> stitch_edge_key;
+
+  /* Full T along edge (may be larger than T for edges split from ngon edges) */
+  int stitch_edge_T;
+  int stitch_offset;
+  int stitch_top_offset;
+  int stitch_start_vert_index;
+  int stitch_end_vert_index;
+
+  Edge()
+      : T(0),
+        top(nullptr),
+        bottom(nullptr),
+        top_offset(-1),
+        bottom_offset(-1),
+        top_indices_decrease(false),
+        bottom_indices_decrease(false),
+        start_vert_index(-1),
+        end_vert_index(-1),
+        second_vert_index(-1),
+        is_stitch_edge(false),
+        stitch_edge_T(0),
+        stitch_offset(0)
+  {
+  }
+
+  int get_vert_along_edge(int n) const
+  {
+    assert(n >= 0 && n <= T);
+
+    if (n == 0) {
+      return start_vert_index;
+    }
+    else if (n == T) {
+      return end_vert_index;
+    }
+
+    return second_vert_index + n - 1;
+  }
+};
+
+inline int Subpatch::edge_t::get_vert_along_edge(int n) const
+{
+  assert(n >= 0 && n <= T);
+
+  if (!indices_decrease_along_edge && !sub_edges_created_in_reverse_order) {
+    n = offset + n;
+  }
+  else if (!indices_decrease_along_edge && sub_edges_created_in_reverse_order) {
+    n = edge->T - offset - T + n;
+  }
+  else if (indices_decrease_along_edge && !sub_edges_created_in_reverse_order) {
+    n = offset + T - n;
+  }
+  else if (indices_decrease_along_edge && sub_edges_created_in_reverse_order) {
+    n = edge->T - offset - n;
+  }
+
+  return edge->get_vert_along_edge(n);
+}
+
+inline int Subpatch::get_vert_along_edge(int edge, int n) const
+{
+  return edges[edge].get_vert_along_edge(n);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __SUBD_SUBPATCH_H__ */
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 98fcc8cd15e..6dcc7f7b3dd 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -82,25 +82,33 @@ list(APPEND ALL_CYCLES_LIBRARIES
   ${TIFF_LIBRARY}
   ${OPENIMAGEIO_LIBRARIES}
   ${OPENEXR_LIBRARIES}
+  ${OPENVDB_LIBRARIES}
 )
 
 include_directories(${INC})
 
-link_directories(${OPENIMAGEIO_LIBPATH}
-                 ${BOOST_LIBPATH}
-                 ${PNG_LIBPATH}
-                 ${JPEG_LIBPATH}
-                 ${ZLIB_LIBPATH}
-                 ${TIFF_LIBPATH}
-                 ${OPENEXR_LIBPATH}
-                 ${OPENCOLORIO_LIBPATH})
+link_directories(
+  ${OPENIMAGEIO_LIBPATH}
+  ${BOOST_LIBPATH}
+  ${PNG_LIBPATH}
+  ${JPEG_LIBPATH}
+  ${ZLIB_LIBPATH}
+  ${TIFF_LIBPATH}
+  ${OPENEXR_LIBPATH}
+  ${OPENCOLORIO_LIBPATH}
+  ${OPENVDB_LIBPATH}
+)
 
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PLATFORM_LINKFLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} ${PLATFORM_LINKFLAGS_DEBUG}")
 
 CYCLES_TEST(render_graph_finalize "${ALL_CYCLES_LIBRARIES};bf_intern_numaapi")
 CYCLES_TEST(util_aligned_malloc "cycles_util")
-CYCLES_TEST(util_path "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
-CYCLES_TEST(util_string "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
-CYCLES_TEST(util_task "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES};bf_intern_numaapi")
-CYCLES_TEST(util_time "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
+CYCLES_TEST(util_path "cycles_util;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
+CYCLES_TEST(util_string "cycles_util;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
+CYCLES_TEST(util_task "cycles_util;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES};bf_intern_numaapi")
+CYCLES_TEST(util_time "cycles_util;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
+set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+CYCLES_TEST(util_avxf_avx "cycles_util;bf_intern_numaapi;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
+set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+CYCLES_TEST(util_avxf_avx2 "cycles_util;bf_intern_numaapi;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index 7fb92bfb862..4ea3470cda8 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -14,14 +14,18 @@
  * limitations under the License.
  */
 
-#include "testing/testing.h"
 #include "testing/mock_log.h"
+#include "testing/testing.h"
+
+#include "device/device.h"
 
 #include "render/graph.h"
-#include "render/scene.h"
 #include "render/nodes.h"
+#include "render/scene.h"
+
 #include "util/util_array.h"
 #include "util/util_logging.h"
+#include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
 
@@ -960,6 +964,13 @@ TEST_F(RenderGraph, constant_fold_blackbody)
   graph.finalize(scene);
 }
 
+/* A Note About The Math Node
+ *
+ * The clamp option is implemented using graph expansion, where a
+ * Clamp node named "clamp" is added and connected to the output.
+ * So the final result is actually from the node "clamp".
+ */
+
 /*
  * Tests: Math with all constant inputs (clamp false).
  */
@@ -985,7 +996,7 @@ TEST_F(RenderGraph, constant_fold_math)
 TEST_F(RenderGraph, constant_fold_math_clamp)
 {
   EXPECT_ANY_MESSAGE(log);
-  CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (1).");
+  CORRECT_INFO_MESSAGE(log, "Folding clamp::Result to constant (1).");
 
   builder
       .add_node(ShaderNodeBuilder<MathNode>("Math")
@@ -1003,7 +1014,7 @@ TEST_F(RenderGraph, constant_fold_math_clamp)
  * Includes 2 tests: constant on each side.
  */
 static void build_math_partial_test_graph(ShaderGraphBuilder &builder,
-                                          NodeMath type,
+                                          NodeMathType type,
                                           float constval)
 {
   builder
@@ -1038,7 +1049,7 @@ TEST_F(RenderGraph, constant_fold_part_math_add_0)
   /* X + 0 == 0 + X == X */
   CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to socket Attribute::Fac.");
   CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
-  INVALID_INFO_MESSAGE(log, "Folding Out::");
+  INVALID_INFO_MESSAGE(log, "Folding clamp::");
 
   build_math_partial_test_graph(builder, NODE_MATH_ADD, 0.0f);
   graph.finalize(scene);
@@ -1053,7 +1064,7 @@ TEST_F(RenderGraph, constant_fold_part_math_sub_0)
   /* X - 0 == X */
   INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
   CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
-  INVALID_INFO_MESSAGE(log, "Folding Out::");
+  INVALID_INFO_MESSAGE(log, "Folding clamp::");
 
   build_math_partial_test_graph(builder, NODE_MATH_SUBTRACT, 0.0f);
   graph.finalize(scene);
@@ -1068,7 +1079,7 @@ TEST_F(RenderGraph, constant_fold_part_math_mul_1)
   /* X * 1 == 1 * X == X */
   CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to socket Attribute::Fac.");
   CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
-  INVALID_INFO_MESSAGE(log, "Folding Out::");
+  INVALID_INFO_MESSAGE(log, "Folding clamp::");
 
   build_math_partial_test_graph(builder, NODE_MATH_MULTIPLY, 1.0f);
   graph.finalize(scene);
@@ -1083,7 +1094,7 @@ TEST_F(RenderGraph, constant_fold_part_math_div_1)
   /* X / 1 == X */
   INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
   CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
-  INVALID_INFO_MESSAGE(log, "Folding Out::");
+  INVALID_INFO_MESSAGE(log, "Folding clamp::");
 
   build_math_partial_test_graph(builder, NODE_MATH_DIVIDE, 1.0f);
   graph.finalize(scene);
@@ -1098,7 +1109,7 @@ TEST_F(RenderGraph, constant_fold_part_math_mul_0)
   /* X * 0 == 0 * X == 0 */
   CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (0).");
   CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to constant (0).");
-  CORRECT_INFO_MESSAGE(log, "Folding Out::Value to constant (0)");
+  CORRECT_INFO_MESSAGE(log, "Folding clamp::Result to constant (0)");
   CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode.");
 
   build_math_partial_test_graph(builder, NODE_MATH_MULTIPLY, 0.0f);
@@ -1114,7 +1125,7 @@ TEST_F(RenderGraph, constant_fold_part_math_div_0)
   /* 0 / X == 0 */
   CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (0).");
   INVALID_INFO_MESSAGE(log, "Folding Math_xC::");
-  INVALID_INFO_MESSAGE(log, "Folding Out::");
+  INVALID_INFO_MESSAGE(log, "Folding clamp::");
 
   build_math_partial_test_graph(builder, NODE_MATH_DIVIDE, 0.0f);
   graph.finalize(scene);
@@ -1129,7 +1140,7 @@ TEST_F(RenderGraph, constant_fold_part_math_pow_0)
   /* X ^ 0 == 1 */
   INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
   CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to constant (1).");
-  INVALID_INFO_MESSAGE(log, "Folding Out::");
+  INVALID_INFO_MESSAGE(log, "Folding clamp::");
 
   build_math_partial_test_graph(builder, NODE_MATH_POWER, 0.0f);
   graph.finalize(scene);
@@ -1144,7 +1155,7 @@ TEST_F(RenderGraph, constant_fold_part_math_pow_1)
   /* 1 ^ X == 1; X ^ 1 == X */
   CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (1)");
   CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
-  INVALID_INFO_MESSAGE(log, "Folding Out::");
+  INVALID_INFO_MESSAGE(log, "Folding clamp::");
 
   build_math_partial_test_graph(builder, NODE_MATH_POWER, 1.0f);
   graph.finalize(scene);
@@ -1156,21 +1167,14 @@ TEST_F(RenderGraph, constant_fold_part_math_pow_1)
 TEST_F(RenderGraph, constant_fold_vector_math)
 {
   EXPECT_ANY_MESSAGE(log);
-  CORRECT_INFO_MESSAGE(log, "Folding VectorMath::Value to constant (1).");
   CORRECT_INFO_MESSAGE(log, "Folding VectorMath::Vector to constant (3, 0, 0).");
-  CORRECT_INFO_MESSAGE(log, "Folding convert_vector_to_float::value_float to constant (1).");
-  CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (2).");
-  CORRECT_INFO_MESSAGE(log, "Folding convert_float_to_color::value_color to constant (2, 2, 2).");
 
   builder
       .add_node(ShaderNodeBuilder<VectorMathNode>("VectorMath")
                     .set(&VectorMathNode::type, NODE_VECTOR_MATH_SUBTRACT)
                     .set("Vector1", make_float3(1.3f, 0.5f, 0.7f))
                     .set("Vector2", make_float3(-1.7f, 0.5f, 0.7f)))
-      .add_node(ShaderNodeBuilder<MathNode>("Math").set(&MathNode::type, NODE_MATH_ADD))
-      .add_connection("VectorMath::Vector", "Math::Value1")
-      .add_connection("VectorMath::Value", "Math::Value2")
-      .output_color("Math::Value");
+      .output_color("VectorMath::Vector");
 
   graph.finalize(scene);
 }
@@ -1180,7 +1184,7 @@ TEST_F(RenderGraph, constant_fold_vector_math)
  * Includes 2 tests: constant on each side.
  */
 static void build_vecmath_partial_test_graph(ShaderGraphBuilder &builder,
-                                             NodeVectorMath type,
+                                             NodeVectorMathType type,
                                              float3 constval)
 {
   builder
@@ -1234,22 +1238,6 @@ TEST_F(RenderGraph, constant_fold_part_vecmath_sub_0)
 }
 
 /*
- * Tests: partial folding for Vector Math Dot Product with known 0.
- */
-TEST_F(RenderGraph, constant_fold_part_vecmath_dot_0)
-{
-  EXPECT_ANY_MESSAGE(log);
-  /* X * 0 == 0 * X == X */
-  CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to constant (0, 0, 0).");
-  CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Vector to constant (0, 0, 0).");
-  CORRECT_INFO_MESSAGE(log, "Folding Out::Vector to constant (0, 0, 0).");
-  CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode.");
-
-  build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_DOT_PRODUCT, make_float3(0, 0, 0));
-  graph.finalize(scene);
-}
-
-/*
  * Tests: partial folding for Vector Math Cross Product with known 0.
  */
 TEST_F(RenderGraph, constant_fold_part_vecmath_cross_0)
diff --git a/intern/cycles/test/util_avxf_avx2_test.cpp b/intern/cycles/test/util_avxf_avx2_test.cpp
new file mode 100644
index 00000000000..9b466ddd3a0
--- /dev/null
+++ b/intern/cycles/test/util_avxf_avx2_test.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#define __KERNEL_AVX2__
+#define __KERNEL_CPU__
+
+#if defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  include "util_avxf_test.h"
+#endif
diff --git a/intern/cycles/test/util_avxf_avx_test.cpp b/intern/cycles/test/util_avxf_avx_test.cpp
new file mode 100644
index 00000000000..cea67649b80
--- /dev/null
+++ b/intern/cycles/test/util_avxf_avx_test.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#define __KERNEL_AVX__
+#define __KERNEL_CPU__
+
+#if defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  include "util_avxf_test.h"
+#endif
diff --git a/intern/cycles/test/util_avxf_test.h b/intern/cycles/test/util_avxf_test.h
new file mode 100644
index 00000000000..d93563fdb3f
--- /dev/null
+++ b/intern/cycles/test/util_avxf_test.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+bool validate_cpu_capabilities()
+{
+
+#ifdef __KERNEL_AVX2__
+  return system_cpu_support_avx2();
+#else
+#  ifdef __KERNEL_AVX__
+  return system_cpu_support_avx();
+#  endif
+#endif
+}
+
+#define VALIDATECPU \
+  if (!validate_cpu_capabilities()) \
+    return;
+
+#define compare_vector_scalar(a, b) \
+  for (size_t index = 0; index < a.size; index++) \
+    EXPECT_FLOAT_EQ(a[index], b);
+
+#define compare_vector_vector(a, b) \
+  for (size_t index = 0; index < a.size; index++) \
+    EXPECT_FLOAT_EQ(a[index], b[index]);
+
+#define compare_vector_vector_near(a, b, abserror) \
+  for (size_t index = 0; index < a.size; index++) \
+    EXPECT_NEAR(a[index], b[index], abserror);
+
+#define basic_test_vv(a, b, op) \
+  VALIDATECPU \
+  avxf c = a op b; \
+  for (size_t i = 0; i < a.size; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
+
+/* vector op float tests */
+#define basic_test_vf(a, b, op) \
+  VALIDATECPU \
+  avxf c = a op b; \
+  for (size_t i = 0; i < a.size; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b);
+
+const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
+const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
+const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
+const float float_b = 1.5f;
+
+TEST(util_avx, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(util_avx, avxf_sub_vv){
+    basic_test_vv(avxf_a, avxf_b, -)} TEST(util_avx, avxf_mul_vv){
+    basic_test_vv(avxf_a, avxf_b, *)} TEST(util_avx, avxf_div_vv){
+    basic_test_vv(avxf_a, avxf_b, /)} TEST(util_avx, avxf_add_vf){
+    basic_test_vf(avxf_a, float_b, +)} TEST(util_avx, avxf_sub_vf){
+    basic_test_vf(avxf_a, float_b, -)} TEST(util_avx, avxf_mul_vf){
+    basic_test_vf(avxf_a, float_b, *)} TEST(util_avx,
+                                            avxf_div_vf){basic_test_vf(avxf_a, float_b, /)}
+
+TEST(util_avx, avxf_ctor)
+{
+  VALIDATECPU
+  compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f),
+                        static_cast<float>(index));
+  compare_vector_scalar(avxf(1.0f), 1.0f);
+  compare_vector_vector(avxf(1.0f, 2.0f), avxf(1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f));
+  compare_vector_vector(avxf(1.0f, 2.0f, 3.0f, 4.0f),
+                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f));
+  compare_vector_vector(avxf(make_float3(1.0f, 2.0f, 3.0f)),
+                        avxf(0.0f, 3.0f, 2.0f, 1.0f, 0.0f, 3.0f, 2.0f, 1.0f));
+}
+
+TEST(util_avx, avxf_sqrt)
+{
+  VALIDATECPU
+  compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
+                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
+}
+
+TEST(util_avx, avxf_min_max)
+{
+  VALIDATECPU
+  compare_vector_vector(min(avxf_a, avxf_b), avxf_a);
+  compare_vector_vector(max(avxf_a, avxf_b), avxf_b);
+}
+
+TEST(util_avx, avxf_set_sign)
+{
+  VALIDATECPU
+  avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a);
+  compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f));
+}
+
+TEST(util_avx, avxf_msub)
+{
+  VALIDATECPU
+  avxf res = msub(avxf_a, avxf_b, avxf_c);
+  avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7],
+                  (avxf_a[6] * avxf_b[6]) - avxf_c[6],
+                  (avxf_a[5] * avxf_b[5]) - avxf_c[5],
+                  (avxf_a[4] * avxf_b[4]) - avxf_c[4],
+                  (avxf_a[3] * avxf_b[3]) - avxf_c[3],
+                  (avxf_a[2] * avxf_b[2]) - avxf_c[2],
+                  (avxf_a[1] * avxf_b[1]) - avxf_c[1],
+                  (avxf_a[0] * avxf_b[0]) - avxf_c[0]);
+  compare_vector_vector(res, exp);
+}
+
+TEST(util_avx, avxf_madd)
+{
+  VALIDATECPU
+  avxf res = madd(avxf_a, avxf_b, avxf_c);
+  avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7],
+                  (avxf_a[6] * avxf_b[6]) + avxf_c[6],
+                  (avxf_a[5] * avxf_b[5]) + avxf_c[5],
+                  (avxf_a[4] * avxf_b[4]) + avxf_c[4],
+                  (avxf_a[3] * avxf_b[3]) + avxf_c[3],
+                  (avxf_a[2] * avxf_b[2]) + avxf_c[2],
+                  (avxf_a[1] * avxf_b[1]) + avxf_c[1],
+                  (avxf_a[0] * avxf_b[0]) + avxf_c[0]);
+  compare_vector_vector(res, exp);
+}
+
+TEST(util_avx, avxf_nmadd)
+{
+  VALIDATECPU
+  avxf res = nmadd(avxf_a, avxf_b, avxf_c);
+  avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]),
+                  avxf_c[6] - (avxf_a[6] * avxf_b[6]),
+                  avxf_c[5] - (avxf_a[5] * avxf_b[5]),
+                  avxf_c[4] - (avxf_a[4] * avxf_b[4]),
+                  avxf_c[3] - (avxf_a[3] * avxf_b[3]),
+                  avxf_c[2] - (avxf_a[2] * avxf_b[2]),
+                  avxf_c[1] - (avxf_a[1] * avxf_b[1]),
+                  avxf_c[0] - (avxf_a[0] * avxf_b[0]));
+  compare_vector_vector(res, exp);
+}
+
+TEST(util_avx, avxf_compare)
+{
+  VALIDATECPU
+  avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f);
+  avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+  avxb res = a <= b;
+  int exp[8] = {
+      a[0] <= b[0] ? -1 : 0,
+      a[1] <= b[1] ? -1 : 0,
+      a[2] <= b[2] ? -1 : 0,
+      a[3] <= b[3] ? -1 : 0,
+      a[4] <= b[4] ? -1 : 0,
+      a[5] <= b[5] ? -1 : 0,
+      a[6] <= b[6] ? -1 : 0,
+      a[7] <= b[7] ? -1 : 0,
+  };
+  compare_vector_vector(res, exp);
+}
+
+TEST(util_avx, avxf_permute)
+{
+  VALIDATECPU
+  avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b);
+  compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f));
+}
+
+TEST(util_avx, avxf_blend)
+{
+  VALIDATECPU
+  avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b);
+  compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f));
+}
+
+TEST(util_avx, avxf_shuffle)
+{
+  VALIDATECPU
+  avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a);
+  compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f));
+}
+
+TEST(util_avx, avxf_cross)
+{
+  VALIDATECPU
+  avxf res = cross(avxf_b, avxf_c);
+  compare_vector_vector_near(res,
+                             avxf(0.0f,
+                                  -9.5367432e-07f,
+                                  0.0f,
+                                  4.7683716e-07f,
+                                  0.0f,
+                                  -3.8146973e-06f,
+                                  3.8146973e-06f,
+                                  3.8146973e-06f),
+                             0.000002000f);
+}
+
+TEST(util_avx, avxf_dot3)
+{
+  VALIDATECPU
+  float den, den2;
+  dot3(avxf_a, avxf_b, den, den2);
+  EXPECT_FLOAT_EQ(den, 14.9f);
+  EXPECT_FLOAT_EQ(den2, 2.9f);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 1c7a6549253..f5e488d1bd2 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -25,14 +25,15 @@ set(SRC
   util_thread.cpp
   util_time.cpp
   util_transform.cpp
+  util_windows.cpp
 )
 
 set(LIB
-
+  ${TBB_LIBRARIES}
 )
 
 if(WITH_CYCLES_STANDALONE)
-  if (WITH_CYCLES_STANDALONE_GUI)
+  if(WITH_CYCLES_STANDALONE_GUI)
     list(APPEND SRC
       util_view.cpp
     )
@@ -40,9 +41,13 @@ if(WITH_CYCLES_STANDALONE)
 endif()
 
 if(CYCLES_STANDALONE_REPOSITORY)
-  list(APPEND INC_SYS ../../third_party/numaapi/include)
+  list(APPEND INC_SYS
+    ../../third_party/numaapi/include
+  )
 else()
-  list(APPEND INC_SYS ../../numaapi/include)
+  list(APPEND INC_SYS
+    ../../numaapi/include
+  )
 endif()
 
 set(SRC_HEADERS
@@ -54,6 +59,8 @@ set(SRC_HEADERS
   util_boundbox.h
   util_debug.h
   util_defines.h
+  util_deque.h
+  util_disjoint_set.h
   util_guarded_allocator.cpp
   util_foreach.h
   util_function.h
@@ -79,6 +86,7 @@ set(SRC_HEADERS
   util_math_matrix.h
   util_md5.h
   util_murmurhash.h
+  util_openimagedenoise.h
   util_opengl.h
   util_optimization.h
   util_param.h
@@ -90,11 +98,10 @@ set(SRC_HEADERS
   util_rect.h
   util_set.h
   util_simd.h
-  util_sky_model.cpp
-  util_sky_model.h
-  util_sky_model_data.h
   util_avxf.h
   util_avxb.h
+  util_avxi.h
+  util_semaphore.h
   util_sseb.h
   util_ssef.h
   util_ssei.h
@@ -104,6 +111,7 @@ set(SRC_HEADERS
   util_string.h
   util_system.h
   util_task.h
+  util_tbb.h
   util_texture.h
   util_thread.h
   util_time.h
@@ -117,7 +125,7 @@ set(SRC_HEADERS
   util_types_float4_impl.h
   util_types_float8.h
   util_types_float8_impl.h
-    util_types_int2.h
+  util_types_int2.h
   util_types_int2_impl.h
   util_types_int3.h
   util_types_int3_impl.h
@@ -138,6 +146,7 @@ set(SRC_HEADERS
   util_types_ushort4.h
   util_types_vector3.h
   util_types_vector3_impl.h
+  util_unique_ptr.h
   util_vector.h
   util_version.h
   util_view.h
diff --git a/intern/cycles/util/util_algorithm.h b/intern/cycles/util/util_algorithm.h
index 62093039625..63abd4e92a3 100644
--- a/intern/cycles/util/util_algorithm.h
+++ b/intern/cycles/util/util_algorithm.h
@@ -25,6 +25,7 @@ using std::max;
 using std::min;
 using std::remove;
 using std::sort;
+using std::stable_sort;
 using std::swap;
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_aligned_malloc.cpp b/intern/cycles/util/util_aligned_malloc.cpp
index 104e6c5e3f4..9b729cd4fc4 100644
--- a/intern/cycles/util/util_aligned_malloc.cpp
+++ b/intern/cycles/util/util_aligned_malloc.cpp
@@ -46,13 +46,7 @@ void *util_aligned_malloc(size_t size, int alignment)
   return MEM_mallocN_aligned(size, alignment, "Cycles Aligned Alloc");
 #elif defined(_WIN32)
   return _aligned_malloc(size, alignment);
-#elif defined(__APPLE__)
-  /* On Mac OS X, both the heap and the stack are guaranteed 16-byte aligned so
-   * they work natively with SSE types with no further work.
-   */
-  assert(alignment == 16);
-  return malloc(size);
-#elif defined(__FreeBSD__) || defined(__NetBSD__)
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__APPLE__)
   void *result;
   if (posix_memalign(&result, alignment, size)) {
     /* Non-zero means allocation error
diff --git a/intern/cycles/util/util_aligned_malloc.h b/intern/cycles/util/util_aligned_malloc.h
index 0f006e95f6a..df7d93c056d 100644
--- a/intern/cycles/util/util_aligned_malloc.h
+++ b/intern/cycles/util/util_aligned_malloc.h
@@ -30,6 +30,21 @@ void *util_aligned_malloc(size_t size, int alignment);
 /* Free memory allocated by util_aligned_malloc. */
 void util_aligned_free(void *ptr);
 
+/* Aligned new operator. */
+template<typename T, typename... Args> T *util_aligned_new(Args... args)
+{
+  void *mem = util_aligned_malloc(sizeof(T), alignof(T));
+  return new (mem) T(args...);
+}
+
+template<typename T> void util_aligned_delete(T *t)
+{
+  if (t) {
+    t->~T();
+    util_aligned_free(t);
+  }
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_ALIGNED_MALLOC_H__ */
diff --git a/intern/cycles/util/util_array.h b/intern/cycles/util/util_array.h
index 1d7e39344f6..db80ab474e0 100644
--- a/intern/cycles/util/util_array.h
+++ b/intern/cycles/util/util_array.h
@@ -63,7 +63,9 @@ template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES> class arra
     }
     else {
       data_ = mem_allocate(from.datasize_);
-      memcpy(data_, from.data_, from.datasize_ * sizeof(T));
+      if (from.datasize_ > 0) {
+        memcpy(data_, from.data_, from.datasize_ * sizeof(T));
+      }
       datasize_ = from.datasize_;
       capacity_ = datasize_;
     }
@@ -73,7 +75,9 @@ template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES> class arra
   {
     if (this != &from) {
       resize(from.size());
-      memcpy((void *)data_, from.data_, datasize_ * sizeof(T));
+      if (datasize_ > 0) {
+        memcpy((void *)data_, from.data_, datasize_ * sizeof(T));
+      }
     }
 
     return *this;
@@ -83,7 +87,7 @@ template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES> class arra
   {
     resize(from.size());
 
-    if (from.size() > 0) {
+    if (from.size() > 0 && datasize_ > 0) {
       memcpy(data_, &from[0], datasize_ * sizeof(T));
     }
 
@@ -100,6 +104,9 @@ template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES> class arra
     if (datasize_ != other.datasize_) {
       return false;
     }
+    if (datasize_ == 0) {
+      return true;
+    }
 
     return memcmp(data_, other.data_, datasize_ * sizeof(T)) == 0;
   }
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index a8ea1dc925e..13d177d2b25 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -77,6 +77,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
 #    define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
 #    define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
+#    define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
 
 #    define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
 #    define ccl_barrier(flags) barrier(flags)
@@ -91,6 +92,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x))
 #    define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
 #    define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
+#    define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x))
 
 ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
                                                       const float old_val,
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
index 615dceaf9d5..5c03b1d88d7 100644
--- a/intern/cycles/util/util_avxb.h
+++ b/intern/cycles/util/util_avxb.h
@@ -16,7 +16,7 @@
  */
 
 #ifndef __UTIL_AVXB_H__
-#  define __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,6 +53,10 @@ struct avxb {
   __forceinline avxb(const __m256 input) : m256(input)
   {
   }
+  __forceinline avxb(const __m128 &a, const __m128 &b)
+      : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
+  {
+  }
   __forceinline operator const __m256 &(void) const
   {
     return m256;
@@ -146,9 +150,9 @@ __forceinline const avxb operator!=(const avxb &a, const avxb &b)
 }
 __forceinline const avxb operator==(const avxb &a, const avxb &b)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-#  else
+#else
   __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
   __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
   __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
@@ -157,16 +161,16 @@ __forceinline const avxb operator==(const avxb &a, const avxb &b)
   __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
   __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
   return _mm256_castsi256_ps(result);
-#  endif
+#endif
 }
 
 __forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
 {
-#  if defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE41__)
   return _mm256_blendv_ps(f, t, m);
-#  else
+#else
   return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-#  endif
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -186,18 +190,18 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b)
 /// Reduction Operations
 ////////////////////////////////////////////////////////////////////////////////
 
-#  if defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE41__)
 __forceinline size_t popcnt(const avxb &a)
 {
   return __popcnt(_mm256_movemask_ps(a));
 }
-#  else
+#else
 __forceinline size_t popcnt(const avxb &a)
 {
   return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
          bool(a[7]);
 }
-#  endif
+#endif
 
 __forceinline bool reduce_and(const avxb &a)
 {
@@ -234,8 +238,6 @@ ccl_device_inline void print_avxb(const char *label, const avxb &a)
   printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
 }
 
-#endif
-
 CCL_NAMESPACE_END
 
-//#endif
+#endif
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 156607e65fb..1fb3ded422f 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -15,7 +15,7 @@
  */
 
 #ifndef __UTIL_AVXF_H__
-#  define __UTIL_AVXF_H__
+#define __UTIL_AVXF_H__
 
 CCL_NAMESPACE_BEGIN
 
@@ -140,6 +140,11 @@ __forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
 /// Unary Operators
 ////////////////////////////////////////////////////////////////////////////////
 
+__forceinline const avxf cast(const __m256i &a)
+{
+  return _mm256_castsi256_ps(a);
+}
+
 __forceinline const avxf mm256_sqrt(const avxf &a)
 {
   return _mm256_sqrt_ps(a.m256);
@@ -259,16 +264,35 @@ template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
   return shuffle<i0>(a, a);
 }
 
+template<size_t i> __forceinline float extract(const avxf &a)
+{
+  __m256 b = shuffle<i, i, i, i>(a).m256;
+  return _mm256_cvtss_f32(b);
+}
+template<> __forceinline float extract<0>(const avxf &a)
+{
+  return _mm256_cvtss_f32(a.m256);
+}
+
+__forceinline ssef low(const avxf &a)
+{
+  return _mm256_extractf128_ps(a.m256, 0);
+}
+__forceinline ssef high(const avxf &a)
+{
+  return _mm256_extractf128_ps(a.m256, 1);
+}
+
 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
 __forceinline const avxf permute(const avxf &a)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-#  else
+#else
   float temp[8];
   _mm256_storeu_ps((float *)&temp, a);
   return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-#  endif
+#endif
 }
 
 template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
@@ -309,39 +333,51 @@ __forceinline avxf mini(const avxf &a, const avxf &b)
 ////////////////////////////////////////////////////////////////////////////////
 __forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fmadd_ps(a, b, c);
-#  else
+#else
   return c + (a * b);
-#  endif
+#endif
 }
 
 __forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fnmadd_ps(a, b, c);
-#  else
+#else
   return c - (a * b);
-#  endif
+#endif
 }
 __forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fmsub_ps(a, b, c);
-#  else
+#else
   return (a * b) - c;
-#  endif
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators
+/// Comparison Operators + Select
 ////////////////////////////////////////////////////////////////////////////////
 __forceinline const avxb operator<=(const avxf &a, const avxf &b)
 {
   return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
 }
 
-#endif
+__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
+{
+  return _mm256_blendv_ps(f, t, m);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Common Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
+{
+  return madd(t, b, (avxf(1.0f) - t) * a);
+}
 
 #ifndef _mm256_set_m128
 #  define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
@@ -352,3 +388,5 @@ __forceinline const avxb operator<=(const avxf &a, const avxf &b)
   _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h
new file mode 100644
index 00000000000..e658a4f848f
--- /dev/null
+++ b/intern/cycles/util/util_avxi.h
@@ -0,0 +1,745 @@
+/*
+ * Copyright 2009-2013 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXI_H__
+#define __UTIL_AVXI_H__
+
+CCL_NAMESPACE_BEGIN
+
+struct avxb;
+
+struct avxi {
+  typedef avxb Mask;  // mask type for us
+  enum { size = 8 };  // number of SIMD elements
+  union {             // data
+    __m256i m256;
+#if !defined(__KERNEL_AVX2__)
+    struct {
+      __m128i l, h;
+    };
+#endif
+    int32_t v[8];
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Constructors, Assignment & Cast Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline avxi()
+  {
+  }
+  __forceinline avxi(const avxi &a)
+  {
+    m256 = a.m256;
+  }
+  __forceinline avxi &operator=(const avxi &a)
+  {
+    m256 = a.m256;
+    return *this;
+  }
+
+  __forceinline avxi(const __m256i a) : m256(a)
+  {
+  }
+  __forceinline operator const __m256i &(void)const
+  {
+    return m256;
+  }
+  __forceinline operator __m256i &(void)
+  {
+    return m256;
+  }
+
+  __forceinline explicit avxi(const ssei &a)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
+  {
+  }
+  __forceinline avxi(const ssei &a, const ssei &b)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
+  {
+  }
+#if defined(__KERNEL_AVX2__)
+  __forceinline avxi(const __m128i &a, const __m128i &b)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
+  {
+  }
+#else
+  __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
+  {
+  }
+#endif
+  __forceinline explicit avxi(const int32_t *const a)
+      : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
+  {
+  }
+  __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
+  {
+  }
+  __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
+  {
+  }
+  __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
+      : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
+  {
+  }
+  __forceinline avxi(
+      int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
+      : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
+  {
+  }
+
+  __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
+  {
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Constants
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
+  {
+  }
+#if defined(__KERNEL_AVX2__)
+  __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
+  {
+  }
+  __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
+  {
+  }
+  __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
+  {
+  }
+#else
+  __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
+  {
+  }
+  __forceinline avxi(PosInfTy)
+      : m256(_mm256_set_epi32(
+            pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
+  {
+  }
+  __forceinline avxi(NegInfTy)
+      : m256(_mm256_set_epi32(
+            neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
+  {
+  }
+#endif
+  __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
+  {
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Array Access
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const int32_t &operator[](const size_t i) const
+  {
+    assert(i < 8);
+    return v[i];
+  }
+  __forceinline int32_t &operator[](const size_t i)
+  {
+    assert(i < 8);
+    return v[i];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxi cast(const __m256 &a)
+{
+  return _mm256_castps_si256(a);
+}
+__forceinline const avxi operator+(const avxi &a)
+{
+  return a;
+}
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator-(const avxi &a)
+{
+  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
+}
+__forceinline const avxi abs(const avxi &a)
+{
+  return _mm256_abs_epi32(a.m256);
+}
+#else
+__forceinline const avxi operator-(const avxi &a)
+{
+  return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
+}
+__forceinline const avxi abs(const avxi &a)
+{
+  return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator+(const avxi &a, const avxi &b)
+{
+  return _mm256_add_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator+(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator+(const avxi &a, const int32_t b)
+{
+  return a + avxi(b);
+}
+__forceinline const avxi operator+(const int32_t a, const avxi &b)
+{
+  return avxi(a) + b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator-(const avxi &a, const avxi &b)
+{
+  return _mm256_sub_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator-(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator-(const avxi &a, const int32_t b)
+{
+  return a - avxi(b);
+}
+__forceinline const avxi operator-(const int32_t a, const avxi &b)
+{
+  return avxi(a) - b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator*(const avxi &a, const avxi &b)
+{
+  return _mm256_mullo_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator*(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator*(const avxi &a, const int32_t b)
+{
+  return a * avxi(b);
+}
+__forceinline const avxi operator*(const int32_t a, const avxi &b)
+{
+  return avxi(a) * b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator&(const avxi &a, const avxi &b)
+{
+  return _mm256_and_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator&(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator&(const avxi &a, const int32_t b)
+{
+  return a & avxi(b);
+}
+__forceinline const avxi operator&(const int32_t a, const avxi &b)
+{
+  return avxi(a) & b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator|(const avxi &a, const avxi &b)
+{
+  return _mm256_or_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator|(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator|(const avxi &a, const int32_t b)
+{
+  return a | avxi(b);
+}
+__forceinline const avxi operator|(const int32_t a, const avxi &b)
+{
+  return avxi(a) | b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator^(const avxi &a, const avxi &b)
+{
+  return _mm256_xor_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator^(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator^(const avxi &a, const int32_t b)
+{
+  return a ^ avxi(b);
+}
+__forceinline const avxi operator^(const int32_t a, const avxi &b)
+{
+  return avxi(a) ^ b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator<<(const avxi &a, const int32_t n)
+{
+  return _mm256_slli_epi32(a.m256, n);
+}
+__forceinline const avxi operator>>(const avxi &a, const int32_t n)
+{
+  return _mm256_srai_epi32(a.m256, n);
+}
+
+__forceinline const avxi sra(const avxi &a, const int32_t b)
+{
+  return _mm256_srai_epi32(a.m256, b);
+}
+__forceinline const avxi srl(const avxi &a, const int32_t b)
+{
+  return _mm256_srli_epi32(a.m256, b);
+}
+#else
+__forceinline const avxi operator<<(const avxi &a, const int32_t n)
+{
+  return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
+}
+__forceinline const avxi operator>>(const avxi &a, const int32_t n)
+{
+  return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
+}
+
+__forceinline const avxi sra(const avxi &a, const int32_t b)
+{
+  return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
+}
+__forceinline const avxi srl(const avxi &a, const int32_t b)
+{
+  return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
+}
+#endif
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi min(const avxi &a, const avxi &b)
+{
+  return _mm256_min_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi min(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi min(const avxi &a, const int32_t b)
+{
+  return min(a, avxi(b));
+}
+__forceinline const avxi min(const int32_t a, const avxi &b)
+{
+  return min(avxi(a), b);
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi max(const avxi &a, const avxi &b)
+{
+  return _mm256_max_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi max(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi max(const avxi &a, const int32_t b)
+{
+  return max(a, avxi(b));
+}
+__forceinline const avxi max(const int32_t a, const avxi &b)
+{
+  return max(avxi(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxi &operator+=(avxi &a, const avxi &b)
+{
+  return a = a + b;
+}
+__forceinline avxi &operator+=(avxi &a, const int32_t b)
+{
+  return a = a + b;
+}
+
+__forceinline avxi &operator-=(avxi &a, const avxi &b)
+{
+  return a = a - b;
+}
+__forceinline avxi &operator-=(avxi &a, const int32_t b)
+{
+  return a = a - b;
+}
+
+__forceinline avxi &operator*=(avxi &a, const avxi &b)
+{
+  return a = a * b;
+}
+__forceinline avxi &operator*=(avxi &a, const int32_t b)
+{
+  return a = a * b;
+}
+
+__forceinline avxi &operator&=(avxi &a, const avxi &b)
+{
+  return a = a & b;
+}
+__forceinline avxi &operator&=(avxi &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+__forceinline avxi &operator|=(avxi &a, const avxi &b)
+{
+  return a = a | b;
+}
+__forceinline avxi &operator|=(avxi &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+__forceinline avxi &operator^=(avxi &a, const avxi &b)
+{
+  return a = a ^ b;
+}
+__forceinline avxi &operator^=(avxi &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+__forceinline avxi &operator<<=(avxi &a, const int32_t b)
+{
+  return a = a << b;
+}
+__forceinline avxi &operator>>=(avxi &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator==(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
+}
+#else
+__forceinline const avxb operator==(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator==(const avxi &a, const int32_t b)
+{
+  return a == avxi(b);
+}
+__forceinline const avxb operator==(const int32_t a, const avxi &b)
+{
+  return avxi(a) == b;
+}
+
+__forceinline const avxb operator!=(const avxi &a, const avxi &b)
+{
+  return !(a == b);
+}
+__forceinline const avxb operator!=(const avxi &a, const int32_t b)
+{
+  return a != avxi(b);
+}
+__forceinline const avxb operator!=(const int32_t a, const avxi &b)
+{
+  return avxi(a) != b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator<(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
+}
+#else
+__forceinline const avxb operator<(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator<(const avxi &a, const int32_t b)
+{
+  return a < avxi(b);
+}
+__forceinline const avxb operator<(const int32_t a, const avxi &b)
+{
+  return avxi(a) < b;
+}
+
+__forceinline const avxb operator>=(const avxi &a, const avxi &b)
+{
+  return !(a < b);
+}
+__forceinline const avxb operator>=(const avxi &a, const int32_t b)
+{
+  return a >= avxi(b);
+}
+__forceinline const avxb operator>=(const int32_t a, const avxi &b)
+{
+  return avxi(a) >= b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator>(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
+}
+#else
+__forceinline const avxb operator>(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator>(const avxi &a, const int32_t b)
+{
+  return a > avxi(b);
+}
+__forceinline const avxb operator>(const int32_t a, const avxi &b)
+{
+  return avxi(a) > b;
+}
+
+__forceinline const avxb operator<=(const avxi &a, const avxi &b)
+{
+  return !(a > b);
+}
+__forceinline const avxb operator<=(const avxi &a, const int32_t b)
+{
+  return a <= avxi(b);
+}
+__forceinline const avxb operator<=(const int32_t a, const avxi &b)
+{
+  return avxi(a) <= b;
+}
+
+__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
+{
+  return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline avxi unpacklo(const avxi &a, const avxi &b)
+{
+  return _mm256_unpacklo_epi32(a.m256, b.m256);
+}
+__forceinline avxi unpackhi(const avxi &a, const avxi &b)
+{
+  return _mm256_unpackhi_epi32(a.m256, b.m256);
+}
+#else
+__forceinline avxi unpacklo(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+__forceinline avxi unpackhi(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+
+template<size_t i> __forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
+{
+  return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_castps_si256(
+      _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const avxi shuffle(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
+}
+template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
+}
+template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
+{
+  return _mm256_castps_si256(
+      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
+}
+
+__forceinline const avxi broadcast(const int *ptr)
+{
+  return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
+}
+template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
+{
+  return _mm256_insertf128_si256(a, b, i);
+}
+template<size_t i> __forceinline const ssei extract(const avxi &a)
+{
+  return _mm256_extractf128_si256(a, i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reductions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxi vreduce_min2(const avxi &v)
+{
+  return min(v, shuffle<1, 0, 3, 2>(v));
+}
+__forceinline const avxi vreduce_min4(const avxi &v)
+{
+  avxi v1 = vreduce_min2(v);
+  return min(v1, shuffle<2, 3, 0, 1>(v1));
+}
+__forceinline const avxi vreduce_min(const avxi &v)
+{
+  avxi v1 = vreduce_min4(v);
+  return min(v1, shuffle<1, 0>(v1));
+}
+
+__forceinline const avxi vreduce_max2(const avxi &v)
+{
+  return max(v, shuffle<1, 0, 3, 2>(v));
+}
+__forceinline const avxi vreduce_max4(const avxi &v)
+{
+  avxi v1 = vreduce_max2(v);
+  return max(v1, shuffle<2, 3, 0, 1>(v1));
+}
+__forceinline const avxi vreduce_max(const avxi &v)
+{
+  avxi v1 = vreduce_max4(v);
+  return max(v1, shuffle<1, 0>(v1));
+}
+
+__forceinline const avxi vreduce_add2(const avxi &v)
+{
+  return v + shuffle<1, 0, 3, 2>(v);
+}
+__forceinline const avxi vreduce_add4(const avxi &v)
+{
+  avxi v1 = vreduce_add2(v);
+  return v1 + shuffle<2, 3, 0, 1>(v1);
+}
+__forceinline const avxi vreduce_add(const avxi &v)
+{
+  avxi v1 = vreduce_add4(v);
+  return v1 + shuffle<1, 0>(v1);
+}
+
+__forceinline int reduce_min(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_min(v)));
+}
+__forceinline int reduce_max(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_max(v)));
+}
+__forceinline int reduce_add(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_add(v)));
+}
+
+__forceinline size_t select_min(const avxi &v)
+{
+  return __bsf(movemask(v == vreduce_min(v)));
+}
+__forceinline size_t select_max(const avxi &v)
+{
+  return __bsf(movemask(v == vreduce_max(v)));
+}
+
+__forceinline size_t select_min(const avxb &valid, const avxi &v)
+{
+  const avxi a = select(valid, v, avxi(pos_inf));
+  return __bsf(movemask(valid & (a == vreduce_min(a))));
+}
+__forceinline size_t select_max(const avxb &valid, const avxi &v)
+{
+  const avxi a = select(valid, v, avxi(neg_inf));
+  return __bsf(movemask(valid & (a == vreduce_max(a))));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Output Operators
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxi(const char *label, const avxi &a)
+{
+  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index b5c3f1a8954..7fab7bd5a15 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -17,8 +17,8 @@
 #ifndef __UTIL_BOUNDBOX_H__
 #define __UTIL_BOUNDBOX_H__
 
-#include <math.h>
 #include <float.h>
+#include <math.h>
 
 #include "util/util_math.h"
 #include "util/util_string.h"
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index ca4c393f66e..c6937ca78fe 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -43,11 +43,29 @@ ccl_device uchar4 color_float_to_byte(float3 c)
   return make_uchar4(r, g, b, 0);
 }
 
+ccl_device uchar4 color_float4_to_uchar4(float4 c)
+{
+  uchar r, g, b, a;
+
+  r = float_to_byte(c.x);
+  g = float_to_byte(c.y);
+  b = float_to_byte(c.z);
+  a = float_to_byte(c.w);
+
+  return make_uchar4(r, g, b, a);
+}
+
 ccl_device_inline float3 color_byte_to_float(uchar4 c)
 {
   return make_float3(c.x * (1.0f / 255.0f), c.y * (1.0f / 255.0f), c.z * (1.0f / 255.0f));
 }
 
+ccl_device_inline float4 color_uchar4_to_float4(uchar4 c)
+{
+  return make_float4(
+      c.x * (1.0f / 255.0f), c.y * (1.0f / 255.0f), c.z * (1.0f / 255.0f), c.w * (1.0f / 255.0f));
+}
+
 ccl_device float color_srgb_to_linear(float c)
 {
   if (c < 0.04045f)
@@ -167,7 +185,8 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y)
 #ifdef __KERNEL_SSE2__
 /*
  * Calculate initial guess for arg^exp based on float representation
- * This method gives a constant bias, which can be easily compensated by multiplication with bias_coeff.
+ * This method gives a constant bias,
+ * which can be easily compensated by multiplication with bias_coeff.
  * Gives better results for exponents near 1 (e. g. 4/5).
  * exp = exponent, encoded as uint32_t
  * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
@@ -235,6 +254,12 @@ ccl_device float3 color_linear_to_srgb_v3(float3 c)
       color_linear_to_srgb(c.x), color_linear_to_srgb(c.y), color_linear_to_srgb(c.z));
 }
 
+ccl_device float4 color_linear_to_srgb_v4(float4 c)
+{
+  return make_float4(
+      color_linear_to_srgb(c.x), color_linear_to_srgb(c.y), color_linear_to_srgb(c.z), c.w);
+}
+
 ccl_device float4 color_srgb_to_linear_v4(float4 c)
 {
 #ifdef __KERNEL_SSE2__
@@ -250,6 +275,20 @@ ccl_device float4 color_srgb_to_linear_v4(float4 c)
 #endif
 }
 
+ccl_device float3 color_highlight_compress(float3 color, float3 *variance)
+{
+  color += make_float3(1.0f, 1.0f, 1.0f);
+  if (variance) {
+    *variance *= sqr3(make_float3(1.0f, 1.0f, 1.0f) / color);
+  }
+  return log3(color);
+}
+
+ccl_device float3 color_highlight_uncompress(float3 color)
+{
+  return exp3(color) - make_float3(1.0f, 1.0f, 1.0f);
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_COLOR_H__ */
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index aabfea7fc49..74ecefa1917 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -31,7 +31,7 @@ DebugFlags::CPU::CPU()
       sse41(true),
       sse3(true),
       sse2(true),
-      bvh_layout(BVH_LAYOUT_DEFAULT),
+      bvh_layout(BVH_LAYOUT_AUTO),
       split_kernel(false)
 {
   reset();
@@ -57,18 +57,7 @@ void DebugFlags::CPU::reset()
 #undef STRINGIFY
 #undef CHECK_CPU_FLAGS
 
-  if (getenv("CYCLES_BVH2") != NULL) {
-    bvh_layout = BVH_LAYOUT_BVH2;
-  }
-  else if (getenv("CYCLES_BVH4") != NULL) {
-    bvh_layout = BVH_LAYOUT_BVH4;
-  }
-  else if (getenv("CYCLES_BVH8") != NULL) {
-    bvh_layout = BVH_LAYOUT_BVH8;
-  }
-  else {
-    bvh_layout = BVH_LAYOUT_DEFAULT;
-  }
+  bvh_layout = BVH_LAYOUT_AUTO;
 
   split_kernel = false;
 }
@@ -86,6 +75,17 @@ void DebugFlags::CUDA::reset()
   split_kernel = false;
 }
 
+DebugFlags::OptiX::OptiX()
+{
+  reset();
+}
+
+void DebugFlags::OptiX::reset()
+{
+  cuda_streams = 1;
+  curves_api = false;
+}
+
 DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
 {
   reset();
@@ -120,7 +120,7 @@ void DebugFlags::OpenCL::reset()
   debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
 }
 
-DebugFlags::DebugFlags() : viewport_static_bvh(false)
+DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false)
 {
   /* Nothing for now. */
 }
@@ -130,6 +130,7 @@ void DebugFlags::reset()
   viewport_static_bvh = false;
   cpu.reset();
   cuda.reset();
+  optix.reset();
   opencl.reset();
 }
 
@@ -145,7 +146,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
      << "  Split      : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
 
   os << "CUDA flags:\n"
-     << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
+     << "  Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
+
+  os << "OptiX flags:\n"
+     << "  CUDA streams : " << debug_flags.optix.cuda_streams << "\n";
 
   const char *opencl_device_type;
   switch (debug_flags.opencl.device_type) {
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index d668ddc6d6c..6ac4beb55b8 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -33,6 +33,8 @@ class DebugFlags {
   /* Use static BVH in viewport, to match final render exactly. */
   bool viewport_static_bvh;
 
+  bool running_inside_blender;
+
   /* Descriptor of CPU feature-set to be used. */
   struct CPU {
     CPU();
@@ -71,10 +73,10 @@ class DebugFlags {
       return sse2;
     }
 
-    /* Requested BVH size.
+    /* Requested BVH layout.
      *
-     * Rendering will use widest possible BVH which is below or equal
-     * this one.
+     * By default the fastest will be used. For debugging the BVH used by other
+     * CPUs and GPUs can be selected here instead.
      */
     BVHLayout bvh_layout;
 
@@ -97,6 +99,20 @@ class DebugFlags {
     bool split_kernel;
   };
 
+  /* Descriptor of OptiX feature-set to be used. */
+  struct OptiX {
+    OptiX();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Number of CUDA streams to launch kernels concurrently from. */
+    int cuda_streams;
+
+    /* Use OptiX curves API for hair instead of custom implementation. */
+    bool curves_api;
+  };
+
   /* Descriptor of OpenCL feature-set to be used. */
   struct OpenCL {
     OpenCL();
@@ -141,7 +157,8 @@ class DebugFlags {
     /* Use debug version of the kernel. */
     bool debug;
 
-    /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */
+    /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all
+     * devices. */
     /* Artificial memory limit in bytes (0 if disabled). */
     size_t mem_limit;
   };
@@ -162,6 +179,9 @@ class DebugFlags {
   /* Requested CUDA flags. */
   CUDA cuda;
 
+  /* Requested OptiX flags. */
+  OptiX optix;
+
   /* Requested OpenCL flags. */
   OpenCL opencl;
 
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index 391d20c04a4..e8e414587fb 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -15,6 +15,11 @@
  * limitations under the License.
  */
 
+/* clang-format off */
+
+/* #define __forceinline triggers a bug in some clang-format versions, disable
+ * format for entire file to keep results consistent. */
+
 #ifndef __UTIL_DEFINES_H__
 #define __UTIL_DEFINES_H__
 
@@ -30,6 +35,7 @@
 #ifndef __KERNEL_GPU__
 #  define ccl_device static inline
 #  define ccl_device_noinline static
+#  define ccl_device_noinline_cpu ccl_device_noinline
 #  define ccl_global
 #  define ccl_static_constant static const
 #  define ccl_constant const
@@ -38,6 +44,8 @@
 #  define ccl_private
 #  define ccl_restrict __restrict
 #  define ccl_ref &
+#  define ccl_optional_struct_init
+#  define ccl_loop_no_unroll
 #  define __KERNEL_WITH_SSE_ALIGN__
 
 #  if defined(_WIN32) && !defined(FREE_WINDOWS)
diff --git a/intern/cycles/util/util_deque.h b/intern/cycles/util/util_deque.h
new file mode 100644
index 00000000000..ccac961aa7d
--- /dev/null
+++ b/intern/cycles/util/util_deque.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_DEQUE_H__
+#define __UTIL_DEQUE_H__
+
+#include <deque>
+
+CCL_NAMESPACE_BEGIN
+
+using std::deque;
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_DEQUE_H__ */
diff --git a/intern/cycles/util/util_disjoint_set.h b/intern/cycles/util/util_disjoint_set.h
new file mode 100644
index 00000000000..946632371d2
--- /dev/null
+++ b/intern/cycles/util/util_disjoint_set.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_DISJOINT_SET_H__
+#define __UTIL_DISJOINT_SET_H__
+
+#include "util_array.h"
+#include <utility>
+
+CCL_NAMESPACE_BEGIN
+
+class DisjointSet {
+ private:
+  array<size_t> parents;
+  array<size_t> ranks;
+
+ public:
+  DisjointSet(size_t size) : parents(size), ranks(size)
+  {
+    for (size_t i = 0; i < size; i++) {
+      parents[i] = i;
+      ranks[i] = 0;
+    }
+  }
+
+  size_t find(size_t x)
+  {
+    size_t root = x;
+    while (parents[root] != root) {
+      root = parents[root];
+    }
+    while (parents[x] != root) {
+      size_t parent = parents[x];
+      parents[x] = root;
+      x = parent;
+    }
+    return root;
+  }
+
+  void join(size_t x, size_t y)
+  {
+    size_t x_root = find(x);
+    size_t y_root = find(y);
+
+    if (x_root == y_root) {
+      return;
+    }
+
+    if (ranks[x_root] < ranks[y_root]) {
+      std::swap(x_root, y_root);
+    }
+    parents[y_root] = x_root;
+
+    if (ranks[x_root] == ranks[y_root]) {
+      ranks[x_root]++;
+    }
+  }
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_DISJOINT_SET_H__ */
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index 2d09326d2ca..f78cc5f5da9 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -18,6 +18,7 @@
 #define __UTIL_GUARDED_ALLOCATOR_H__
 
 #include <cstddef>
+#include <cstdlib>
 #include <memory>
 
 #ifdef WITH_BLENDER_GUARDEDALLOC
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 9c40f5310c2..8de62893ba8 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -17,8 +17,8 @@
 #ifndef __UTIL_HALF_H__
 #define __UTIL_HALF_H__
 
-#include "util/util_types.h"
 #include "util/util_math.h"
+#include "util/util_types.h"
 
 #ifdef __KERNEL_SSE2__
 #  include "util/util_simd.h"
@@ -36,7 +36,8 @@ CCL_NAMESPACE_BEGIN
 
 /* CUDA has its own half data type, no need to define then */
 #  ifndef __KERNEL_CUDA__
-/* Implementing this as a class rather than a typedef so that the compiler can tell it apart from unsigned shorts. */
+/* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
+ * unsigned shorts. */
 class half {
  public:
   half() : v(0)
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index 785482967db..0021eec169b 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -21,41 +21,357 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline uint hash_int_2d(uint kx, uint ky)
-{
+/* ***** Jenkins Lookup3 Hash Functions ***** */
+
+/* Source: http://burtleburtle.net/bob/c/lookup3.c */
+
 #define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
 
+#define mix(a, b, c) \
+  { \
+    a -= c; \
+    a ^= rot(c, 4); \
+    c += b; \
+    b -= a; \
+    b ^= rot(a, 6); \
+    a += c; \
+    c -= b; \
+    c ^= rot(b, 8); \
+    b += a; \
+    a -= c; \
+    a ^= rot(c, 16); \
+    c += b; \
+    b -= a; \
+    b ^= rot(a, 19); \
+    a += c; \
+    c -= b; \
+    c ^= rot(b, 4); \
+    b += a; \
+  } \
+  ((void)0)
+
+#define final(a, b, c) \
+  { \
+    c ^= b; \
+    c -= rot(b, 14); \
+    a ^= c; \
+    a -= rot(c, 11); \
+    b ^= a; \
+    b -= rot(a, 25); \
+    c ^= b; \
+    c -= rot(b, 16); \
+    a ^= c; \
+    a -= rot(c, 4); \
+    b ^= a; \
+    b -= rot(a, 14); \
+    c ^= b; \
+    c -= rot(b, 24); \
+  } \
+  ((void)0)
+
+ccl_device_inline uint hash_uint(uint kx)
+{
   uint a, b, c;
+  a = b = c = 0xdeadbeef + (1 << 2) + 13;
+
+  a += kx;
+  final(a, b, c);
 
+  return c;
+}
+
+ccl_device_inline uint hash_uint2(uint kx, uint ky)
+{
+  uint a, b, c;
   a = b = c = 0xdeadbeef + (2 << 2) + 13;
+
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline uint hash_uint3(uint kx, uint ky, uint kz)
+{
+  uint a, b, c;
+  a = b = c = 0xdeadbeef + (3 << 2) + 13;
+
+  c += kz;
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline uint hash_uint4(uint kx, uint ky, uint kz, uint kw)
+{
+  uint a, b, c;
+  a = b = c = 0xdeadbeef + (4 << 2) + 13;
+
   a += kx;
   b += ky;
+  c += kz;
+  mix(a, b, c);
 
-  c ^= b;
-  c -= rot(b, 14);
-  a ^= c;
-  a -= rot(c, 11);
-  b ^= a;
-  b -= rot(a, 25);
-  c ^= b;
-  c -= rot(b, 16);
-  a ^= c;
-  a -= rot(c, 4);
-  b ^= a;
-  b -= rot(a, 14);
-  c ^= b;
-  c -= rot(b, 24);
+  a += kw;
+  final(a, b, c);
 
   return c;
+}
 
 #undef rot
+#undef final
+#undef mix
+
+/* Hashing uint or uint[234] into a float in the range [0, 1]. */
+
+ccl_device_inline float hash_uint_to_float(uint kx)
+{
+  return (float)hash_uint(kx) / (float)0xFFFFFFFFu;
+}
+
+ccl_device_inline float hash_uint2_to_float(uint kx, uint ky)
+{
+  return (float)hash_uint2(kx, ky) / (float)0xFFFFFFFFu;
+}
+
+ccl_device_inline float hash_uint3_to_float(uint kx, uint ky, uint kz)
+{
+  return (float)hash_uint3(kx, ky, kz) / (float)0xFFFFFFFFu;
+}
+
+ccl_device_inline float hash_uint4_to_float(uint kx, uint ky, uint kz, uint kw)
+{
+  return (float)hash_uint4(kx, ky, kz, kw) / (float)0xFFFFFFFFu;
+}
+
+/* Hashing float or float[234] into a float in the range [0, 1]. */
+
+ccl_device_inline float hash_float_to_float(float k)
+{
+  return hash_uint_to_float(__float_as_uint(k));
+}
+
+ccl_device_inline float hash_float2_to_float(float2 k)
+{
+  return hash_uint2_to_float(__float_as_uint(k.x), __float_as_uint(k.y));
+}
+
+ccl_device_inline float hash_float3_to_float(float3 k)
+{
+  return hash_uint3_to_float(__float_as_uint(k.x), __float_as_uint(k.y), __float_as_uint(k.z));
+}
+
+ccl_device_inline float hash_float4_to_float(float4 k)
+{
+  return hash_uint4_to_float(
+      __float_as_uint(k.x), __float_as_uint(k.y), __float_as_uint(k.z), __float_as_uint(k.w));
+}
+
+/* Hashing float[234] into float[234] of components in the range [0, 1]. */
+
+ccl_device_inline float2 hash_float2_to_float2(float2 k)
+{
+  return make_float2(hash_float2_to_float(k), hash_float3_to_float(make_float3(k.x, k.y, 1.0)));
+}
+
+ccl_device_inline float3 hash_float3_to_float3(float3 k)
+{
+  return make_float3(hash_float3_to_float(k),
+                     hash_float4_to_float(make_float4(k.x, k.y, k.z, 1.0)),
+                     hash_float4_to_float(make_float4(k.x, k.y, k.z, 2.0)));
+}
+
+ccl_device_inline float4 hash_float4_to_float4(float4 k)
+{
+  return make_float4(hash_float4_to_float(k),
+                     hash_float4_to_float(make_float4(k.w, k.x, k.y, k.z)),
+                     hash_float4_to_float(make_float4(k.z, k.w, k.x, k.y)),
+                     hash_float4_to_float(make_float4(k.y, k.z, k.w, k.x)));
+}
+
+/* Hashing float or float[234] into float3 of components in range [0, 1]. */
+
+ccl_device_inline float3 hash_float_to_float3(float k)
+{
+  return make_float3(hash_float_to_float(k),
+                     hash_float2_to_float(make_float2(k, 1.0)),
+                     hash_float2_to_float(make_float2(k, 2.0)));
+}
+
+ccl_device_inline float3 hash_float2_to_float3(float2 k)
+{
+  return make_float3(hash_float2_to_float(k),
+                     hash_float3_to_float(make_float3(k.x, k.y, 1.0)),
+                     hash_float3_to_float(make_float3(k.x, k.y, 2.0)));
 }
 
-ccl_device_inline uint hash_int(uint k)
+ccl_device_inline float3 hash_float4_to_float3(float4 k)
 {
-  return hash_int_2d(k, 0);
+  return make_float3(hash_float4_to_float(k),
+                     hash_float4_to_float(make_float4(k.z, k.x, k.w, k.y)),
+                     hash_float4_to_float(make_float4(k.w, k.z, k.y, k.x)));
 }
 
+/* SSE Versions Of Jenkins Lookup3 Hash Functions */
+
+#ifdef __KERNEL_SSE2__
+#  define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))
+
+#  define mix(a, b, c) \
+    { \
+      a -= c; \
+      a ^= rot(c, 4); \
+      c += b; \
+      b -= a; \
+      b ^= rot(a, 6); \
+      a += c; \
+      c -= b; \
+      c ^= rot(b, 8); \
+      b += a; \
+      a -= c; \
+      a ^= rot(c, 16); \
+      c += b; \
+      b -= a; \
+      b ^= rot(a, 19); \
+      a += c; \
+      c -= b; \
+      c ^= rot(b, 4); \
+      b += a; \
+    }
+
+#  define final(a, b, c) \
+    { \
+      c ^= b; \
+      c -= rot(b, 14); \
+      a ^= c; \
+      a -= rot(c, 11); \
+      b ^= a; \
+      b -= rot(a, 25); \
+      c ^= b; \
+      c -= rot(b, 16); \
+      a ^= c; \
+      a -= rot(c, 4); \
+      b ^= a; \
+      b -= rot(a, 14); \
+      c ^= b; \
+      c -= rot(b, 24); \
+    }
+
+ccl_device_inline ssei hash_ssei(ssei kx)
+{
+  ssei a, b, c;
+  a = b = c = ssei(0xdeadbeef + (1 << 2) + 13);
+
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
+{
+  ssei a, b, c;
+  a = b = c = ssei(0xdeadbeef + (2 << 2) + 13);
+
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
+{
+  ssei a, b, c;
+  a = b = c = ssei(0xdeadbeef + (3 << 2) + 13);
+
+  c += kz;
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
+{
+  ssei a, b, c;
+  a = b = c = ssei(0xdeadbeef + (4 << 2) + 13);
+
+  a += kx;
+  b += ky;
+  c += kz;
+  mix(a, b, c);
+
+  a += kw;
+  final(a, b, c);
+
+  return c;
+}
+
+#  if defined(__KERNEL_AVX__)
+ccl_device_inline avxi hash_avxi(avxi kx)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+
+  c += kz;
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+
+  a += kx;
+  b += ky;
+  c += kz;
+  mix(a, b, c);
+
+  a += kw;
+  final(a, b, c);
+
+  return c;
+}
+#  endif
+
+#  undef rot
+#  undef final
+#  undef mix
+
+#endif
+
 #ifndef __KERNEL_GPU__
 static inline uint hash_string(const char *str)
 {
@@ -68,11 +384,6 @@ static inline uint hash_string(const char *str)
 }
 #endif
 
-ccl_device_inline float hash_int_01(uint k)
-{
-  return (float)hash_int(k) * (1.0f / (float)0xFFFFFFFF);
-}
-
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_HASH_H__ */
diff --git a/intern/cycles/util/util_ies.cpp b/intern/cycles/util/util_ies.cpp
index ff5c709b406..62d3d42186d 100644
--- a/intern/cycles/util/util_ies.cpp
+++ b/intern/cycles/util/util_ies.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <algorithm>
+
 #include "util/util_foreach.h"
 #include "util/util_ies.h"
 #include "util/util_math.h"
@@ -28,7 +30,7 @@ CCL_NAMESPACE_BEGIN
 // issue.
 template class GuardedAllocator<char>;
 
-bool IESFile::load(ustring ies)
+bool IESFile::load(const string &ies)
 {
   clear();
   if (!parse(ies) || !process()) {
@@ -76,7 +78,7 @@ class IESTextParser {
   vector<char> text;
   char *data;
 
-  IESTextParser(ustring str) : text(str.begin(), str.end())
+  IESTextParser(const string &str) : text(str.begin(), str.end())
   {
     std::replace(text.begin(), text.end(), ',', ' ');
     data = strstr(&text[0], "\nTILT=");
@@ -116,7 +118,7 @@ class IESTextParser {
   }
 };
 
-bool IESFile::parse(ustring ies)
+bool IESFile::parse(const string &ies)
 {
   if (ies.empty()) {
     return false;
@@ -155,7 +157,8 @@ bool IESFile::parse(ustring ies)
   type = (IESType)parser.get_long();    /* Photometric type */
 
   /* TODO(lukas): Test whether the current type B processing can also deal with type A files.
-   * In theory the only difference should be orientation which we ignore anyways, but with IES you never know...
+   * In theory the only difference should be orientation which we ignore anyways, but with IES you
+   * never know...
    */
   if (type != TYPE_B && type != TYPE_C) {
     return false;
@@ -173,12 +176,13 @@ bool IESFile::parse(ustring ies)
    * Cycles expects radiometric quantities, though, which requires a conversion.
    * However, the Luminous efficacy (ratio of lumens per Watt) depends on the spectral distribution
    * of the light source since lumens take human perception into account.
-   * Since this spectral distribution is not known from the IES file, a typical one must be assumed.
-   * The D65 standard illuminant has a Luminous efficacy of 177.83, which is used here to convert to Watt/sr.
-   * A more advanced approach would be to add a Blackbody Temperature input to the node and numerically
-   * integrate the Luminous efficacy from the resulting spectral distribution.
-   * Also, the Watt/sr value must be multiplied by 4*pi to get the Watt value that Cycles expects
-   * for lamp strength. Therefore, the conversion here uses 4*pi/177.83 as a Candela to Watt factor.
+   * Since this spectral distribution is not known from the IES file, a typical one must be
+   * assumed. The D65 standard illuminant has a Luminous efficacy of 177.83, which is used here to
+   * convert to Watt/sr. A more advanced approach would be to add a Blackbody Temperature input to
+   * the node and numerically integrate the Luminous efficacy from the resulting spectral
+   * distribution. Also, the Watt/sr value must be multiplied by 4*pi to get the Watt value that
+   * Cycles expects for lamp strength. Therefore, the conversion here uses 4*pi/177.83 as a Candela
+   * to Watt factor.
    */
   factor *= 0.0706650768394;
 
@@ -294,7 +298,8 @@ bool IESFile::process_type_b()
 bool IESFile::process_type_c()
 {
   if (h_angles[0] == 90.0f) {
-    /* Some files are stored from 90° to 270°, so we just rotate them to the regular 0°-180° range here. */
+    /* Some files are stored from 90° to 270°, so we just rotate them to the regular 0°-180° range
+     * here. */
     for (int i = 0; i < h_angles.size(); i++) {
       h_angles[i] -= 90.0f;
     }
@@ -311,8 +316,9 @@ bool IESFile::process_type_c()
 
   if (h_angles[h_angles.size() - 1] == 90.0f) {
     /* Only one quadrant is defined, so we need to mirror twice (from one to two, then to four).
-     * Since the two->four mirroring step might also be required if we get an input of two quadrants,
-     * we only do the first mirror here and later do the second mirror in either case. */
+     * Since the two->four mirroring step might also be required if we get an input of two
+     * quadrants, we only do the first mirror here and later do the second mirror in either case.
+     */
     int hnum = h_angles.size();
     for (int i = hnum - 2; i >= 0; i--) {
       h_angles.push_back(180.0f - h_angles[i]);
@@ -329,8 +335,8 @@ bool IESFile::process_type_c()
     }
   }
 
-  /* Some files skip the 360° entry (contrary to standard) because it's supposed to be identical to the 0° entry.
-   * If the file has a discernible order in its spacing, just fix this. */
+  /* Some files skip the 360° entry (contrary to standard) because it's supposed to be identical to
+   * the 0° entry. If the file has a discernible order in its spacing, just fix this. */
   if (h_angles[h_angles.size() - 1] != 360.0f) {
     int hnum = h_angles.size();
     float last_step = h_angles[hnum - 1] - h_angles[hnum - 2];
diff --git a/intern/cycles/util/util_ies.h b/intern/cycles/util/util_ies.h
index ab1b9ea57cf..95473103614 100644
--- a/intern/cycles/util/util_ies.h
+++ b/intern/cycles/util/util_ies.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_IES_H__
 #define __UTIL_IES_H__
 
-#include "util/util_param.h"
+#include "util/util_string.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -32,11 +32,11 @@ class IESFile {
   int packed_size();
   void pack(float *data);
 
-  bool load(ustring ies);
+  bool load(const string &ies);
   void clear();
 
  protected:
-  bool parse(ustring ies);
+  bool parse(const string &ies);
   bool process();
   bool process_type_b();
   bool process_type_c();
diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h
index 8962c09d098..27ec7ffb423 100644
--- a/intern/cycles/util/util_image.h
+++ b/intern/cycles/util/util_image.h
@@ -21,6 +21,7 @@
 
 #  include <OpenImageIO/imageio.h>
 
+#  include "util/util_half.h"
 #  include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index 4a5e7e6a9ea..e41250ab1b9 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -17,6 +17,7 @@
 #include "util/util_logging.h"
 
 #include "util/util_math.h"
+#include "util/util_string.h"
 
 #include <stdio.h>
 #ifdef _MSC_VER
@@ -25,6 +26,21 @@
 
 CCL_NAMESPACE_BEGIN
 
+static bool is_verbosity_set()
+{
+#ifdef WITH_CYCLES_LOGGING
+  using CYCLES_GFLAGS_NAMESPACE::GetCommandLineOption;
+
+  std::string verbosity;
+  if (!GetCommandLineOption("v", &verbosity)) {
+    return false;
+  }
+  return verbosity != "0";
+#else
+  return false;
+#endif
+}
+
 void util_logging_init(const char *argv0)
 {
 #ifdef WITH_CYCLES_LOGGING
@@ -36,7 +52,9 @@ void util_logging_init(const char *argv0)
 
   google::InitGoogleLogging(argv0);
   SetCommandLineOption("logtostderr", "1");
-  SetCommandLineOption("v", "0");
+  if (!is_verbosity_set()) {
+    SetCommandLineOption("v", "0");
+  }
   SetCommandLineOption("stderrthreshold", severity_fatal);
   SetCommandLineOption("minloglevel", severity_fatal);
 #else
@@ -49,7 +67,9 @@ void util_logging_start()
 #ifdef WITH_CYCLES_LOGGING
   using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption;
   SetCommandLineOption("logtostderr", "1");
-  SetCommandLineOption("v", "2");
+  if (!is_verbosity_set()) {
+    SetCommandLineOption("v", "2");
+  }
   SetCommandLineOption("stderrthreshold", "1");
   SetCommandLineOption("minloglevel", "0");
 #endif
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 1a5e6666b32..3e56f0a0193 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -48,6 +48,7 @@ class LogMessageVoidify {
 #  define LOG_SUPPRESS() (true) ? ((void)0) : LogMessageVoidify() & StubStream()
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
+#  define VLOG_IF(severity, condition) LOG_SUPPRESS()
 #endif
 
 #define VLOG_ONCE(level, flag) \
diff --git a/intern/cycles/util/util_map.h b/intern/cycles/util/util_map.h
index 3c9288417cf..f1b2522362f 100644
--- a/intern/cycles/util/util_map.h
+++ b/intern/cycles/util/util_map.h
@@ -25,6 +25,14 @@ CCL_NAMESPACE_BEGIN
 using std::map;
 using std::pair;
 using std::unordered_map;
+using std::unordered_multimap;
+
+template<typename T> static void map_free_memory(T &data)
+{
+  /* Use swap() trick to actually free all internal memory. */
+  T empty_data;
+  data.swap(empty_data);
+}
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 2c7f826db93..8caabf6eac3 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -294,6 +294,21 @@ ccl_device_inline float mix(float a, float b, float t)
 {
   return a + t * (b - a);
 }
+
+ccl_device_inline float smoothstep(float edge0, float edge1, float x)
+{
+  float result;
+  if (x < edge0)
+    result = 0.0f;
+  else if (x >= edge1)
+    result = 1.0f;
+  else {
+    float t = (x - edge0) / (edge1 - edge0);
+    result = (3.0f - 2.0f * t) * (t * t);
+  }
+  return result;
+}
+
 #endif /* __KERNEL_OPENCL__ */
 
 #ifndef __KERNEL_CUDA__
@@ -318,11 +333,45 @@ ccl_device_inline int quick_floor_to_int(float x)
   return float_to_int(x) - ((x < 0) ? 1 : 0);
 }
 
+ccl_device_inline float floorfrac(float x, int *i)
+{
+  *i = quick_floor_to_int(x);
+  return x - *i;
+}
+
 ccl_device_inline int ceil_to_int(float f)
 {
   return float_to_int(ceilf(f));
 }
 
+ccl_device_inline float fractf(float x)
+{
+  return x - floorf(x);
+}
+
+/* Adapted from godotengine math_funcs.h. */
+ccl_device_inline float wrapf(float value, float max, float min)
+{
+  float range = max - min;
+  return (range != 0.0f) ? value - (range * floorf((value - min) / range)) : min;
+}
+
+ccl_device_inline float pingpongf(float a, float b)
+{
+  return (b != 0.0f) ? fabsf(fractf((a - b) / (b * 2.0f)) * b * 2.0f - b) : 0.0f;
+}
+
+ccl_device_inline float smoothminf(float a, float b, float k)
+{
+  if (k != 0.0f) {
+    float h = fmaxf(k - fabsf(a - b), 0.0f) / k;
+    return fminf(a, b) - h * h * h * k * (1.0f / 6.0f);
+  }
+  else {
+    return fminf(a, b);
+  }
+}
+
 ccl_device_inline float signf(float f)
 {
   return (f < 0.0f) ? -1.0f : 1.0f;
@@ -336,6 +385,17 @@ ccl_device_inline float nonzerof(float f, float eps)
     return f;
 }
 
+/* Signum function testing for zero. Matches GLSL and OSL functions. */
+ccl_device_inline float compatible_signf(float f)
+{
+  if (f == 0.0f) {
+    return 0.0f;
+  }
+  else {
+    return signf(f);
+  }
+}
+
 ccl_device_inline float smoothstepf(float f)
 {
   float ff = f * f;
@@ -417,12 +477,12 @@ ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const fl
 ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b)
 {
 #if 0
-  if(fabsf(N.y) >= 0.999f) {
+  if (fabsf(N.y) >= 0.999f) {
     *a = make_float3(1, 0, 0);
     *b = make_float3(0, 0, 1);
     return;
   }
-  if(fabsf(N.z) >= 0.999f) {
+  if (fabsf(N.z) >= 0.999f) {
     *a = make_float3(1, 0, 0);
     *b = make_float3(0, 1, 0);
     return;
@@ -528,6 +588,11 @@ ccl_device_inline float safe_sqrtf(float f)
   return sqrtf(max(f, 0.0f));
 }
 
+ccl_device_inline float inversesqrtf(float f)
+{
+  return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
+}
+
 ccl_device float safe_asinf(float a)
 {
   return asinf(clamp(a, -1.0f, 1.0f));
@@ -617,6 +682,57 @@ ccl_device float bits_to_01(uint bits)
   return bits * (1.0f / (float)0xFFFFFFFF);
 }
 
+ccl_device_inline uint count_leading_zeros(uint x)
+{
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+  return __clz(x);
+#elif defined(__KERNEL_OPENCL__)
+  return clz(x);
+#else
+  assert(x != 0);
+#  ifdef _MSC_VER
+  unsigned long leading_zero = 0;
+  _BitScanReverse(&leading_zero, x);
+  return (31 - leading_zero);
+#  else
+  return __builtin_clz(x);
+#  endif
+#endif
+}
+
+ccl_device_inline uint count_trailing_zeros(uint x)
+{
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+  return (__ffs(x) - 1);
+#elif defined(__KERNEL_OPENCL__)
+  return (31 - count_leading_zeros(x & -x));
+#else
+  assert(x != 0);
+#  ifdef _MSC_VER
+  unsigned long ctz = 0;
+  _BitScanForward(&ctz, x);
+  return ctz;
+#  else
+  return __builtin_ctz(x);
+#  endif
+#endif
+}
+
+ccl_device_inline uint find_first_set(uint x)
+{
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+  return __ffs(x);
+#elif defined(__KERNEL_OPENCL__)
+  return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
+#else
+#  ifdef _MSC_VER
+  return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
+#  else
+  return __builtin_ffs(x);
+#  endif
+#endif
+}
+
 /* projections */
 ccl_device_inline float2 map_to_tube(const float3 co)
 {
@@ -651,6 +767,36 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
   return make_float2(u, v);
 }
 
+/* Compares two floats.
+ * Returns true if their absolute difference is smaller than abs_diff (for numbers near zero)
+ * or their relative difference is less than ulp_diff ULPs.
+ * Based on
+ * https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+ */
+
+ccl_device_inline float compare_floats(float a, float b, float abs_diff, int ulp_diff)
+{
+  if (fabsf(a - b) < abs_diff) {
+    return true;
+  }
+
+  if ((a < 0.0f) != (b < 0.0f)) {
+    return false;
+  }
+
+  return (abs(__float_as_int(a) - __float_as_int(b)) < ulp_diff);
+}
+
+/* Calculate the angle between the two vectors a and b.
+ * The usual approach acos(dot(a, b)) has severe precision issues for small angles,
+ * which are avoided by this method.
+ * Based on "Mangled Angles" from https://people.eecs.berkeley.edu/~wkahan/Mindless.pdf
+ */
+ccl_device_inline float precise_angle(float3 a, float3 b)
+{
+  return 2.0f * atan2f(len(a - b), len(a + b));
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
index 872271666aa..e979bd9e0c0 100644
--- a/intern/cycles/util/util_math_fast.h
+++ b/intern/cycles/util/util_math_fast.h
@@ -282,8 +282,10 @@ ccl_device float fast_acosf(float x)
   const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
   /* Based on http://www.pouet.net/topic.php?which=9132&page=2
    * 85% accurate (ulp 0)
-   * Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
-   * Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
+   * Examined 2130706434 values of acos:
+   *   15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
+   * Examined 2130706434 values of acos:
+   *   15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
    */
   const float a = sqrtf(1.0f - m) *
                   (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
@@ -312,8 +314,10 @@ ccl_device float fast_atanf(float x)
   const float s = 1.0f - (1.0f - k); /* Crush denormals. */
   const float t = s * s;
   /* http://mathforum.org/library/drmath/view/62672.html
-   * Examined 4278190080 values of atan: 2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
-   * Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
+   * Examined 4278190080 values of atan:
+   *   2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
+   * Examined 4278190080 values of atan:
+   *   171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
    */
   float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
   if (a > 1.0f) {
@@ -442,6 +446,11 @@ ccl_device_inline float fast_expf(float x)
 }
 
 #ifndef __KERNEL_GPU__
+/* MSVC seems to have a code-gen bug here in at least SSE41/AVX
+ * see T78047 for details. */
+#  ifdef _MSC_VER
+#    pragma optimize("", off)
+#  endif
 ccl_device float4 fast_exp2f4(float4 x)
 {
   const float4 one = make_float4(1.0f);
@@ -457,6 +466,9 @@ ccl_device float4 fast_exp2f4(float4 x)
   r = madd4(x, r, make_float4(1.0f));
   return __int4_as_float4(__float4_as_int4(r) + (m << 23));
 }
+#  ifdef _MSC_VER
+#    pragma optimize("", on)
+#  endif
 
 ccl_device_inline float4 fast_expf4(float4 x)
 {
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
index 9feaf042d19..bf21430af3c 100644
--- a/intern/cycles/util/util_math_float2.h
+++ b/intern/cycles/util/util_math_float2.h
@@ -35,7 +35,9 @@ ccl_device_inline float2 operator*(float f, const float2 &a);
 ccl_device_inline float2 operator/(float f, const float2 &a);
 ccl_device_inline float2 operator/(const float2 &a, float f);
 ccl_device_inline float2 operator/(const float2 &a, const float2 &b);
+ccl_device_inline float2 operator+(const float2 &a, const float f);
 ccl_device_inline float2 operator+(const float2 &a, const float2 &b);
+ccl_device_inline float2 operator-(const float2 &a, const float f);
 ccl_device_inline float2 operator-(const float2 &a, const float2 &b);
 ccl_device_inline float2 operator+=(float2 &a, const float2 &b);
 ccl_device_inline float2 operator*=(float2 &a, const float2 &b);
@@ -48,6 +50,7 @@ ccl_device_inline bool operator!=(const float2 &a, const float2 &b);
 
 ccl_device_inline bool is_zero(const float2 &a);
 ccl_device_inline float average(const float2 &a);
+ccl_device_inline float distance(const float2 &a, const float2 &b);
 ccl_device_inline float dot(const float2 &a, const float2 &b);
 ccl_device_inline float cross(const float2 &a, const float2 &b);
 ccl_device_inline float len(const float2 &a);
@@ -60,8 +63,11 @@ ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &
 ccl_device_inline float2 fabs(const float2 &a);
 ccl_device_inline float2 as_float2(const float4 &a);
 ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
+ccl_device_inline float2 floor(const float2 &a);
 #endif /* !__KERNEL_OPENCL__ */
 
+ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
+
 /*******************************************************************************
  * Definition.
  */
@@ -103,11 +109,21 @@ ccl_device_inline float2 operator/(const float2 &a, const float2 &b)
   return make_float2(a.x / b.x, a.y / b.y);
 }
 
+ccl_device_inline float2 operator+(const float2 &a, const float f)
+{
+  return a + make_float2(f, f);
+}
+
 ccl_device_inline float2 operator+(const float2 &a, const float2 &b)
 {
   return make_float2(a.x + b.x, a.y + b.y);
 }
 
+ccl_device_inline float2 operator-(const float2 &a, const float f)
+{
+  return a - make_float2(f, f);
+}
+
 ccl_device_inline float2 operator-(const float2 &a, const float2 &b)
 {
   return make_float2(a.x - b.x, a.y - b.y);
@@ -159,6 +175,11 @@ ccl_device_inline float average(const float2 &a)
   return (a.x + a.y) * (1.0f / 2.0f);
 }
 
+ccl_device_inline float distance(const float2 &a, const float2 &b)
+{
+  return len(a - b);
+}
+
 ccl_device_inline float dot(const float2 &a, const float2 &b)
 {
   return a.x * b.x + a.y * b.y;
@@ -226,8 +247,18 @@ ccl_device_inline float2 mix(const float2 &a, const float2 &b, float t)
   return a + t * (b - a);
 }
 
+ccl_device_inline float2 floor(const float2 &a)
+{
+  return make_float2(floorf(a.x), floorf(a.y));
+}
+
 #endif /* !__KERNEL_OPENCL__ */
 
+ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
+{
+  return (b != 0.0f) ? a / b : make_float2(0.0f, 0.0f);
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_FLOAT2_H__ */
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 85e9b8114ff..dd2010715ba 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -35,7 +35,9 @@ ccl_device_inline float3 operator*(const float f, const float3 &a);
 ccl_device_inline float3 operator/(const float f, const float3 &a);
 ccl_device_inline float3 operator/(const float3 &a, const float f);
 ccl_device_inline float3 operator/(const float3 &a, const float3 &b);
+ccl_device_inline float3 operator+(const float3 &a, const float f);
 ccl_device_inline float3 operator+(const float3 &a, const float3 &b);
+ccl_device_inline float3 operator-(const float3 &a, const float f);
 ccl_device_inline float3 operator-(const float3 &a, const float3 &b);
 ccl_device_inline float3 operator+=(float3 &a, const float3 &b);
 ccl_device_inline float3 operator-=(float3 &a, const float3 &b);
@@ -47,6 +49,7 @@ ccl_device_inline float3 operator/=(float3 &a, float f);
 ccl_device_inline bool operator==(const float3 &a, const float3 &b);
 ccl_device_inline bool operator!=(const float3 &a, const float3 &b);
 
+ccl_device_inline float distance(const float3 &a, const float3 &b);
 ccl_device_inline float dot(const float3 &a, const float3 &b);
 ccl_device_inline float dot_xy(const float3 &a, const float3 &b);
 ccl_device_inline float3 cross(const float3 &a, const float3 &b);
@@ -58,6 +61,8 @@ ccl_device_inline float3 fabs(const float3 &a);
 ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t);
 ccl_device_inline float3 rcp(const float3 &a);
 ccl_device_inline float3 sqrt(const float3 &a);
+ccl_device_inline float3 floor(const float3 &a);
+ccl_device_inline float3 ceil(const float3 &a);
 #endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float min3(float3 a);
@@ -65,11 +70,17 @@ ccl_device_inline float max3(float3 a);
 ccl_device_inline float len(const float3 a);
 ccl_device_inline float len_squared(const float3 a);
 
+ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
+ccl_device_inline float3 project(const float3 v, const float3 v_proj);
+
 ccl_device_inline float3 saturate3(float3 a);
 ccl_device_inline float3 safe_normalize(const float3 a);
 ccl_device_inline float3 normalize_len(const float3 a, float *t);
 ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
+ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b);
+ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b);
 ccl_device_inline float3 interp(float3 a, float3 b, float t);
+ccl_device_inline float3 sqr3(float3 a);
 
 ccl_device_inline bool is_zero(const float3 a);
 ccl_device_inline float reduce_add(const float3 a);
@@ -141,6 +152,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 #  endif
 }
 
+ccl_device_inline float3 operator+(const float3 &a, const float f)
+{
+  return a + make_float3(f, f, f);
+}
+
 ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 {
 #  ifdef __KERNEL_SSE__
@@ -150,6 +166,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 #  endif
 }
 
+ccl_device_inline float3 operator-(const float3 &a, const float f)
+{
+  return a - make_float3(f, f, f);
+}
+
 ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 {
 #  ifdef __KERNEL_SSE__
@@ -204,6 +225,11 @@ ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
   return !(a == b);
 }
 
+ccl_device_inline float distance(const float3 &a, const float3 &b)
+{
+  return len(a - b);
+}
+
 ccl_device_inline float dot(const float3 &a, const float3 &b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
@@ -280,6 +306,24 @@ ccl_device_inline float3 sqrt(const float3 &a)
 #  endif
 }
 
+ccl_device_inline float3 floor(const float3 &a)
+{
+#  ifdef __KERNEL_SSE__
+  return float3(_mm_floor_ps(a));
+#  else
+  return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
+#  endif
+}
+
+ccl_device_inline float3 ceil(const float3 &a)
+{
+#  ifdef __KERNEL_SSE__
+  return float3(_mm_ceil_ps(a));
+#  else
+  return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
+#  endif
+}
+
 ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
 {
   return a + t * (b - a);
@@ -320,6 +364,19 @@ ccl_device_inline float len_squared(const float3 a)
   return dot(a, a);
 }
 
+ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
+{
+  float3 unit_normal = normalize(normal);
+  return incident - 2.0f * unit_normal * dot(incident, unit_normal);
+}
+
+ccl_device_inline float3 project(const float3 v, const float3 v_proj)
+{
+  float len_squared = dot(v_proj, v_proj);
+  return (len_squared != 0.0f) ? (dot(v, v_proj) / len_squared) * v_proj :
+                                 make_float3(0.0f, 0.0f, 0.0f);
+}
+
 ccl_device_inline float3 saturate3(float3 a)
 {
   return make_float3(saturate(a.x), saturate(a.y), saturate(a.z));
@@ -344,11 +401,28 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
   return (*t != 0.0f) ? a / (*t) : a;
 }
 
+ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b)
+{
+  return make_float3((b.x != 0.0f) ? a.x / b.x : 0.0f,
+                     (b.y != 0.0f) ? a.y / b.y : 0.0f,
+                     (b.z != 0.0f) ? a.z / b.z : 0.0f);
+}
+
+ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b)
+{
+  return (b != 0.0f) ? a / b : make_float3(0.0f, 0.0f, 0.0f);
+}
+
 ccl_device_inline float3 interp(float3 a, float3 b, float t)
 {
   return a + t * (b - a);
 }
 
+ccl_device_inline float3 sqr3(float3 a)
+{
+  return a * a;
+}
+
 ccl_device_inline bool is_zero(const float3 a)
 {
 #ifdef __KERNEL_SSE__
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 1fb886572e3..cd4b3e3b74c 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -34,7 +34,9 @@ ccl_device_inline float4 operator*(const float4 &a, float f);
 ccl_device_inline float4 operator*(float f, const float4 &a);
 ccl_device_inline float4 operator/(const float4 &a, float f);
 ccl_device_inline float4 operator/(const float4 &a, const float4 &b);
+ccl_device_inline float4 operator+(const float4 &a, const float f);
 ccl_device_inline float4 operator+(const float4 &a, const float4 &b);
+ccl_device_inline float4 operator-(const float4 &a, const float f);
 ccl_device_inline float4 operator-(const float4 &a, const float4 &b);
 ccl_device_inline float4 operator+=(float4 &a, const float4 &b);
 ccl_device_inline float4 operator*=(float4 &a, const float4 &b);
@@ -46,6 +48,7 @@ ccl_device_inline int4 operator>=(const float4 &a, const float4 &b);
 ccl_device_inline int4 operator<=(const float4 &a, const float4 &b);
 ccl_device_inline bool operator==(const float4 &a, const float4 &b);
 
+ccl_device_inline float distance(const float4 &a, const float4 &b);
 ccl_device_inline float dot(const float4 &a, const float4 &b);
 ccl_device_inline float len_squared(const float4 &a);
 ccl_device_inline float4 rcp(const float4 &a);
@@ -61,8 +64,12 @@ ccl_device_inline float4 min(const float4 &a, const float4 &b);
 ccl_device_inline float4 max(const float4 &a, const float4 &b);
 ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx);
 ccl_device_inline float4 fabs(const float4 &a);
+ccl_device_inline float4 floor(const float4 &a);
+ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
 #endif /* !__KERNEL_OPENCL__*/
 
+ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
+
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b);
@@ -139,6 +146,11 @@ ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 #  endif
 }
 
+ccl_device_inline float4 operator+(const float4 &a, const float f)
+{
+  return a + make_float4(f, f, f, f);
+}
+
 ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 {
 #  ifdef __KERNEL_SSE__
@@ -148,6 +160,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 #  endif
 }
 
+ccl_device_inline float4 operator-(const float4 &a, const float f)
+{
+  return a - make_float4(f, f, f, f);
+}
+
 ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 {
 #  ifdef __KERNEL_SSE__
@@ -162,6 +179,11 @@ ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
   return a = a + b;
 }
 
+ccl_device_inline float4 operator-=(float4 &a, const float4 &b)
+{
+  return a = a - b;
+}
+
 ccl_device_inline float4 operator*=(float4 &a, const float4 &b)
 {
   return a = a * b;
@@ -213,6 +235,11 @@ ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 #  endif
 }
 
+ccl_device_inline float distance(const float4 &a, const float4 &b)
+{
+  return len(a - b);
+}
+
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
@@ -338,6 +365,21 @@ ccl_device_inline float4 fabs(const float4 &a)
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 #  endif
 }
+
+ccl_device_inline float4 floor(const float4 &a)
+{
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_floor_ps(a));
+#  else
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
+{
+  return a + t * (b - a);
+}
+
 #endif /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
@@ -430,6 +472,11 @@ ccl_device_inline float4 load_float4(const float *v)
 
 #endif /* !__KERNEL_GPU__ */
 
+ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b)
+{
+  return (b != 0.0f) ? a / b : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h
index 95ac231c611..fa3a541eea9 100644
--- a/intern/cycles/util/util_math_intersect.h
+++ b/intern/cycles/util/util_math_intersect.h
@@ -163,7 +163,7 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
 
   /* Calculate geometry normal and denominator. */
   const float3 Ng1 = cross(e1, e0);
-  //const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+  // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
   const float3 Ng = Ng1 + Ng1;
   const float den = dot3(Ng, dir);
   /* Avoid division by 0. */
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index fe80fab6ebd..1dc661a7aa7 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -110,7 +110,8 @@ ccl_device_inline void math_vec3_add_strided(
 }
 
 /* Elementary matrix operations.
- * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */
+ * Note: TriMatrix refers to a square matrix that is symmetric,
+ * and therefore its upper-triangular part isn't stored. */
 
 ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A,
                                                    int n,
@@ -196,7 +197,8 @@ ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
   }
 }
 
-/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process.
+/* Solve A*S=y for S given A and y,
+ * where A is symmetrical positive-semi-definite and both inputs are destroyed in the process.
  *
  * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A.
  * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S.
@@ -204,15 +206,16 @@ ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
  * Then, the remaining problem is Lt*S = b, which again can be solved easily.
  *
  * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is
- * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
+ * symmetrical positive-semidefinite by construction,
+ * so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
 ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A,
                                                  ccl_global float3 *y,
                                                  int n,
                                                  int stride)
 {
   /* Since the first entry of the design row is always 1, the upper-left element of XtWX is a good
-   * heuristic for the amount of pixels considered (with weighting), therefore the amount of correction
-   * is scaled based on it. */
+   * heuristic for the amount of pixels considered (with weighting),
+   * therefore the amount of correction is scaled based on it. */
   math_trimatrix_add_diagonal(A, n, 3e-7f * A[0], stride); /* Improve the numerical stability. */
   math_trimatrix_cholesky(A, n, stride);                   /* Replace A with L so that L*Lt = A. */
 
@@ -233,9 +236,9 @@ ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A,
   }
 }
 
-/* Perform the Jacobi Eigenvalue Methon on matrix A.
- * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
- * The algorithm overwrites the contents of A.
+/* Perform the Jacobi Eigenvalue Method on matrix A.
+ * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever
+ * accessed. The algorithm overwrites the contents of A.
  *
  * After returning, A will be overwritten with D, which is (almost) diagonal,
  * and V will contain the eigenvectors of the original A in its rows (!),
@@ -263,7 +266,8 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A,
     }
     if (off_diagonal < 1e-7f) {
       /* The matrix has nearly reached diagonal form.
-       * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
+       * Since the eigenvalues are only used to determine truncation, their exact values aren't
+       * required - a relative error of a few ULPs won't matter at all. */
       break;
     }
 
@@ -277,7 +281,8 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A,
         float element = MAT(A, n, row, col);
         float abs_element = fabsf(element);
 
-        /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
+        /* If we're in a later sweep and the element already is very small,
+         * just set it to zero and skip the rotation. */
         if (sweep > 3 && abs_element <= singular_epsilon * fabsf(MAT(A, n, row, row)) &&
             abs_element <= singular_epsilon * fabsf(MAT(A, n, col, col))) {
           MAT(A, n, row, col) = 0.0f;
@@ -288,13 +293,16 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A,
           continue;
         }
 
-        /* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */
+        /* If we're in one of the first sweeps and the element is smaller than the threshold,
+         * skip it. */
         if (sweep < 3 && (abs_element < threshold)) {
           continue;
         }
 
-        /* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi).
-         * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case.
+        /* Determine rotation: The rotation is characterized by its angle phi - or,
+         * in the actual implementation, sin(phi) and cos(phi).
+         * To find those, we first compute their ratio - that might be unstable if the angle
+         * approaches 90°, so there's a fallback for that case.
          * Then, we compute sin(phi) and cos(phi) themselves. */
         float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
         float ratio;
@@ -310,7 +318,8 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A,
 
         float c = 1.0f / sqrtf(1.0f + ratio * ratio);
         float s = ratio * c;
-        /* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */
+        /* To improve numerical stability by avoiding cancellation, the update equations are
+         * reformulized to use sin(phi) and tan(phi/2) instead. */
         float tan_phi_2 = s / (1.0f + c);
 
         /* Update the singular values in the diagonal. */
@@ -330,7 +339,8 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A,
     MATS(M, n, r2, c2, stride) += s * (M1 - tan_phi_2 * M2); \
   }
 
-        /* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */
+        /* Split into three parts to ensure correct accesses since we only store the
+         * lower-triangular part of A. */
         for (int i = 0; i < col; i++)
           ROT(A, col, i, row, i, 1);
         for (int i = col + 1; i < row; i++)
diff --git a/intern/cycles/util/util_md5.cpp b/intern/cycles/util/util_md5.cpp
index c11f495f785..0df521c2b58 100644
--- a/intern/cycles/util/util_md5.cpp
+++ b/intern/cycles/util/util_md5.cpp
@@ -26,8 +26,8 @@
 #include "util_md5.h"
 #include "util_path.h"
 
-#include <string.h>
 #include <stdio.h>
+#include <string.h>
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_openimagedenoise.h b/intern/cycles/util/util_openimagedenoise.h
new file mode 100644
index 00000000000..aafa69cb530
--- /dev/null
+++ b/intern/cycles/util/util_openimagedenoise.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_OPENIMAGEDENOISE_H__
+#define __UTIL_OPENIMAGEDENOISE_H__
+
+#ifdef WITH_OPENIMAGEDENOISE
+#  include <OpenImageDenoise/oidn.hpp>
+#endif
+
+#include "util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+static inline bool openimagedenoise_supported()
+{
+#ifdef WITH_OPENIMAGEDENOISE
+  return system_cpu_support_sse41();
+#else
+  return false;
+#endif
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_OPENIMAGEDENOISE_H__ */
diff --git a/intern/cycles/util/util_param.h b/intern/cycles/util/util_param.h
index cfbe416aba1..3f8e2d6d700 100644
--- a/intern/cycles/util/util_param.h
+++ b/intern/cycles/util/util_param.h
@@ -29,6 +29,11 @@ CCL_NAMESPACE_BEGIN
 OIIO_NAMESPACE_USING
 
 static constexpr TypeDesc TypeFloat2(TypeDesc::FLOAT, TypeDesc::VEC2);
+static constexpr TypeDesc TypeRGBA(TypeDesc::FLOAT, TypeDesc::VEC4, TypeDesc::COLOR);
+static constexpr TypeDesc TypeFloatArray4(TypeDesc::FLOAT,
+                                          TypeDesc::SCALAR,
+                                          TypeDesc::NOSEMANTICS,
+                                          4);
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 77293c45f6b..8905c8bc7f0 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "util/util_md5.h"
 #include "util/util_path.h"
+#include "util/util_md5.h"
 #include "util/util_string.h"
 
 #include <OpenImageIO/filesystem.h>
@@ -36,8 +36,8 @@ OIIO_NAMESPACE_USING
 #  define DIR_SEP '/'
 #  include <dirent.h>
 #  include <pwd.h>
-#  include <unistd.h>
 #  include <sys/types.h>
+#  include <unistd.h>
 #endif
 
 #ifdef HAVE_SHLWAPI_H
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
index e3edf219435..073b09f719f 100644
--- a/intern/cycles/util/util_profiling.cpp
+++ b/intern/cycles/util/util_profiling.cpp
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include "util/util_algorithm.h"
 #include "util/util_profiling.h"
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
 #include "util/util_set.h"
 
 CCL_NAMESPACE_BEGIN
@@ -47,7 +48,8 @@ void Profiler::run()
       }
 
       if (cur_shader >= 0 && cur_shader < shader_samples.size()) {
-        /* Only consider the active shader during events whose runtime significantly depends on it. */
+        /* Only consider the active shader during events whose runtime significantly depends on it.
+         */
         if (((cur_event >= PROFILING_SHADER_EVAL) && (cur_event <= PROFILING_SUBSURFACE)) ||
             ((cur_event >= PROFILING_CLOSURE_EVAL) &&
              (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) {
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
index f5f500239f2..ceec08ed894 100644
--- a/intern/cycles/util/util_profiling.h
+++ b/intern/cycles/util/util_profiling.h
@@ -19,7 +19,6 @@
 
 #include <atomic>
 
-#include "util/util_foreach.h"
 #include "util/util_map.h"
 #include "util/util_thread.h"
 #include "util/util_vector.h"
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index f05e5b918f3..26534a29dfe 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -25,8 +25,8 @@
 
 #include "util/util_function.h"
 #include "util/util_string.h"
-#include "util/util_time.h"
 #include "util/util_thread.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -204,6 +204,8 @@ class Progress {
 
   float get_progress()
   {
+    thread_scoped_lock lock(progress_mutex);
+
     if (total_pixel_samples > 0) {
       return ((float)pixel_samples) / total_pixel_samples;
     }
@@ -362,7 +364,8 @@ class Progress {
    * It's used to display the sample count if only one tile is active. */
   int current_tile_sample;
   /* Stores the number of tiles that's already finished.
-   * Used to determine whether all but the last tile are finished rendering, in which case the current_tile_sample is displayed. */
+   * Used to determine whether all but the last tile are finished rendering,
+   * in which case the current_tile_sample is displayed. */
   int rendered_tiles, denoised_tiles;
 
   double start_time, render_start_time;
diff --git a/intern/cycles/util/util_semaphore.h b/intern/cycles/util/util_semaphore.h
new file mode 100644
index 00000000000..d995b0732b8
--- /dev/null
+++ b/intern/cycles/util/util_semaphore.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_SEMAPHORE_H__
+#define __UTIL_SEMAPHORE_H__
+
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Counting Semaphore
+ *
+ * To restrict concurrent access to a resource to a specified number
+ * of threads. Similar to std::counting_semaphore from C++20. */
+
+class thread_counting_semaphore {
+ public:
+  explicit thread_counting_semaphore(const int count) : count(count)
+  {
+  }
+
+  thread_counting_semaphore(const thread_counting_semaphore &) = delete;
+
+  void acquire()
+  {
+    thread_scoped_lock lock(mutex);
+    while (count == 0) {
+      condition.wait(lock);
+    }
+    count--;
+  }
+
+  void release()
+  {
+    thread_scoped_lock lock(mutex);
+    count++;
+    condition.notify_one();
+  }
+
+ protected:
+  thread_mutex mutex;
+  thread_condition_variable condition;
+  int count;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_SEMAPHORE_H__ */
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 8fcaadc5f53..de0e3c39f30 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -45,7 +45,7 @@
 
 #  endif
 
-#  if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
+#  if defined(__x86_64__) || defined(_M_X64)
 #    define SIMD_SET_FLUSH_TO_ZERO \
       _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
       _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
@@ -75,6 +75,28 @@ static struct FalseTy {
   }
 } False ccl_maybe_unused;
 
+static struct ZeroTy {
+  __forceinline operator float() const
+  {
+    return 0;
+  }
+  __forceinline operator int() const
+  {
+    return 0;
+  }
+} zero ccl_maybe_unused;
+
+static struct OneTy {
+  __forceinline operator float() const
+  {
+    return 1;
+  }
+  __forceinline operator int() const
+  {
+    return 1;
+  }
+} one ccl_maybe_unused;
+
 static struct NegInfTy {
   __forceinline operator float() const
   {
@@ -97,6 +119,9 @@ static struct PosInfTy {
   }
 } inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
 
+static struct StepTy {
+} step ccl_maybe_unused;
+
 /* Intrinsics Functions */
 
 #    if defined(__BMI__) && defined(__GNUC__)
@@ -563,6 +588,13 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
 
 #    endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
+/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
+ * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
+#    if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+#      undef _mm256_cvtss_f32
+#      define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
+#    endif
+
 #  else /* __KERNEL_SSE2__ */
 
 /* This section is for utility functions which operates on non-register data
diff --git a/intern/cycles/util/util_sky_model.cpp b/intern/cycles/util/util_sky_model.cpp
deleted file mode 100644
index 4a6a9f32607..00000000000
--- a/intern/cycles/util/util_sky_model.cpp
+++ /dev/null
@@ -1,349 +0,0 @@
-/*
-This source is published under the following 3-clause BSD license.
-
-Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote
-      products derived from this software without specific prior written
-      permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/* ============================================================================
-
-This file is part of a sample implementation of the analytical skylight and
-solar radiance models presented in the SIGGRAPH 2012 paper
-
-
-           "An Analytic Model for Full Spectral Sky-Dome Radiance"
-
-and the 2013 IEEE CG&A paper
-
-       "Adding a Solar Radiance Function to the Hosek Skylight Model"
-
-                                   both by
-
-                       Lukas Hosek and Alexander Wilkie
-                Charles University in Prague, Czech Republic
-
-
-                        Version: 1.4a, February 22nd, 2013
-
-Version history:
-
-1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters
-      from the interface of the colourspace sky dome initialisation functions.
-
-1.4   February 11th, 2013
-      Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too
-      bright: this affected both normal and alien sun scenarios. The
-      coefficients of the solar radiance function were changed to fix this.
-
-1.3   January 21st, 2013 (not released to the public)
-      Added support for solar discs that are not exactly the same size as
-      the terrestrial sun. Also added support for suns with a different
-      emission spectrum ("Alien World" functionality).
-
-1.2a  December 18th, 2012
-      Fixed a mistake and some inaccuracies in the solar radiance function
-      explanations found in ArHosekSkyModel.h. The actual source code is
-      unchanged compared to version 1.2.
-
-1.2   December 17th, 2012
-      Native RGB data and a solar radiance function that matches the turbidity
-      conditions were added.
-
-1.1   September 2012
-      The coefficients of the spectral model are now scaled so that the output
-      is given in physical units: W / (m^-2 * sr * nm). Also, the output of the
-      XYZ model is now no longer scaled to the range [0...1]. Instead, it is
-      the result of a simple conversion from spectral data via the CIE 2 degree
-      standard observer matching functions. Therefore, after multiplication
-      with 683 lm / W, the Y channel now corresponds to luminance in lm.
-
-1.0   May 11th, 2012
-      Initial release.
-
-
-Please visit http://cgg.mff.cuni.cz/projects/SkylightModelling/ to check if
-an updated version of this code has been published!
-
-============================================================================ */
-
-/*
-
-All instructions on how to use this code are in the accompanying header file.
-
-*/
-
-#include "util/util_sky_model.h"
-#include "util/util_sky_model_data.h"
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-
-CCL_NAMESPACE_BEGIN
-
-//   Some macro definitions that occur elsewhere in ART, and that have to be
-//   replicated to make this a stand-alone module.
-
-#ifndef MATH_PI
-#  define MATH_PI 3.141592653589793
-#endif
-
-#ifndef MATH_DEG_TO_RAD
-#  define MATH_DEG_TO_RAD (MATH_PI / 180.0)
-#endif
-
-#ifndef DEGREES
-#  define DEGREES *MATH_DEG_TO_RAD
-#endif
-
-#ifndef TERRESTRIAL_SOLAR_RADIUS
-#  define TERRESTRIAL_SOLAR_RADIUS ((0.51 DEGREES) / 2.0)
-#endif
-
-#ifndef ALLOC
-#  define ALLOC(_struct) ((_struct *)malloc(sizeof(_struct)))
-#endif
-
-// internal definitions
-
-typedef const double *ArHosekSkyModel_Dataset;
-typedef const double *ArHosekSkyModel_Radiance_Dataset;
-
-// internal functions
-
-static void ArHosekSkyModel_CookConfiguration(ArHosekSkyModel_Dataset dataset,
-                                              ArHosekSkyModelConfiguration config,
-                                              double turbidity,
-                                              double albedo,
-                                              double solar_elevation)
-{
-  const double *elev_matrix;
-
-  int int_turbidity = (int)turbidity;
-  double turbidity_rem = turbidity - (double)int_turbidity;
-
-  solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
-
-  // alb 0 low turb
-
-  elev_matrix = dataset + (9 * 6 * (int_turbidity - 1));
-
-  for (unsigned int i = 0; i < 9; ++i) {
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    config[i] =
-        (1.0 - albedo) * (1.0 - turbidity_rem) *
-        (pow(1.0 - solar_elevation, 5.0) * elev_matrix[i] +
-         5.0 * pow(1.0 - solar_elevation, 4.0) * solar_elevation * elev_matrix[i + 9] +
-         10.0 * pow(1.0 - solar_elevation, 3.0) * pow(solar_elevation, 2.0) * elev_matrix[i + 18] +
-         10.0 * pow(1.0 - solar_elevation, 2.0) * pow(solar_elevation, 3.0) * elev_matrix[i + 27] +
-         5.0 * (1.0 - solar_elevation) * pow(solar_elevation, 4.0) * elev_matrix[i + 36] +
-         pow(solar_elevation, 5.0) * elev_matrix[i + 45]);
-  }
-
-  // alb 1 low turb
-  elev_matrix = dataset + (9 * 6 * 10 + 9 * 6 * (int_turbidity - 1));
-  for (unsigned int i = 0; i < 9; ++i) {
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    config[i] +=
-        (albedo) * (1.0 - turbidity_rem) *
-        (pow(1.0 - solar_elevation, 5.0) * elev_matrix[i] +
-         5.0 * pow(1.0 - solar_elevation, 4.0) * solar_elevation * elev_matrix[i + 9] +
-         10.0 * pow(1.0 - solar_elevation, 3.0) * pow(solar_elevation, 2.0) * elev_matrix[i + 18] +
-         10.0 * pow(1.0 - solar_elevation, 2.0) * pow(solar_elevation, 3.0) * elev_matrix[i + 27] +
-         5.0 * (1.0 - solar_elevation) * pow(solar_elevation, 4.0) * elev_matrix[i + 36] +
-         pow(solar_elevation, 5.0) * elev_matrix[i + 45]);
-  }
-
-  if (int_turbidity == 10)
-    return;
-
-  // alb 0 high turb
-  elev_matrix = dataset + (9 * 6 * (int_turbidity));
-  for (unsigned int i = 0; i < 9; ++i) {
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    config[i] +=
-        (1.0 - albedo) * (turbidity_rem) *
-        (pow(1.0 - solar_elevation, 5.0) * elev_matrix[i] +
-         5.0 * pow(1.0 - solar_elevation, 4.0) * solar_elevation * elev_matrix[i + 9] +
-         10.0 * pow(1.0 - solar_elevation, 3.0) * pow(solar_elevation, 2.0) * elev_matrix[i + 18] +
-         10.0 * pow(1.0 - solar_elevation, 2.0) * pow(solar_elevation, 3.0) * elev_matrix[i + 27] +
-         5.0 * (1.0 - solar_elevation) * pow(solar_elevation, 4.0) * elev_matrix[i + 36] +
-         pow(solar_elevation, 5.0) * elev_matrix[i + 45]);
-  }
-
-  // alb 1 high turb
-  elev_matrix = dataset + (9 * 6 * 10 + 9 * 6 * (int_turbidity));
-  for (unsigned int i = 0; i < 9; ++i) {
-    //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-    config[i] +=
-        (albedo) * (turbidity_rem) *
-        (pow(1.0 - solar_elevation, 5.0) * elev_matrix[i] +
-         5.0 * pow(1.0 - solar_elevation, 4.0) * solar_elevation * elev_matrix[i + 9] +
-         10.0 * pow(1.0 - solar_elevation, 3.0) * pow(solar_elevation, 2.0) * elev_matrix[i + 18] +
-         10.0 * pow(1.0 - solar_elevation, 2.0) * pow(solar_elevation, 3.0) * elev_matrix[i + 27] +
-         5.0 * (1.0 - solar_elevation) * pow(solar_elevation, 4.0) * elev_matrix[i + 36] +
-         pow(solar_elevation, 5.0) * elev_matrix[i + 45]);
-  }
-}
-
-static double ArHosekSkyModel_CookRadianceConfiguration(ArHosekSkyModel_Radiance_Dataset dataset,
-                                                        double turbidity,
-                                                        double albedo,
-                                                        double solar_elevation)
-{
-  const double *elev_matrix;
-
-  int int_turbidity = (int)turbidity;
-  double turbidity_rem = turbidity - (double)int_turbidity;
-  double res;
-  solar_elevation = pow(solar_elevation / (MATH_PI / 2.0), (1.0 / 3.0));
-
-  // alb 0 low turb
-  elev_matrix = dataset + (6 * (int_turbidity - 1));
-  //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-  res = (1.0 - albedo) * (1.0 - turbidity_rem) *
-        (pow(1.0 - solar_elevation, 5.0) * elev_matrix[0] +
-         5.0 * pow(1.0 - solar_elevation, 4.0) * solar_elevation * elev_matrix[1] +
-         10.0 * pow(1.0 - solar_elevation, 3.0) * pow(solar_elevation, 2.0) * elev_matrix[2] +
-         10.0 * pow(1.0 - solar_elevation, 2.0) * pow(solar_elevation, 3.0) * elev_matrix[3] +
-         5.0 * (1.0 - solar_elevation) * pow(solar_elevation, 4.0) * elev_matrix[4] +
-         pow(solar_elevation, 5.0) * elev_matrix[5]);
-
-  // alb 1 low turb
-  elev_matrix = dataset + (6 * 10 + 6 * (int_turbidity - 1));
-  //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-  res += (albedo) * (1.0 - turbidity_rem) *
-         (pow(1.0 - solar_elevation, 5.0) * elev_matrix[0] +
-          5.0 * pow(1.0 - solar_elevation, 4.0) * solar_elevation * elev_matrix[1] +
-          10.0 * pow(1.0 - solar_elevation, 3.0) * pow(solar_elevation, 2.0) * elev_matrix[2] +
-          10.0 * pow(1.0 - solar_elevation, 2.0) * pow(solar_elevation, 3.0) * elev_matrix[3] +
-          5.0 * (1.0 - solar_elevation) * pow(solar_elevation, 4.0) * elev_matrix[4] +
-          pow(solar_elevation, 5.0) * elev_matrix[5]);
-  if (int_turbidity == 10)
-    return res;
-
-  // alb 0 high turb
-  elev_matrix = dataset + (6 * (int_turbidity));
-  //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-  res += (1.0 - albedo) * (turbidity_rem) *
-         (pow(1.0 - solar_elevation, 5.0) * elev_matrix[0] +
-          5.0 * pow(1.0 - solar_elevation, 4.0) * solar_elevation * elev_matrix[1] +
-          10.0 * pow(1.0 - solar_elevation, 3.0) * pow(solar_elevation, 2.0) * elev_matrix[2] +
-          10.0 * pow(1.0 - solar_elevation, 2.0) * pow(solar_elevation, 3.0) * elev_matrix[3] +
-          5.0 * (1.0 - solar_elevation) * pow(solar_elevation, 4.0) * elev_matrix[4] +
-          pow(solar_elevation, 5.0) * elev_matrix[5]);
-
-  // alb 1 high turb
-  elev_matrix = dataset + (6 * 10 + 6 * (int_turbidity));
-  //(1-t).^3* A1 + 3*(1-t).^2.*t * A2 + 3*(1-t) .* t .^ 2 * A3 + t.^3 * A4;
-  res += (albedo) * (turbidity_rem) *
-         (pow(1.0 - solar_elevation, 5.0) * elev_matrix[0] +
-          5.0 * pow(1.0 - solar_elevation, 4.0) * solar_elevation * elev_matrix[1] +
-          10.0 * pow(1.0 - solar_elevation, 3.0) * pow(solar_elevation, 2.0) * elev_matrix[2] +
-          10.0 * pow(1.0 - solar_elevation, 2.0) * pow(solar_elevation, 3.0) * elev_matrix[3] +
-          5.0 * (1.0 - solar_elevation) * pow(solar_elevation, 4.0) * elev_matrix[4] +
-          pow(solar_elevation, 5.0) * elev_matrix[5]);
-  return res;
-}
-
-static double ArHosekSkyModel_GetRadianceInternal(ArHosekSkyModelConfiguration configuration,
-                                                  double theta,
-                                                  double gamma)
-{
-  const double expM = exp(configuration[4] * gamma);
-  const double rayM = cos(gamma) * cos(gamma);
-  const double mieM =
-      (1.0 + cos(gamma) * cos(gamma)) /
-      pow((1.0 + configuration[8] * configuration[8] - 2.0 * configuration[8] * cos(gamma)), 1.5);
-  const double zenith = sqrt(cos(theta));
-
-  return (1.0 + configuration[0] * exp(configuration[1] / (cos(theta) + 0.01))) *
-         (configuration[2] + configuration[3] * expM + configuration[5] * rayM +
-          configuration[6] * mieM + configuration[7] * zenith);
-}
-
-void arhosekskymodelstate_free(ArHosekSkyModelState *state)
-{
-  free(state);
-}
-
-double arhosekskymodel_radiance(ArHosekSkyModelState *state,
-                                double theta,
-                                double gamma,
-                                double wavelength)
-{
-  int low_wl = (int)((wavelength - 320.0) / 40.0);
-
-  if (low_wl < 0 || low_wl >= 11)
-    return 0.0;
-
-  double interp = fmod((wavelength - 320.0) / 40.0, 1.0);
-
-  double val_low = ArHosekSkyModel_GetRadianceInternal(state->configs[low_wl], theta, gamma) *
-                   state->radiances[low_wl] * state->emission_correction_factor_sky[low_wl];
-
-  if (interp < 1e-6)
-    return val_low;
-
-  double result = (1.0 - interp) * val_low;
-
-  if (low_wl + 1 < 11) {
-    result += interp *
-              ArHosekSkyModel_GetRadianceInternal(state->configs[low_wl + 1], theta, gamma) *
-              state->radiances[low_wl + 1] * state->emission_correction_factor_sky[low_wl + 1];
-  }
-
-  return result;
-}
-
-// xyz and rgb versions
-
-ArHosekSkyModelState *arhosek_xyz_skymodelstate_alloc_init(const double turbidity,
-                                                           const double albedo,
-                                                           const double elevation)
-{
-  ArHosekSkyModelState *state = ALLOC(ArHosekSkyModelState);
-
-  state->solar_radius = TERRESTRIAL_SOLAR_RADIUS;
-  state->turbidity = turbidity;
-  state->albedo = albedo;
-  state->elevation = elevation;
-
-  for (unsigned int channel = 0; channel < 3; ++channel) {
-    ArHosekSkyModel_CookConfiguration(
-        datasetsXYZ[channel], state->configs[channel], turbidity, albedo, elevation);
-
-    state->radiances[channel] = ArHosekSkyModel_CookRadianceConfiguration(
-        datasetsXYZRad[channel], turbidity, albedo, elevation);
-  }
-
-  return state;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_sky_model.h b/intern/cycles/util/util_sky_model.h
deleted file mode 100644
index 84340614b2c..00000000000
--- a/intern/cycles/util/util_sky_model.h
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
-This source is published under the following 3-clause BSD license.
-
-Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote
-      products derived from this software without specific prior written
-      permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/* ============================================================================
-
-This file is part of a sample implementation of the analytical skylight and
-solar radiance models presented in the SIGGRAPH 2012 paper
-
-
-           "An Analytic Model for Full Spectral Sky-Dome Radiance"
-
-and the 2013 IEEE CG&A paper
-
-       "Adding a Solar Radiance Function to the Hosek Skylight Model"
-
-                                   both by
-
-                       Lukas Hosek and Alexander Wilkie
-                Charles University in Prague, Czech Republic
-
-
-                        Version: 1.4a, February 22nd, 2013
-
-Version history:
-
-1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters
-      from the interface of the colourspace sky dome initialisation functions.
-
-1.4   February 11th, 2013
-      Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too
-      bright: this affected both normal and alien sun scenarios. The
-      coefficients of the solar radiance function were changed to fix this.
-
-1.3   January 21st, 2013 (not released to the public)
-      Added support for solar discs that are not exactly the same size as
-      the terrestrial sun. Also added support for suns with a different
-      emission spectrum ("Alien World" functionality).
-
-1.2a  December 18th, 2012
-      Fixed a mistake and some inaccuracies in the solar radiance function
-      explanations found in ArHosekSkyModel.h. The actual source code is
-      unchanged compared to version 1.2.
-
-1.2   December 17th, 2012
-      Native RGB data and a solar radiance function that matches the turbidity
-      conditions were added.
-
-1.1   September 2012
-      The coefficients of the spectral model are now scaled so that the output
-      is given in physical units: W / (m^-2 * sr * nm). Also, the output of the
-      XYZ model is now no longer scaled to the range [0...1]. Instead, it is
-      the result of a simple conversion from spectral data via the CIE 2 degree
-      standard observer matching functions. Therefore, after multiplication
-      with 683 lm / W, the Y channel now corresponds to luminance in lm.
-
-1.0   May 11th, 2012
-      Initial release.
-
-
-Please visit http://cgg.mff.cuni.cz/projects/SkylightModelling/ to check if
-an updated version of this code has been published!
-
-============================================================================ */
-
-/*
-
-This code is taken from ART, a rendering research system written in a
-mix of C99 / Objective C. Since ART is not a small system and is intended to
-be inter-operable with other libraries, and since C does not have namespaces,
-the structures and functions in ART all have to have somewhat wordy
-canonical names that begin with Ar.../ar..., like those seen in this example.
-
-Usage information:
-==================
-
-
-Model initialisation
---------------------
-
-A separate ArHosekSkyModelState has to be maintained for each spectral
-band you want to use the model for. So in a renderer with 'num_channels'
-bands, you would need something like
-
-    ArHosekSkyModelState  * skymodel_state[num_channels];
-
-You then have to allocate and initialise these states. In the following code
-snippet, we assume that 'albedo' is defined as
-
-    double  albedo[num_channels];
-
-with a ground albedo value between [0,1] for each channel. The solar elevation
-is given in radians.
-
-    for ( unsigned int i = 0; i < num_channels; i++ )
-        skymodel_state[i] =
-            arhosekskymodelstate_alloc_init(
-                  turbidity,
-                  albedo[i],
-                  solarElevation
-                );
-
-Note that starting with version 1.3, there is also a second initialisation
-function which generates skydome states for different solar emission spectra
-and solar radii: 'arhosekskymodelstate_alienworld_alloc_init()'.
-
-See the notes about the "Alien World" functionality provided further down for a
-discussion of the usefulness and limits of that second initalisation function.
-Sky model states that have been initialized with either function behave in a
-completely identical fashion during use and cleanup.
-
-Using the model to generate skydome samples
--------------------------------------------
-
-Generating a skydome radiance spectrum "skydome_result" for a given location
-on the skydome determined via the angles theta and gamma works as follows:
-
-    double  skydome_result[num_channels];
-
-    for ( unsigned int i = 0; i < num_channels; i++ )
-        skydome_result[i] =
-            arhosekskymodel_radiance(
-                skymodel_state[i],
-                theta,
-                gamma,
-                channel_center[i]
-              );
-
-The variable "channel_center" is assumed to hold the channel center wavelengths
-for each of the num_channels samples of the spectrum we are building.
-
-
-Cleanup after use
------------------
-
-After rendering is complete, the content of the sky model states should be
-disposed of via
-
-        for ( unsigned int i = 0; i < num_channels; i++ )
-            arhosekskymodelstate_free( skymodel_state[i] );
-
-
-CIE XYZ Version of the Model
-----------------------------
-
-Usage of the CIE XYZ version of the model is exactly the same, except that
-num_channels is of course always 3, and that ArHosekTristimSkyModelState and
-arhosek_tristim_skymodel_radiance() have to be used instead of their spectral
-counterparts.
-
-RGB Version of the Model
-------------------------
-
-The RGB version uses sRGB primaries with a linear gamma ramp. The same set of
-functions as with the XYZ data is used, except the model is initialized
-by calling arhosek_rgb_skymodelstate_alloc_init.
-
-Solar Radiance Function
------------------------
-
-For each position on the solar disc, this function returns the entire radiance
-one sees - direct emission, as well as in-scattered light in the area of the
-solar disc. The latter is important for low solar elevations - nice images of
-the setting sun would not be possible without this. This is also the reason why
-this function, just like the regular sky dome model evaluation function, needs
-access to the sky dome data structures, as these provide information on
-in-scattered radiance.
-
-CAVEAT #1: in this release, this function is only provided in spectral form!
-           RGB/XYZ versions to follow at a later date.
-
-CAVEAT #2: (fixed from release 1.3 onwards)
-
-CAVEAT #3: limb darkening renders the brightness of the solar disc
-           inhomogeneous even for high solar elevations - only taking a single
-           sample at the centre of the sun will yield an incorrect power
-           estimate for the solar disc! Always take multiple random samples
-           across the entire solar disc to estimate its power!
-
-CAVEAT #4: in this version, the limb darkening calculations still use a fairly
-           computationally expensive 5th order polynomial that was directly
-           taken from astronomical literature. For the purposes of Computer
-           Graphics, this is needlessly accurate, though, and will be replaced
-           by a cheaper approximation in a future release.
-
-"Alien World" functionality
----------------------------
-
-The Hosek sky model can be used to roughly (!) predict the appearance of
-outdoor scenes on earth-like planets, i.e. planets of a similar size and
-atmospheric make-up. Since the spectral version of our model predicts sky dome
-luminance patterns and solar radiance independently for each waveband, and
-since the intensity of each waveband is solely dependent on the input radiance
-from the star that the world in question is orbiting, it is trivial to re-scale
-the wavebands to match a different star radiance.
-
-At least in theory, the spectral version of the model has always been capable
-of this sort of thing, and the actual sky dome and solar radiance models were
-actually not altered at all in this release. All we did was to add some support
-functionality for doing this more easily with the existing data and functions,
-and to add some explanations.
-
-Just use 'arhosekskymodelstate_alienworld_alloc_init()' to initialise the sky
-model states (you will have to provide values for star temperature and solar
-intensity compared to the terrestrial sun), and do everything else as you
-did before.
-
-CAVEAT #1: we assume the emission of the star that illuminates the alien world
-           to be a perfect blackbody emission spectrum. This is never entirely
-           realistic - real star emission spectra are considerably more complex
-           than this, mainly due to absorption effects in the outer layers of
-           stars. However, blackbody spectra are a reasonable first assumption
-           in a usage scenario like this, where 100% accuracy is simply not
-           necessary: for rendering purposes, there are likely no visible
-           differences between a highly accurate solution based on a more
-           involved simulation, and this approximation.
-
-CAVEAT #2: we always use limb darkening data from our own sun to provide this
-           "appearance feature", even for suns of strongly different
-           temperature. Which is presumably not very realistic, but (as with
-           the unaltered blackbody spectrum from caveat #1) probably not a bad
-           first guess, either. If you need more accuracy than we provide here,
-           please make inquiries with a friendly astro-physicst of your choice.
-
-CAVEAT #3: you have to provide a value for the solar intensity of the star
-           which illuminates the alien world. For this, please bear in mind
-           that there is very likely a comparatively tight range of absolute
-           solar irradiance values for which an earth-like planet with an
-           atmosphere like the one we assume in our model can exist in the
-           first place!
-
-           Too much irradiance, and the atmosphere probably boils off into
-           space, too little, it freezes. Which means that stars of
-           considerably different emission colour than our sun will have to be
-           fairly different in size from it, to still provide a reasonable and
-           inhabitable amount of irradiance. Red stars will need to be much
-           larger than our sun, while white or blue stars will have to be
-           comparatively tiny. The initialisation function handles this and
-           computes a plausible solar radius for a given emission spectrum. In
-           terms of absolute radiometric values, you should probably not stray
-           all too far from a solar intensity value of 1.0.
-
-CAVEAT #4: although we now support different solar radii for the actual solar
-           disc, the sky dome luminance patterns are *not* parameterised by
-           this value - i.e. the patterns stay exactly the same for different
-           solar radii! Which is of course not correct. But in our experience,
-           solar discs up to several degrees in diameter (! - our own sun is
-           half a degree across) do not cause the luminance patterns on the sky
-           to change perceptibly. The reason we know this is that we initially
-           used unrealistically large suns in our brute force path tracer, in
-           order to improve convergence speeds (which in the beginning were
-           abysmal). Later, we managed to do the reference renderings much
-           faster even with realistically small suns, and found that there was
-           no real difference in skydome appearance anyway.
-           Conclusion: changing the solar radius should not be over-done, so
-           close orbits around red supergiants are a no-no. But for the
-           purposes of getting a fairly credible first impression of what an
-           alien world with a reasonably sized sun would look like, what we are
-           doing here is probably still o.k.
-
-HINT #1:   if you want to model the sky of an earth-like planet that orbits
-           a binary star, just super-impose two of these models with solar
-           intensity of ~0.5 each, and closely spaced solar positions. Light is
-           additive, after all. Tattooine, here we come... :-)
-
-           P.S. according to Star Wars canon, Tattooine orbits a binary
-           that is made up of a G and K class star, respectively.
-           So ~5500K and ~4200K should be good first guesses for their
-           temperature. Just in case you were wondering, after reading the
-           previous paragraph.
-*/
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef _SKY_MODEL_H_
-#  define _SKY_MODEL_H_
-
-typedef double ArHosekSkyModelConfiguration[9];
-
-//   Spectral version of the model
-
-/* ----------------------------------------------------------------------------
-
-    ArHosekSkyModelState struct
-    ---------------------------
-
-    This struct holds the pre-computation data for one particular albedo value.
-    Most fields are self-explanatory, but users should never directly
-    manipulate any of them anyway. The only consistent way to manipulate such
-    structs is via the functions 'arhosekskymodelstate_alloc_init' and
-    'arhosekskymodelstate_free'.
-
-    'emission_correction_factor_sky'
-    'emission_correction_factor_sun'
-
-        The original model coefficients were fitted against the emission of
-        our local sun. If a different solar emission is desired (i.e. if the
-        model is being used to predict skydome appearance for an earth-like
-        planet that orbits a different star), these correction factors, which
-        are determined during the alloc_init step, are applied to each waveband
-        separately (they default to 1.0 in normal usage). This is the simplest
-        way to retrofit this sort of capability to the existing model. The
-        different factors for sky and sun are needed since the solar disc may
-        be of a different size compared to the terrestrial sun.
-
----------------------------------------------------------------------------- */
-
-typedef struct ArHosekSkyModelState {
-  ArHosekSkyModelConfiguration configs[11];
-  double radiances[11];
-  double turbidity;
-  double solar_radius;
-  double emission_correction_factor_sky[11];
-  double emission_correction_factor_sun[11];
-  double albedo;
-  double elevation;
-} ArHosekSkyModelState;
-
-/* ----------------------------------------------------------------------------
-
-    arhosekskymodelstate_alloc_init() function
-    ------------------------------------------
-
-    Initialises an ArHosekSkyModelState struct for a terrestrial setting.
-
----------------------------------------------------------------------------- */
-
-ArHosekSkyModelState *arhosekskymodelstate_alloc_init(const double solar_elevation,
-                                                      const double atmospheric_turbidity,
-                                                      const double ground_albedo);
-
-/* ----------------------------------------------------------------------------
-
-    arhosekskymodelstate_alienworld_alloc_init() function
-    -----------------------------------------------------
-
-    Initialises an ArHosekSkyModelState struct for an "alien world" setting
-    with a sun of a surface temperature given in 'kelvin'. The parameter
-    'solar_intensity' controls the overall brightness of the sky, relative
-    to the solar irradiance on Earth. A value of 1.0 yields a sky dome that
-    is, on average over the wavelenghts covered in the model (!), as bright
-    as the terrestrial sky in radiometric terms.
-
-    Which means that the solar radius has to be adjusted, since the
-    emissivity of a solar surface with a given temperature is more or less
-    fixed. So hotter suns have to be smaller to be equally bright as the
-    terrestrial sun, while cooler suns have to be larger. Note that there are
-    limits to the validity of the luminance patterns of the underlying model:
-    see the discussion above for more on this. In particular, an alien sun with
-    a surface temperature of only 2000 Kelvin has to be very large if it is
-    to be as bright as the terrestrial sun - so large that the luminance
-    patterns are no longer a really good fit in that case.
-
-    If you need information about the solar radius that the model computes
-    for a given temperature (say, for light source sampling purposes), you
-    have to query the 'solar_radius' variable of the sky model state returned
-    *after* running this function.
-
----------------------------------------------------------------------------- */
-
-ArHosekSkyModelState *arhosekskymodelstate_alienworld_alloc_init(
-    const double solar_elevation,
-    const double solar_intensity,
-    const double solar_surface_temperature_kelvin,
-    const double atmospheric_turbidity,
-    const double ground_albedo);
-
-void arhosekskymodelstate_free(ArHosekSkyModelState *state);
-
-double arhosekskymodel_radiance(ArHosekSkyModelState *state,
-                                double theta,
-                                double gamma,
-                                double wavelength);
-
-// CIE XYZ and RGB versions
-
-ArHosekSkyModelState *arhosek_xyz_skymodelstate_alloc_init(const double turbidity,
-                                                           const double albedo,
-                                                           const double elevation);
-
-ArHosekSkyModelState *arhosek_rgb_skymodelstate_alloc_init(const double turbidity,
-                                                           const double albedo,
-                                                           const double elevation);
-
-double arhosek_tristim_skymodel_radiance(ArHosekSkyModelState *state,
-                                         double theta,
-                                         double gamma,
-                                         int channel);
-
-//   Delivers the complete function: sky + sun, including limb darkening.
-//   Please read the above description before using this - there are several
-//   caveats!
-
-double arhosekskymodel_solar_radiance(ArHosekSkyModelState *state,
-                                      double theta,
-                                      double gamma,
-                                      double wavelength);
-
-#endif  // _SKY_MODEL_H_
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_sky_model_data.h b/intern/cycles/util/util_sky_model_data.h
deleted file mode 100644
index a2a3935eb84..00000000000
--- a/intern/cycles/util/util_sky_model_data.h
+++ /dev/null
@@ -1,3847 +0,0 @@
-/*
-This source is published under the following 3-clause BSD license.
-
-Copyright (c) 2012 - 2013, Lukas Hosek and Alexander Wilkie
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * None of the names of the contributors may be used to endorse or promote
-      products derived from this software without specific prior written
-      permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/* ============================================================================
-
-This file is part of a sample implementation of the analytical skylight and
-solar radiance models presented in the SIGGRAPH 2012 paper
-
-
-           "An Analytic Model for Full Spectral Sky-Dome Radiance"
-
-and the 2013 IEEE CG&A paper
-
-       "Adding a Solar Radiance Function to the Hosek Skylight Model"
-
-                                   both by
-
-                       Lukas Hosek and Alexander Wilkie
-                Charles University in Prague, Czech Republic
-
-
-                        Version: 1.4a, February 22nd, 2013
-
-Version history:
-
-1.4a  February 22nd, 2013
-      Removed unnecessary and counter-intuitive solar radius parameters
-      from the interface of the colourspace sky dome initialisation functions.
-
-1.4   February 11th, 2013
-      Fixed a bug which caused the relative brightness of the solar disc
-      and the sky dome to be off by a factor of about 6. The sun was too
-      bright: this affected both normal and alien sun scenarios. The
-      coefficients of the solar radiance function were changed to fix this.
-
-1.3   January 21st, 2013 (not released to the public)
-      Added support for solar discs that are not exactly the same size as
-      the terrestrial sun. Also added support for suns with a different
-      emission spectrum ("Alien World" functionality).
-
-1.2a  December 18th, 2012
-      Fixed a mistake and some inaccuracies in the solar radiance function
-      explanations found in ArHosekSkyModel.h. The actual source code is
-      unchanged compared to version 1.2.
-
-1.2   December 17th, 2012
-      Native RGB data and a solar radiance function that matches the turbidity
-      conditions were added.
-
-1.1   September 2012
-      The coefficients of the spectral model are now scaled so that the output
-      is given in physical units: W / (m^-2 * sr * nm). Also, the output of the
-      XYZ model is now no longer scaled to the range [0...1]. Instead, it is
-      the result of a simple conversion from spectral data via the CIE 2 degree
-      standard observer matching functions. Therefore, after multiplication
-      with 683 lm / W, the Y channel now corresponds to luminance in lm.
-
-1.0   May 11th, 2012
-      Initial release.
-
-
-Please visit http://cgg.mff.cuni.cz/projects/SkylightModelling/ to check if
-an updated version of this code has been published!
-
-============================================================================ */
-
-CCL_NAMESPACE_BEGIN
-
-/*
-
-This file contains the coefficient data for the XYZ colour space version of
-the model.
-
-*/
-
-// Uses Sep 9 pattern / Aug 23 mean dataset
-
-static const double datasetXYZ1[] = {
-    // albedo 0, turbidity 1
-    -1.117001e+000,
-    -1.867262e-001,
-    -1.113505e+001,
-    1.259865e+001,
-    -3.937339e-002,
-    1.167571e+000,
-    7.100686e-003,
-    3.592678e+000,
-    6.083296e-001,
-    -1.152006e+000,
-    -1.926669e-001,
-    6.152049e+000,
-    -4.770802e+000,
-    -8.704701e-002,
-    7.483626e-001,
-    3.372718e-002,
-    4.464592e+000,
-    4.036546e-001,
-    -1.072371e+000,
-    -2.696632e-001,
-    2.816168e-001,
-    1.820571e+000,
-    -3.742666e-001,
-    2.080607e+000,
-    -7.675295e-002,
-    -2.835366e+000,
-    1.129329e+000,
-    -1.109935e+000,
-    -1.532764e-001,
-    1.198787e+000,
-    -9.015183e-001,
-    5.173015e-003,
-    5.749178e-001,
-    1.075633e-001,
-    4.387949e+000,
-    2.650413e-001,
-    -1.052297e+000,
-    -2.229452e-001,
-    1.952347e+000,
-    5.727205e-001,
-    -4.885070e+000,
-    1.984016e+000,
-    -1.106197e-001,
-    -4.898361e-001,
-    8.907873e-001,
-    -1.070108e+000,
-    -1.600465e-001,
-    1.593886e+000,
-    -4.479251e-005,
-    -3.306541e+000,
-    9.390193e-001,
-    9.513168e-002,
-    2.343583e+000,
-    5.335404e-001,
-    // albedo 0, turbidity 2
-    -1.113253e+000,
-    -1.699600e-001,
-    -1.038822e+001,
-    1.137513e+001,
-    -4.040911e-002,
-    1.037455e+000,
-    4.991792e-002,
-    4.801919e+000,
-    6.302710e-001,
-    -1.135747e+000,
-    -1.678594e-001,
-    4.970755e+000,
-    -4.430230e+000,
-    -6.657408e-002,
-    3.636161e-001,
-    1.558009e-001,
-    6.013370e+000,
-    3.959601e-001,
-    -1.095892e+000,
-    -2.732595e-001,
-    7.666496e-001,
-    1.350731e+000,
-    -4.401401e-001,
-    2.470135e+000,
-    -1.707929e-001,
-    -3.260793e+000,
-    1.170337e+000,
-    -1.073668e+000,
-    -2.603929e-002,
-    -1.944589e-001,
-    4.575207e-001,
-    6.878164e-001,
-    -1.390770e-001,
-    3.690299e-001,
-    7.885781e+000,
-    1.877694e-001,
-    -1.070091e+000,
-    -2.798957e-001,
-    2.338478e+000,
-    -2.647221e+000,
-    -7.387808e+000,
-    2.329210e+000,
-    -1.644639e-001,
-    -2.003710e+000,
-    9.874527e-001,
-    -1.067120e+000,
-    -1.418866e-001,
-    1.254090e+000,
-    6.053048e+000,
-    -2.918892e+000,
-    5.322812e-001,
-    1.613053e-001,
-    3.018161e+000,
-    5.274090e-001,
-    // albedo 0, turbidity 3
-    -1.129483e+000,
-    -1.890619e-001,
-    -9.065101e+000,
-    9.659923e+000,
-    -3.607819e-002,
-    8.314359e-001,
-    8.181661e-002,
-    4.768868e+000,
-    6.339777e-001,
-    -1.146420e+000,
-    -1.883579e-001,
-    3.309173e+000,
-    -3.127882e+000,
-    -6.938176e-002,
-    3.987113e-001,
-    1.400581e-001,
-    6.283042e+000,
-    5.267076e-001,
-    -1.128348e+000,
-    -2.641305e-001,
-    1.223176e+000,
-    5.514952e-002,
-    -3.490649e-001,
-    1.997784e+000,
-    -4.123709e-002,
-    -2.251251e+000,
-    9.483466e-001,
-    -1.025820e+000,
-    1.404690e-002,
-    -1.187406e+000,
-    2.729900e+000,
-    5.877588e-001,
-    -2.761140e-001,
-    4.602633e-001,
-    8.305125e+000,
-    3.945001e-001,
-    -1.083957e+000,
-    -2.606679e-001,
-    2.207108e+000,
-    -7.202803e+000,
-    -5.968103e+000,
-    2.129455e+000,
-    -7.789512e-002,
-    -1.137688e+000,
-    8.871769e-001,
-    -1.062465e+000,
-    -1.512189e-001,
-    1.042881e+000,
-    1.427839e+001,
-    -4.242214e+000,
-    4.038100e-001,
-    1.997780e-001,
-    2.814449e+000,
-    5.803196e-001,
-    // albedo 0, turbidity 4
-    -1.175099e+000,
-    -2.410789e-001,
-    -1.108587e+001,
-    1.133404e+001,
-    -1.819300e-002,
-    6.772942e-001,
-    9.605043e-002,
-    4.231166e+000,
-    6.239972e-001,
-    -1.224207e+000,
-    -2.883527e-001,
-    3.002206e+000,
-    -2.649612e+000,
-    -4.795418e-002,
-    4.984398e-001,
-    3.251434e-002,
-    4.851611e+000,
-    6.551019e-001,
-    -1.136955e+000,
-    -2.423048e-001,
-    1.058823e+000,
-    -2.489236e-001,
-    -2.462179e-001,
-    1.933140e+000,
-    9.106828e-002,
-    -1.905869e-001,
-    8.171065e-001,
-    -1.014535e+000,
-    -8.262500e-003,
-    -1.448017e+000,
-    2.295788e+000,
-    3.510334e-001,
-    -1.477418e+000,
-    5.432449e-001,
-    5.762796e+000,
-    4.908751e-001,
-    -1.070666e+000,
-    -2.379780e-001,
-    1.844589e+000,
-    -5.442448e+000,
-    -4.012768e+000,
-    2.945275e+000,
-    9.854725e-003,
-    8.455959e-002,
-    8.145030e-001,
-    -1.071525e+000,
-    -1.777132e-001,
-    8.076590e-001,
-    9.925865e+000,
-    -3.324623e+000,
-    -6.367437e-001,
-    2.844581e-001,
-    2.248384e+000,
-    6.544022e-001,
-    // albedo 0, turbidity 5
-    -1.218818e+000,
-    -2.952382e-001,
-    -1.345975e+001,
-    1.347153e+001,
-    -6.814585e-003,
-    5.079068e-001,
-    1.197230e-001,
-    3.776949e+000,
-    5.836961e-001,
-    -1.409868e+000,
-    -5.114330e-001,
-    2.776539e+000,
-    -2.039001e+000,
-    -2.673769e-002,
-    4.145288e-001,
-    7.829342e-004,
-    2.275883e+000,
-    6.629691e-001,
-    -1.069151e+000,
-    -9.434247e-002,
-    7.293972e-001,
-    -1.222473e+000,
-    -1.533461e-001,
-    2.160357e+000,
-    4.626837e-002,
-    3.852415e+000,
-    8.593570e-001,
-    -1.021306e+000,
-    -1.149551e-001,
-    -1.108414e+000,
-    4.178343e+000,
-    4.013665e-001,
-    -2.222814e+000,
-    6.929462e-001,
-    1.392652e+000,
-    4.401662e-001,
-    -1.074251e+000,
-    -2.224002e-001,
-    1.372356e+000,
-    -8.858704e+000,
-    -3.922660e+000,
-    3.020018e+000,
-    -1.458724e-002,
-    1.511186e+000,
-    8.288064e-001,
-    -1.062048e+000,
-    -1.526582e-001,
-    4.921067e-001,
-    1.485522e+001,
-    -3.229936e+000,
-    -8.426604e-001,
-    3.916243e-001,
-    2.678994e+000,
-    6.689264e-001,
-    // albedo 0, turbidity 6
-    -1.257023e+000,
-    -3.364700e-001,
-    -1.527795e+001,
-    1.504223e+001,
-    2.717715e-003,
-    3.029910e-001,
-    1.636851e-001,
-    3.561663e+000,
-    5.283161e-001,
-    -1.635124e+000,
-    -7.329993e-001,
-    3.523939e+000,
-    -2.566337e+000,
-    -1.902543e-002,
-    5.505483e-001,
-    -6.242176e-002,
-    1.065992e+000,
-    6.654236e-001,
-    -9.295823e-001,
-    4.845834e-002,
-    -2.992990e-001,
-    -2.001327e-001,
-    -8.019339e-002,
-    1.807806e+000,
-    9.020277e-002,
-    5.095372e+000,
-    8.639936e-001,
-    -1.093740e+000,
-    -2.148608e-001,
-    -5.216240e-001,
-    2.119777e+000,
-    9.506454e-002,
-    -1.831439e+000,
-    6.961204e-001,
-    1.102084e-001,
-    4.384319e-001,
-    -1.044181e+000,
-    -1.849257e-001,
-    9.071246e-001,
-    -4.648901e+000,
-    -2.279385e+000,
-    2.356502e+000,
-    -4.169147e-002,
-    1.932557e+000,
-    8.296550e-001,
-    -1.061451e+000,
-    -1.458745e-001,
-    2.952267e-001,
-    8.967214e+000,
-    -3.726228e+000,
-    -5.022316e-001,
-    5.684877e-001,
-    3.102347e+000,
-    6.658443e-001,
-    // albedo 0, turbidity 7
-    -1.332391e+000,
-    -4.127769e-001,
-    -9.328643e+000,
-    9.046194e+000,
-    3.457775e-003,
-    3.377425e-001,
-    1.530909e-001,
-    3.301209e+000,
-    4.997917e-001,
-    -1.932002e+000,
-    -9.947777e-001,
-    -2.042329e+000,
-    3.586940e+000,
-    -5.642182e-002,
-    8.130478e-001,
-    -8.195988e-002,
-    1.118294e-001,
-    5.617231e-001,
-    -8.707374e-001,
-    1.286999e-001,
-    1.820054e+000,
-    -4.674706e+000,
-    3.317471e-003,
-    5.919018e-001,
-    1.975278e-001,
-    6.686519e+000,
-    9.631727e-001,
-    -1.070378e+000,
-    -3.030579e-001,
-    -9.041938e-001,
-    6.200201e+000,
-    1.232207e-001,
-    -3.650628e-001,
-    5.029403e-001,
-    -2.903162e+000,
-    3.811408e-001,
-    -1.063035e+000,
-    -1.637545e-001,
-    5.853072e-001,
-    -7.889906e+000,
-    -1.200641e+000,
-    1.035018e+000,
-    1.192093e-001,
-    3.267054e+000,
-    8.416151e-001,
-    -1.053655e+000,
-    -1.562286e-001,
-    2.423683e-001,
-    1.128575e+001,
-    -4.363262e+000,
-    -7.314160e-002,
-    5.642088e-001,
-    2.514023e+000,
-    6.670457e-001,
-    // albedo 0, turbidity 8
-    -1.366112e+000,
-    -4.718287e-001,
-    -7.876222e+000,
-    7.746900e+000,
-    -9.182309e-003,
-    4.716076e-001,
-    8.320252e-002,
-    3.165603e+000,
-    5.392334e-001,
-    -2.468204e+000,
-    -1.336340e+000,
-    -5.386723e+000,
-    7.072672e+000,
-    -8.329266e-002,
-    8.636876e-001,
-    -1.978177e-002,
-    -1.326218e-001,
-    2.979222e-001,
-    -9.653522e-001,
-    -2.373416e-002,
-    1.810250e+000,
-    -6.467262e+000,
-    1.410706e-001,
-    -4.753717e-001,
-    3.003095e-001,
-    6.551163e+000,
-    1.151083e+000,
-    -8.943186e-001,
-    -2.487152e-001,
-    -2.308960e-001,
-    8.512648e+000,
-    1.298402e-001,
-    1.034705e+000,
-    2.303509e-001,
-    -3.924095e+000,
-    2.982717e-001,
-    -1.146999e+000,
-    -2.318784e-001,
-    8.992419e-002,
-    -9.933614e+000,
-    -8.860920e-001,
-    -3.071656e-002,
-    2.852012e-001,
-    3.046199e+000,
-    8.599001e-001,
-    -1.032399e+000,
-    -1.645145e-001,
-    2.683599e-001,
-    1.327701e+001,
-    -4.407670e+000,
-    7.709869e-002,
-    4.951727e-001,
-    1.957277e+000,
-    6.630943e-001,
-    // albedo 0, turbidity 9
-    -1.469070e+000,
-    -6.135092e-001,
-    -6.506263e+000,
-    6.661315e+000,
-    -3.835383e-002,
-    7.150413e-001,
-    7.784318e-003,
-    2.820577e+000,
-    6.756784e-001,
-    -2.501583e+000,
-    -1.247404e+000,
-    -1.523462e+001,
-    1.633191e+001,
-    -1.204803e-002,
-    5.896471e-001,
-    -2.002023e-002,
-    1.144647e+000,
-    6.177874e-002,
-    -2.438672e+000,
-    -1.127291e+000,
-    5.731172e+000,
-    -1.021350e+001,
-    6.165610e-002,
-    -7.752641e-001,
-    4.708254e-001,
-    4.176847e+000,
-    1.200881e+000,
-    -1.513427e-001,
-    9.792731e-002,
-    -1.612349e+000,
-    9.814289e+000,
-    5.188921e-002,
-    1.716403e+000,
-    -7.039255e-002,
-    -2.815115e+000,
-    3.291874e-001,
-    -1.318511e+000,
-    -3.650554e-001,
-    4.221268e-001,
-    -9.294529e+000,
-    -4.397520e-002,
-    -8.100625e-001,
-    3.742719e-001,
-    1.834166e+000,
-    8.223450e-001,
-    -1.016009e+000,
-    -1.820264e-001,
-    1.278426e-001,
-    1.182696e+001,
-    -4.801528e+000,
-    4.947899e-001,
-    4.660378e-001,
-    1.601254e+000,
-    6.702359e-001,
-    // albedo 0, turbidity 10
-    -1.841310e+000,
-    -9.781779e-001,
-    -4.610903e+000,
-    4.824662e+000,
-    -5.100806e-002,
-    6.463776e-001,
-    -6.377724e-006,
-    2.216875e+000,
-    8.618530e-001,
-    -2.376373e+000,
-    -1.108657e+000,
-    -1.489799e+001,
-    1.546458e+001,
-    4.091025e-002,
-    9.761780e-002,
-    -1.048958e-002,
-    2.165834e+000,
-    -1.609171e-001,
-    -4.710318e+000,
-    -2.261963e+000,
-    6.947327e+000,
-    -1.034828e+001,
-    -1.325542e-001,
-    7.508674e-001,
-    2.247553e-001,
-    2.873142e+000,
-    1.297100e+000,
-    2.163750e-001,
-    -1.944345e-001,
-    -2.437860e+000,
-    1.011314e+001,
-    4.450500e-001,
-    3.111492e-001,
-    2.751323e-001,
-    -1.627906e+000,
-    2.531213e-001,
-    -1.258794e+000,
-    -3.524641e-001,
-    8.425444e-001,
-    -1.085313e+001,
-    -1.154381e+000,
-    -4.638014e-001,
-    -2.781115e-003,
-    4.344498e-001,
-    8.507091e-001,
-    -1.018938e+000,
-    -1.804153e-001,
-    -6.354054e-002,
-    1.573150e+001,
-    -4.386999e+000,
-    6.211115e-001,
-    5.294648e-001,
-    1.580749e+000,
-    6.586655e-001,
-    // albedo 1, turbidity 1
-    -1.116416e+000,
-    -1.917524e-001,
-    -1.068233e+001,
-    1.222221e+001,
-    -3.668978e-002,
-    1.054022e+000,
-    1.592132e-002,
-    3.180583e+000,
-    5.627370e-001,
-    -1.132341e+000,
-    -1.671286e-001,
-    5.976499e+000,
-    -4.227366e+000,
-    -9.542489e-002,
-    8.664938e-001,
-    8.351793e-003,
-    4.876068e+000,
-    4.492779e-001,
-    -1.087635e+000,
-    -3.173679e-001,
-    4.314407e-001,
-    1.100555e+000,
-    -4.410057e-001,
-    1.677253e+000,
-    -3.005925e-002,
-    -4.201249e+000,
-    1.070902e+000,
-    -1.083031e+000,
-    -8.847705e-002,
-    1.291773e+000,
-    4.546776e-001,
-    3.091894e-001,
-    7.261760e-001,
-    4.203659e-002,
-    5.990615e+000,
-    3.704756e-001,
-    -1.057899e+000,
-    -2.246706e-001,
-    2.329563e+000,
-    -1.219656e+000,
-    -5.335260e+000,
-    8.545378e-001,
-    -3.906209e-002,
-    -9.025499e-001,
-    7.797348e-001,
-    -1.073305e+000,
-    -1.522553e-001,
-    1.767063e+000,
-    1.904280e+000,
-    -3.101673e+000,
-    3.995856e-001,
-    2.905192e-002,
-    2.563977e+000,
-    5.753067e-001,
-    // albedo 1, turbidity 2
-    -1.113674e+000,
-    -1.759694e-001,
-    -9.754125e+000,
-    1.087391e+001,
-    -3.841093e-002,
-    9.524272e-001,
-    5.680219e-002,
-    4.227034e+000,
-    6.029571e-001,
-    -1.126496e+000,
-    -1.680281e-001,
-    5.332352e+000,
-    -4.575579e+000,
-    -6.761755e-002,
-    3.295335e-001,
-    1.194896e-001,
-    5.570901e+000,
-    4.536185e-001,
-    -1.103074e+000,
-    -2.681801e-001,
-    6.571479e-002,
-    2.396522e+000,
-    -4.551280e-001,
-    2.466331e+000,
-    -1.232022e-001,
-    -3.023201e+000,
-    1.086379e+000,
-    -1.053299e+000,
-    -2.697173e-002,
-    8.379121e-001,
-    -9.681458e-001,
-    5.890692e-001,
-    -4.872027e-001,
-    2.936929e-001,
-    7.510139e+000,
-    3.079122e-001,
-    -1.079553e+000,
-    -2.710448e-001,
-    2.462379e+000,
-    -3.713554e-001,
-    -8.534512e+000,
-    1.828242e+000,
-    -1.686398e-001,
-    -1.961340e+000,
-    8.941077e-001,
-    -1.069741e+000,
-    -1.396394e-001,
-    1.657868e+000,
-    3.236313e+000,
-    -2.706344e+000,
-    -2.948122e-001,
-    1.314816e-001,
-    2.868457e+000,
-    5.413403e-001,
-    // albedo 1, turbidity 3
-    -1.131649e+000,
-    -1.954455e-001,
-    -7.751595e+000,
-    8.685861e+000,
-    -4.910871e-002,
-    8.992952e-001,
-    4.710143e-002,
-    4.254818e+000,
-    6.821116e-001,
-    -1.156689e+000,
-    -1.884324e-001,
-    3.163519e+000,
-    -3.091522e+000,
-    -6.613927e-002,
-    -2.575883e-002,
-    1.640065e-001,
-    6.073643e+000,
-    4.453468e-001,
-    -1.079224e+000,
-    -2.621389e-001,
-    9.446437e-001,
-    1.448479e+000,
-    -3.969384e-001,
-    2.626638e+000,
-    -8.101186e-002,
-    -3.016355e+000,
-    1.076295e+000,
-    -1.080832e+000,
-    1.033057e-002,
-    -3.500156e-001,
-    -3.281419e-002,
-    5.655512e-001,
-    -1.156742e+000,
-    4.534710e-001,
-    8.774122e+000,
-    2.772869e-001,
-    -1.051202e+000,
-    -2.679975e-001,
-    2.719109e+000,
-    -2.190316e+000,
-    -6.878798e+000,
-    2.250481e+000,
-    -2.030252e-001,
-    -2.026527e+000,
-    9.701096e-001,
-    -1.089849e+000,
-    -1.598589e-001,
-    1.564748e+000,
-    6.869187e+000,
-    -3.053670e+000,
-    -6.110435e-001,
-    1.644472e-001,
-    2.370452e+000,
-    5.511770e-001,
-    // albedo 1, turbidity 4
-    -1.171419e+000,
-    -2.429746e-001,
-    -8.991334e+000,
-    9.571216e+000,
-    -2.772861e-002,
-    6.688262e-001,
-    7.683478e-002,
-    3.785611e+000,
-    6.347635e-001,
-    -1.228554e+000,
-    -2.917562e-001,
-    2.753986e+000,
-    -2.491780e+000,
-    -4.663434e-002,
-    3.118303e-001,
-    7.546506e-002,
-    4.463096e+000,
-    5.955071e-001,
-    -1.093124e+000,
-    -2.447767e-001,
-    9.097406e-001,
-    5.448296e-001,
-    -2.957824e-001,
-    2.024167e+000,
-    -5.152333e-004,
-    -1.069081e+000,
-    9.369565e-001,
-    -1.056994e+000,
-    1.569507e-002,
-    -8.217491e-001,
-    1.870818e+000,
-    7.061930e-001,
-    -1.483928e+000,
-    5.978206e-001,
-    6.864902e+000,
-    3.673332e-001,
-    -1.054871e+000,
-    -2.758129e-001,
-    2.712807e+000,
-    -5.950110e+000,
-    -6.554039e+000,
-    2.447523e+000,
-    -1.895171e-001,
-    -1.454292e+000,
-    9.131738e-001,
-    -1.100218e+000,
-    -1.746241e-001,
-    1.438505e+000,
-    1.115481e+001,
-    -3.266076e+000,
-    -8.837357e-001,
-    1.970100e-001,
-    1.991595e+000,
-    5.907821e-001,
-    // albedo 1, turbidity 5
-    -1.207267e+000,
-    -2.913610e-001,
-    -1.103767e+001,
-    1.140724e+001,
-    -1.416800e-002,
-    5.564047e-001,
-    8.476262e-002,
-    3.371255e+000,
-    6.221335e-001,
-    -1.429698e+000,
-    -5.374218e-001,
-    2.837524e+000,
-    -2.221936e+000,
-    -2.422337e-002,
-    9.313758e-002,
-    7.190250e-002,
-    1.869022e+000,
-    5.609035e-001,
-    -1.002274e+000,
-    -6.972810e-002,
-    4.031308e-001,
-    -3.932997e-001,
-    -1.521923e-001,
-    2.390646e+000,
-    -6.893990e-002,
-    2.999661e+000,
-    1.017843e+000,
-    -1.081168e+000,
-    -1.178666e-001,
-    -4.968080e-001,
-    3.919299e+000,
-    6.046866e-001,
-    -2.440615e+000,
-    7.891538e-001,
-    2.140835e+000,
-    2.740470e-001,
-    -1.050727e+000,
-    -2.307688e-001,
-    2.276396e+000,
-    -9.454407e+000,
-    -5.505176e+000,
-    2.992620e+000,
-    -2.450942e-001,
-    6.078372e-001,
-    9.606765e-001,
-    -1.103752e+000,
-    -1.810202e-001,
-    1.375044e+000,
-    1.589095e+001,
-    -3.438954e+000,
-    -1.265669e+000,
-    2.475172e-001,
-    1.680768e+000,
-    5.978056e-001,
-    // albedo 1, turbidity 6
-    -1.244324e+000,
-    -3.378542e-001,
-    -1.111001e+001,
-    1.137784e+001,
-    -7.896794e-003,
-    4.808023e-001,
-    9.249904e-002,
-    3.025816e+000,
-    5.880239e-001,
-    -1.593165e+000,
-    -7.027621e-001,
-    2.220896e+000,
-    -1.437709e+000,
-    -1.534738e-002,
-    6.286958e-002,
-    6.644555e-002,
-    1.091727e+000,
-    5.470080e-001,
-    -9.136506e-001,
-    1.344874e-002,
-    7.772636e-001,
-    -1.209396e+000,
-    -1.408978e-001,
-    2.433718e+000,
-    -1.041938e-001,
-    3.791244e+000,
-    1.037916e+000,
-    -1.134968e+000,
-    -1.803315e-001,
-    -9.267335e-001,
-    4.576670e+000,
-    6.851928e-001,
-    -2.805000e+000,
-    8.687208e-001,
-    1.161483e+000,
-    2.571688e-001,
-    -1.017037e+000,
-    -2.053943e-001,
-    2.361640e+000,
-    -9.887818e+000,
-    -5.122889e+000,
-    3.287088e+000,
-    -2.594102e-001,
-    8.578927e-001,
-    9.592340e-001,
-    -1.118723e+000,
-    -1.934942e-001,
-    1.226023e+000,
-    1.674140e+001,
-    -3.277335e+000,
-    -1.629809e+000,
-    2.765232e-001,
-    1.637713e+000,
-    6.113963e-001,
-    // albedo 1, turbidity 7
-    -1.314779e+000,
-    -4.119915e-001,
-    -1.241150e+001,
-    1.241578e+001,
-    2.344284e-003,
-    2.980837e-001,
-    1.414613e-001,
-    2.781731e+000,
-    4.998556e-001,
-    -1.926199e+000,
-    -1.020038e+000,
-    2.569200e+000,
-    -1.081159e+000,
-    -2.266833e-002,
-    3.588668e-001,
-    8.750078e-003,
-    -2.452171e-001,
-    4.796758e-001,
-    -7.780002e-001,
-    1.850647e-001,
-    4.445456e-002,
-    -2.409297e+000,
-    -7.816346e-002,
-    1.546790e+000,
-    -2.807227e-002,
-    5.998176e+000,
-    1.132396e+000,
-    -1.179326e+000,
-    -3.578330e-001,
-    -2.392933e-001,
-    6.467883e+000,
-    5.904596e-001,
-    -1.869975e+000,
-    8.045839e-001,
-    -2.498121e+000,
-    1.610633e-001,
-    -1.009956e+000,
-    -1.311896e-001,
-    1.726577e+000,
-    -1.219356e+001,
-    -3.466239e+000,
-    2.343602e+000,
-    -2.252205e-001,
-    2.573681e+000,
-    1.027109e+000,
-    -1.112460e+000,
-    -2.063093e-001,
-    1.233051e+000,
-    2.058946e+001,
-    -4.578074e+000,
-    -1.145643e+000,
-    3.160192e-001,
-    1.420159e+000,
-    5.860212e-001,
-    // albedo 1, turbidity 8
-    -1.371689e+000,
-    -4.914196e-001,
-    -1.076610e+001,
-    1.107405e+001,
-    -1.485077e-002,
-    5.936218e-001,
-    3.685482e-002,
-    2.599968e+000,
-    6.002204e-001,
-    -2.436997e+000,
-    -1.377939e+000,
-    2.130141e-002,
-    1.079593e+000,
-    -1.796232e-002,
-    -3.933248e-002,
-    1.610711e-001,
-    -6.901181e-001,
-    1.206416e-001,
-    -8.743368e-001,
-    7.331370e-002,
-    8.734259e-001,
-    -3.743126e+000,
-    -3.151167e-002,
-    1.297596e+000,
-    -7.634926e-002,
-    6.532873e+000,
-    1.435737e+000,
-    -9.810197e-001,
-    -3.521634e-001,
-    -2.855205e-001,
-    7.134674e+000,
-    6.839748e-001,
-    -1.394841e+000,
-    6.952036e-001,
-    -4.633104e+000,
-    -2.173401e-002,
-    -1.122958e+000,
-    -1.691536e-001,
-    1.382360e+000,
-    -1.102913e+001,
-    -2.608171e+000,
-    1.865111e+000,
-    -1.345154e-001,
-    3.112342e+000,
-    1.094134e+000,
-    -1.075586e+000,
-    -2.077415e-001,
-    1.171477e+000,
-    1.793270e+001,
-    -4.656858e+000,
-    -1.036839e+000,
-    3.338295e-001,
-    1.042793e+000,
-    5.739374e-001,
-    // albedo 1, turbidity 9
-    -1.465871e+000,
-    -6.364486e-001,
-    -8.833718e+000,
-    9.343650e+000,
-    -3.223600e-002,
-    7.552848e-001,
-    -3.121341e-006,
-    2.249164e+000,
-    8.094662e-001,
-    -2.448924e+000,
-    -1.270878e+000,
-    -4.823703e+000,
-    5.853058e+000,
-    -2.149127e-002,
-    3.581132e-002,
-    -1.230276e-003,
-    4.892553e-001,
-    -1.597657e-001,
-    -2.419809e+000,
-    -1.071337e+000,
-    1.575648e+000,
-    -4.983580e+000,
-    9.545185e-003,
-    5.032615e-001,
-    4.186266e-001,
-    4.634147e+000,
-    1.433517e+000,
-    -1.383278e-001,
-    -2.797095e-002,
-    -1.943067e-001,
-    6.679623e+000,
-    4.118280e-001,
-    -2.744289e-001,
-    -2.118722e-002,
-    -4.337025e+000,
-    1.505072e-001,
-    -1.341872e+000,
-    -2.518572e-001,
-    1.027009e+000,
-    -6.527103e+000,
-    -1.081271e+000,
-    1.015465e+000,
-    2.845789e-001,
-    2.470371e+000,
-    9.278120e-001,
-    -1.040640e+000,
-    -2.367454e-001,
-    1.100744e+000,
-    8.827253e+000,
-    -4.560794e+000,
-    -7.287017e-001,
-    2.842503e-001,
-    6.336593e-001,
-    6.327335e-001,
-    // albedo 1, turbidity 10
-    -1.877993e+000,
-    -1.025135e+000,
-    -4.311037e+000,
-    4.715016e+000,
-    -4.711631e-002,
-    6.335844e-001,
-    -7.665398e-006,
-    1.788017e+000,
-    9.001409e-001,
-    -2.281540e+000,
-    -1.137668e+000,
-    -1.036869e+001,
-    1.136254e+001,
-    1.961739e-002,
-    -9.836174e-002,
-    -6.734567e-003,
-    1.320918e+000,
-    -2.400807e-001,
-    -4.904054e+000,
-    -2.315781e+000,
-    5.735999e+000,
-    -8.626257e+000,
-    -1.255643e-001,
-    1.545446e+000,
-    1.396860e-001,
-    2.972897e+000,
-    1.429934e+000,
-    4.077067e-001,
-    -1.833688e-001,
-    -2.450939e+000,
-    9.119433e+000,
-    4.505361e-001,
-    -1.340828e+000,
-    3.973690e-001,
-    -1.785370e+000,
-    9.628711e-002,
-    -1.296052e+000,
-    -3.250526e-001,
-    1.813294e+000,
-    -1.031485e+001,
-    -1.388690e+000,
-    1.239733e+000,
-    -8.989196e-002,
-    -3.389637e-001,
-    9.639560e-001,
-    -1.062181e+000,
-    -2.423444e-001,
-    7.577592e-001,
-    1.566938e+001,
-    -4.462264e+000,
-    -5.742810e-001,
-    3.262259e-001,
-    9.461672e-001,
-    6.232887e-001,
-};
-
-static const double datasetXYZRad1[] = {
-    // albedo 0, turbidity 1
-    1.560219e+000,
-    1.417388e+000,
-    1.206927e+000,
-    1.091949e+001,
-    5.931416e+000,
-    7.304788e+000,
-    // albedo 0, turbidity 2
-    1.533049e+000,
-    1.560532e+000,
-    3.685059e-001,
-    1.355040e+001,
-    5.543711e+000,
-    7.792189e+000,
-    // albedo 0, turbidity 3
-    1.471043e+000,
-    1.746088e+000,
-    -9.299697e-001,
-    1.720362e+001,
-    5.473384e+000,
-    8.336416e+000,
-    // albedo 0, turbidity 4
-    1.355991e+000,
-    2.109348e+000,
-    -3.295855e+000,
-    2.264843e+001,
-    5.454607e+000,
-    9.304656e+000,
-    // albedo 0, turbidity 5
-    1.244963e+000,
-    2.547533e+000,
-    -5.841485e+000,
-    2.756879e+001,
-    5.576104e+000,
-    1.043287e+001,
-    // albedo 0, turbidity 6
-    1.175532e+000,
-    2.784634e+000,
-    -7.212225e+000,
-    2.975347e+001,
-    6.472980e+000,
-    1.092331e+001,
-    // albedo 0, turbidity 7
-    1.082973e+000,
-    3.118094e+000,
-    -8.934293e+000,
-    3.186879e+001,
-    8.473885e+000,
-    1.174019e+001,
-    // albedo 0, turbidity 8
-    9.692500e-001,
-    3.349574e+000,
-    -1.003810e+001,
-    3.147654e+001,
-    1.338931e+001,
-    1.272547e+001,
-    // albedo 0, turbidity 9
-    8.547044e-001,
-    3.151538e+000,
-    -9.095567e+000,
-    2.554995e+001,
-    2.273219e+001,
-    1.410398e+001,
-    // albedo 0, turbidity 10
-    7.580340e-001,
-    2.311153e+000,
-    -5.170814e+000,
-    1.229669e+001,
-    3.686529e+001,
-    1.598882e+001,
-    // albedo 1, turbidity 1
-    1.664273e+000,
-    1.574468e+000,
-    1.422078e+000,
-    9.768247e+000,
-    1.447338e+001,
-    1.644988e+001,
-    // albedo 1, turbidity 2
-    1.638295e+000,
-    1.719586e+000,
-    5.786675e-001,
-    1.239846e+001,
-    1.415419e+001,
-    1.728605e+001,
-    // albedo 1, turbidity 3
-    1.572623e+000,
-    1.921559e+000,
-    -7.714802e-001,
-    1.609246e+001,
-    1.420954e+001,
-    1.825908e+001,
-    // albedo 1, turbidity 4
-    1.468395e+000,
-    2.211970e+000,
-    -2.845869e+000,
-    2.075027e+001,
-    1.524822e+001,
-    1.937622e+001,
-    // albedo 1, turbidity 5
-    1.355047e+000,
-    2.556469e+000,
-    -4.960920e+000,
-    2.460237e+001,
-    1.648360e+001,
-    2.065648e+001,
-    // albedo 1, turbidity 6
-    1.291642e+000,
-    2.742036e+000,
-    -6.061967e+000,
-    2.602002e+001,
-    1.819144e+001,
-    2.116712e+001,
-    // albedo 1, turbidity 7
-    1.194565e+000,
-    2.972120e+000,
-    -7.295779e+000,
-    2.691805e+001,
-    2.124880e+001,
-    2.201819e+001,
-    // albedo 1, turbidity 8
-    1.083631e+000,
-    3.047021e+000,
-    -7.766096e+000,
-    2.496261e+001,
-    2.744264e+001,
-    2.291875e+001,
-    // albedo 1, turbidity 9
-    9.707994e-001,
-    2.736459e+000,
-    -6.308284e+000,
-    1.760860e+001,
-    3.776291e+001,
-    2.392150e+001,
-    // albedo 1, turbidity 10
-    8.574294e-001,
-    1.865155e+000,
-    -2.364707e+000,
-    4.337793e+000,
-    5.092831e+001,
-    2.523432e+001,
-};
-
-static const double datasetXYZ2[] = {
-    // albedo 0, turbidity 1
-    -1.127942e+000,
-    -1.905548e-001,
-    -1.252356e+001,
-    1.375799e+001,
-    -3.624732e-002,
-    1.055453e+000,
-    1.385036e-002,
-    4.176970e+000,
-    5.928345e-001,
-    -1.155260e+000,
-    -1.778135e-001,
-    6.216056e+000,
-    -5.254116e+000,
-    -8.787445e-002,
-    8.434621e-001,
-    4.025734e-002,
-    6.195322e+000,
-    3.111856e-001,
-    -1.125624e+000,
-    -3.217593e-001,
-    5.043919e-001,
-    1.686284e+000,
-    -3.536071e-001,
-    1.476321e+000,
-    -7.899019e-002,
-    -4.522531e+000,
-    1.271691e+000,
-    -1.081801e+000,
-    -1.033234e-001,
-    9.995550e-001,
-    7.482946e-003,
-    -6.776018e-002,
-    1.463141e+000,
-    9.492021e-002,
-    5.612723e+000,
-    1.298846e-001,
-    -1.075320e+000,
-    -2.402711e-001,
-    2.141284e+000,
-    -1.203359e+000,
-    -4.945188e+000,
-    1.437221e+000,
-    -8.096750e-002,
-    -1.028378e+000,
-    1.004164e+000,
-    -1.073337e+000,
-    -1.516517e-001,
-    1.639379e+000,
-    2.304669e+000,
-    -3.214244e+000,
-    1.286245e+000,
-    5.613957e-002,
-    2.480902e+000,
-    4.999363e-001,
-    // albedo 0, turbidity 2
-    -1.128399e+000,
-    -1.857793e-001,
-    -1.089863e+001,
-    1.172984e+001,
-    -3.768099e-002,
-    9.439285e-001,
-    4.869335e-002,
-    4.845114e+000,
-    6.119211e-001,
-    -1.114002e+000,
-    -1.399280e-001,
-    4.963800e+000,
-    -4.685500e+000,
-    -7.780879e-002,
-    4.049736e-001,
-    1.586297e-001,
-    7.770264e+000,
-    3.449006e-001,
-    -1.185472e+000,
-    -3.403543e-001,
-    6.588322e-001,
-    1.133713e+000,
-    -4.118674e-001,
-    2.061191e+000,
-    -1.882768e-001,
-    -4.372586e+000,
-    1.223530e+000,
-    -1.002272e+000,
-    2.000703e-002,
-    7.073269e-002,
-    1.485075e+000,
-    5.005589e-001,
-    4.301494e-001,
-    3.626541e-001,
-    7.921098e+000,
-    1.574766e-001,
-    -1.121006e+000,
-    -3.007777e-001,
-    2.242051e+000,
-    -4.571561e+000,
-    -7.761071e+000,
-    2.053404e+000,
-    -1.524018e-001,
-    -1.886162e+000,
-    1.018208e+000,
-    -1.058864e+000,
-    -1.358673e-001,
-    1.389667e+000,
-    8.633409e+000,
-    -3.437249e+000,
-    7.295429e-001,
-    1.514700e-001,
-    2.842513e+000,
-    5.014325e-001,
-    // albedo 0, turbidity 3
-    -1.144464e+000,
-    -2.043799e-001,
-    -1.020188e+001,
-    1.071247e+001,
-    -3.256693e-002,
-    7.860205e-001,
-    6.872719e-002,
-    4.824771e+000,
-    6.259836e-001,
-    -1.170104e+000,
-    -2.118626e-001,
-    4.391405e+000,
-    -4.198900e+000,
-    -7.111559e-002,
-    3.890442e-001,
-    1.024831e-001,
-    6.282535e+000,
-    5.365688e-001,
-    -1.129171e+000,
-    -2.552880e-001,
-    2.238298e-001,
-    7.314295e-001,
-    -3.562730e-001,
-    1.881931e+000,
-    -3.078716e-002,
-    -1.039120e+000,
-    9.096301e-001,
-    -1.042294e+000,
-    4.450203e-003,
-    -5.116033e-001,
-    2.627589e+000,
-    6.098996e-001,
-    -1.264638e-001,
-    4.325281e-001,
-    7.080503e+000,
-    4.583646e-001,
-    -1.082293e+000,
-    -2.723056e-001,
-    2.065076e+000,
-    -8.143133e+000,
-    -7.892212e+000,
-    2.142231e+000,
-    -7.106240e-002,
-    -1.122398e+000,
-    8.338505e-001,
-    -1.071715e+000,
-    -1.426568e-001,
-    1.095351e+000,
-    1.729783e+001,
-    -3.851931e+000,
-    4.360514e-001,
-    2.114440e-001,
-    2.970832e+000,
-    5.944389e-001,
-    // albedo 0, turbidity 4
-    -1.195909e+000,
-    -2.590449e-001,
-    -1.191037e+001,
-    1.207947e+001,
-    -1.589842e-002,
-    6.297846e-001,
-    9.054772e-002,
-    4.285959e+000,
-    5.933752e-001,
-    -1.245763e+000,
-    -3.316637e-001,
-    4.293660e+000,
-    -3.694011e+000,
-    -4.699947e-002,
-    4.843684e-001,
-    2.130425e-002,
-    4.097549e+000,
-    6.530809e-001,
-    -1.148742e+000,
-    -1.902509e-001,
-    -2.393233e-001,
-    -2.441254e-001,
-    -2.610918e-001,
-    1.846988e+000,
-    3.532866e-002,
-    2.660106e+000,
-    8.358294e-001,
-    -1.016080e+000,
-    -7.444960e-002,
-    -5.053436e-001,
-    4.388855e+000,
-    6.054987e-001,
-    -1.208300e+000,
-    5.817215e-001,
-    2.543570e+000,
-    4.726568e-001,
-    -1.072027e+000,
-    -2.101440e-001,
-    1.518378e+000,
-    -1.060119e+001,
-    -6.016546e+000,
-    2.649475e+000,
-    -5.166992e-002,
-    1.571269e+000,
-    8.344622e-001,
-    -1.072365e+000,
-    -1.511201e-001,
-    7.478010e-001,
-    1.900732e+001,
-    -3.950387e+000,
-    -3.473907e-001,
-    3.797211e-001,
-    2.782949e+000,
-    6.296808e-001,
-    // albedo 0, turbidity 5
-    -1.239423e+000,
-    -3.136289e-001,
-    -1.351100e+001,
-    1.349468e+001,
-    -7.070423e-003,
-    5.012315e-001,
-    1.106008e-001,
-    3.803619e+000,
-    5.577948e-001,
-    -1.452524e+000,
-    -5.676944e-001,
-    2.993153e+000,
-    -2.277288e+000,
-    -2.168954e-002,
-    3.056720e-001,
-    1.152338e-002,
-    1.852697e+000,
-    6.427228e-001,
-    -1.061421e+000,
-    -4.590521e-002,
-    6.057022e-001,
-    -1.096835e+000,
-    -1.504952e-001,
-    2.344921e+000,
-    -5.491832e-002,
-    5.268322e+000,
-    9.082253e-001,
-    -1.042373e+000,
-    -1.769498e-001,
-    -1.075388e+000,
-    3.831712e+000,
-    3.154140e-001,
-    -2.416458e+000,
-    7.909032e-001,
-    -1.492892e-002,
-    3.854049e-001,
-    -1.064159e+000,
-    -1.892684e-001,
-    1.438685e+000,
-    -8.166362e+000,
-    -3.616364e+000,
-    3.275206e+000,
-    -1.203825e-001,
-    2.039491e+000,
-    8.688057e-001,
-    -1.070120e+000,
-    -1.569508e-001,
-    4.124760e-001,
-    1.399683e+001,
-    -3.547085e+000,
-    -1.046326e+000,
-    4.973825e-001,
-    2.791231e+000,
-    6.503286e-001,
-    // albedo 0, turbidity 6
-    -1.283579e+000,
-    -3.609518e-001,
-    -1.335397e+001,
-    1.315248e+001,
-    -4.431938e-004,
-    3.769526e-001,
-    1.429824e-001,
-    3.573613e+000,
-    4.998696e-001,
-    -1.657952e+000,
-    -7.627948e-001,
-    1.958222e+000,
-    -7.949816e-001,
-    -2.882837e-002,
-    5.356149e-001,
-    -5.191946e-002,
-    8.869955e-001,
-    6.263320e-001,
-    -9.527600e-001,
-    6.494189e-002,
-    5.361303e-001,
-    -2.129590e+000,
-    -9.258630e-002,
-    1.604776e+000,
-    5.067770e-002,
-    6.376055e+000,
-    9.138052e-001,
-    -1.080827e+000,
-    -2.523120e-001,
-    -7.154262e-001,
-    4.120085e+000,
-    1.878228e-001,
-    -1.492158e+000,
-    6.881655e-001,
-    -1.446611e+000,
-    4.040631e-001,
-    -1.054075e+000,
-    -1.665498e-001,
-    9.191052e-001,
-    -6.636943e+000,
-    -1.894826e+000,
-    2.107810e+000,
-    -3.680499e-002,
-    2.655452e+000,
-    8.413840e-001,
-    -1.061127e+000,
-    -1.448849e-001,
-    2.667493e-001,
-    1.034103e+001,
-    -4.285769e+000,
-    -3.874504e-001,
-    5.998752e-001,
-    3.132426e+000,
-    6.652753e-001,
-    // albedo 0, turbidity 7
-    -1.347345e+000,
-    -4.287832e-001,
-    -9.305553e+000,
-    9.133813e+000,
-    -3.173527e-003,
-    3.977564e-001,
-    1.151420e-001,
-    3.320564e+000,
-    4.998134e-001,
-    -1.927296e+000,
-    -9.901372e-001,
-    -2.593499e+000,
-    4.087421e+000,
-    -5.833993e-002,
-    8.158929e-001,
-    -4.681279e-002,
-    2.423716e-001,
-    4.938052e-001,
-    -9.470092e-001,
-    7.325237e-002,
-    2.064735e+000,
-    -5.167540e+000,
-    -1.313751e-002,
-    4.832169e-001,
-    1.126295e-001,
-    6.970522e+000,
-    1.035022e+000,
-    -1.022557e+000,
-    -2.762616e-001,
-    -9.375748e-001,
-    6.696739e+000,
-    2.200765e-001,
-    -1.133253e-001,
-    5.492505e-001,
-    -3.109391e+000,
-    3.321914e-001,
-    -1.087444e+000,
-    -1.836263e-001,
-    6.225024e-001,
-    -8.576765e+000,
-    -1.107637e+000,
-    7.859427e-001,
-    9.910909e-002,
-    3.112938e+000,
-    8.596261e-001,
-    -1.051544e+000,
-    -1.546262e-001,
-    2.371731e-001,
-    1.200502e+001,
-    -4.527291e+000,
-    7.268862e-002,
-    5.571478e-001,
-    2.532873e+000,
-    6.662000e-001,
-    // albedo 0, turbidity 8
-    -1.375576e+000,
-    -4.840019e-001,
-    -8.121290e+000,
-    8.058140e+000,
-    -1.445661e-002,
-    5.123314e-001,
-    5.813321e-002,
-    3.203219e+000,
-    5.442318e-001,
-    -2.325221e+000,
-    -1.241463e+000,
-    -7.063430e+000,
-    8.741369e+000,
-    -7.829950e-002,
-    8.844273e-001,
-    -3.471106e-002,
-    1.740583e-001,
-    2.814079e-001,
-    -1.228700e+000,
-    -2.013412e-001,
-    2.949042e+000,
-    -7.371945e+000,
-    1.071753e-001,
-    -2.491970e-001,
-    2.265223e-001,
-    6.391504e+000,
-    1.172389e+000,
-    -7.601786e-001,
-    -1.680631e-001,
-    -7.584444e-001,
-    8.541356e+000,
-    8.222291e-002,
-    6.729633e-001,
-    3.206615e-001,
-    -3.700940e+000,
-    2.710054e-001,
-    -1.191166e+000,
-    -2.672347e-001,
-    2.927498e-001,
-    -9.713613e+000,
-    -4.783721e-001,
-    2.352803e-001,
-    2.161949e-001,
-    2.691481e+000,
-    8.745447e-001,
-    -1.030135e+000,
-    -1.653301e-001,
-    2.263443e-001,
-    1.296157e+001,
-    -4.650644e+000,
-    7.055709e-003,
-    5.091975e-001,
-    2.000370e+000,
-    6.603839e-001,
-    // albedo 0, turbidity 9
-    -1.508018e+000,
-    -6.460933e-001,
-    -6.402745e+000,
-    6.545995e+000,
-    -3.750320e-002,
-    6.921803e-001,
-    3.309819e-003,
-    2.797527e+000,
-    6.978446e-001,
-    -2.333308e+000,
-    -1.167837e+000,
-    -1.746787e+001,
-    1.868630e+001,
-    -8.948229e-003,
-    5.621946e-001,
-    -3.402626e-002,
-    1.217943e+000,
-    1.149865e-002,
-    -2.665953e+000,
-    -1.226307e+000,
-    7.169725e+000,
-    -1.159434e+001,
-    3.583420e-002,
-    -3.074378e-001,
-    3.412248e-001,
-    4.422122e+000,
-    1.283791e+000,
-    -9.705116e-002,
-    8.312991e-002,
-    -2.160462e+000,
-    1.028235e+001,
-    3.543357e-002,
-    1.032049e+000,
-    1.058310e-001,
-    -2.972898e+000,
-    2.418628e-001,
-    -1.329617e+000,
-    -3.699557e-001,
-    5.560117e-001,
-    -9.730113e+000,
-    9.938865e-002,
-    -3.071488e-001,
-    2.510691e-001,
-    1.777111e+000,
-    8.705142e-001,
-    -1.019387e+000,
-    -1.893247e-001,
-    1.194079e-001,
-    1.239436e+001,
-    -4.799224e+000,
-    2.940213e-001,
-    4.841268e-001,
-    1.529724e+000,
-    6.582615e-001,
-    // albedo 0, turbidity 10
-    -1.896737e+000,
-    -1.005442e+000,
-    -6.411032e+000,
-    6.548220e+000,
-    -3.227596e-002,
-    5.717262e-001,
-    -8.115192e-006,
-    2.296704e+000,
-    9.000749e-001,
-    -2.411116e+000,
-    -1.225587e+000,
-    -1.753629e+001,
-    1.829393e+001,
-    1.247555e-002,
-    2.364616e-001,
-    -5.114637e-003,
-    1.603778e+000,
-    -2.224156e-001,
-    -4.707121e+000,
-    -2.074977e+000,
-    7.942300e+000,
-    -1.132407e+001,
-    -5.415654e-002,
-    5.446811e-001,
-    1.032493e-001,
-    4.010235e+000,
-    1.369802e+000,
-    1.010482e-001,
-    -4.013305e-001,
-    -2.674579e+000,
-    9.779409e+000,
-    1.782506e-001,
-    7.053045e-001,
-    4.200002e-001,
-    -2.400671e+000,
-    1.953165e-001,
-    -1.243526e+000,
-    -3.391255e-001,
-    8.848882e-001,
-    -9.789025e+000,
-    -3.997324e-001,
-    -9.546227e-001,
-    -1.044017e-001,
-    6.010593e-001,
-    8.714462e-001,
-    -1.014633e+000,
-    -1.730009e-001,
-    -7.738934e-002,
-    1.390903e+001,
-    -4.847307e+000,
-    1.076059e+000,
-    5.685743e-001,
-    1.572992e+000,
-    6.561432e-001,
-    // albedo 1, turbidity 1
-    -1.122998e+000,
-    -1.881183e-001,
-    -1.030709e+001,
-    1.158932e+001,
-    -4.079495e-002,
-    9.603774e-001,
-    3.079436e-002,
-    4.009235e+000,
-    5.060745e-001,
-    -1.134790e+000,
-    -1.539688e-001,
-    5.478405e+000,
-    -4.217270e+000,
-    -1.043858e-001,
-    7.165008e-001,
-    1.524765e-002,
-    6.473623e+000,
-    4.207882e-001,
-    -1.134957e+000,
-    -3.513318e-001,
-    7.393837e-001,
-    1.354415e+000,
-    -4.764078e-001,
-    1.690441e+000,
-    -5.492640e-002,
-    -5.563523e+000,
-    1.145743e+000,
-    -1.058344e+000,
-    -5.758503e-002,
-    1.168230e+000,
-    3.269824e-001,
-    1.795193e-001,
-    7.849011e-001,
-    7.441853e-002,
-    6.904804e+000,
-    2.818790e-001,
-    -1.075194e+000,
-    -2.355813e-001,
-    2.463685e+000,
-    -1.536505e+000,
-    -7.505771e+000,
-    9.619712e-001,
-    -6.465851e-002,
-    -1.355492e+000,
-    8.489847e-001,
-    -1.079030e+000,
-    -1.465328e-001,
-    1.773838e+000,
-    2.310131e+000,
-    -3.136065e+000,
-    3.507952e-001,
-    4.435014e-002,
-    2.819225e+000,
-    5.689008e-001,
-    // albedo 1, turbidity 2
-    -1.125833e+000,
-    -1.870849e-001,
-    -9.555833e+000,
-    1.059713e+001,
-    -4.225402e-002,
-    9.164663e-001,
-    4.338796e-002,
-    4.400980e+000,
-    6.056119e-001,
-    -1.127440e+000,
-    -1.551891e-001,
-    4.755621e+000,
-    -4.408806e+000,
-    -7.851763e-002,
-    2.268284e-001,
-    1.460070e-001,
-    7.048003e+000,
-    3.525997e-001,
-    -1.143788e+000,
-    -3.170178e-001,
-    5.480669e-001,
-    2.041830e+000,
-    -4.532139e-001,
-    2.302233e+000,
-    -1.887419e-001,
-    -4.489221e+000,
-    1.250967e+000,
-    -1.032849e+000,
-    7.376031e-003,
-    5.666073e-001,
-    -2.312203e-001,
-    4.862894e-001,
-    -1.748294e-001,
-    3.572870e-001,
-    8.380522e+000,
-    1.302333e-001,
-    -1.093728e+000,
-    -2.786977e-001,
-    2.641272e+000,
-    -1.507494e+000,
-    -8.731243e+000,
-    1.684055e+000,
-    -2.023377e-001,
-    -2.176398e+000,
-    1.013249e+000,
-    -1.076578e+000,
-    -1.456205e-001,
-    1.693935e+000,
-    2.945003e+000,
-    -2.822673e+000,
-    -2.520033e-001,
-    1.517034e-001,
-    2.649109e+000,
-    5.179094e-001,
-    // albedo 1, turbidity 3
-    -1.146417e+000,
-    -2.119353e-001,
-    -7.187525e+000,
-    8.058599e+000,
-    -5.256438e-002,
-    8.375733e-001,
-    3.887093e-002,
-    4.222111e+000,
-    6.695347e-001,
-    -1.173674e+000,
-    -2.067025e-001,
-    2.899359e+000,
-    -2.804918e+000,
-    -8.473899e-002,
-    3.944225e-003,
-    1.340641e-001,
-    6.160887e+000,
-    4.527141e-001,
-    -1.090098e+000,
-    -2.599633e-001,
-    9.180856e-001,
-    1.092710e+000,
-    -4.215019e-001,
-    2.427660e+000,
-    -9.277667e-002,
-    -2.123523e+000,
-    1.058159e+000,
-    -1.084460e+000,
-    8.056181e-003,
-    -2.453510e-001,
-    6.619567e-001,
-    4.668118e-001,
-    -9.526719e-001,
-    4.648454e-001,
-    8.001572e+000,
-    3.054194e-001,
-    -1.053728e+000,
-    -2.765784e-001,
-    2.792388e+000,
-    -3.489517e+000,
-    -8.150535e+000,
-    2.195757e+000,
-    -2.017234e-001,
-    -2.128017e+000,
-    9.326589e-001,
-    -1.099348e+000,
-    -1.593939e-001,
-    1.568292e+000,
-    7.247853e+000,
-    -2.933000e+000,
-    -5.890481e-001,
-    1.724440e-001,
-    2.433484e+000,
-    5.736558e-001,
-    // albedo 1, turbidity 4
-    -1.185983e+000,
-    -2.581184e-001,
-    -7.761056e+000,
-    8.317053e+000,
-    -3.351773e-002,
-    6.676667e-001,
-    5.941733e-002,
-    3.820727e+000,
-    6.324032e-001,
-    -1.268591e+000,
-    -3.398067e-001,
-    2.348503e+000,
-    -2.023779e+000,
-    -5.368458e-002,
-    1.083282e-001,
-    8.402858e-002,
-    3.910254e+000,
-    5.577481e-001,
-    -1.071353e+000,
-    -1.992459e-001,
-    7.878387e-001,
-    1.974702e-001,
-    -3.033058e-001,
-    2.335298e+000,
-    -8.205259e-002,
-    7.954454e-001,
-    9.972312e-001,
-    -1.089513e+000,
-    -3.104364e-002,
-    -5.995746e-001,
-    2.330281e+000,
-    6.581939e-001,
-    -1.821467e+000,
-    6.679973e-001,
-    5.090195e+000,
-    3.125161e-001,
-    -1.040214e+000,
-    -2.570934e-001,
-    2.660489e+000,
-    -6.506045e+000,
-    -7.053586e+000,
-    2.763153e+000,
-    -2.433632e-001,
-    -7.648176e-001,
-    9.452937e-001,
-    -1.116052e+000,
-    -1.831993e-001,
-    1.457694e+000,
-    1.163608e+001,
-    -3.216426e+000,
-    -1.045594e+000,
-    2.285002e-001,
-    1.817407e+000,
-    5.810396e-001,
-    // albedo 1, turbidity 5
-    -1.230134e+000,
-    -3.136264e-001,
-    -8.909301e+000,
-    9.145006e+000,
-    -1.055387e-002,
-    4.467317e-001,
-    1.016826e-001,
-    3.342964e+000,
-    5.633840e-001,
-    -1.442907e+000,
-    -5.593147e-001,
-    2.156447e+000,
-    -1.241657e+000,
-    -3.512130e-002,
-    3.050274e-001,
-    1.797175e-002,
-    1.742358e+000,
-    5.977153e-001,
-    -1.027627e+000,
-    -6.481539e-002,
-    4.351975e-001,
-    -1.051677e+000,
-    -2.030672e-001,
-    1.942684e+000,
-    -3.615993e-002,
-    4.050266e+000,
-    9.801624e-001,
-    -1.082110e+000,
-    -1.578209e-001,
-    -3.397511e-001,
-    4.163851e+000,
-    6.650368e-001,
-    -1.841730e+000,
-    7.062544e-001,
-    6.789881e-001,
-    3.172623e-001,
-    -1.047447e+000,
-    -1.977560e-001,
-    2.183364e+000,
-    -8.805249e+000,
-    -5.483962e+000,
-    2.551309e+000,
-    -1.779640e-001,
-    1.519501e+000,
-    9.212536e-001,
-    -1.111853e+000,
-    -1.935736e-001,
-    1.394408e+000,
-    1.392405e+001,
-    -3.465430e+000,
-    -1.068432e+000,
-    2.388671e-001,
-    1.455336e+000,
-    6.233425e-001,
-    // albedo 1, turbidity 6
-    -1.262238e+000,
-    -3.546341e-001,
-    -1.008703e+001,
-    1.020084e+001,
-    -1.852187e-003,
-    3.537580e-001,
-    1.239199e-001,
-    3.056093e+000,
-    5.132052e-001,
-    -1.613810e+000,
-    -7.355585e-001,
-    2.760123e+000,
-    -1.685253e+000,
-    -2.517552e-002,
-    2.914258e-001,
-    4.743448e-003,
-    8.689596e-001,
-    5.674192e-001,
-    -9.462336e-001,
-    2.950767e-002,
-    -2.613816e-001,
-    -7.398653e-001,
-    -1.315558e-001,
-    1.901042e+000,
-    -6.447844e-002,
-    4.969341e+000,
-    1.027342e+000,
-    -1.111481e+000,
-    -2.194054e-001,
-    -9.004538e-002,
-    3.983442e+000,
-    4.871278e-001,
-    -1.965315e+000,
-    7.956121e-001,
-    -2.363225e-001,
-    2.718037e-001,
-    -1.036397e+000,
-    -1.827106e-001,
-    1.964747e+000,
-    -8.870759e+000,
-    -4.208011e+000,
-    2.461215e+000,
-    -2.158905e-001,
-    1.561676e+000,
-    9.436866e-001,
-    -1.113769e+000,
-    -1.947819e-001,
-    1.300720e+000,
-    1.516476e+001,
-    -4.088732e+000,
-    -1.069384e+000,
-    2.836434e-001,
-    1.671451e+000,
-    6.229612e-001,
-    // albedo 1, turbidity 7
-    -1.328069e+000,
-    -4.244047e-001,
-    -8.417040e+000,
-    8.552244e+000,
-    -6.813504e-003,
-    4.127422e-001,
-    9.619897e-002,
-    2.854227e+000,
-    5.059880e-001,
-    -1.927552e+000,
-    -1.025290e+000,
-    9.529576e-001,
-    4.255950e-001,
-    -3.738779e-002,
-    2.584586e-001,
-    4.911004e-002,
-    -2.640913e-001,
-    4.138626e-001,
-    -8.488094e-001,
-    1.435988e-001,
-    6.356807e-001,
-    -2.895732e+000,
-    -8.473961e-002,
-    1.701305e+000,
-    -1.323908e-001,
-    6.499338e+000,
-    1.210928e+000,
-    -1.128313e+000,
-    -3.397048e-001,
-    -4.043140e-001,
-    6.265097e+000,
-    5.482395e-001,
-    -2.057614e+000,
-    8.884087e-001,
-    -2.943879e+000,
-    9.760301e-002,
-    -1.039764e+000,
-    -1.494772e-001,
-    1.781915e+000,
-    -1.153012e+001,
-    -3.379232e+000,
-    2.517231e+000,
-    -2.764393e-001,
-    2.588849e+000,
-    1.052120e+000,
-    -1.108447e+000,
-    -2.012251e-001,
-    1.198640e+000,
-    1.925331e+001,
-    -4.423892e+000,
-    -1.257122e+000,
-    3.395690e-001,
-    1.481220e+000,
-    5.880175e-001,
-    // albedo 1, turbidity 8
-    -1.374185e+000,
-    -4.967434e-001,
-    -7.401318e+000,
-    7.724021e+000,
-    -2.345723e-002,
-    5.979653e-001,
-    2.436346e-002,
-    2.658970e+000,
-    6.014891e-001,
-    -2.310933e+000,
-    -1.290290e+000,
-    -1.301909e+000,
-    2.557806e+000,
-    -3.744449e-002,
-    8.982861e-002,
-    1.090613e-001,
-    -4.398363e-001,
-    1.184329e-001,
-    -1.124730e+000,
-    -9.921830e-002,
-    1.366902e+000,
-    -4.172489e+000,
-    -5.078016e-002,
-    1.393597e+000,
-    -9.323843e-002,
-    6.452721e+000,
-    1.435913e+000,
-    -8.468477e-001,
-    -2.744819e-001,
-    -4.347200e-001,
-    6.713362e+000,
-    6.127133e-001,
-    -1.685634e+000,
-    7.360941e-001,
-    -4.535502e+000,
-    -2.920866e-002,
-    -1.165242e+000,
-    -2.008697e-001,
-    1.438778e+000,
-    -1.008936e+001,
-    -2.214771e+000,
-    2.102909e+000,
-    -1.763085e-001,
-    2.859075e+000,
-    1.093470e+000,
-    -1.074614e+000,
-    -2.066374e-001,
-    1.131891e+000,
-    1.630063e+001,
-    -4.801441e+000,
-    -1.112590e+000,
-    3.595785e-001,
-    1.122227e+000,
-    5.794610e-001,
-    // albedo 1, turbidity 9
-    -1.521515e+000,
-    -6.835604e-001,
-    -5.571044e+000,
-    6.028774e+000,
-    -4.253715e-002,
-    6.875746e-001,
-    -5.279456e-006,
-    2.180150e+000,
-    8.487705e-001,
-    -2.240415e+000,
-    -1.171166e+000,
-    -7.182771e+000,
-    8.417068e+000,
-    -1.932866e-002,
-    1.101887e-001,
-    -1.098862e-002,
-    6.242195e-001,
-    -2.393875e-001,
-    -2.712354e+000,
-    -1.198830e+000,
-    3.180200e+000,
-    -6.768130e+000,
-    -2.563386e-003,
-    7.984607e-001,
-    2.764376e-001,
-    4.695358e+000,
-    1.557045e+000,
-    -3.655172e-002,
-    -2.142321e-002,
-    -9.138120e-001,
-    7.932786e+000,
-    3.516542e-001,
-    -7.994343e-001,
-    1.786761e-001,
-    -4.208399e+000,
-    1.820576e-002,
-    -1.368610e+000,
-    -2.656212e-001,
-    1.249397e+000,
-    -8.317818e+000,
-    -8.962772e-001,
-    1.423249e+000,
-    1.478381e-001,
-    2.191660e+000,
-    1.007748e+000,
-    -1.041753e+000,
-    -2.453366e-001,
-    1.061102e+000,
-    1.130172e+001,
-    -4.739312e+000,
-    -9.223334e-001,
-    2.982776e-001,
-    6.162931e-001,
-    6.080302e-001,
-    // albedo 1, turbidity 10
-    -1.989159e+000,
-    -1.095160e+000,
-    -2.915550e+000,
-    3.275339e+000,
-    -5.735765e-002,
-    5.742174e-001,
-    -7.683288e-006,
-    1.763400e+000,
-    9.001342e-001,
-    -2.070020e+000,
-    -1.086338e+000,
-    -1.095898e+001,
-    1.206960e+001,
-    3.780123e-002,
-    -1.774699e-002,
-    -5.881348e-004,
-    1.333819e+000,
-    -2.605423e-001,
-    -5.249653e+000,
-    -2.383040e+000,
-    6.160406e+000,
-    -9.097138e+000,
-    -1.955319e-001,
-    1.651785e+000,
-    6.016463e-004,
-    3.021824e+000,
-    1.493574e+000,
-    4.685432e-001,
-    -2.358662e-001,
-    -2.666433e+000,
-    9.685763e+000,
-    5.804928e-001,
-    -1.521875e+000,
-    5.668989e-001,
-    -1.548136e+000,
-    1.688642e-002,
-    -1.296891e+000,
-    -3.449031e-001,
-    1.928548e+000,
-    -1.167560e+001,
-    -1.627615e+000,
-    1.355603e+000,
-    -1.929074e-001,
-    -6.568952e-001,
-    1.009774e+000,
-    -1.067288e+000,
-    -2.410392e-001,
-    7.147961e-001,
-    1.783840e+001,
-    -4.374399e+000,
-    -6.588777e-001,
-    3.329831e-001,
-    1.012066e+000,
-    6.118645e-001,
-};
-
-static const double datasetXYZRad2[] = {
-    // albedo 0, turbidity 1
-    1.632341e+000,
-    1.395230e+000,
-    1.375634e+000,
-    1.238193e+001,
-    5.921102e+000,
-    7.766508e+000,
-    // albedo 0, turbidity 2
-    1.597115e+000,
-    1.554617e+000,
-    3.932382e-001,
-    1.505284e+001,
-    5.725234e+000,
-    8.158155e+000,
-    // albedo 0, turbidity 3
-    1.522034e+000,
-    1.844545e+000,
-    -1.322862e+000,
-    1.918382e+001,
-    5.440769e+000,
-    8.837119e+000,
-    // albedo 0, turbidity 4
-    1.403048e+000,
-    2.290852e+000,
-    -4.013792e+000,
-    2.485100e+001,
-    5.521888e+000,
-    9.845547e+000,
-    // albedo 0, turbidity 5
-    1.286364e+000,
-    2.774498e+000,
-    -6.648221e+000,
-    2.964151e+001,
-    5.923777e+000,
-    1.097075e+001,
-    // albedo 0, turbidity 6
-    1.213544e+000,
-    3.040195e+000,
-    -8.092676e+000,
-    3.186082e+001,
-    6.789782e+000,
-    1.158899e+001,
-    // albedo 0, turbidity 7
-    1.122622e+000,
-    3.347465e+000,
-    -9.649016e+000,
-    3.343824e+001,
-    9.347715e+000,
-    1.231374e+001,
-    // albedo 0, turbidity 8
-    1.007356e+000,
-    3.543858e+000,
-    -1.053520e+001,
-    3.239842e+001,
-    1.483962e+001,
-    1.331718e+001,
-    // albedo 0, turbidity 9
-    8.956642e-001,
-    3.278700e+000,
-    -9.254933e+000,
-    2.557923e+001,
-    2.489677e+001,
-    1.476166e+001,
-    // albedo 0, turbidity 10
-    7.985143e-001,
-    2.340404e+000,
-    -4.928274e+000,
-    1.141787e+001,
-    3.961501e+001,
-    1.682448e+001,
-    // albedo 1, turbidity 1
-    1.745162e+000,
-    1.639467e+000,
-    1.342721e+000,
-    1.166033e+001,
-    1.490124e+001,
-    1.774031e+001,
-    // albedo 1, turbidity 2
-    1.708439e+000,
-    1.819144e+000,
-    2.834399e-001,
-    1.448066e+001,
-    1.459214e+001,
-    1.858679e+001,
-    // albedo 1, turbidity 3
-    1.631720e+000,
-    2.094799e+000,
-    -1.378825e+000,
-    1.843198e+001,
-    1.463173e+001,
-    1.962881e+001,
-    // albedo 1, turbidity 4
-    1.516536e+000,
-    2.438729e+000,
-    -3.624121e+000,
-    2.298621e+001,
-    1.599782e+001,
-    2.070027e+001,
-    // albedo 1, turbidity 5
-    1.405863e+000,
-    2.785191e+000,
-    -5.705236e+000,
-    2.645121e+001,
-    1.768330e+001,
-    2.191903e+001,
-    // albedo 1, turbidity 6
-    1.344052e+000,
-    2.951807e+000,
-    -6.683851e+000,
-    2.744271e+001,
-    1.985706e+001,
-    2.229452e+001,
-    // albedo 1, turbidity 7
-    1.245827e+000,
-    3.182923e+000,
-    -7.822960e+000,
-    2.791395e+001,
-    2.327254e+001,
-    2.315910e+001,
-    // albedo 1, turbidity 8
-    1.132305e+000,
-    3.202593e+000,
-    -8.008429e+000,
-    2.521093e+001,
-    3.000014e+001,
-    2.405306e+001,
-    // albedo 1, turbidity 9
-    1.020330e+000,
-    2.820556e+000,
-    -6.238704e+000,
-    1.709276e+001,
-    4.077916e+001,
-    2.509949e+001,
-    // albedo 1, turbidity 10
-    9.031570e-001,
-    1.863917e+000,
-    -1.955738e+000,
-    3.032665e+000,
-    5.434290e+001,
-    2.641780e+001,
-};
-
-static const double datasetXYZ3[] = {
-    // albedo 0, turbidity 1
-    -1.310023e+000,
-    -4.407658e-001,
-    -3.640340e+001,
-    3.683292e+001,
-    -8.124762e-003,
-    5.297961e-001,
-    1.188633e-002,
-    3.138320e+000,
-    5.134778e-001,
-    -1.424100e+000,
-    -5.501606e-001,
-    -1.753510e+001,
-    1.822769e+001,
-    -1.539272e-002,
-    6.366826e-001,
-    2.661996e-003,
-    2.659915e+000,
-    4.071138e-001,
-    -1.103436e+000,
-    -1.884105e-001,
-    6.425322e+000,
-    -6.910579e+000,
-    -2.019861e-002,
-    3.553271e-001,
-    -1.589061e-002,
-    5.345985e+000,
-    8.790218e-001,
-    -1.186200e+000,
-    -4.307514e-001,
-    -3.957947e+000,
-    5.979352e+000,
-    -5.348869e-002,
-    1.736117e+000,
-    3.491346e-002,
-    -2.692261e+000,
-    5.610506e-001,
-    -1.006038e+000,
-    -1.305995e-001,
-    4.473513e+000,
-    -3.806719e+000,
-    1.419407e-001,
-    -2.148238e-002,
-    -5.081185e-002,
-    3.735362e+000,
-    5.358280e-001,
-    -1.078507e+000,
-    -1.633754e-001,
-    -3.812368e+000,
-    4.381700e+000,
-    2.988122e-002,
-    1.754224e+000,
-    1.472376e-001,
-    3.722798e+000,
-    4.999157e-001,
-    // albedo 0, turbidity 2
-    -1.333582e+000,
-    -4.649908e-001,
-    -3.359528e+001,
-    3.404375e+001,
-    -9.384242e-003,
-    5.587511e-001,
-    5.726310e-003,
-    3.073145e+000,
-    5.425529e-001,
-    -1.562624e+000,
-    -7.107068e-001,
-    -1.478170e+001,
-    1.559839e+001,
-    -1.462375e-002,
-    5.050133e-001,
-    2.516017e-002,
-    1.604696e+000,
-    2.902403e-001,
-    -8.930158e-001,
-    4.068077e-002,
-    1.373481e+000,
-    -2.342752e+000,
-    -2.098058e-002,
-    6.248686e-001,
-    -5.258363e-002,
-    7.058214e+000,
-    1.150373e+000,
-    -1.262823e+000,
-    -4.818353e-001,
-    8.892610e-004,
-    1.923120e+000,
-    -4.979718e-002,
-    1.040693e+000,
-    1.558103e-001,
-    -2.852480e+000,
-    2.420691e-001,
-    -9.968383e-001,
-    -1.200648e-001,
-    1.324342e+000,
-    -9.430889e-001,
-    1.931098e-001,
-    4.436916e-001,
-    -7.320456e-002,
-    4.215931e+000,
-    7.898019e-001,
-    -1.078185e+000,
-    -1.718192e-001,
-    -1.720191e+000,
-    2.358918e+000,
-    2.765637e-002,
-    1.260245e+000,
-    2.021941e-001,
-    3.395483e+000,
-    5.173628e-001,
-    // albedo 0, turbidity 3
-    -1.353023e+000,
-    -4.813523e-001,
-    -3.104920e+001,
-    3.140156e+001,
-    -9.510741e-003,
-    5.542030e-001,
-    8.135471e-003,
-    3.136646e+000,
-    5.215989e-001,
-    -1.624704e+000,
-    -7.990201e-001,
-    -2.167125e+001,
-    2.246341e+001,
-    -1.163533e-002,
-    5.415746e-001,
-    2.618378e-002,
-    1.139214e+000,
-    3.444357e-001,
-    -7.983610e-001,
-    1.417476e-001,
-    9.914841e+000,
-    -1.081503e+001,
-    -1.218845e-002,
-    3.411392e-001,
-    -6.137698e-002,
-    7.445848e+000,
-    1.180080e+000,
-    -1.266679e+000,
-    -4.288977e-001,
-    -5.818701e+000,
-    6.986437e+000,
-    -8.180711e-002,
-    1.397403e+000,
-    2.016916e-001,
-    -1.275731e+000,
-    2.592773e-001,
-    -1.009707e+000,
-    -1.537754e-001,
-    3.496378e+000,
-    -3.013726e+000,
-    2.421150e-001,
-    -2.831925e-001,
-    3.003395e-002,
-    3.702862e+000,
-    7.746320e-001,
-    -1.075646e+000,
-    -1.768747e-001,
-    -1.347762e+000,
-    1.989004e+000,
-    1.375836e-002,
-    1.764810e+000,
-    1.330018e-001,
-    3.230864e+000,
-    6.626210e-001,
-    // albedo 0, turbidity 4
-    -1.375269e+000,
-    -5.103569e-001,
-    -3.442661e+001,
-    3.478703e+001,
-    -8.460009e-003,
-    5.408643e-001,
-    4.813323e-003,
-    3.016078e+000,
-    5.062069e-001,
-    -1.821679e+000,
-    -9.766461e-001,
-    -1.926488e+001,
-    1.997912e+001,
-    -9.822567e-003,
-    3.649556e-001,
-    4.316092e-002,
-    8.930190e-001,
-    4.166527e-001,
-    -6.633542e-001,
-    1.997841e-001,
-    2.395592e+000,
-    -3.117175e+000,
-    -1.080884e-002,
-    8.983814e-001,
-    -1.375825e-001,
-    6.673463e+000,
-    1.115663e+000,
-    -1.303240e+000,
-    -3.612712e-001,
-    8.292959e-002,
-    3.381364e-001,
-    -6.078648e-002,
-    3.229247e-001,
-    3.680987e-001,
-    7.046755e-001,
-    3.144924e-001,
-    -9.952598e-001,
-    -2.039076e-001,
-    4.026851e-001,
-    2.686684e-001,
-    1.640712e-001,
-    5.186341e-001,
-    -1.205520e-002,
-    2.659613e+000,
-    8.030394e-001,
-    -1.098579e+000,
-    -2.151992e-001,
-    6.558198e-001,
-    -7.436900e-004,
-    -1.421817e-003,
-    1.073701e+000,
-    1.886875e-001,
-    2.536857e+000,
-    6.673923e-001,
-    // albedo 0, turbidity 5
-    -1.457986e+000,
-    -5.906842e-001,
-    -3.812464e+001,
-    3.838539e+001,
-    -6.024357e-003,
-    4.741484e-001,
-    1.209223e-002,
-    2.818432e+000,
-    5.012433e-001,
-    -1.835728e+000,
-    -1.003405e+000,
-    -6.848129e+000,
-    7.601943e+000,
-    -1.277375e-002,
-    4.785598e-001,
-    3.366853e-002,
-    1.097701e+000,
-    4.636635e-001,
-    -8.491348e-001,
-    9.466365e-003,
-    -2.685226e+000,
-    2.004060e+000,
-    -1.168708e-002,
-    6.752316e-001,
-    -1.543371e-001,
-    5.674759e+000,
-    1.039534e+000,
-    -1.083379e+000,
-    -1.506790e-001,
-    7.328236e-001,
-    -5.095568e-001,
-    -8.609153e-002,
-    4.448820e-001,
-    4.174662e-001,
-    1.481556e+000,
-    3.942551e-001,
-    -1.117089e+000,
-    -3.337605e-001,
-    2.502281e-001,
-    4.036323e-001,
-    2.673899e-001,
-    2.829817e-001,
-    2.242450e-002,
-    2.043207e+000,
-    7.706902e-001,
-    -1.071648e+000,
-    -2.126200e-001,
-    6.069466e-001,
-    -1.456290e-003,
-    -5.515960e-001,
-    1.046755e+000,
-    1.985021e-001,
-    2.290245e+000,
-    6.876058e-001,
-    // albedo 0, turbidity 6
-    -1.483903e+000,
-    -6.309647e-001,
-    -4.380213e+001,
-    4.410537e+001,
-    -5.712161e-003,
-    5.195992e-001,
-    2.028428e-003,
-    2.687114e+000,
-    5.098321e-001,
-    -2.053976e+000,
-    -1.141473e+000,
-    5.109183e-001,
-    8.060391e-002,
-    -1.033983e-002,
-    4.066532e-001,
-    4.869627e-002,
-    1.161722e+000,
-    4.039525e-001,
-    -6.348185e-001,
-    7.651292e-002,
-    -1.031327e+001,
-    1.007598e+001,
-    -2.083688e-002,
-    7.359516e-001,
-    -2.029459e-001,
-    5.013257e+000,
-    1.077649e+000,
-    -1.228630e+000,
-    -1.650496e-001,
-    4.077157e-002,
-    -7.189167e-001,
-    -5.092220e-002,
-    2.959814e-001,
-    5.111496e-001,
-    2.540433e+000,
-    3.615330e-001,
-    -1.041883e+000,
-    -3.278413e-001,
-    -6.691911e-002,
-    1.307364e+000,
-    2.166663e-001,
-    3.000595e-001,
-    -3.157136e-003,
-    1.389208e+000,
-    7.999026e-001,
-    -1.103556e+000,
-    -2.443602e-001,
-    4.705347e-001,
-    -9.296482e-004,
-    -5.309920e-001,
-    9.654511e-001,
-    2.142587e-001,
-    2.244723e+000,
-    6.839976e-001,
-    // albedo 0, turbidity 7
-    -1.555684e+000,
-    -6.962113e-001,
-    -4.647983e+001,
-    4.674270e+001,
-    -5.034895e-003,
-    4.755090e-001,
-    -9.502561e-007,
-    2.626569e+000,
-    5.056194e-001,
-    -1.998288e+000,
-    -1.124720e+000,
-    -1.629586e+000,
-    2.187993e+000,
-    -8.284384e-003,
-    3.845258e-001,
-    5.726240e-002,
-    1.185644e+000,
-    4.255812e-001,
-    -1.032570e+000,
-    -2.513850e-001,
-    -3.721112e+000,
-    3.506967e+000,
-    -2.186561e-002,
-    9.436049e-001,
-    -2.451412e-001,
-    4.725724e+000,
-    1.039256e+000,
-    -8.597532e-001,
-    9.073332e-002,
-    -2.553741e+000,
-    1.993237e+000,
-    -4.390891e-002,
-    -2.046928e-001,
-    5.515623e-001,
-    1.909127e+000,
-    3.948212e-001,
-    -1.210482e+000,
-    -4.477622e-001,
-    -2.267805e-001,
-    1.219488e+000,
-    1.336186e-001,
-    6.866897e-001,
-    2.808997e-002,
-    1.600403e+000,
-    7.816409e-001,
-    -1.078168e+000,
-    -2.699261e-001,
-    2.537282e-001,
-    3.820684e-001,
-    -4.425103e-001,
-    5.298235e-001,
-    2.185217e-001,
-    1.728679e+000,
-    6.882743e-001,
-    // albedo 0, turbidity 8
-    -1.697968e+000,
-    -8.391488e-001,
-    -5.790105e+001,
-    5.814120e+001,
-    -3.404760e-003,
-    4.265140e-001,
-    -1.796301e-006,
-    2.368442e+000,
-    5.324429e-001,
-    -2.141552e+000,
-    -1.172230e+000,
-    1.677872e+001,
-    -1.641470e+001,
-    -5.732425e-003,
-    2.002199e-001,
-    6.841834e-002,
-    1.485338e+000,
-    3.215763e-001,
-    -1.442946e+000,
-    -7.264245e-001,
-    -9.503706e+000,
-    9.650462e+000,
-    -2.120995e-002,
-    1.419263e+000,
-    -2.893098e-001,
-    3.860731e+000,
-    1.120857e+000,
-    -5.696752e-001,
-    3.411279e-001,
-    -2.931035e-001,
-    -6.512552e-001,
-    -1.068437e-001,
-    -1.085661e+000,
-    6.107549e-001,
-    1.459503e+000,
-    3.210336e-001,
-    -1.313839e+000,
-    -5.921371e-001,
-    -2.332222e-001,
-    1.648196e+000,
-    2.492787e-001,
-    1.381033e+000,
-    -1.993392e-002,
-    9.812560e-001,
-    8.316329e-001,
-    -1.087464e+000,
-    -3.195534e-001,
-    2.902095e-001,
-    3.383709e-001,
-    -8.798482e-001,
-    1.494668e-002,
-    2.529703e-001,
-    1.452644e+000,
-    6.693870e-001,
-    // albedo 0, turbidity 9
-    -2.068582e+000,
-    -1.118605e+000,
-    -5.081598e+001,
-    5.097486e+001,
-    -3.280669e-003,
-    4.067371e-001,
-    -2.544951e-006,
-    2.179497e+000,
-    5.778017e-001,
-    -1.744693e+000,
-    -8.537207e-001,
-    2.234361e+001,
-    -2.208318e+001,
-    -5.932616e-003,
-    1.035049e-001,
-    5.742772e-002,
-    1.977880e+000,
-    2.124846e-001,
-    -3.287515e+000,
-    -2.140268e+000,
-    -1.249566e+001,
-    1.240091e+001,
-    -2.409349e-002,
-    1.397821e+000,
-    -2.371627e-001,
-    2.771192e+000,
-    1.170496e+000,
-    5.502311e-001,
-    1.046630e+000,
-    2.193517e+000,
-    -2.220400e+000,
-    -1.064394e-001,
-    -1.017926e+000,
-    4.795457e-001,
-    1.030644e+000,
-    3.177516e-001,
-    -1.719734e+000,
-    -9.536198e-001,
-    -6.586821e-001,
-    1.386361e+000,
-    -2.513065e-002,
-    1.187011e+000,
-    6.542539e-002,
-    5.296055e-001,
-    8.082660e-001,
-    -1.005700e+000,
-    -3.028096e-001,
-    4.470957e-002,
-    1.007760e+000,
-    -8.119016e-001,
-    3.153338e-002,
-    2.311321e-001,
-    1.182208e+000,
-    6.824758e-001,
-    // albedo 0, turbidity 10
-    -2.728867e+000,
-    -1.580388e+000,
-    -3.079627e+001,
-    3.092586e+001,
-    -4.197673e-003,
-    3.154759e-001,
-    -3.897675e-006,
-    1.920567e+000,
-    6.664791e-001,
-    -1.322495e+000,
-    -7.249275e-001,
-    1.477660e+001,
-    -1.468154e+001,
-    -9.044857e-003,
-    5.624314e-002,
-    6.498392e-002,
-    2.047389e+000,
-    6.367540e-002,
-    -6.102376e+000,
-    -3.473018e+000,
-    -9.926071e+000,
-    9.637797e+000,
-    -1.097909e-002,
-    1.103498e+000,
-    -2.424521e-001,
-    2.520748e+000,
-    1.240260e+000,
-    1.351796e+000,
-    1.018588e+000,
-    2.009081e+000,
-    -1.333394e+000,
-    -1.979125e-001,
-    -3.318292e-001,
-    4.476624e-001,
-    9.095235e-001,
-    2.955611e-001,
-    -1.774467e+000,
-    -1.079880e+000,
-    -8.084680e-002,
-    2.577697e-001,
-    -1.149295e-001,
-    4.975303e-001,
-    2.931611e-003,
-    -3.803171e-001,
-    8.002794e-001,
-    -9.898401e-001,
-    -2.542513e-001,
-    -7.530911e-002,
-    1.870355e+000,
-    -1.521918e+000,
-    2.405164e-001,
-    2.964615e-001,
-    1.334800e+000,
-    6.789053e-001,
-    // albedo 1, turbidity 1
-    -1.279730e+000,
-    -4.290674e-001,
-    -4.277972e+001,
-    4.343305e+001,
-    -6.541826e-003,
-    4.945086e-001,
-    1.425338e-002,
-    2.685244e+000,
-    5.011313e-001,
-    -1.449506e+000,
-    -5.766374e-001,
-    -1.688496e+001,
-    1.781118e+001,
-    -1.121649e-002,
-    3.545020e-001,
-    2.287338e-002,
-    1.904281e+000,
-    4.936998e-001,
-    -1.021980e+000,
-    -1.897574e-001,
-    2.482462e+000,
-    -2.941725e+000,
-    -1.570448e-002,
-    7.532578e-001,
-    -4.256800e-002,
-    5.239660e+000,
-    4.983116e-001,
-    -1.162608e+000,
-    -3.428049e-001,
-    3.974358e+000,
-    -1.527935e+000,
-    -3.919201e-002,
-    8.758593e-001,
-    7.291363e-002,
-    -3.455257e+000,
-    8.007426e-001,
-    -9.929985e-001,
-    -8.712006e-002,
-    -7.397313e-001,
-    1.348372e+000,
-    9.511685e-002,
-    3.233584e-001,
-    -7.549148e-002,
-    5.806452e+000,
-    4.990042e-001,
-    -1.084996e+000,
-    -1.739767e-001,
-    1.580475e-001,
-    9.088180e-001,
-    6.871433e-002,
-    5.933079e-001,
-    1.188921e-001,
-    3.074079e+000,
-    4.999327e-001,
-    // albedo 1, turbidity 2
-    -1.317009e+000,
-    -4.661946e-001,
-    -4.255347e+001,
-    4.312782e+001,
-    -5.727235e-003,
-    4.285447e-001,
-    2.189854e-002,
-    2.608310e+000,
-    5.190700e-001,
-    -1.469236e+000,
-    -6.282139e-001,
-    -1.241404e+001,
-    1.348765e+001,
-    -1.204770e-002,
-    5.070285e-001,
-    -7.280216e-004,
-    1.491533e+000,
-    3.635064e-001,
-    -9.713808e-001,
-    -8.138038e-002,
-    3.709854e-001,
-    -1.041174e+000,
-    -1.814075e-002,
-    5.060860e-001,
-    -2.053756e-002,
-    6.161431e+000,
-    1.093736e+000,
-    -1.159057e+000,
-    -3.698074e-001,
-    2.711209e+000,
-    -6.006479e-001,
-    -4.896926e-002,
-    9.273957e-001,
-    1.137712e-001,
-    -3.496828e+000,
-    2.867109e-001,
-    -1.011601e+000,
-    -8.201890e-002,
-    2.105725e-001,
-    4.597520e-001,
-    1.478925e-001,
-    2.138940e-001,
-    -5.660670e-002,
-    6.057755e+000,
-    7.859121e-001,
-    -1.078020e+000,
-    -1.811580e-001,
-    1.646622e-001,
-    8.348426e-001,
-    1.149064e-001,
-    4.985738e-001,
-    1.376605e-001,
-    2.746607e+000,
-    4.999626e-001,
-    // albedo 1, turbidity 3
-    -1.325672e+000,
-    -4.769313e-001,
-    -4.111215e+001,
-    4.168293e+001,
-    -6.274997e-003,
-    4.649469e-001,
-    1.119411e-002,
-    2.631267e+000,
-    5.234546e-001,
-    -1.619391e+000,
-    -8.000253e-001,
-    -1.534098e+001,
-    1.632706e+001,
-    -1.012023e-002,
-    4.242255e-001,
-    2.931597e-002,
-    8.925807e-001,
-    3.314765e-001,
-    -7.356979e-001,
-    1.368406e-001,
-    2.972579e+000,
-    -3.535359e+000,
-    -1.318948e-002,
-    4.607620e-001,
-    -7.182778e-002,
-    6.254100e+000,
-    1.236299e+000,
-    -1.316217e+000,
-    -4.194427e-001,
-    3.489902e-002,
-    1.289849e+000,
-    -4.755960e-002,
-    1.138222e+000,
-    1.975992e-001,
-    -8.991542e-001,
-    2.290572e-001,
-    -9.502188e-001,
-    -1.172703e-001,
-    1.405202e+000,
-    -3.061919e-001,
-    1.058772e-001,
-    -3.760592e-001,
-    -1.983179e-002,
-    3.562353e+000,
-    7.895959e-001,
-    -1.100117e+000,
-    -1.900567e-001,
-    4.925030e-001,
-    5.250225e-001,
-    1.576804e-001,
-    1.042701e+000,
-    7.330743e-002,
-    2.796064e+000,
-    6.749783e-001,
-    // albedo 1, turbidity 4
-    -1.354183e+000,
-    -5.130625e-001,
-    -4.219268e+001,
-    4.271772e+001,
-    -5.365373e-003,
-    4.136743e-001,
-    1.235172e-002,
-    2.520122e+000,
-    5.187269e-001,
-    -1.741434e+000,
-    -9.589761e-001,
-    -8.230339e+000,
-    9.296799e+000,
-    -9.600162e-003,
-    4.994969e-001,
-    2.955452e-002,
-    3.667099e-001,
-    3.526999e-001,
-    -6.917347e-001,
-    2.154887e-001,
-    -8.760264e-001,
-    2.334121e-001,
-    -1.909621e-002,
-    4.748033e-001,
-    -1.138514e-001,
-    6.515360e+000,
-    1.225097e+000,
-    -1.293189e+000,
-    -4.218700e-001,
-    1.620952e+000,
-    -7.858597e-001,
-    -3.769410e-002,
-    6.636786e-001,
-    3.364945e-001,
-    -5.341017e-001,
-    2.128347e-001,
-    -9.735521e-001,
-    -1.325495e-001,
-    1.007517e+000,
-    2.598258e-001,
-    6.762169e-002,
-    1.421018e-003,
-    -6.915987e-002,
-    3.185897e+000,
-    8.641956e-001,
-    -1.094800e+000,
-    -1.962062e-001,
-    5.755591e-001,
-    2.906259e-001,
-    2.625748e-001,
-    7.644049e-001,
-    1.347492e-001,
-    2.677126e+000,
-    6.465460e-001,
-    // albedo 1, turbidity 5
-    -1.393063e+000,
-    -5.578338e-001,
-    -4.185249e+001,
-    4.233504e+001,
-    -5.435640e-003,
-    4.743765e-001,
-    7.422477e-003,
-    2.442801e+000,
-    5.211707e-001,
-    -1.939487e+000,
-    -1.128509e+000,
-    -8.974257e+000,
-    9.978383e+000,
-    -7.965597e-003,
-    2.948830e-001,
-    4.436763e-002,
-    2.839868e-001,
-    3.440424e-001,
-    -6.011562e-001,
-    2.354877e-001,
-    -3.079820e+000,
-    2.585094e+000,
-    -2.002701e-002,
-    7.793909e-001,
-    -1.598414e-001,
-    5.834678e+000,
-    1.202856e+000,
-    -1.315676e+000,
-    -3.903446e-001,
-    1.701900e+000,
-    -1.304609e+000,
-    -1.045121e-002,
-    2.747707e-001,
-    4.143967e-001,
-    3.197102e-001,
-    2.637580e-001,
-    -9.618628e-001,
-    -1.625841e-001,
-    1.187138e+000,
-    1.497802e-001,
-    -5.590954e-006,
-    3.178475e-002,
-    -4.153145e-002,
-    2.496096e+000,
-    8.195082e-001,
-    -1.111554e+000,
-    -2.365546e-001,
-    7.831875e-001,
-    2.018684e-001,
-    2.074369e-001,
-    7.395978e-001,
-    1.225730e-001,
-    1.876478e+000,
-    6.821167e-001,
-    // albedo 1, turbidity 6
-    -1.427879e+000,
-    -5.994879e-001,
-    -3.531016e+001,
-    3.581581e+001,
-    -6.431497e-003,
-    4.554192e-001,
-    7.348731e-004,
-    2.334619e+000,
-    5.233377e-001,
-    -1.998177e+000,
-    -1.206633e+000,
-    -2.146510e+001,
-    2.242237e+001,
-    -5.857596e-003,
-    2.755663e-001,
-    6.384795e-002,
-    1.358244e-001,
-    3.328437e-001,
-    -6.440630e-001,
-    2.058571e-001,
-    2.155499e+000,
-    -2.587968e+000,
-    -1.840023e-002,
-    8.826555e-001,
-    -2.222452e-001,
-    5.847073e+000,
-    1.228387e+000,
-    -1.229071e+000,
-    -3.360441e-001,
-    -3.429599e-001,
-    6.179469e-001,
-    2.029610e-003,
-    8.899319e-002,
-    5.041624e-001,
-    1.882964e-001,
-    2.252040e-001,
-    -1.022905e+000,
-    -2.101621e-001,
-    1.915689e+000,
-    -6.498794e-001,
-    -3.463651e-002,
-    8.954605e-002,
-    -6.797854e-002,
-    2.417705e+000,
-    8.568618e-001,
-    -1.082538e+000,
-    -2.007723e-001,
-    4.731009e-001,
-    4.077267e-001,
-    1.324289e-001,
-    6.514880e-001,
-    1.702912e-001,
-    2.309383e+000,
-    6.600895e-001,
-    // albedo 1, turbidity 7
-    -1.472139e+000,
-    -6.499815e-001,
-    -3.428465e+001,
-    3.469659e+001,
-    -5.747023e-003,
-    4.174167e-001,
-    1.688597e-003,
-    2.323046e+000,
-    5.395191e-001,
-    -2.161176e+000,
-    -1.353089e+000,
-    -2.226827e+001,
-    2.329138e+001,
-    -5.583808e-003,
-    2.364793e-001,
-    6.096656e-002,
-    1.944666e-003,
-    2.861624e-001,
-    -6.593044e-001,
-    1.393558e-001,
-    4.698373e+000,
-    -5.193883e+000,
-    -1.998390e-002,
-    1.095635e+000,
-    -2.391254e-001,
-    5.598103e+000,
-    1.236193e+000,
-    -1.195717e+000,
-    -2.972715e-001,
-    4.648953e-002,
-    3.024588e-001,
-    5.003313e-003,
-    -3.754741e-001,
-    5.247265e-001,
-    -1.381312e-001,
-    2.493896e-001,
-    -1.020139e+000,
-    -2.253524e-001,
-    3.548437e-001,
-    7.030485e-001,
-    -2.107076e-002,
-    4.581395e-001,
-    -3.243757e-002,
-    2.453259e+000,
-    8.323623e-001,
-    -1.098770e+000,
-    -2.435780e-001,
-    8.761614e-001,
-    1.941613e-001,
-    -1.990692e-001,
-    3.761139e-001,
-    1.657412e-001,
-    1.590503e+000,
-    6.741417e-001,
-    // albedo 1, turbidity 8
-    -1.648007e+000,
-    -8.205121e-001,
-    -4.435106e+001,
-    4.479801e+001,
-    -4.181353e-003,
-    3.854830e-001,
-    -1.842385e-006,
-    2.000281e+000,
-    5.518363e-001,
-    -2.140986e+000,
-    -1.282239e+000,
-    -3.979213e+000,
-    4.672459e+000,
-    -5.008582e-003,
-    2.421920e-001,
-    6.253602e-002,
-    6.612713e-001,
-    2.555851e-001,
-    -1.300502e+000,
-    -5.137898e-001,
-    5.179821e-001,
-    -4.032341e-001,
-    -2.066785e-002,
-    1.087929e+000,
-    -2.615309e-001,
-    4.225887e+000,
-    1.229237e+000,
-    -6.963340e-001,
-    9.241060e-002,
-    6.936356e-002,
-    -3.588571e-001,
-    -5.461843e-002,
-    -5.616643e-001,
-    5.484166e-001,
-    -4.776267e-002,
-    2.414935e-001,
-    -1.233179e+000,
-    -4.325498e-001,
-    6.479813e-001,
-    8.368356e-001,
-    2.458875e-001,
-    6.464752e-001,
-    -2.897097e-002,
-    1.561773e+000,
-    8.518598e-001,
-    -1.051023e+000,
-    -2.533690e-001,
-    1.004294e+000,
-    3.028083e-001,
-    -1.520108e+000,
-    1.607013e-001,
-    1.619975e-001,
-    1.131094e+000,
-    6.706655e-001,
-    // albedo 1, turbidity 9
-    -1.948249e+000,
-    -1.097383e+000,
-    -4.453697e+001,
-    4.494902e+001,
-    -3.579939e-003,
-    3.491605e-001,
-    -2.500253e-006,
-    1.740442e+000,
-    6.188022e-001,
-    -2.154253e+000,
-    -1.209559e+000,
-    4.144894e+000,
-    -3.562411e+000,
-    -5.638843e-003,
-    1.067169e-001,
-    7.594858e-002,
-    1.005280e+000,
-    1.072543e-001,
-    -2.513259e+000,
-    -1.507208e+000,
-    -1.602979e+000,
-    1.404154e+000,
-    -5.560750e-003,
-    1.240490e+000,
-    -2.852117e-001,
-    3.485252e+000,
-    1.349321e+000,
-    -7.832214e-002,
-    3.655626e-001,
-    3.856288e-001,
-    6.867894e-001,
-    -1.609523e-001,
-    -6.704306e-001,
-    5.357301e-001,
-    -6.457935e-001,
-    1.479503e-001,
-    -1.354784e+000,
-    -5.454375e-001,
-    8.797469e-001,
-    -1.466514e+000,
-    7.134420e-001,
-    5.934903e-001,
-    -2.911178e-002,
-    8.643737e-001,
-    9.030724e-001,
-    -1.048324e+000,
-    -2.738736e-001,
-    8.783074e-001,
-    3.246188e+000,
-    -4.435369e+000,
-    1.251791e-001,
-    1.783486e-001,
-    1.064657e+000,
-    6.522878e-001,
-    // albedo 1, turbidity 10
-    -2.770408e+000,
-    -1.618911e+000,
-    -2.504031e+001,
-    2.531674e+001,
-    -4.239279e-003,
-    3.241013e-001,
-    -3.764484e-006,
-    1.586843e+000,
-    7.035906e-001,
-    -1.913500e+000,
-    -1.144014e+000,
-    -1.080587e+001,
-    1.153677e+001,
-    -1.003197e-002,
-    1.577515e-001,
-    5.217789e-002,
-    1.225278e+000,
-    5.172771e-003,
-    -5.293208e+000,
-    -2.876463e+000,
-    2.087053e+000,
-    -3.201552e+000,
-    3.892964e-003,
-    5.323930e-001,
-    -2.034512e-001,
-    2.617760e+000,
-    1.273597e+000,
-    9.060340e-001,
-    3.773409e-001,
-    -6.399945e-001,
-    3.213979e+000,
-    -9.112172e-002,
-    6.494055e-001,
-    3.953280e-001,
-    5.047796e-001,
-    2.998695e-001,
-    -1.482179e+000,
-    -6.778310e-001,
-    1.161775e+000,
-    -3.004872e+000,
-    4.774797e-001,
-    -4.969248e-001,
-    -3.512074e-003,
-    -1.307190e+000,
-    7.927378e-001,
-    -9.863181e-001,
-    -1.803364e-001,
-    5.810824e-001,
-    4.580570e+000,
-    -3.863454e+000,
-    5.328174e-001,
-    2.272821e-001,
-    1.771114e+000,
-    6.791814e-001,
-};
-
-static const double datasetXYZRad3[] = {
-    // albedo 0, turbidity 1
-    1.168084e+000,
-    2.156455e+000,
-    -3.980314e+000,
-    1.989302e+001,
-    1.328335e+001,
-    1.435621e+001,
-    // albedo 0, turbidity 2
-    1.135488e+000,
-    2.294701e+000,
-    -4.585886e+000,
-    2.090208e+001,
-    1.347840e+001,
-    1.467658e+001,
-    // albedo 0, turbidity 3
-    1.107408e+000,
-    2.382765e+000,
-    -5.112357e+000,
-    2.147823e+001,
-    1.493128e+001,
-    1.460882e+001,
-    // albedo 0, turbidity 4
-    1.054193e+000,
-    2.592891e+000,
-    -6.115000e+000,
-    2.268967e+001,
-    1.635672e+001,
-    1.518999e+001,
-    // albedo 0, turbidity 5
-    1.006946e+000,
-    2.705420e+000,
-    -6.698930e+000,
-    2.291830e+001,
-    1.834324e+001,
-    1.570651e+001,
-    // albedo 0, turbidity 6
-    9.794044e-001,
-    2.742440e+000,
-    -6.805283e+000,
-    2.225271e+001,
-    2.050797e+001,
-    1.563130e+001,
-    // albedo 0, turbidity 7
-    9.413577e-001,
-    2.722009e+000,
-    -6.760707e+000,
-    2.098242e+001,
-    2.342588e+001,
-    1.605011e+001,
-    // albedo 0, turbidity 8
-    8.917923e-001,
-    2.592780e+000,
-    -6.152635e+000,
-    1.774141e+001,
-    2.858324e+001,
-    1.657910e+001,
-    // albedo 0, turbidity 9
-    8.288391e-001,
-    2.153434e+000,
-    -4.118327e+000,
-    1.078118e+001,
-    3.681710e+001,
-    1.738139e+001,
-    // albedo 0, turbidity 10
-    7.623528e-001,
-    1.418187e+000,
-    -8.845235e-001,
-    7.590129e-001,
-    4.629859e+001,
-    1.921657e+001,
-    // albedo 1, turbidity 1
-    1.352858e+000,
-    2.048862e+000,
-    -2.053393e+000,
-    1.405874e+001,
-    3.045344e+001,
-    3.044430e+001,
-    // albedo 1, turbidity 2
-    1.330497e+000,
-    2.126497e+000,
-    -2.466296e+000,
-    1.467559e+001,
-    3.090738e+001,
-    3.069707e+001,
-    // albedo 1, turbidity 3
-    1.286344e+000,
-    2.200436e+000,
-    -2.877228e+000,
-    1.492701e+001,
-    3.236288e+001,
-    3.077223e+001,
-    // albedo 1, turbidity 4
-    1.234428e+000,
-    2.289628e+000,
-    -3.404699e+000,
-    1.499436e+001,
-    3.468390e+001,
-    3.084842e+001,
-    // albedo 1, turbidity 5
-    1.178660e+000,
-    2.306071e+000,
-    -3.549159e+000,
-    1.411006e+001,
-    3.754188e+001,
-    3.079730e+001,
-    // albedo 1, turbidity 6
-    1.151366e+000,
-    2.333005e+000,
-    -3.728627e+000,
-    1.363374e+001,
-    3.905894e+001,
-    3.092599e+001,
-    // albedo 1, turbidity 7
-    1.101593e+000,
-    2.299422e+000,
-    -3.565787e+000,
-    1.196745e+001,
-    4.188472e+001,
-    3.102755e+001,
-    // albedo 1, turbidity 8
-    1.038322e+000,
-    2.083539e+000,
-    -2.649585e+000,
-    8.037389e+000,
-    4.700869e+001,
-    3.065948e+001,
-    // albedo 1, turbidity 9
-    9.596146e-001,
-    1.671470e+000,
-    -8.751538e-001,
-    1.679772e+000,
-    5.345784e+001,
-    3.054520e+001,
-    // albedo 1, turbidity 10
-    8.640731e-001,
-    9.858301e-001,
-    1.854956e+000,
-    -6.798097e+000,
-    5.936468e+001,
-    3.110255e+001,
-};
-
-static const double *datasetsXYZ[] = {datasetXYZ1, datasetXYZ2, datasetXYZ3};
-
-static const double *datasetsXYZRad[] = {datasetXYZRad1, datasetXYZRad2, datasetXYZRad3};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index e6610dbb197..e9f0efb4efb 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -18,6 +18,8 @@
 #ifndef __UTIL_SSEF_H__
 #define __UTIL_SSEF_H__
 
+#include "util_ssei.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
@@ -523,13 +525,29 @@ __forceinline ssei truncatei(const ssef &a)
   return _mm_cvttps_epi32(a.m128);
 }
 
+/* This is about 25% faster than straightforward floor to integer conversion
+ * due to better pipelining.
+ *
+ * Unsaturated add 0xffffffff (a < 0) is the same as subtract -1.
+ */
 __forceinline ssei floori(const ssef &a)
 {
-#  if defined(__KERNEL_SSE41__)
-  return ssei(floor(a));
-#  else
-  return ssei(a - ssef(0.5f));
-#  endif
+  return truncatei(a) + cast((a < 0.0f).m128);
+}
+
+__forceinline ssef floorfrac(const ssef &x, ssei *i)
+{
+  *i = floori(x);
+  return x - ssef(*i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Common Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline ssef mix(const ssef &a, const ssef &b, const ssef &t)
+{
+  return madd(t, b, (ssef(1.0f) - t) * a);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index bfab2ee00f9..a4db9193206 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -310,6 +310,15 @@ __forceinline ssei &operator|=(ssei &a, const int32_t &b)
   return a = a | b;
 }
 
+__forceinline ssei &operator^=(ssei &a, const ssei &b)
+{
+  return a = a ^ b;
+}
+__forceinline ssei &operator^=(ssei &a, const int32_t &b)
+{
+  return a = a ^ b;
+}
+
 __forceinline ssei &operator<<=(ssei &a, const int32_t &b)
 {
   return a = a << b;
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index ceb52830319..d809f2e06d7 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -14,34 +14,20 @@
  * limitations under the License.
  */
 
+/* clang-format off */
+
+/* #define static_assert triggers a bug in some clang-format versions, disable
+ * format for entire file to keep results consistent. */
+
 #ifndef __UTIL_STATIC_ASSERT_H__
 #define __UTIL_STATIC_ASSERT_H__
 
 CCL_NAMESPACE_BEGIN
 
-/* TODO(sergey): In theory CUDA might work with own static assert
- * implementation since it's just pure C++.
- */
-#ifdef __KERNEL_GPU__
-#  ifndef static_assert
-#    define static_assert(statement, message)
-#  endif
-#endif /* __KERNEL_GPU__ */
-
-/* TODO(sergey): For until C++11 is a bare minimum for us,
- * we do a bit of a trickery to show meaningful message so
- * it's more or less clear what's wrong when building without
- * C++11.
- *
- * The thing here is: our non-C++11 implementation doesn't
- * have a way to print any message after preprocessor
- * substitution so we rely on the message which is passed to
- * static_assert() since that's the only message visible when
- * compilation fails.
- *
- * After C++11 bump it should be possible to glue structure
- * name to the error message,
- */
+#if defined(__KERNEL_OPENCL__) || defined(CYCLES_CUBIN_CC)
+#  define static_assert(statement, message)
+#endif /* __KERNEL_OPENCL__ */
+
 #define static_assert_align(st, align) \
   static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
 
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index f71145741c9..ce2d4acdde4 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -17,9 +17,9 @@
 #ifndef __UTIL_STRING_H__
 #define __UTIL_STRING_H__
 
+#include <sstream>
 #include <string.h>
 #include <string>
-#include <sstream>
 
 #include "util/util_vector.h"
 
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 0cd991c6231..6d32153209a 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -17,8 +17,8 @@
 #include "util/util_system.h"
 
 #include "util/util_logging.h"
-#include "util/util_types.h"
 #include "util/util_string.h"
+#include "util/util_types.h"
 
 #include <numaapi.h>
 
@@ -35,8 +35,8 @@ OIIO_NAMESPACE_USING
 #  include <sys/sysctl.h>
 #  include <sys/types.h>
 #else
-#  include <unistd.h>
 #  include <sys/ioctl.h>
+#  include <unistd.h>
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -357,7 +357,7 @@ size_t system_physical_ram()
   MEMORYSTATUSEX ram;
   ram.dwLength = sizeof(ram);
   GlobalMemoryStatusEx(&ram);
-  return ram.ullTotalPhys * 1024;
+  return ram.ullTotalPhys;
 #elif defined(__APPLE__)
   uint64_t ram = 0;
   size_t len = sizeof(ram);
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 4b11ce73ea9..4fb61392e92 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -14,106 +14,34 @@
  * limitations under the License.
  */
 
+#include "util/util_task.h"
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_system.h"
-#include "util/util_task.h"
 #include "util/util_time.h"
 
-//#define THREADING_DEBUG_ENABLED
-
-#ifdef THREADING_DEBUG_ENABLED
-#  include <stdio.h>
-#  define THREADING_DEBUG(...) \
-    do { \
-      printf(__VA_ARGS__); \
-      fflush(stdout); \
-    } while (0)
-#else
-#  define THREADING_DEBUG(...)
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 /* Task Pool */
 
-TaskPool::TaskPool()
+TaskPool::TaskPool() : start_time(time_dt()), num_tasks_handled(0)
 {
-  num_tasks_handled = 0;
-  num = 0;
-  do_cancel = false;
 }
 
 TaskPool::~TaskPool()
 {
-  stop();
+  cancel();
 }
 
-void TaskPool::push(Task *task, bool front)
+void TaskPool::push(TaskRunFunction &&task)
 {
-  TaskScheduler::Entry entry;
-
-  entry.task = task;
-  entry.pool = this;
-
-  TaskScheduler::push(entry, front);
-}
-
-void TaskPool::push(const TaskRunFunction &run, bool front)
-{
-  push(new Task(run), front);
+  tbb_group.run(std::move(task));
+  num_tasks_handled++;
 }
 
 void TaskPool::wait_work(Summary *stats)
 {
-  thread_scoped_lock num_lock(num_mutex);
-
-  while (num != 0) {
-    num_lock.unlock();
-
-    thread_scoped_lock queue_lock(TaskScheduler::queue_mutex);
-
-    /* find task from this pool. if we get a task from another pool,
-     * we can get into deadlock */
-    TaskScheduler::Entry work_entry;
-    bool found_entry = false;
-    list<TaskScheduler::Entry>::iterator it;
-
-    for (it = TaskScheduler::queue.begin(); it != TaskScheduler::queue.end(); it++) {
-      TaskScheduler::Entry &entry = *it;
-
-      if (entry.pool == this) {
-        work_entry = entry;
-        found_entry = true;
-        TaskScheduler::queue.erase(it);
-        break;
-      }
-    }
-
-    queue_lock.unlock();
-
-    /* if found task, do it, otherwise wait until other tasks are done */
-    if (found_entry) {
-      /* run task */
-      work_entry.task->run(0);
-
-      /* delete task */
-      delete work_entry.task;
-
-      /* notify pool task was done */
-      num_decrease(1);
-    }
-
-    num_lock.lock();
-    if (num == 0)
-      break;
-
-    if (!found_entry) {
-      THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::wait_work !found_entry\n", num);
-      num_cond.wait(num_lock);
-      THREADING_DEBUG("num==%d, condition wait done in TaskPool::wait_work !found_entry\n", num);
-    }
-  }
+  tbb_group.wait();
 
   if (stats != NULL) {
     stats->time_total = time_dt() - start_time;
@@ -123,193 +51,21 @@ void TaskPool::wait_work(Summary *stats)
 
 void TaskPool::cancel()
 {
-  do_cancel = true;
-
-  TaskScheduler::clear(this);
-
-  {
-    thread_scoped_lock num_lock(num_mutex);
-
-    while (num) {
-      THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::cancel\n", num);
-      num_cond.wait(num_lock);
-      THREADING_DEBUG("num==%d condition wait done in TaskPool::cancel\n", num);
-    }
-  }
-
-  do_cancel = false;
-}
-
-void TaskPool::stop()
-{
-  TaskScheduler::clear(this);
-
-  assert(num == 0);
+  tbb_group.cancel();
+  tbb_group.wait();
 }
 
 bool TaskPool::canceled()
 {
-  return do_cancel;
-}
-
-bool TaskPool::finished()
-{
-  thread_scoped_lock num_lock(num_mutex);
-  return num == 0;
-}
-
-void TaskPool::num_decrease(int done)
-{
-  num_mutex.lock();
-  num -= done;
-
-  assert(num >= 0);
-  if (num == 0) {
-    THREADING_DEBUG("num==%d, notifying all in TaskPool::num_decrease\n", num);
-    num_cond.notify_all();
-  }
-
-  num_mutex.unlock();
-}
-
-void TaskPool::num_increase()
-{
-  thread_scoped_lock num_lock(num_mutex);
-  if (num_tasks_handled == 0) {
-    start_time = time_dt();
-  }
-  num++;
-  num_tasks_handled++;
-  THREADING_DEBUG("num==%d, notifying all in TaskPool::num_increase\n", num);
-  num_cond.notify_all();
+  return tbb_group.is_canceling();
 }
 
 /* Task Scheduler */
 
 thread_mutex TaskScheduler::mutex;
 int TaskScheduler::users = 0;
-vector<thread *> TaskScheduler::threads;
-bool TaskScheduler::do_exit = false;
-
-list<TaskScheduler::Entry> TaskScheduler::queue;
-thread_mutex TaskScheduler::queue_mutex;
-thread_condition_variable TaskScheduler::queue_cond;
-
-namespace {
-
-/* Get number of processors on each of the available nodes. The result is sized
- * by the highest node index, and element corresponds to number of processors on
- * that node.
- * If node is not available, then the corresponding number of processors is
- * zero. */
-void get_per_node_num_processors(vector<int> *num_per_node_processors)
-{
-  const int num_nodes = system_cpu_num_numa_nodes();
-  if (num_nodes == 0) {
-    LOG(ERROR) << "Zero available NUMA nodes, is not supposed to happen.";
-    return;
-  }
-  num_per_node_processors->resize(num_nodes);
-  for (int node = 0; node < num_nodes; ++node) {
-    if (!system_cpu_is_numa_node_available(node)) {
-      (*num_per_node_processors)[node] = 0;
-      continue;
-    }
-    (*num_per_node_processors)[node] = system_cpu_num_numa_node_processors(node);
-  }
-}
-
-/* Calculate total number of processors on all available nodes.
- * This is similar to system_cpu_thread_count(), but uses pre-calculated number
- * of processors on each of the node, avoiding extra system calls and checks for
- * the node availability. */
-int get_num_total_processors(const vector<int> &num_per_node_processors)
-{
-  int num_total_processors = 0;
-  foreach (int num_node_processors, num_per_node_processors) {
-    num_total_processors += num_node_processors;
-  }
-  return num_total_processors;
-}
-
-/* Compute NUMA node for every thread to run on, for the best performance. */
-vector<int> distribute_threads_on_nodes(const int num_threads)
-{
-  /* Start with all threads unassigned to any specific NUMA node. */
-  vector<int> thread_nodes(num_threads, -1);
-  const int num_active_group_processors = system_cpu_num_active_group_processors();
-  VLOG(1) << "Detected " << num_active_group_processors << " processors "
-          << "in active group.";
-  if (num_active_group_processors >= num_threads) {
-    /* If the current thread is set up in a way that its affinity allows to
-     * use at least requested number of threads we do not explicitly set
-     * affinity to the worker therads.
-     * This way we allow users to manually edit affinity of the parent
-     * thread, and here we follow that affinity. This way it's possible to
-     * have two Cycles/Blender instances running manually set to a different
-     * dies on a CPU. */
-    VLOG(1) << "Not setting thread group affinity.";
-    return thread_nodes;
-  }
-  vector<int> num_per_node_processors;
-  get_per_node_num_processors(&num_per_node_processors);
-  if (num_per_node_processors.size() == 0) {
-    /* Error was already repported, here we can't do anything, so we simply
-     * leave default affinity to all the worker threads. */
-    return thread_nodes;
-  }
-  const int num_nodes = num_per_node_processors.size();
-  int thread_index = 0;
-  /* First pass: fill in all the nodes to their maximum.
-  *
-   * If there is less threads than the overall nodes capacity, some of the
-   * nodes or parts of them will idle.
-   *
-   * TODO(sergey): Consider picking up fastest nodes if number of threads
-   * fits on them. For example, on Threadripper2 we might consider using nodes
-   * 0 and 2 if user requested 32 render threads. */
-  const int num_total_node_processors = get_num_total_processors(num_per_node_processors);
-  int current_node_index = 0;
-  while (thread_index < num_total_node_processors && thread_index < num_threads) {
-    const int num_node_processors = num_per_node_processors[current_node_index];
-    for (int processor_index = 0; processor_index < num_node_processors; ++processor_index) {
-      VLOG(1) << "Scheduling thread " << thread_index << " to node " << current_node_index << ".";
-      thread_nodes[thread_index] = current_node_index;
-      ++thread_index;
-      if (thread_index == num_threads) {
-        /* All threads are scheduled on their nodes. */
-        return thread_nodes;
-      }
-    }
-    ++current_node_index;
-  }
-  /* Second pass: keep scheduling threads to each node one by one, uniformly
-   * fillign them in.
-   * This is where things becomes tricky to predict for the maximum
-   * performance: on the one hand this avoids too much threading overhead on
-   * few nodes, but for the final performance having all the overhead on one
-   * node might be better idea (since other nodes will have better chance of
-   * rendering faster).
-   * But more tricky is that nodes might have difference capacity, so we might
-   * want to do some weighted scheduling. For example, if node 0 has 16
-   * processors and node 1 has 32 processors, we'd better schedule 1 extra
-   * thread on node 0 and 2 extra threads on node 1. */
-  current_node_index = 0;
-  while (thread_index < num_threads) {
-    /* Skip unavailable nodes. */
-    /* TODO(sergey): Add sanity check against deadlock. */
-    while (num_per_node_processors[current_node_index] == 0) {
-      current_node_index = (current_node_index + 1) % num_nodes;
-    }
-    VLOG(1) << "Scheduling thread " << thread_index << " to node " << current_node_index << ".";
-    ++thread_index;
-    current_node_index = (current_node_index + 1) % num_nodes;
-  }
-
-  return thread_nodes;
-}
-
-}  // namespace
+int TaskScheduler::active_num_threads = 0;
+tbb::global_control *TaskScheduler::global_control = nullptr;
 
 void TaskScheduler::init(int num_threads)
 {
@@ -320,22 +76,15 @@ void TaskScheduler::init(int num_threads)
   if (users != 1) {
     return;
   }
-  do_exit = false;
-  const bool use_auto_threads = (num_threads == 0);
-  if (use_auto_threads) {
+  if (num_threads > 0) {
     /* Automatic number of threads. */
-    num_threads = system_cpu_thread_count();
+    VLOG(1) << "Overriding number of TBB threads to " << num_threads << ".";
+    global_control = new tbb::global_control(tbb::global_control::max_allowed_parallelism,
+                                             num_threads);
+    active_num_threads = num_threads;
   }
-  VLOG(1) << "Creating pool of " << num_threads << " threads.";
-
-  /* Compute distribution on NUMA nodes. */
-  vector<int> thread_nodes = distribute_threads_on_nodes(num_threads);
-
-  /* Launch threads that will be waiting for work. */
-  threads.resize(num_threads);
-  for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
-    threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run, thread_index + 1),
-                                       thread_nodes[thread_index]);
+  else {
+    active_num_threads = system_cpu_thread_count();
   }
 }
 
@@ -344,105 +93,20 @@ void TaskScheduler::exit()
   thread_scoped_lock lock(mutex);
   users--;
   if (users == 0) {
-    VLOG(1) << "De-initializing thread pool of task scheduler.";
-    /* stop all waiting threads */
-    TaskScheduler::queue_mutex.lock();
-    do_exit = true;
-    TaskScheduler::queue_cond.notify_all();
-    TaskScheduler::queue_mutex.unlock();
-
-    /* delete threads */
-    foreach (thread *t, threads) {
-      t->join();
-      delete t;
-    }
-    threads.clear();
+    delete global_control;
+    global_control = nullptr;
+    active_num_threads = 0;
   }
 }
 
 void TaskScheduler::free_memory()
 {
   assert(users == 0);
-  threads.free_memory();
 }
 
-bool TaskScheduler::thread_wait_pop(Entry &entry)
+int TaskScheduler::num_threads()
 {
-  thread_scoped_lock queue_lock(queue_mutex);
-
-  while (queue.empty() && !do_exit)
-    queue_cond.wait(queue_lock);
-
-  if (queue.empty()) {
-    assert(do_exit);
-    return false;
-  }
-
-  entry = queue.front();
-  queue.pop_front();
-
-  return true;
-}
-
-void TaskScheduler::thread_run(int thread_id)
-{
-  Entry entry;
-
-  /* todo: test affinity/denormal mask */
-
-  /* keep popping off tasks */
-  while (thread_wait_pop(entry)) {
-    /* run task */
-    entry.task->run(thread_id);
-
-    /* delete task */
-    delete entry.task;
-
-    /* notify pool task was done */
-    entry.pool->num_decrease(1);
-  }
-}
-
-void TaskScheduler::push(Entry &entry, bool front)
-{
-  entry.pool->num_increase();
-
-  /* add entry to queue */
-  TaskScheduler::queue_mutex.lock();
-  if (front)
-    TaskScheduler::queue.push_front(entry);
-  else
-    TaskScheduler::queue.push_back(entry);
-
-  TaskScheduler::queue_cond.notify_one();
-  TaskScheduler::queue_mutex.unlock();
-}
-
-void TaskScheduler::clear(TaskPool *pool)
-{
-  thread_scoped_lock queue_lock(TaskScheduler::queue_mutex);
-
-  /* erase all tasks from this pool from the queue */
-  list<Entry>::iterator it = queue.begin();
-  int done = 0;
-
-  while (it != queue.end()) {
-    Entry &entry = *it;
-
-    if (entry.pool == pool) {
-      done++;
-      delete entry.task;
-
-      it = queue.erase(it);
-    }
-    else
-      it++;
-  }
-
-  queue_lock.unlock();
-
-  /* notify done */
-  pool->num_decrease(done);
+  return active_num_threads;
 }
 
 /* Dedicated Task Pool */
@@ -458,31 +122,30 @@ DedicatedTaskPool::DedicatedTaskPool()
 
 DedicatedTaskPool::~DedicatedTaskPool()
 {
-  stop();
+  wait();
+
+  do_exit = true;
+  queue_cond.notify_all();
+
   worker_thread->join();
   delete worker_thread;
 }
 
-void DedicatedTaskPool::push(Task *task, bool front)
+void DedicatedTaskPool::push(TaskRunFunction &&task, bool front)
 {
   num_increase();
 
   /* add task to queue */
   queue_mutex.lock();
   if (front)
-    queue.push_front(task);
+    queue.emplace_front(std::move(task));
   else
-    queue.push_back(task);
+    queue.emplace_back(std::move(task));
 
   queue_cond.notify_one();
   queue_mutex.unlock();
 }
 
-void DedicatedTaskPool::push(const TaskRunFunction &run, bool front)
-{
-  push(new Task(run), front);
-}
-
 void DedicatedTaskPool::wait()
 {
   thread_scoped_lock num_lock(num_mutex);
@@ -501,18 +164,6 @@ void DedicatedTaskPool::cancel()
   do_cancel = false;
 }
 
-void DedicatedTaskPool::stop()
-{
-  clear();
-
-  do_exit = true;
-  queue_cond.notify_all();
-
-  wait();
-
-  assert(num == 0);
-}
-
 bool DedicatedTaskPool::canceled()
 {
   return do_cancel;
@@ -535,7 +186,7 @@ void DedicatedTaskPool::num_increase()
   num_cond.notify_all();
 }
 
-bool DedicatedTaskPool::thread_wait_pop(Task *&task)
+bool DedicatedTaskPool::thread_wait_pop(TaskRunFunction &task)
 {
   thread_scoped_lock queue_lock(queue_mutex);
 
@@ -555,15 +206,15 @@ bool DedicatedTaskPool::thread_wait_pop(Task *&task)
 
 void DedicatedTaskPool::thread_run()
 {
-  Task *task;
+  TaskRunFunction task;
 
   /* keep popping off tasks */
   while (thread_wait_pop(task)) {
     /* run task */
-    task->run(0);
+    task();
 
     /* delete task */
-    delete task;
+    task = nullptr;
 
     /* notify task was done */
     num_decrease(1);
@@ -575,15 +226,8 @@ void DedicatedTaskPool::clear()
   thread_scoped_lock queue_lock(queue_mutex);
 
   /* erase all tasks from the queue */
-  list<Task *>::iterator it = queue.begin();
-  int done = 0;
-
-  while (it != queue.end()) {
-    done++;
-    delete *it;
-
-    it = queue.erase(it);
-  }
+  int done = queue.size();
+  queue.clear();
 
   queue_lock.unlock();
 
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
index fd30a33d8ef..a56ca62f62c 100644
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@@ -19,48 +19,16 @@
 
 #include "util/util_list.h"
 #include "util/util_string.h"
+#include "util/util_tbb.h"
 #include "util/util_thread.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
-class Task;
 class TaskPool;
 class TaskScheduler;
 
-/* Notes on Thread ID
- *
- * Thread ID argument reports the 0-based ID of a working thread from which
- * the run() callback is being invoked. Thread ID of 0 denotes the thread from
- * which wait_work() was called.
- *
- * DO NOT use this ID to control execution flaw, use it only for things like
- * emulating TLS which does not affect on scheduling. Don't use this ID to make
- * any decisions.
- *
- * It is to be noted here that dedicated task pool will always report thread ID
- * of 0.
- */
-
-typedef function<void(int thread_id)> TaskRunFunction;
-
-/* Task
- *
- * Base class for tasks to be executed in threads. */
-
-class Task {
- public:
-  Task(){};
-  explicit Task(const TaskRunFunction &run_) : run(run_)
-  {
-  }
-
-  virtual ~Task()
-  {
-  }
-
-  TaskRunFunction run;
-};
+typedef function<void(void)> TaskRunFunction;
 
 /* Task Pool
  *
@@ -68,8 +36,7 @@ class Task {
  * pool, we can wait for all tasks to be done, or cancel them before they are
  * done.
  *
- * The run callback that actually executes the task may be created like this:
- * function_bind(&MyClass::task_execute, this, _1, _2) */
+ * TaskRunFunction may be created with std::bind or lambda expressions. */
 
 class TaskPool {
  public:
@@ -89,27 +56,15 @@ class TaskPool {
   TaskPool();
   ~TaskPool();
 
-  void push(Task *task, bool front = false);
-  void push(const TaskRunFunction &run, bool front = false);
+  void push(TaskRunFunction &&task);
 
   void wait_work(Summary *stats = NULL); /* work and wait until all tasks are done */
-  void cancel();                         /* cancel all tasks, keep worker threads running */
-  void stop();                           /* stop all worker threads */
-  bool finished();                       /* check if all work has been completed */
+  void cancel(); /* cancel all tasks and wait until they are no longer executing */
 
   bool canceled(); /* for worker threads, test if canceled */
 
  protected:
-  friend class TaskScheduler;
-
-  void num_decrease(int done);
-  void num_increase();
-
-  thread_mutex num_mutex;
-  thread_condition_variable num_cond;
-
-  int num;
-  bool do_cancel;
+  tbb::task_group tbb_group;
 
   /* ** Statistics ** */
 
@@ -131,40 +86,19 @@ class TaskScheduler {
   static void exit();
   static void free_memory();
 
-  /* number of threads that can work on task */
-  static int num_threads()
-  {
-    return threads.size();
-  }
-
-  /* test if any session is using the scheduler */
-  static bool active()
-  {
-    return users != 0;
-  }
+  /* Approximate number of threads that will work on task, which may be lower
+   * or higher than the actual number of threads. Use as little as possible and
+   * leave splitting up tasks to the scheduler.. */
+  static int num_threads();
 
  protected:
-  friend class TaskPool;
-
-  struct Entry {
-    Task *task;
-    TaskPool *pool;
-  };
-
   static thread_mutex mutex;
   static int users;
-  static vector<thread *> threads;
-  static bool do_exit;
+  static int active_num_threads;
 
-  static list<Entry> queue;
-  static thread_mutex queue_mutex;
-  static thread_condition_variable queue_cond;
-
-  static void thread_run(int thread_id);
-  static bool thread_wait_pop(Entry &entry);
-
-  static void push(Entry &entry, bool front);
-  static void clear(TaskPool *pool);
+#ifdef WITH_TBB_GLOBAL_CONTROL
+  static tbb::global_control *global_control;
+#endif
 };
 
 /* Dedicated Task Pool
@@ -179,12 +113,10 @@ class DedicatedTaskPool {
   DedicatedTaskPool();
   ~DedicatedTaskPool();
 
-  void push(Task *task, bool front = false);
-  void push(const TaskRunFunction &run, bool front = false);
+  void push(TaskRunFunction &&run, bool front = false);
 
   void wait();   /* wait until all tasks are done */
   void cancel(); /* cancel all tasks, keep worker thread running */
-  void stop();   /* stop worker thread */
 
   bool canceled(); /* for worker thread, test if canceled */
 
@@ -193,14 +125,14 @@ class DedicatedTaskPool {
   void num_increase();
 
   void thread_run();
-  bool thread_wait_pop(Task *&entry);
+  bool thread_wait_pop(TaskRunFunction &task);
 
   void clear();
 
   thread_mutex num_mutex;
   thread_condition_variable num_cond;
 
-  list<Task *> queue;
+  list<TaskRunFunction> queue;
   thread_mutex queue_mutex;
   thread_condition_variable queue_cond;
 
diff --git a/intern/cycles/util/util_tbb.h b/intern/cycles/util/util_tbb.h
new file mode 100644
index 00000000000..206ba106ca6
--- /dev/null
+++ b/intern/cycles/util/util_tbb.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_TBB_H__
+#define __UTIL_TBB_H__
+
+/* TBB includes <windows.h>, do it ourselves first so we are sure
+ * WIN32_LEAN_AND_MEAN and similar are defined beforehand. */
+#include "util_windows.h"
+
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#include <tbb/tbb.h>
+
+#if TBB_INTERFACE_VERSION_MAJOR >= 10
+#  define WITH_TBB_GLOBAL_CONTROL
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+using tbb::blocked_range;
+using tbb::enumerable_thread_specific;
+using tbb::parallel_for;
+
+static inline void parallel_for_cancel()
+{
+  tbb::task::self().cancel_group_execution();
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TBB_H__ */
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index 5ce16e0095a..863c2ea3124 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -17,6 +17,8 @@
 #ifndef __UTIL_TEXTURE_H__
 #define __UTIL_TEXTURE_H__
 
+#include "util_transform.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Texture limits on devices. */
@@ -59,6 +61,18 @@ typedef enum ImageDataType {
   IMAGE_DATA_NUM_TYPES
 } ImageDataType;
 
+/* Alpha types
+ * How to treat alpha in images. */
+typedef enum ImageAlphaType {
+  IMAGE_ALPHA_UNASSOCIATED = 0,
+  IMAGE_ALPHA_ASSOCIATED = 1,
+  IMAGE_ALPHA_CHANNEL_PACKED = 2,
+  IMAGE_ALPHA_IGNORE = 3,
+  IMAGE_ALPHA_AUTO = 4,
+
+  IMAGE_ALPHA_NUM_TYPES,
+} ImageAlphaType;
+
 #define IMAGE_DATA_TYPE_SHIFT 3
 #define IMAGE_DATA_TYPE_MASK 0x7
 
@@ -79,12 +93,17 @@ typedef enum ExtensionType {
 typedef struct TextureInfo {
   /* Pointer, offset or texture depending on device. */
   uint64_t data;
+  /* Data Type */
+  uint data_type;
   /* Buffer number for OpenCL. */
   uint cl_buffer;
   /* Interpolation and extension type. */
   uint interpolation, extension;
   /* Dimensions. */
   uint width, height, depth;
+  /* Transform for 3D textures. */
+  uint use_transform_3d;
+  Transform transform_3d;
 } TextureInfo;
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 18ec5b32144..29f9becbefe 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -17,11 +17,11 @@
 #ifndef __UTIL_THREAD_H__
 #define __UTIL_THREAD_H__
 
-#include <thread>
-#include <mutex>
 #include <condition_variable>
 #include <functional>
+#include <mutex>
 #include <queue>
+#include <thread>
 
 #ifdef _WIN32
 #  include "util_windows.h"
@@ -29,9 +29,9 @@
 #  include <pthread.h>
 #endif
 
-#ifdef __APPLE__
-#  include <libkern/OSAtomic.h>
-#endif
+/* NOTE: Use tbb/spin_mutex.h instead of util_tbb.h because some of the TBB
+ * functionality requires RTTI, which is disabled for OSL kernel. */
+#include <tbb/spin_mutex.h>
 
 #include "util/util_function.h"
 
@@ -65,76 +65,7 @@ class thread {
   int node_;
 };
 
-/* Own wrapper around pthread's spin lock to make it's use easier. */
-
-class thread_spin_lock {
- public:
-#ifdef __APPLE__
-  inline thread_spin_lock()
-  {
-    spin_ = OS_SPINLOCK_INIT;
-  }
-
-  inline void lock()
-  {
-    OSSpinLockLock(&spin_);
-  }
-
-  inline void unlock()
-  {
-    OSSpinLockUnlock(&spin_);
-  }
-#elif defined(_WIN32)
-  inline thread_spin_lock()
-  {
-    const DWORD SPIN_COUNT = 50000;
-    InitializeCriticalSectionAndSpinCount(&cs_, SPIN_COUNT);
-  }
-
-  inline ~thread_spin_lock()
-  {
-    DeleteCriticalSection(&cs_);
-  }
-
-  inline void lock()
-  {
-    EnterCriticalSection(&cs_);
-  }
-
-  inline void unlock()
-  {
-    LeaveCriticalSection(&cs_);
-  }
-#else
-  inline thread_spin_lock()
-  {
-    pthread_spin_init(&spin_, 0);
-  }
-
-  inline ~thread_spin_lock()
-  {
-    pthread_spin_destroy(&spin_);
-  }
-
-  inline void lock()
-  {
-    pthread_spin_lock(&spin_);
-  }
-
-  inline void unlock()
-  {
-    pthread_spin_unlock(&spin_);
-  }
-#endif
- protected:
-#ifdef __APPLE__
-  OSSpinLock spin_;
-#elif defined(_WIN32)
-  CRITICAL_SECTION cs_;
-#else
-  pthread_spinlock_t spin_;
-#endif
-};
+using thread_spin_lock = tbb::spin_mutex;
 
 class thread_scoped_spin_lock {
  public:
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 6a9bfbea4ca..101122740d7 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -46,8 +46,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util/util_projection.h"
 #include "util/util_transform.h"
+#include "util/util_projection.h"
 
 #include "util/util_boundbox.h"
 #include "util/util_math.h"
@@ -227,6 +227,7 @@ static void transform_decompose(DecomposedTransform *decomp, const Transform *tf
   M.y.w = 0.0f;
   M.z.w = 0.0f;
 
+#if 0
   Transform R = M;
   float norm;
   int iteration = 0;
@@ -260,6 +261,41 @@ static void transform_decompose(DecomposedTransform *decomp, const Transform *tf
   decomp->y.w = scale.x.x;
   decomp->z = make_float4(scale.x.y, scale.x.z, scale.y.x, scale.y.y);
   decomp->w = make_float4(scale.y.z, scale.z.x, scale.z.y, scale.z.z);
+#else
+  float3 colx = transform_get_column(&M, 0);
+  float3 coly = transform_get_column(&M, 1);
+  float3 colz = transform_get_column(&M, 2);
+
+  /* extract scale and shear first */
+  float3 scale, shear;
+  scale.x = len(colx);
+  colx /= scale.x;
+  shear.z = dot(colx, coly);
+  coly -= shear.z * colx;
+  scale.y = len(coly);
+  coly /= scale.y;
+  shear.y = dot(colx, colz);
+  colz -= shear.y * colx;
+  shear.x = dot(coly, colz);
+  colz -= shear.x * coly;
+  scale.z = len(colz);
+  colz /= scale.z;
+
+  transform_set_column(&M, 0, colx);
+  transform_set_column(&M, 1, coly);
+  transform_set_column(&M, 2, colz);
+
+  if (transform_negative_scale(M)) {
+    scale *= -1.0f;
+    M = M * transform_scale(-1.0f, -1.0f, -1.0f);
+  }
+
+  decomp->x = transform_to_quat(M);
+
+  decomp->y.w = scale.x;
+  decomp->z = make_float4(shear.z, shear.y, 0.0f, scale.y);
+  decomp->w = make_float4(shear.x, 0.0f, 0.0f, scale.z);
+#endif
 }
 
 void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, size_t size)
@@ -271,7 +307,7 @@ void transform_motion_decompose(DecomposedTransform *decomp, const Transform *mo
       /* Ensure rotation around shortest angle, negated quaternions are the same
        * but this means we don't have to do the check in quat_interpolate */
       if (dot(decomp[i - 1].x, decomp[i].x) < 0.0f)
-        decomp[i - 1].x = -decomp[i - 1].x;
+        decomp[i].x = -decomp[i].x;
     }
   }
 }
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index 13ca27c2fed..d0a6264d5cf 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -148,6 +148,32 @@ ccl_device_inline Transform make_transform(float a,
   return t;
 }
 
+ccl_device_inline Transform euler_to_transform(const float3 euler)
+{
+  float cx = cosf(euler.x);
+  float cy = cosf(euler.y);
+  float cz = cosf(euler.z);
+  float sx = sinf(euler.x);
+  float sy = sinf(euler.y);
+  float sz = sinf(euler.z);
+
+  Transform t;
+  t.x.x = cy * cz;
+  t.y.x = cy * sz;
+  t.z.x = -sy;
+
+  t.x.y = sy * sx * cz - cx * sz;
+  t.y.y = sy * sx * sz + cx * cz;
+  t.z.y = cy * sx;
+
+  t.x.z = sy * cx * cz + sx * sz;
+  t.y.z = sy * cx * sz - sx * cz;
+  t.z.z = cy * cx;
+
+  t.x.w = t.y.w = t.z.w = 0.0f;
+  return t;
+}
+
 /* Constructs a coordinate frame from a normalized normal. */
 ccl_device_inline Transform make_transform_frame(float3 N)
 {
@@ -318,28 +344,28 @@ ccl_device_inline Transform transform_empty()
 
 ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 {
-  /* use simpe nlerp instead of slerp. it's faster and almost the same */
+  /* Optix is using lerp to interpolate motion transformations. */
+#ifdef __KERNEL_OPTIX__
   return normalize((1.0f - t) * q1 + t * q2);
-
-#if 0
+#else  /* __KERNEL_OPTIX__ */
   /* note: this does not ensure rotation around shortest angle, q1 and q2
    * are assumed to be matched already in transform_motion_decompose */
   float costheta = dot(q1, q2);
 
   /* possible optimization: it might be possible to precompute theta/qperp */
 
-  if(costheta > 0.9995f) {
+  if (costheta > 0.9995f) {
     /* linear interpolation in degenerate case */
-    return normalize((1.0f - t)*q1 + t*q2);
+    return normalize((1.0f - t) * q1 + t * q2);
   }
-  else  {
+  else {
     /* slerp */
     float theta = acosf(clamp(costheta, -1.0f, 1.0f));
     float4 qperp = normalize(q2 - q1 * costheta);
     float thetap = theta * t;
     return q1 * cosf(thetap) + qperp * sinf(thetap);
   }
-#endif
+#endif /* __KERNEL_OPTIX__ */
 }
 
 ccl_device_inline Transform transform_quick_inverse(Transform M)
@@ -442,29 +468,6 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
 
 #ifndef __KERNEL_GPU__
 
-#  ifdef WITH_EMBREE
-ccl_device void transform_motion_array_interpolate_straight(
-    Transform *tfm, const ccl_global DecomposedTransform *motion, uint numsteps, float time)
-{
-  /* Figure out which steps we need to interpolate. */
-  int maxstep = numsteps - 1;
-  int step = min((int)(time * maxstep), maxstep - 1);
-  float t = time * maxstep - step;
-
-  const ccl_global DecomposedTransform *a = motion + step;
-  const ccl_global DecomposedTransform *b = motion + step + 1;
-  Transform step1, step2;
-
-  transform_compose(&step1, a);
-  transform_compose(&step2, b);
-
-  /* matrix lerp */
-  tfm->x = (1.0f - t) * step1.x + t * step2.x;
-  tfm->y = (1.0f - t) * step1.y + t * step2.y;
-  tfm->z = (1.0f - t) * step1.z + t * step2.z;
-}
-#  endif
-
 class BoundBox2D;
 
 ccl_device_inline bool operator==(const DecomposedTransform &A, const DecomposedTransform &B)
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 48e9983ac8f..a721595667d 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -101,6 +101,11 @@ ccl_device_inline size_t round_down(size_t x, size_t multiple)
   return (x / multiple) * multiple;
 }
 
+ccl_device_inline bool is_power_of_two(size_t x)
+{
+  return (x & (x - 1)) == 0;
+}
+
 CCL_NAMESPACE_END
 
 /* Vectorized types declaration. */
@@ -148,11 +153,12 @@ CCL_NAMESPACE_END
 /* SSE types. */
 #ifndef __KERNEL_GPU__
 #  include "util/util_sseb.h"
-#  include "util/util_ssei.h"
 #  include "util/util_ssef.h"
+#  include "util/util_ssei.h"
 #  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
 #    include "util/util_avxb.h"
 #    include "util/util_avxf.h"
+#    include "util/util_avxi.h"
 #  endif
 #endif
 
diff --git a/intern/cycles/util/util_types_float8.h b/intern/cycles/util/util_types_float8.h
index 7289e3298c3..27da120a4ba 100644
--- a/intern/cycles/util/util_types_float8.h
+++ b/intern/cycles/util/util_types_float8.h
@@ -1,30 +1,30 @@
 /*
-* Original code Copyright 2017, Intel Corporation
-* Modifications Copyright 2018, Blender Foundation.
-*
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions are met:
-*
-* * Redistributions of source code must retain the above copyright notice,
-* this list of conditions and the following disclaimer.
-* * Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in the
-* documentation and/or other materials provided with the distribution.
-* * Neither the name of Intel Corporation nor the names of its contributors
-* may be used to endorse or promote products derived from this software
-* without specific prior written permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ * Original code Copyright 2017, Intel Corporation
+ * Modifications Copyright 2018, Blender Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __UTIL_TYPES_FLOAT8_H__
 #define __UTIL_TYPES_FLOAT8_H__
diff --git a/intern/cycles/util/util_types_float8_impl.h b/intern/cycles/util/util_types_float8_impl.h
index 8ce3d81b1bb..4e4ea28c6a4 100644
--- a/intern/cycles/util/util_types_float8_impl.h
+++ b/intern/cycles/util/util_types_float8_impl.h
@@ -1,30 +1,30 @@
 /*
-* Original code Copyright 2017, Intel Corporation
-* Modifications Copyright 2018, Blender Foundation.
-*
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions are met:
-*
-* * Redistributions of source code must retain the above copyright notice,
-* this list of conditions and the following disclaimer.
-* * Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in the
-* documentation and/or other materials provided with the distribution.
-* * Neither the name of Intel Corporation nor the names of its contributors
-* may be used to endorse or promote products derived from this software
-* without specific prior written permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ * Original code Copyright 2017, Intel Corporation
+ * Modifications Copyright 2018, Blender Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
 #define __UTIL_TYPES_FLOAT8_IMPL_H__
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 437478d64d3..04fb33368d9 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -27,7 +27,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Own subclass-ed vestion of std::vector. Subclass is needed because:
+/* Own subclass-ed version of std::vector. Subclass is needed because:
  *
  * - Use own allocator which keeps track of used/peak memory.
  * - Have method to ensure capacity is re-set to 0.
diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h
index 38829d3a29c..8bce5ff85aa 100644
--- a/intern/cycles/util/util_version.h
+++ b/intern/cycles/util/util_version.h
@@ -22,7 +22,7 @@
 CCL_NAMESPACE_BEGIN
 
 #define CYCLES_VERSION_MAJOR 1
-#define CYCLES_VERSION_MINOR 9
+#define CYCLES_VERSION_MINOR 13
 #define CYCLES_VERSION_PATCH 0
 
 #define CYCLES_MAKE_VERSION_STRING2(a, b, c) #a "." #b "." #c
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index f23174fd6dc..9d9ff451b3b 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -134,7 +134,7 @@ static void view_display()
 
   glMatrixMode(GL_PROJECTION);
   glLoadIdentity();
-  gluOrtho2D(0, V.width, 0, V.height);
+  glOrtho(0, V.width, 0, V.height, -1, 1);
 
   glMatrixMode(GL_MODELVIEW);
   glLoadIdentity();
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
new file mode 100644
index 00000000000..807a5adc84a
--- /dev/null
+++ b/intern/cycles/util/util_windows.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2019-2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _WIN32
+#  include <windows.h>
+#endif
+
+#include "util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+bool system_windows_version_at_least(int major, int build)
+{
+#ifdef _WIN32
+  HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
+  if (hMod == 0) {
+    return false;
+  }
+
+  typedef NTSTATUS(WINAPI * RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
+  RtlGetVersionPtr rtl_get_version = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
+  if (rtl_get_version == NULL) {
+    return false;
+  }
+
+  RTL_OSVERSIONINFOW rovi = {0};
+  rovi.dwOSVersionInfoSize = sizeof(rovi);
+  if (rtl_get_version(&rovi) != 0) {
+    return false;
+  }
+
+  return (rovi.dwMajorVersion > major ||
+          (rovi.dwMajorVersion == major && rovi.dwBuildNumber >= build));
+#else
+  (void)major;
+  (void)build;
+  return false;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_windows.h b/intern/cycles/util/util_windows.h
index 0d85c5437f6..9cbf91a23a7 100644
--- a/intern/cycles/util/util_windows.h
+++ b/intern/cycles/util/util_windows.h
@@ -33,4 +33,10 @@
 
 #endif /* _WIN32 */
 
+CCL_NAMESPACE_BEGIN
+
+bool system_windows_version_at_least(int major, int build);
+
+CCL_NAMESPACE_END
+
 #endif /* __UTIL_WINDOWS_H__ */