355 files changed, 14704 insertions, 12189 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 6f6bd7ec2cc..121c8bdad6e 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -177,14 +177,11 @@ if(CXX_HAS_AVX2)
   add_definitions(-DWITH_KERNEL_AVX2)
 endif()
 
-if(WITH_CYCLES_OSL)
-  # LLVM and OSL need to build without RTTI
-  if(WIN32 AND MSVC)
-    set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-  elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang"))
-    set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-  endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
+# LLVM and OSL need to build without RTTI
+if(WIN32 AND MSVC)
+  set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang"))
+  set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
 endif()
 
 # Definitions and Includes
@@ -228,11 +225,8 @@ if(WITH_CYCLES_DEVICE_OPTIX)
       SYSTEM
       ${OPTIX_INCLUDE_DIR}
       )
-
-    # Need pre-compiled CUDA binaries in the OptiX device
-    set(WITH_CYCLES_CUDA_BINARIES ON)
   else()
-    message(STATUS "Optix not found, disabling it from Cycles")
+    message(STATUS "OptiX not found, disabling it from Cycles")
     set(WITH_CYCLES_DEVICE_OPTIX OFF)
   endif()
 endif()
@@ -319,9 +313,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
       set(MAX_MSVC 1910)
     elseif(${CUDA_VERSION} EQUAL "9.1")
       set(MAX_MSVC 1911)
-    elseif(${CUDA_VERSION} EQUAL "10.0")
-      set(MAX_MSVC 1999)
-    elseif(${CUDA_VERSION} EQUAL "10.1")
+    elseif(${CUDA_VERSION} LESS "11.0")
       set(MAX_MSVC 1999)
     endif()
     if(NOT MSVC_VERSION LESS ${MAX_MSVC} OR CMAKE_C_COMPILER_ID MATCHES "Clang")
@@ -338,7 +330,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
 endif()
 
 # NVRTC gives wrong rendering result in CUDA 10.0, so we must use NVCC.
-if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_CUBIN_COMPILER)
+if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_CUBIN_COMPILER AND NOT WITH_CYCLES_CUBIN_COMPILER_OVERRRIDE)
   if(NOT (${CUDA_VERSION} VERSION_LESS 10.0))
     message(STATUS "cycles_cubin_cc not supported for CUDA 10.0+, using nvcc instead.")
     set(WITH_CYCLES_CUBIN_COMPILER OFF)
@@ -356,17 +348,6 @@ if(WITH_CYCLES_NETWORK)
   add_definitions(-DWITH_NETWORK)
 endif()
 
-if(WITH_OPENCOLORIO)
-  add_definitions(-DWITH_OCIO)
-  include_directories(
-    SYSTEM
-    ${OPENCOLORIO_INCLUDE_DIRS}
-  )
-  if(WIN32)
-    add_definitions(-DOpenColorIO_STATIC)
-  endif()
-endif()
-
 if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
   add_subdirectory(app)
 endif()
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index d67a72ab7db..ef374f91a65 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -51,14 +51,17 @@ endif()
 
 # Common configuration.
 
-link_directories(${OPENIMAGEIO_LIBPATH}
-                 ${BOOST_LIBPATH}
-                 ${PNG_LIBPATH}
-                 ${JPEG_LIBPATH}
-                 ${ZLIB_LIBPATH}
-                 ${TIFF_LIBPATH}
-                 ${OPENEXR_LIBPATH}
-                 ${OPENJPEG_LIBPATH})
+link_directories(
+  ${OPENIMAGEIO_LIBPATH}
+  ${BOOST_LIBPATH}
+  ${PNG_LIBPATH}
+  ${JPEG_LIBPATH}
+  ${ZLIB_LIBPATH}
+  ${TIFF_LIBPATH}
+  ${OPENEXR_LIBPATH}
+  ${OPENJPEG_LIBPATH}
+  ${OPENVDB_LIBPATH}
+)
 
 if(WITH_OPENCOLORIO)
   link_directories(${OPENCOLORIO_LIBPATH})
diff --git a/intern/cycles/app/cycles_cubin_cc.cpp b/intern/cycles/app/cycles_cubin_cc.cpp
index 774c18f4219..7631cb9bed5 100644
--- a/intern/cycles/app/cycles_cubin_cc.cpp
+++ b/intern/cycles/app/cycles_cubin_cc.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <stdio.h>
 #include <stdint.h>
+#include <stdio.h>
 
 #include <string>
 #include <vector>
@@ -43,7 +43,8 @@ template<typename T> std::string to_string(const T &n)
 
 class CompilationSettings {
  public:
-  CompilationSettings() : target_arch(0), bits(64), verbose(false), fast_math(false)
+  CompilationSettings()
+      : target_arch(0), bits(64), verbose(false), fast_math(false), ptx_only(false)
   {
   }
 
@@ -57,12 +58,13 @@ class CompilationSettings {
   int bits;
   bool verbose;
   bool fast_math;
+  bool ptx_only;
 };
 
 static bool compile_cuda(CompilationSettings &settings)
 {
-  const char *headers[] = {"stdlib.h", "float.h", "math.h", "stdio.h"};
-  const char *header_content[] = {"\n", "\n", "\n", "\n"};
+  const char *headers[] = {"stdlib.h", "float.h", "math.h", "stdio.h", "stddef.h"};
+  const char *header_content[] = {"\n", "\n", "\n", "\n", "\n"};
 
   printf("Building %s\n", settings.input_file.c_str());
 
@@ -83,6 +85,8 @@ static bool compile_cuda(CompilationSettings &settings)
   options.push_back("-D__KERNEL_CUDA_VERSION__=" + std::to_string(cuewNvrtcVersion()));
   options.push_back("-arch=compute_" + std::to_string(settings.target_arch));
   options.push_back("--device-as-default-execution-space");
+  options.push_back("-DCYCLES_CUBIN_CC");
+  options.push_back("--std=c++11");
   if (settings.fast_math)
     options.push_back("--use_fast_math");
 
@@ -134,10 +138,14 @@ static bool compile_cuda(CompilationSettings &settings)
     fprintf(stderr, "Error: nvrtcGetPTX failed (%d)\n\n", (int)result);
     return false;
   }
-
-  /* Write a file in the temp folder with the ptx code. */
-  settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" +
-                      OIIO::Filesystem::unique_path();
+  if (settings.ptx_only) {
+    settings.ptx_file = settings.output_file;
+  }
+  else {
+    /* Write a file in the temp folder with the ptx code. */
+    settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" +
+                        OIIO::Filesystem::unique_path();
+  }
   FILE *f = fopen(settings.ptx_file.c_str(), "wb");
   fwrite(&ptx_code[0], 1, ptx_size, f);
   fclose(f);
@@ -249,6 +257,9 @@ static bool parse_parameters(int argc, const char **argv, CompilationSettings &s
              "-D %L",
              &settings.defines,
              "Add additional defines",
+             "-ptx",
+             &settings.ptx_only,
+             "emit PTX code",
              "-v",
              &settings.verbose,
              "Use verbose logging",
@@ -303,8 +314,10 @@ int main(int argc, const char **argv)
     exit(EXIT_FAILURE);
   }
 
-  if (!link_ptxas(settings)) {
-    exit(EXIT_FAILURE);
+  if (!settings.ptx_only) {
+    if (!link_ptxas(settings)) {
+      exit(EXIT_FAILURE);
+    }
   }
 
   return 0;
diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp
index c5a4c9b375b..1ad70a376ed 100644
--- a/intern/cycles/app/cycles_server.cpp
+++ b/intern/cycles/app/cycles_server.cpp
@@ -20,11 +20,11 @@
 
 #include "util/util_args.h"
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_task.h"
-#include "util/util_logging.h"
 
 using namespace ccl;
 
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index d2d112e8d7e..e45c37be494 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -16,12 +16,12 @@
 
 #include <stdio.h>
 
+#include "device/device.h"
 #include "render/buffers.h"
 #include "render/camera.h"
-#include "device/device.h"
+#include "render/integrator.h"
 #include "render/scene.h"
 #include "render/session.h"
-#include "render/integrator.h"
 
 #include "util/util_args.h"
 #include "util/util_foreach.h"
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 1dbe8a30ff2..aec00f845f3 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -16,9 +16,9 @@
 
 #include <stdio.h>
 
-#include <sstream>
 #include <algorithm>
 #include <iterator>
+#include <sstream>
 
 #include "graph/node_xml.h"
 
@@ -32,8 +32,8 @@
 #include "render/nodes.h"
 #include "render/object.h"
 #include "render/osl.h"
-#include "render/shader.h"
 #include "render/scene.h"
+#include "render/shader.h"
 
 #include "subd/subd_patch.h"
 #include "subd/subd_split.h"
@@ -292,7 +292,7 @@ static void xml_read_shader_graph(XMLReadState &state, Shader *shader, xml_node
             filepath = path_join(state.base, filepath);
           }
 
-          snode = ((OSLShaderManager *)manager)->osl_node(filepath);
+          snode = OSLShaderManager::osl_node(manager, filepath);
 
           if (!snode) {
             fprintf(stderr, "Failed to create OSL node from \"%s\".\n", filepath.c_str());
@@ -326,6 +326,10 @@ static void xml_read_shader_graph(XMLReadState &state, Shader *shader, xml_node
         fprintf(stderr, "Node type \"%s\" is not a shader node.\n", node_type->name.c_str());
         continue;
       }
+      else if (node_type->create == NULL) {
+        fprintf(stderr, "Can't create abstract node type \"%s\".\n", node_type->name.c_str());
+        continue;
+      }
 
       snode = (ShaderNode *)node_type->create(node_type);
     }
@@ -376,11 +380,11 @@ static Mesh *xml_add_mesh(Scene *scene, const Transform &tfm)
 {
   /* create mesh */
   Mesh *mesh = new Mesh();
-  scene->meshes.push_back(mesh);
+  scene->geometry.push_back(mesh);
 
   /* create object*/
   Object *object = new Object();
-  object->mesh = mesh;
+  object->geometry = mesh;
   object->tfm = tfm;
   scene->objects.push_back(object);
 
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 0888eeb78bb..496e8e9310b 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -18,6 +18,9 @@ set(INC_SYS
 set(SRC
   blender_camera.cpp
   blender_device.cpp
+  blender_image.cpp
+  blender_geometry.cpp
+  blender_light.cpp
   blender_mesh.cpp
   blender_object.cpp
   blender_object_cull.cpp
@@ -30,9 +33,12 @@ set(SRC
   blender_sync.cpp
   blender_texture.cpp
   blender_viewport.cpp
+  blender_volume.cpp
 
   CCL_api.h
   blender_device.h
+  blender_id_map.h
+  blender_image.h
   blender_object_cull.h
   blender_sync.h
   blender_session.h
@@ -86,6 +92,20 @@ if(WITH_MOD_FLUID)
   add_definitions(-DWITH_FLUID)
 endif()
 
+if(WITH_NEW_OBJECT_TYPES)
+  add_definitions(-DWITH_NEW_OBJECT_TYPES)
+endif()
+
+if(WITH_OPENVDB)
+  add_definitions(-DWITH_OPENVDB ${OPENVDB_DEFINITIONS})
+  list(APPEND INC_SYS
+    ${OPENVDB_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${OPENVDB_LIBRARIES}
+  )
+endif()
+
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
 # avoid link failure with clang 3.4 debug
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 6d6f89603fe..3d2a52d0cf6 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -22,7 +22,7 @@ bl_info = {
     "blender": (2, 80, 0),
     "description": "Cycles renderer integration",
     "warning": "",
-    "wiki_url": "https://docs.blender.org/manual/en/latest/render/cycles/",
+    "doc_url": "https://docs.blender.org/manual/en/latest/render/cycles/",
     "tracker_url": "",
     "support": 'OFFICIAL',
     "category": "Render"}
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index ee7ac7737c0..2b872bb5c39 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -33,7 +33,7 @@ def _is_using_buggy_driver():
             # in the version string, but those cards do not quite work and
             # causing crashes.
             return True
-        regex = re.compile(".*Compatibility Profile Context ([0-9]+(\.[0-9]+)+)$")
+        regex = re.compile(".*Compatibility Profile Context ([0-9]+(\\.[0-9]+)+)$")
         if not regex.match(version):
             # Skip cards like FireGL
             return False
@@ -245,9 +245,6 @@ def list_render_passes(srl):
     if srl.use_pass_transmission_direct:   yield ("TransDir",      "RGB",  'COLOR')
     if srl.use_pass_transmission_indirect: yield ("TransInd",      "RGB",  'COLOR')
     if srl.use_pass_transmission_color:    yield ("TransCol",      "RGB",  'COLOR')
-    if srl.use_pass_subsurface_direct:     yield ("SubsurfaceDir", "RGB",  'COLOR')
-    if srl.use_pass_subsurface_indirect:   yield ("SubsurfaceInd", "RGB",  'COLOR')
-    if srl.use_pass_subsurface_color:      yield ("SubsurfaceCol", "RGB",  'COLOR')
     if srl.use_pass_emit:                  yield ("Emit",          "RGB",  'COLOR')
     if srl.use_pass_environment:           yield ("Env",           "RGB",  'COLOR')
 
@@ -258,6 +255,7 @@ def list_render_passes(srl):
     if crl.pass_debug_bvh_traversed_instances: yield ("Debug BVH Traversed Instances", "X",   'VALUE')
     if crl.pass_debug_bvh_intersections:       yield ("Debug BVH Intersections",       "X",   'VALUE')
     if crl.pass_debug_ray_bounces:             yield ("Debug Ray Bounces",             "X",   'VALUE')
+    if crl.pass_debug_sample_count:            yield ("Debug Sample Count",            "X",   'VALUE')
     if crl.use_pass_volume_direct:             yield ("VolumeDir",                     "RGB", 'COLOR')
     if crl.use_pass_volume_indirect:           yield ("VolumeInd",                     "RGB", 'COLOR')
 
@@ -284,8 +282,7 @@ def list_render_passes(srl):
             yield ("Denoising Intensity",       "X",   'VALUE')
             clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
                              "denoising_glossy_direct", "denoising_glossy_indirect",
-                             "denoising_transmission_direct", "denoising_transmission_indirect",
-                             "denoising_subsurface_direct", "denoising_subsurface_indirect")
+                             "denoising_transmission_direct", "denoising_transmission_indirect")
             if any(getattr(crl, option) for option in clean_options):
                 yield ("Denoising Clean", "RGB", 'COLOR')
 
diff --git a/intern/cycles/blender/addon/operators.py b/intern/cycles/blender/addon/operators.py
index 80bb663330b..3c8e79eaba5 100644
--- a/intern/cycles/blender/addon/operators.py
+++ b/intern/cycles/blender/addon/operators.py
@@ -153,12 +153,12 @@ class CYCLES_OT_denoise_animation(Operator):
             self.report({'ERROR'}, str(e))
             return {'FINISHED'}
 
-        self.report({'INFO'}, "Denoising completed.")
+        self.report({'INFO'}, "Denoising completed")
         return {'FINISHED'}
 
 
 class CYCLES_OT_merge_images(Operator):
-    "Combine OpenEXR multilayer images rendered with different sample" \
+    "Combine OpenEXR multilayer images rendered with different sample " \
     "ranges into one image with reduced noise"
     bl_idname = "cycles.merge_images"
     bl_label = "Merge Images"
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5f163c2510b..c91e210bbd8 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -112,6 +112,7 @@ enum_use_layer_samples = (
 enum_sampling_pattern = (
     ('SOBOL', "Sobol", "Use Sobol random sampling pattern"),
     ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"),
+    ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"),
 )
 
 enum_integrator = (
@@ -178,10 +179,6 @@ enum_view3d_shading_render_pass= (
     ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass", 45),
     ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass", 46),
 
-    ('SUBSURFACE_DIRECT', "Subsurface Direct", "Show the Subsurface Direct render pass", 47),
-    ('SUBSURFACE_INDIRECT', "Subsurface Indirect", "Show the Subsurface Indirect render pass", 48),
-    ('SUBSURFACE_COLOR', "Subsurface Color", "Show the Subsurface Color render pass", 49),
-
     ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass", 50),
     ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass", 51),
 
@@ -197,7 +194,12 @@ enum_aov_types = (
     ('COLOR', "Color", "Write a Color pass", 1),
 )
 
-enum_denoising_optix_input_passes= (
+enum_viewport_denoising = (
+    ('NONE', "None", "Disable viewport denoising", 0),
+    ('OPTIX', "OptiX AI-Accelerated", "Use the OptiX denoiser running on the GPU (requires at least one compatible OptiX device)", 1),
+)
+
+enum_denoising_optix_input_passes = (
     ('RGB', "Color", "Use only color as input", 1),
     ('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2),
     ('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3),
@@ -229,6 +231,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default='PATH',
     )
 
+    preview_pause: BoolProperty(
+        name="Pause Preview",
+        description="Pause all viewport preview renders",
+        default=False,
+    )
+    preview_denoising: EnumProperty(
+        name="Viewport Denoising",
+        description="Denoise the image after each preview update with the selected denoiser engine",
+        items=enum_viewport_denoising,
+        default='NONE',
+    )
+
     use_square_samples: BoolProperty(
         name="Square Samples",
         description="Square sampling values for easier artist control",
@@ -247,11 +261,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=0, max=(1 << 24),
         default=32,
     )
-    preview_pause: BoolProperty(
-        name="Pause Preview",
-        description="Pause all viewport preview renders",
-        default=False,
-    )
     aa_samples: IntProperty(
         name="AA Samples",
         description="Number of antialiasing samples to render for each pixel",
@@ -264,6 +273,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=0, max=2097151,
         default=32,
     )
+
     diffuse_samples: IntProperty(
         name="Diffuse Samples",
         description="Number of diffuse bounce samples to render for each AA sample",
@@ -294,14 +304,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=1, max=1024,
         default=1,
     )
-
     subsurface_samples: IntProperty(
         name="Subsurface Samples",
         description="Number of subsurface scattering samples to render for each AA sample",
         min=1, max=1024,
         default=1,
     )
-
     volume_samples: IntProperty(
         name="Volume Samples",
         description="Number of volume scattering samples to render for each AA sample",
@@ -342,6 +350,26 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0.01,
     )
 
+    use_adaptive_sampling: BoolProperty(
+        name="Use Adaptive Sampling",
+        description="Automatically reduce the number of samples per pixel based on estimated noise level",
+        default=False,
+    )
+
+    adaptive_threshold: FloatProperty(
+        name="Adaptive Sampling Threshold",
+        description="Noise level step to stop sampling at, lower values reduce noise the cost of render time. Zero for automatic setting based on number of AA samples",
+        min=0.0, max=1.0,
+        default=0.0,
+        precision=4,
+    )
+    adaptive_min_samples: IntProperty(
+        name="Adaptive Min Samples",
+        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on number of AA samples",
+        min=0, max=4096,
+        default=0,
+    )
+
     min_light_bounces: IntProperty(
             name="Min Light Bounces",
             description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
@@ -416,13 +444,20 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=8,
     )
 
-    volume_step_size: FloatProperty(
-        name="Step Size",
-        description="Distance between volume shader samples when rendering the volume "
-        "(lower values give more accurate and detailed results, but also increased render time)",
-        default=0.1,
-        min=0.0000001, max=100000.0, soft_min=0.01, soft_max=1.0, precision=4,
-        unit='LENGTH'
+    volume_step_rate: FloatProperty(
+        name="Step Rate",
+        description="Globally adjust detail for volume rendering, on top of automatically estimated step size. "
+                    "Higher values reduce render time, lower values render with more detail",
+        default=1.0,
+        min=0.01, max=100.0, soft_min=0.1, soft_max=10.0, precision=2
+    )
+
+    volume_preview_step_rate: FloatProperty(
+        name="Step Rate",
+        description="Globally adjust detail for volume rendering, on top of automatically estimated step size. "
+                    "Higher values reduce render time, lower values render with more detail",
+        default=1.0,
+        min=0.01, max=100.0, soft_min=0.1, soft_max=10.0, precision=2
     )
 
     volume_max_steps: IntProperty(
@@ -562,6 +597,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=64,
         subtype='PIXEL'
     )
+    preview_denoising_start_sample: IntProperty(
+        name="Start Denoising",
+        description="Sample to start denoising the preview at",
+        min=0, max=(1 << 24),
+        default=1,
+    )
 
     debug_reset_timeout: FloatProperty(
         name="Reset timeout",
@@ -641,7 +682,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             ('DIFFUSE', "Diffuse", ""),
             ('GLOSSY', "Glossy", ""),
             ('TRANSMISSION', "Transmission", ""),
-            ('SUBSURFACE', "Subsurface", ""),
         ),
     )
 
@@ -901,6 +941,14 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup):
         default='LINEAR',
     )
 
+    volume_step_rate: FloatProperty(
+        name="Step Rate",
+        description="Scale the distance between volume shader samples when rendering the volume "
+                    "(lower values give more accurate and detailed results, but also increased render time)",
+        default=1.0,
+        min=0.001, max=1000.0, soft_min=0.1, soft_max=10.0, precision=4
+    )
+
     displacement_method: EnumProperty(
         name="Displacement Method",
         description="Method to use for the displacement",
@@ -1011,6 +1059,13 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
         items=enum_volume_interpolation,
         default='LINEAR',
     )
+    volume_step_size: FloatProperty(
+        name="Step Size",
+        description="Distance between volume shader samples when rendering the volume "
+                    "(lower values give more accurate and detailed results, but also increased render time)",
+        default=1.0,
+        min=0.0000001, max=100000.0, soft_min=0.1, soft_max=100.0, precision=4
+    )
 
     @classmethod
     def register(cls):
@@ -1121,7 +1176,7 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
     motion_steps: IntProperty(
         name="Motion Steps",
         description="Control accuracy of motion blur, more steps gives more memory usage (actual number of steps is 2^(steps - 1))",
-        min=1, soft_max=8,
+        min=1, max=7,
         default=1,
     )
 
@@ -1285,7 +1340,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         default=False,
         update=update_render_passes,
     )
-
+    pass_debug_sample_count: BoolProperty(
+        name="Debug Sample Count",
+        description="Number of samples/camera rays per pixel",
+        default=False,
+        update=update_render_passes,
+    )
     use_pass_volume_direct: BoolProperty(
         name="Volume Direct",
         description="Deliver direct volumetric scattering pass",
@@ -1305,12 +1365,6 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         default=False,
         update=update_render_passes,
     )
-    use_optix_denoising: BoolProperty(
-        name="Use OptiX AI Denoising",
-        description="Denoise the rendered image with the OptiX AI denoiser",
-        default=False,
-        update=update_render_passes,
-    )
     denoising_diffuse_direct: BoolProperty(
         name="Diffuse Direct",
         description="Denoise the direct diffuse lighting",
@@ -1341,16 +1395,6 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         description="Denoise the indirect transmission lighting",
         default=True,
     )
-    denoising_subsurface_direct: BoolProperty(
-        name="Subsurface Direct",
-        description="Denoise the direct subsurface lighting",
-        default=True,
-    )
-    denoising_subsurface_indirect: BoolProperty(
-        name="Subsurface Indirect",
-        description="Denoise the indirect subsurface lighting",
-        default=True,
-    )
     denoising_strength: FloatProperty(
         name="Denoising Strength",
         description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
@@ -1387,11 +1431,18 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         min=0, max=7,
         default=0,
     )
+
+    use_optix_denoising: BoolProperty(
+        name="OptiX AI-Accelerated",
+        description="Use the OptiX denoiser to denoise the rendered image",
+        default=False,
+        update=update_render_passes,
+    )
     denoising_optix_input_passes: EnumProperty(
         name="Input Passes",
-        description="Controls which passes the OptiX AI denoiser should use as input, which can have different effects on the denoised image",
+        description="Passes handed over to the OptiX denoiser (this can have different effects on the denoised image)",
         items=enum_denoising_optix_input_passes,
-        default='RGB',
+        default='RGB_ALBEDO',
     )
 
     use_pass_crypto_object: BoolProperty(
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 35d5d3801d2..37675c5699d 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -112,6 +112,10 @@ def show_device_active(context):
         return True
     return context.preferences.addons[__package__].preferences.has_active_device()
 
+def show_optix_denoising(context):
+    # OptiX AI denoiser can be used when at least one device supports OptiX
+    return bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX'))
+
 
 def draw_samples_info(layout, context):
     cscene = context.scene.cycles
@@ -177,17 +181,23 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
         if not use_optix(context):
             layout.prop(cscene, "progressive")
 
-        if cscene.progressive == 'PATH' or use_branched_path(context) is False:
+        if not use_branched_path(context):
             col = layout.column(align=True)
             col.prop(cscene, "samples", text="Render")
             col.prop(cscene, "preview_samples", text="Viewport")
-
-            draw_samples_info(layout, context)
         else:
             col = layout.column(align=True)
             col.prop(cscene, "aa_samples", text="Render")
             col.prop(cscene, "preview_aa_samples", text="Viewport")
 
+        # Viewport denoising is currently only supported with OptiX
+        if show_optix_denoising(context):
+            col = layout.column()
+            col.prop(cscene, "preview_denoising")
+
+        if not use_branched_path(context):
+            draw_samples_info(layout, context)
+
 
 class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
     bl_label = "Sub Samples"
@@ -195,9 +205,7 @@ class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
 
     @classmethod
     def poll(cls, context):
-        scene = context.scene
-        cscene = scene.cycles
-        return cscene.progressive != 'PATH' and use_branched_path(context)
+        return use_branched_path(context)
 
     def draw(self, context):
         layout = self.layout
@@ -222,6 +230,32 @@ class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
         draw_samples_info(layout, context)
 
 
+class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel):
+    bl_label = "Adaptive Sampling"
+    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw_header(self, context):
+        layout = self.layout
+        scene = context.scene
+        cscene = scene.cycles
+
+        layout.prop(cscene, "use_adaptive_sampling", text="")
+
+    def draw(self, context):
+        layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        scene = context.scene
+        cscene = scene.cycles
+
+        layout.active = cscene.use_adaptive_sampling
+
+        col = layout.column(align=True)
+        col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+        col.prop(cscene, "adaptive_threshold", text="Noise Threshold")
+
 class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
     bl_label = "Advanced"
     bl_parent_id = "CYCLES_RENDER_PT_sampling"
@@ -239,7 +273,9 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         row.prop(cscene, "seed")
         row.prop(cscene, "use_animated_seed", text="", icon='TIME')
 
-        layout.prop(cscene, "sampling_pattern", text="Pattern")
+        col = layout.column(align=True)
+        col.active = not(cscene.use_adaptive_sampling)
+        col.prop(cscene, "sampling_pattern", text="Pattern")
 
         layout.prop(cscene, "use_square_samples")
 
@@ -337,7 +373,7 @@ class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel):
         col = layout.column()
         sub = col.column(align=True)
         sub.prop(cscene, "dicing_rate", text="Dicing Rate Render")
-        sub.prop(cscene, "preview_dicing_rate", text="Preview")
+        sub.prop(cscene, "preview_dicing_rate", text="Viewport")
 
         col.separator()
 
@@ -392,9 +428,11 @@ class CYCLES_RENDER_PT_volumes(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        col = layout.column()
-        col.prop(cscene, "volume_step_size", text="Step Size")
-        col.prop(cscene, "volume_max_steps", text="Max Steps")
+        col = layout.column(align=True)
+        col.prop(cscene, "volume_step_rate", text="Step Rate Render")
+        col.prop(cscene, "volume_preview_step_rate", text="Viewport")
+
+        layout.prop(cscene, "volume_max_steps", text="Max Steps")
 
 
 class CYCLES_RENDER_PT_light_paths(CyclesButtonsPanel, Panel):
@@ -635,9 +673,6 @@ class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
 
         sub = col.column()
         sub.active = not rd.use_save_buffers
-        for view_layer in scene.view_layers:
-            if view_layer.cycles.use_denoising:
-                sub.active = False
         sub.prop(cscene, "use_progressive_refine")
 
 
@@ -705,6 +740,11 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel):
         col.prop(rd, "preview_pixel_size", text="Pixel Size")
         col.prop(cscene, "preview_start_resolution", text="Start Pixels")
 
+        if show_optix_denoising(context):
+            sub = col.row(align=True)
+            sub.active = cscene.preview_denoising != 'NONE'
+            sub.prop(cscene, "preview_denoising_start_sample", text="Denoising Start Sample")
+
 
 class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
     bl_label = "Filter"
@@ -732,6 +772,8 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
         col.prop(view_layer, "use_solid", text="Surfaces")
         col = flow.column()
         col.prop(view_layer, "use_strand", text="Hair")
+        col = flow.column()
+        col.prop(view_layer, "use_volumes", text="Volumes")
         if with_freestyle:
             col = flow.column()
             col.prop(view_layer, "use_freestyle", text="Freestyle")
@@ -803,6 +845,8 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
         col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data")
         col = flow.column()
         col.prop(cycles_view_layer, "pass_debug_render_time", text="Render Time")
+        col = flow.column()
+        col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count")
 
         layout.separator()
 
@@ -848,14 +892,6 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel):
 
         split = layout.split(factor=0.35)
         split.use_property_split = False
-        split.label(text="Subsurface")
-        row = split.row(align=True)
-        row.prop(view_layer, "use_pass_subsurface_direct", text="Direct", toggle=True)
-        row.prop(view_layer, "use_pass_subsurface_indirect", text="Indirect", toggle=True)
-        row.prop(view_layer, "use_pass_subsurface_color", text="Color", toggle=True)
-
-        split = layout.split(factor=0.35)
-        split.use_property_split = False
         split.label(text="Volume")
         row = split.row(align=True)
         row.prop(cycles_view_layer, "use_pass_volume_direct", text="Direct", toggle=True)
@@ -981,15 +1017,14 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
 
         col = split.column(align=True)
 
-        if use_optix(context):
-            col.prop(cycles_view_layer, "use_optix_denoising", text="OptiX AI Denoising")
+        if show_optix_denoising(context):
+            col.prop(cycles_view_layer, "use_optix_denoising")
+            col.separator(factor=2.0)
 
             if cycles_view_layer.use_optix_denoising:
                 col.prop(cycles_view_layer, "denoising_optix_input_passes")
                 return
 
-            col.separator(factor=2.0)
-
         col.prop(cycles_view_layer, "denoising_radius", text="Radius")
         col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
         col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
@@ -1036,15 +1071,6 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
         split = layout.split(factor=0.5)
         split.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
 
-        col = split.column()
-        col.alignment = 'RIGHT'
-        col.label(text="Subsurface")
-
-        row = split.row(align=True)
-        row.use_property_split = False
-        row.prop(cycles_view_layer, "denoising_subsurface_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_subsurface_indirect", text="Indirect", toggle=True)
-
 
 class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
@@ -1391,8 +1417,6 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel):
         light = context.light
         clamp = light.cycles
 
-        layout.use_property_decorate = False
-
         if self.bl_space_type == 'PROPERTIES':
             layout.row().prop(light, "type", expand=True)
             layout.use_property_split = True
@@ -1674,6 +1698,9 @@ class CYCLES_WORLD_PT_settings_volume(CyclesButtonsPanel, Panel):
         sub.prop(cworld, "volume_sampling", text="Sampling")
         col.prop(cworld, "volume_interpolation", text="Interpolation")
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
+        sub = col.column()
+        sub.active = not cworld.homogeneous_volume
+        sub.prop(cworld, "volume_step_size")
 
 
 class CYCLES_MATERIAL_PT_preview(CyclesButtonsPanel, Panel):
@@ -1805,6 +1832,9 @@ class CYCLES_MATERIAL_PT_settings_volume(CyclesButtonsPanel, Panel):
         sub.prop(cmat, "volume_sampling", text="Sampling")
         col.prop(cmat, "volume_interpolation", text="Interpolation")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
+        sub = col.column()
+        sub.active = not cmat.homogeneous_volume
+        sub.prop(cmat, "volume_step_rate")
 
     def draw(self, context):
         self.draw_shared(self, context, context.material)
@@ -1852,7 +1882,7 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel):
         cscene = scene.cycles
         rd = scene.render
         if rd.use_bake_multires == False and cscene.bake_type in {
-                'NORMAL', 'COMBINED', 'DIFFUSE', 'GLOSSY', 'TRANSMISSION', 'SUBSURFACE'}:
+                'NORMAL', 'COMBINED', 'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}:
             return True
 
     def draw(self, context):
@@ -1887,11 +1917,10 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel):
             flow.prop(cbk, "use_pass_diffuse")
             flow.prop(cbk, "use_pass_glossy")
             flow.prop(cbk, "use_pass_transmission")
-            flow.prop(cbk, "use_pass_subsurface")
             flow.prop(cbk, "use_pass_ambient_occlusion")
             flow.prop(cbk, "use_pass_emit")
 
-        elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION', 'SUBSURFACE'}:
+        elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}:
             row = col.row(align=True)
             row.use_property_split = False
             row.prop(cbk, "use_pass_direct", toggle=True)
@@ -2192,8 +2221,6 @@ def draw_device(self, context):
         col = layout.column()
         col.prop(cscene, "feature_set")
 
-        scene = context.scene
-
         col = layout.column()
         col.active = show_device_active(context)
         col.prop(cscene, "device")
@@ -2248,6 +2275,7 @@ classes = (
     CYCLES_PT_integrator_presets,
     CYCLES_RENDER_PT_sampling,
     CYCLES_RENDER_PT_sampling_sub_samples,
+    CYCLES_RENDER_PT_sampling_adaptive,
     CYCLES_RENDER_PT_sampling_advanced,
     CYCLES_RENDER_PT_light_paths,
     CYCLES_RENDER_PT_light_paths_max_bounces,
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 899245db03e..49f23f4ba30 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -42,10 +42,7 @@ def custom_bake_remap(scene):
         'GLOSSY_COLOR',
         'TRANSMISSION_DIRECT',
         'TRANSMISSION_INDIRECT',
-        'TRANSMISSION_COLOR',
-        'SUBSURFACE_DIRECT',
-        'SUBSURFACE_INDIRECT',
-        'SUBSURFACE_COLOR')
+        'TRANSMISSION_COLOR')
 
     diffuse_direct_idx = bake_lookup.index('DIFFUSE_DIRECT')
 
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index c84d6e1572b..40a1a2c2edc 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -725,22 +725,26 @@ static void blender_camera_view_subset(BL::RenderEngine &b_engine,
   BoundBox2D cam, view;
   float view_aspect, cam_aspect, sensor_size;
 
-  /* get viewport viewplane */
+  /* Get viewport viewplane. */
   BlenderCamera view_bcam;
   blender_camera_init(&view_bcam, b_render);
   blender_camera_from_view(&view_bcam, b_engine, b_scene, b_v3d, b_rv3d, width, height, true);
 
   blender_camera_viewplane(&view_bcam, width, height, &view, &view_aspect, &sensor_size);
 
-  /* get camera viewplane */
+  /* Get camera viewplane. */
   BlenderCamera cam_bcam;
   blender_camera_init(&cam_bcam, b_render);
   blender_camera_from_object(&cam_bcam, b_engine, b_ob, true);
 
+  /* Camera border is affect by aspect, viewport is not. */
+  cam_bcam.pixelaspect.x = b_render.pixel_aspect_x();
+  cam_bcam.pixelaspect.y = b_render.pixel_aspect_y();
+
   blender_camera_viewplane(
       &cam_bcam, cam_bcam.full_width, cam_bcam.full_height, &cam, &cam_aspect, &sensor_size);
 
-  /* return */
+  /* Return */
   *view_box = view * (1.0f / view_aspect);
   *cam_box = cam * (1.0f / cam_aspect);
 }
@@ -863,7 +867,8 @@ void BlenderSync::sync_view(BL::SpaceView3D &b_v3d,
   }
 }
 
-BufferParams BlenderSync::get_buffer_params(BL::RenderSettings &b_render,
+BufferParams BlenderSync::get_buffer_params(BL::Scene &b_scene,
+                                            BL::RenderSettings &b_render,
                                             BL::SpaceView3D &b_v3d,
                                             BL::RegionView3D &b_rv3d,
                                             Camera *cam,
@@ -899,7 +904,11 @@ BufferParams BlenderSync::get_buffer_params(BL::RenderSettings &b_render,
     params.height = height;
   }
 
-  update_viewport_display_passes(b_v3d, params.passes);
+  PassType display_pass = update_viewport_display_passes(b_v3d, params.passes);
+
+  /* Can only denoise the combined image pass */
+  params.denoising_data_pass = display_pass == PASS_COMBINED &&
+                               update_viewport_display_denoising(b_v3d, b_scene);
 
   return params;
 }
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 4dba8ffbe0e..0c87808d880 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -17,6 +17,7 @@
 #include "render/attribute.h"
 #include "render/camera.h"
 #include "render/curves.h"
+#include "render/hair.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
@@ -107,12 +108,12 @@ static void InterpolateKeySegments(
 }
 
 static bool ObtainCacheParticleData(
-    Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background)
+    Geometry *geom, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background)
 {
   int curvenum = 0;
   int keyno = 0;
 
-  if (!(mesh && b_mesh && b_ob && CData))
+  if (!(geom && b_mesh && b_ob && CData))
     return false;
 
   Transform tfm = get_transform(b_ob->matrix_world());
@@ -128,7 +129,7 @@ static bool ObtainCacheParticleData(
 
       if ((b_part.render_type() == BL::ParticleSettings::render_type_PATH) &&
           (b_part.type() == BL::ParticleSettings::type_HAIR)) {
-        int shader = clamp(b_part.material() - 1, 0, mesh->used_shaders.size() - 1);
+        int shader = clamp(b_part.material() - 1, 0, geom->used_shaders.size() - 1);
         int display_step = background ? b_part.render_step() : b_part.display_step();
         int totparts = b_psys.particles.length();
         int totchild = background ? b_psys.child_particles.length() :
@@ -173,19 +174,20 @@ static bool ObtainCacheParticleData(
           CData->curve_firstkey.push_back_slow(keyno);
 
           float curve_length = 0.0f;
-          float3 pcKey;
+          float3 prev_co_world = make_float3(0.0f, 0.0f, 0.0f);
+          float3 prev_co_object = make_float3(0.0f, 0.0f, 0.0f);
           for (int step_no = 0; step_no < ren_step; step_no++) {
-            float nco[3];
-            b_psys.co_hair(*b_ob, pa_no, step_no, nco);
-            float3 cKey = make_float3(nco[0], nco[1], nco[2]);
-            cKey = transform_point(&itfm, cKey);
+            float3 co_world = prev_co_world;
+            b_psys.co_hair(*b_ob, pa_no, step_no, &co_world.x);
+            float3 co_object = transform_point(&itfm, co_world);
             if (step_no > 0) {
-              const float step_length = len(cKey - pcKey);
+              const float step_length = len(co_object - prev_co_object);
               curve_length += step_length;
             }
-            CData->curvekey_co.push_back_slow(cKey);
+            CData->curvekey_co.push_back_slow(co_object);
             CData->curvekey_time.push_back_slow(curve_length);
-            pcKey = cKey;
+            prev_co_object = co_object;
+            prev_co_world = co_world;
             keynum++;
           }
           keyno += keynum;
@@ -201,14 +203,14 @@ static bool ObtainCacheParticleData(
   return true;
 }
 
-static bool ObtainCacheParticleUV(Mesh *mesh,
+static bool ObtainCacheParticleUV(Geometry *geom,
                                   BL::Mesh *b_mesh,
                                   BL::Object *b_ob,
                                   ParticleCurveData *CData,
                                   bool background,
                                   int uv_num)
 {
-  if (!(mesh && b_mesh && b_ob && CData))
+  if (!(geom && b_mesh && b_ob && CData))
     return false;
 
   CData->curve_uv.clear();
@@ -264,14 +266,14 @@ static bool ObtainCacheParticleUV(Mesh *mesh,
   return true;
 }
 
-static bool ObtainCacheParticleVcol(Mesh *mesh,
+static bool ObtainCacheParticleVcol(Geometry *geom,
                                     BL::Mesh *b_mesh,
                                     BL::Object *b_ob,
                                     ParticleCurveData *CData,
                                     bool background,
                                     int vcol_num)
 {
-  if (!(mesh && b_mesh && b_ob && CData))
+  if (!(geom && b_mesh && b_ob && CData))
     return false;
 
   CData->curve_vcol.clear();
@@ -593,21 +595,21 @@ static void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, in
   /* texture coords still needed */
 }
 
-static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
+static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CData)
 {
   int num_keys = 0;
   int num_curves = 0;
 
-  if (mesh->num_curves())
+  if (hair->num_curves())
     return;
 
   Attribute *attr_intercept = NULL;
   Attribute *attr_random = NULL;
 
-  if (mesh->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
-    attr_intercept = mesh->curve_attributes.add(ATTR_STD_CURVE_INTERCEPT);
-  if (mesh->need_attribute(scene, ATTR_STD_CURVE_RANDOM))
-    attr_random = mesh->curve_attributes.add(ATTR_STD_CURVE_RANDOM);
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
+    attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM))
+    attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
 
   /* compute and reserve size of arrays */
   for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
@@ -620,10 +622,10 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
   }
 
   if (num_curves > 0) {
-    VLOG(1) << "Exporting curve segments for mesh " << mesh->name;
+    VLOG(1) << "Exporting curve segments for mesh " << hair->name;
   }
 
-  mesh->reserve_curves(mesh->num_curves() + num_curves, mesh->curve_keys.size() + num_keys);
+  hair->reserve_curves(hair->num_curves() + num_curves, hair->curve_keys.size() + num_keys);
 
   num_keys = 0;
   num_curves = 0;
@@ -648,7 +650,7 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
             (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)) {
           radius = 0.0f;
         }
-        mesh->add_curve_key(ickey_loc, radius);
+        hair->add_curve_key(ickey_loc, radius);
         if (attr_intercept)
           attr_intercept->add(time);
 
@@ -659,16 +661,16 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
         attr_random->add(hash_uint2_to_float(num_curves, 0));
       }
 
-      mesh->add_curve(num_keys, CData->psys_shader[sys]);
+      hair->add_curve(num_keys, CData->psys_shader[sys]);
       num_keys += num_curve_keys;
       num_curves++;
     }
   }
 
   /* check allocation */
-  if ((mesh->curve_keys.size() != num_keys) || (mesh->num_curves() != num_curves)) {
+  if ((hair->curve_keys.size() != num_keys) || (hair->num_curves() != num_curves)) {
     VLOG(1) << "Allocation failed, clearing data";
-    mesh->clear();
+    hair->clear();
   }
 }
 
@@ -712,24 +714,58 @@ static float4 LerpCurveSegmentMotionCV(ParticleCurveData *CData, int sys, int cu
   return lerp(mP, mP2, remainder);
 }
 
-static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int motion_step)
+static void export_hair_motion_validate_attribute(Hair *hair,
+                                                  int motion_step,
+                                                  int num_motion_keys,
+                                                  bool have_motion)
 {
-  VLOG(1) << "Exporting curve motion segments for mesh " << mesh->name << ", motion step "
+  Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  const int num_keys = hair->curve_keys.size();
+
+  if (num_motion_keys != num_keys || !have_motion) {
+    /* No motion or hair "topology" changed, remove attributes again. */
+    if (num_motion_keys != num_keys) {
+      VLOG(1) << "Hair topology changed, removing attribute.";
+    }
+    else {
+      VLOG(1) << "No motion, removing attribute.";
+    }
+    hair->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
+  }
+  else if (motion_step > 0) {
+    VLOG(1) << "Filling in new motion vertex position for motion_step " << motion_step;
+
+    /* Motion, fill up previous steps that we might have skipped because
+     * they had no motion, but we need them anyway now. */
+    for (int step = 0; step < motion_step; step++) {
+      float4 *mP = attr_mP->data_float4() + step * num_keys;
+
+      for (int key = 0; key < num_keys; key++) {
+        mP[key] = float3_to_float4(hair->curve_keys[key]);
+        mP[key].w = hair->curve_radius[key];
+      }
+    }
+  }
+}
+
+static void ExportCurveSegmentsMotion(Hair *hair, ParticleCurveData *CData, int motion_step)
+{
+  VLOG(1) << "Exporting curve motion segments for hair " << hair->name << ", motion step "
           << motion_step;
 
   /* find attribute */
-  Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
   bool new_attribute = false;
 
   /* add new attribute if it doesn't exist already */
   if (!attr_mP) {
     VLOG(1) << "Creating new motion vertex position attribute";
-    attr_mP = mesh->curve_attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
+    attr_mP = hair->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
     new_attribute = true;
   }
 
   /* export motion vectors for curve keys */
-  size_t numkeys = mesh->curve_keys.size();
+  size_t numkeys = hair->curve_keys.size();
   float4 *mP = attr_mP->data_float4() + motion_step * numkeys;
   bool have_motion = false;
   int i = 0;
@@ -740,24 +776,24 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
          curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys];
          curve++) {
       /* Curve lengths may not match! Curves can be clipped. */
-      int curve_key_end = (num_curves + 1 < (int)mesh->curve_first_key.size() ?
-                               mesh->curve_first_key[num_curves + 1] :
-                               (int)mesh->curve_keys.size());
-      const int num_center_curve_keys = curve_key_end - mesh->curve_first_key[num_curves];
+      int curve_key_end = (num_curves + 1 < (int)hair->curve_first_key.size() ?
+                               hair->curve_first_key[num_curves + 1] :
+                               (int)hair->curve_keys.size());
+      const int num_center_curve_keys = curve_key_end - hair->curve_first_key[num_curves];
       const int is_num_keys_different = CData->curve_keynum[curve] - num_center_curve_keys;
 
       if (!is_num_keys_different) {
         for (int curvekey = CData->curve_firstkey[curve];
              curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve];
              curvekey++) {
-          if (i < mesh->curve_keys.size()) {
+          if (i < hair->curve_keys.size()) {
             mP[i] = CurveSegmentMotionCV(CData, sys, curve, curvekey);
             if (!have_motion) {
               /* unlike mesh coordinates, these tend to be slightly different
                * between frames due to particle transforms into/out of object
                * space, so we use an epsilon to detect actual changes */
-              float4 curve_key = float3_to_float4(mesh->curve_keys[i]);
-              curve_key.w = mesh->curve_radius[i];
+              float4 curve_key = float3_to_float4(hair->curve_keys[i]);
+              curve_key.w = hair->curve_radius[i];
               if (len_squared(mP[i] - curve_key) > 1e-5f * 1e-5f)
                 have_motion = true;
             }
@@ -781,42 +817,17 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
     }
   }
 
-  /* in case of new attribute, we verify if there really was any motion */
+  /* In case of new attribute, we verify if there really was any motion. */
   if (new_attribute) {
-    if (i != numkeys || !have_motion) {
-      /* No motion or hair "topology" changed, remove attributes again. */
-      if (i != numkeys) {
-        VLOG(1) << "Hair topology changed, removing attribute.";
-      }
-      else {
-        VLOG(1) << "No motion, removing attribute.";
-      }
-      mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
-    }
-    else if (motion_step > 0) {
-      VLOG(1) << "Filling in new motion vertex position for motion_step " << motion_step;
-      /* motion, fill up previous steps that we might have skipped because
-       * they had no motion, but we need them anyway now */
-      for (int step = 0; step < motion_step; step++) {
-        float4 *mP = attr_mP->data_float4() + step * numkeys;
-
-        for (int key = 0; key < numkeys; key++) {
-          mP[key] = float3_to_float4(mesh->curve_keys[key]);
-          mP[key].w = mesh->curve_radius[key];
-        }
-      }
-    }
+    export_hair_motion_validate_attribute(hair, motion_step, i, have_motion);
   }
 }
 
-static void ExportCurveTriangleUV(ParticleCurveData *CData,
-                                  int vert_offset,
-                                  int resol,
-                                  float2 *uvdata)
+static void ExportCurveTriangleUV(ParticleCurveData *CData, int resol, float2 *uvdata)
 {
   if (uvdata == NULL)
     return;
-  int vertexindex = vert_offset;
+  int vertexindex = 0;
 
   for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
     for (int curve = CData->psys_firstcurve[sys];
@@ -844,15 +855,12 @@ static void ExportCurveTriangleUV(ParticleCurveData *CData,
   }
 }
 
-static void ExportCurveTriangleVcol(ParticleCurveData *CData,
-                                    int vert_offset,
-                                    int resol,
-                                    uchar4 *cdata)
+static void ExportCurveTriangleVcol(ParticleCurveData *CData, int resol, uchar4 *cdata)
 {
   if (cdata == NULL)
     return;
 
-  int vertexindex = vert_offset;
+  int vertexindex = 0;
 
   for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
     for (int curve = CData->psys_firstcurve[sys];
@@ -951,7 +959,7 @@ void BlenderSync::sync_curve_settings()
           if ((b_psys->settings().render_type() == BL::ParticleSettings::render_type_PATH) &&
               (b_psys->settings().type() == BL::ParticleSettings::type_HAIR)) {
             BL::ID key = BKE_object_is_modified(*b_ob) ? *b_ob : b_ob->data();
-            mesh_map.set_recalc(key);
+            geometry_map.set_recalc(key);
             object_map.set_recalc(*b_ob);
           }
         }
@@ -963,42 +971,51 @@ void BlenderSync::sync_curve_settings()
     curve_system_manager->tag_update(scene);
 }
 
-void BlenderSync::sync_curves(
-    Mesh *mesh, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step)
+bool BlenderSync::object_has_particle_hair(BL::Object b_ob)
 {
-  if (!motion) {
-    /* Clear stored curve data */
-    mesh->curve_keys.clear();
-    mesh->curve_radius.clear();
-    mesh->curve_first_key.clear();
-    mesh->curve_shader.clear();
-    mesh->curve_attributes.clear();
+  /* Test if the object has a particle modifier with hair. */
+  BL::Object::modifiers_iterator b_mod;
+  for (b_ob.modifiers.begin(b_mod); b_mod != b_ob.modifiers.end(); ++b_mod) {
+    if ((b_mod->type() == b_mod->type_PARTICLE_SYSTEM) &&
+        (preview ? b_mod->show_viewport() : b_mod->show_render())) {
+      BL::ParticleSystemModifier psmd((const PointerRNA)b_mod->ptr);
+      BL::ParticleSystem b_psys((const PointerRNA)psmd.particle_system().ptr);
+      BL::ParticleSettings b_part((const PointerRNA)b_psys.settings().ptr);
+
+      if ((b_part.render_type() == BL::ParticleSettings::render_type_PATH) &&
+          (b_part.type() == BL::ParticleSettings::type_HAIR)) {
+        return true;
+      }
+    }
   }
 
-  /* obtain general settings */
-  const bool use_curves = scene->curve_system_manager->use_curves;
+  return false;
+}
 
-  if (!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT && b_ob.mode() != b_ob.mode_EDIT)) {
-    if (!motion)
-      mesh->compute_bounds();
+/* Old particle hair. */
+void BlenderSync::sync_particle_hair(
+    Geometry *geom, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step)
+{
+  Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL;
+  Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL;
+
+  /* obtain general settings */
+  if (b_ob.mode() == b_ob.mode_PARTICLE_EDIT || b_ob.mode() == b_ob.mode_EDIT) {
     return;
   }
 
-  const int primitive = scene->curve_system_manager->primitive;
   const int triangle_method = scene->curve_system_manager->triangle_method;
   const int resolution = scene->curve_system_manager->resolution;
-  const size_t vert_num = mesh->verts.size();
-  const size_t tri_num = mesh->num_triangles();
   int used_res = 1;
 
   /* extract particle hair data - should be combined with connecting to mesh later*/
 
   ParticleCurveData CData;
 
-  ObtainCacheParticleData(mesh, &b_mesh, &b_ob, &CData, !preview);
+  ObtainCacheParticleData(geom, &b_mesh, &b_ob, &CData, !preview);
 
   /* add hair geometry to mesh */
-  if (primitive == CURVE_TRIANGLES) {
+  if (mesh) {
     if (triangle_method == CURVE_CAMERA_TRIANGLES) {
       /* obtain camera parameters */
       float3 RotCam;
@@ -1022,31 +1039,31 @@ void BlenderSync::sync_curves(
   }
   else {
     if (motion)
-      ExportCurveSegmentsMotion(mesh, &CData, motion_step);
+      ExportCurveSegmentsMotion(hair, &CData, motion_step);
     else
-      ExportCurveSegments(scene, mesh, &CData);
+      ExportCurveSegments(scene, hair, &CData);
   }
 
   /* generated coordinates from first key. we should ideally get this from
    * blender to handle deforming objects */
   if (!motion) {
-    if (mesh->need_attribute(scene, ATTR_STD_GENERATED)) {
+    if (geom->need_attribute(scene, ATTR_STD_GENERATED)) {
       float3 loc, size;
       mesh_texture_space(b_mesh, loc, size);
 
-      if (primitive == CURVE_TRIANGLES) {
+      if (mesh) {
         Attribute *attr_generated = mesh->attributes.add(ATTR_STD_GENERATED);
         float3 *generated = attr_generated->data_float3();
 
-        for (size_t i = vert_num; i < mesh->verts.size(); i++)
+        for (size_t i = 0; i < mesh->verts.size(); i++)
           generated[i] = mesh->verts[i] * size - loc;
       }
       else {
-        Attribute *attr_generated = mesh->curve_attributes.add(ATTR_STD_GENERATED);
+        Attribute *attr_generated = hair->attributes.add(ATTR_STD_GENERATED);
         float3 *generated = attr_generated->data_float3();
 
-        for (size_t i = 0; i < mesh->num_curves(); i++) {
-          float3 co = mesh->curve_keys[mesh->get_curve(i).first_key];
+        for (size_t i = 0; i < hair->num_curves(); i++) {
+          float3 co = hair->curve_keys[hair->get_curve(i).first_key];
           generated[i] = co * size - loc;
         }
       }
@@ -1059,21 +1076,21 @@ void BlenderSync::sync_curves(
     int vcol_num = 0;
 
     for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l, vcol_num++) {
-      if (!mesh->need_attribute(scene, ustring(l->name().c_str())))
+      if (!geom->need_attribute(scene, ustring(l->name().c_str())))
         continue;
 
-      ObtainCacheParticleVcol(mesh, &b_mesh, &b_ob, &CData, !preview, vcol_num);
+      ObtainCacheParticleVcol(geom, &b_mesh, &b_ob, &CData, !preview, vcol_num);
 
-      if (primitive == CURVE_TRIANGLES) {
+      if (mesh) {
         Attribute *attr_vcol = mesh->attributes.add(
             ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
 
         uchar4 *cdata = attr_vcol->data_uchar4();
 
-        ExportCurveTriangleVcol(&CData, tri_num * 3, used_res, cdata);
+        ExportCurveTriangleVcol(&CData, used_res, cdata);
       }
       else {
-        Attribute *attr_vcol = mesh->curve_attributes.add(
+        Attribute *attr_vcol = hair->attributes.add(
             ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CURVE);
 
         float3 *fdata = attr_vcol->data_float3();
@@ -1101,12 +1118,12 @@ void BlenderSync::sync_curves(
       ustring name = ustring(l->name().c_str());
 
       /* UV map */
-      if (mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) {
+      if (geom->need_attribute(scene, name) || geom->need_attribute(scene, std)) {
         Attribute *attr_uv;
 
-        ObtainCacheParticleUV(mesh, &b_mesh, &b_ob, &CData, !preview, uv_num);
+        ObtainCacheParticleUV(geom, &b_mesh, &b_ob, &CData, !preview, uv_num);
 
-        if (primitive == CURVE_TRIANGLES) {
+        if (mesh) {
           if (active_render)
             attr_uv = mesh->attributes.add(std, name);
           else
@@ -1114,13 +1131,13 @@ void BlenderSync::sync_curves(
 
           float2 *uv = attr_uv->data_float2();
 
-          ExportCurveTriangleUV(&CData, tri_num * 3, used_res, uv);
+          ExportCurveTriangleUV(&CData, used_res, uv);
         }
         else {
           if (active_render)
-            attr_uv = mesh->curve_attributes.add(std, name);
+            attr_uv = hair->attributes.add(std, name);
           else
-            attr_uv = mesh->curve_attributes.add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
+            attr_uv = hair->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
 
           float2 *uv = attr_uv->data_float2();
 
@@ -1135,8 +1152,292 @@ void BlenderSync::sync_curves(
       }
     }
   }
+}
+
+#ifdef WITH_NEW_OBJECT_TYPES
+static float4 hair_point_as_float4(BL::HairPoint b_point)
+{
+  float4 mP = float3_to_float4(get_float3(b_point.co()));
+  mP.w = b_point.radius();
+  return mP;
+}
+
+static float4 interpolate_hair_points(BL::Hair b_hair,
+                                      const int first_point_index,
+                                      const int num_points,
+                                      const float step)
+{
+  const float curve_t = step * (num_points - 1);
+  const int point_a = clamp((int)curve_t, 0, num_points - 1);
+  const int point_b = min(point_a + 1, num_points - 1);
+  const float t = curve_t - (float)point_a;
+  return lerp(hair_point_as_float4(b_hair.points[first_point_index + point_a]),
+              hair_point_as_float4(b_hair.points[first_point_index + point_b]),
+              t);
+}
+
+static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
+{
+  /* TODO: optimize so we can straight memcpy arrays from Blender? */
+
+  /* Add requested attributes. */
+  Attribute *attr_intercept = NULL;
+  Attribute *attr_random = NULL;
+
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT)) {
+    attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
+  }
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM)) {
+    attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
+  }
+
+  /* Reserve memory. */
+  const int num_keys = b_hair.points.length();
+  const int num_curves = b_hair.curves.length();
+
+  if (num_curves > 0) {
+    VLOG(1) << "Exporting curve segments for hair " << hair->name;
+  }
+
+  hair->reserve_curves(num_curves, num_keys);
+
+  /* Export curves and points. */
+  vector<float> points_length;
+
+  BL::Hair::curves_iterator b_curve_iter;
+  for (b_hair.curves.begin(b_curve_iter); b_curve_iter != b_hair.curves.end(); ++b_curve_iter) {
+    BL::HairCurve b_curve = *b_curve_iter;
+    const int first_point_index = b_curve.first_point_index();
+    const int num_points = b_curve.num_points();
+
+    float3 prev_co = make_float3(0.0f, 0.0f, 0.0f);
+    float length = 0.0f;
+    if (attr_intercept) {
+      points_length.clear();
+      points_length.reserve(num_points);
+    }
+
+    /* Position and radius. */
+    for (int i = 0; i < num_points; i++) {
+      BL::HairPoint b_point = b_hair.points[first_point_index + i];
+
+      const float3 co = get_float3(b_point.co());
+      const float radius = b_point.radius();
+      hair->add_curve_key(co, radius);
+
+      if (attr_intercept) {
+        if (i > 0) {
+          length += len(co - prev_co);
+          points_length.push_back(length);
+        }
+        prev_co = co;
+      }
+    }
+
+    /* Normalized 0..1 attribute along curve. */
+    if (attr_intercept) {
+      for (int i = 0; i < num_points; i++) {
+        attr_intercept->add((length == 0.0f) ? 0.0f : points_length[i] / length);
+      }
+    }
+
+    /* Random number per curve. */
+    if (attr_random != NULL) {
+      attr_random->add(hash_uint2_to_float(b_curve.index(), 0));
+    }
+
+    /* Curve. */
+    const int shader_index = 0;
+    hair->add_curve(first_point_index, shader_index);
+  }
+}
+
+static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_step)
+{
+  VLOG(1) << "Exporting curve motion segments for hair " << hair->name << ", motion step "
+          << motion_step;
+
+  /* Find or add attribute. */
+  Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  bool new_attribute = false;
+
+  if (!attr_mP) {
+    VLOG(1) << "Creating new motion vertex position attribute";
+    attr_mP = hair->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
+    new_attribute = true;
+  }
+
+  /* Export motion keys. */
+  const int num_keys = hair->curve_keys.size();
+  float4 *mP = attr_mP->data_float4() + motion_step * num_keys;
+  bool have_motion = false;
+  int num_motion_keys = 0;
+  int curve_index = 0;
+
+  BL::Hair::curves_iterator b_curve_iter;
+  for (b_hair.curves.begin(b_curve_iter); b_curve_iter != b_hair.curves.end(); ++b_curve_iter) {
+    BL::HairCurve b_curve = *b_curve_iter;
+    const int first_point_index = b_curve.first_point_index();
+    const int num_points = b_curve.num_points();
+
+    Hair::Curve curve = hair->get_curve(curve_index);
+    curve_index++;
+
+    if (num_points == curve.num_keys) {
+      /* Number of keys matches. */
+      for (int i = 0; i < num_points; i++) {
+        int point_index = first_point_index + i;
+
+        if (point_index < num_keys) {
+          mP[num_motion_keys] = hair_point_as_float4(b_hair.points[point_index]);
+          num_motion_keys++;
+
+          if (!have_motion) {
+            /* TODO: use epsilon for comparison? Was needed for particles due to
+             * transform, but ideally should not happen anymore. */
+            float4 curve_key = float3_to_float4(hair->curve_keys[i]);
+            curve_key.w = hair->curve_radius[i];
+            have_motion = !(mP[i] == curve_key);
+          }
+        }
+      }
+    }
+    else {
+      /* Number of keys has changed. Generate an interpolated version
+       * to preserve motion blur. */
+      const float step_size = curve.num_keys > 1 ? 1.0f / (curve.num_keys - 1) : 0.0f;
+      for (int i = 0; i < curve.num_keys; i++) {
+        const float step = i * step_size;
+        mP[num_motion_keys] = interpolate_hair_points(b_hair, first_point_index, num_points, step);
+        num_motion_keys++;
+      }
+      have_motion = true;
+    }
+  }
+
+  /* In case of new attribute, we verify if there really was any motion. */
+  if (new_attribute) {
+    export_hair_motion_validate_attribute(hair, motion_step, num_motion_keys, have_motion);
+  }
+}
+#endif /* WITH_NEW_OBJECT_TYPES */
 
-  mesh->compute_bounds();
+/* Hair object. */
+void BlenderSync::sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motion_step)
+{
+#ifdef WITH_NEW_OBJECT_TYPES
+  /* Convert Blender hair to Cycles curves. */
+  BL::Hair b_hair(b_ob.data());
+  if (motion) {
+    export_hair_curves_motion(hair, b_hair, motion_step);
+  }
+  else {
+    export_hair_curves(scene, hair, b_hair);
+  }
+#else
+  (void)hair;
+  (void)b_ob;
+  (void)motion;
+  (void)motion_step;
+#endif /* WITH_NEW_OBJECT_TYPES */
+}
+
+void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph,
+                            BL::Object b_ob,
+                            Geometry *geom,
+                            const vector<Shader *> &used_shaders)
+{
+  Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL;
+  Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL;
+
+  /* Compares curve_keys rather than strands in order to handle quick hair
+   * adjustments in dynamic BVH - other methods could probably do this better. */
+  array<float3> oldcurve_keys;
+  array<float> oldcurve_radius;
+  array<int> oldtriangles;
+  if (hair) {
+    oldcurve_keys.steal_data(hair->curve_keys);
+    oldcurve_radius.steal_data(hair->curve_radius);
+  }
+  else {
+    oldtriangles.steal_data(mesh->triangles);
+  }
+
+  geom->clear();
+  geom->used_shaders = used_shaders;
+
+  if (view_layer.use_hair && scene->curve_system_manager->use_curves) {
+#ifdef WITH_NEW_OBJECT_TYPES
+    if (b_ob.type() == BL::Object::type_HAIR) {
+      /* Hair object. */
+      sync_hair(hair, b_ob, false);
+      assert(mesh == NULL);
+    }
+    else
+#endif
+    {
+      /* Particle hair. */
+      bool need_undeformed = geom->need_attribute(scene, ATTR_STD_GENERATED);
+      BL::Mesh b_mesh = object_to_mesh(
+          b_data, b_ob, b_depsgraph, need_undeformed, Mesh::SUBDIVISION_NONE);
+
+      if (b_mesh) {
+        sync_particle_hair(geom, b_mesh, b_ob, false);
+        free_object_to_mesh(b_data, b_ob, b_mesh);
+      }
+    }
+  }
+
+  /* tag update */
+  const bool rebuild = (hair && ((oldcurve_keys != hair->curve_keys) ||
+                                 (oldcurve_radius != hair->curve_radius))) ||
+                       (mesh && (oldtriangles != mesh->triangles));
+
+  geom->tag_update(scene, rebuild);
+}
+
+void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph,
+                                   BL::Object b_ob,
+                                   Geometry *geom,
+                                   int motion_step)
+{
+  Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL;
+  Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL;
+
+  /* Skip if nothing exported. */
+  if ((hair && hair->num_keys() == 0) || (mesh && mesh->verts.size() == 0)) {
+    return;
+  }
+
+  /* Export deformed coordinates. */
+  if (ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
+#ifdef WITH_NEW_OBJECT_TYPES
+    if (b_ob.type() == BL::Object::type_HAIR) {
+      /* Hair object. */
+      sync_hair(hair, b_ob, true, motion_step);
+      assert(mesh == NULL);
+      return;
+    }
+    else
+#endif
+    {
+      /* Particle hair. */
+      BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_depsgraph, false, Mesh::SUBDIVISION_NONE);
+      if (b_mesh) {
+        sync_particle_hair(geom, b_mesh, b_ob, true, motion_step);
+        free_object_to_mesh(b_data, b_ob, b_mesh);
+        return;
+      }
+    }
+  }
+
+  /* No deformation on this frame, copy coordinates if other frames did have it. */
+  if (hair) {
+    hair->copy_center_to_motion_step(motion_step);
+  }
+  else {
+    mesh->copy_center_to_motion_step(motion_step);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index 111fc8d5192..5140f190f36 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -17,8 +17,26 @@
 #include "blender/blender_device.h"
 #include "blender/blender_util.h"
 
+#include "util/util_foreach.h"
+
 CCL_NAMESPACE_BEGIN
 
+enum DenoiserType {
+  DENOISER_NONE = 0,
+  DENOISER_OPTIX = 1,
+
+  DENOISER_NUM
+};
+
+enum ComputeDevice {
+  COMPUTE_DEVICE_CPU = 0,
+  COMPUTE_DEVICE_CUDA = 1,
+  COMPUTE_DEVICE_OPENCL = 2,
+  COMPUTE_DEVICE_OPTIX = 3,
+
+  COMPUTE_DEVICE_NUM
+};
+
 int blender_device_threads(BL::Scene &b_scene)
 {
   BL::RenderSettings b_r = b_scene.render();
@@ -40,7 +58,7 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
     /* Find network device. */
     vector<DeviceInfo> devices = Device::available_devices(DEVICE_MASK_NETWORK);
     if (!devices.empty()) {
-      device = devices.front();
+      return devices.front();
     }
   }
   else if (get_enum(cscene, "device") == 1) {
@@ -57,14 +75,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
     }
 
     /* Test if we are using GPU devices. */
-    enum ComputeDevice {
-      COMPUTE_DEVICE_CPU = 0,
-      COMPUTE_DEVICE_CUDA = 1,
-      COMPUTE_DEVICE_OPENCL = 2,
-      COMPUTE_DEVICE_OPTIX = 3,
-      COMPUTE_DEVICE_NUM = 4,
-    };
-
     ComputeDevice compute_device = (ComputeDevice)get_enum(
         cpreferences, "compute_device_type", COMPUTE_DEVICE_NUM, COMPUTE_DEVICE_CPU);
 
@@ -106,6 +116,34 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
     }
   }
 
+  /* Ensure there is an OptiX device when using the OptiX denoiser. */
+  bool use_optix_denoising = get_enum(cscene, "preview_denoising", DENOISER_NUM, DENOISER_NONE) ==
+                                 DENOISER_OPTIX &&
+                             !background;
+  BL::Scene::view_layers_iterator b_view_layer;
+  for (b_scene.view_layers.begin(b_view_layer); b_view_layer != b_scene.view_layers.end();
+       ++b_view_layer) {
+    PointerRNA crl = RNA_pointer_get(&b_view_layer->ptr, "cycles");
+    if (get_boolean(crl, "use_optix_denoising")) {
+      use_optix_denoising = true;
+    }
+  }
+
+  if (use_optix_denoising && device.type != DEVICE_OPTIX) {
+    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
+    if (!optix_devices.empty()) {
+      /* Convert to a special multi device with separate denoising devices. */
+      if (device.multi_devices.empty()) {
+        device.multi_devices.push_back(device);
+      }
+
+      /* Simply use the first available OptiX device. */
+      const DeviceInfo optix_device = optix_devices.front();
+      device.id += optix_device.id; /* Uniquely identify this special multi device. */
+      device.denoising_devices.push_back(optix_device);
+    }
+  }
+
   return device;
 }
 
diff --git a/intern/cycles/blender/blender_device.h b/intern/cycles/blender/blender_device.h
index fd6c045c966..8d2ecac7483 100644
--- a/intern/cycles/blender/blender_device.h
+++ b/intern/cycles/blender/blender_device.h
@@ -18,9 +18,9 @@
 #define __BLENDER_DEVICE_H__
 
 #include "MEM_guardedalloc.h"
-#include "RNA_types.h"
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
+#include "RNA_types.h"
 
 #include "device/device.h"
 
diff --git a/intern/cycles/blender/blender_geometry.cpp b/intern/cycles/blender/blender_geometry.cpp
new file mode 100644
index 00000000000..7ca35cff961
--- /dev/null
+++ b/intern/cycles/blender/blender_geometry.cpp
@@ -0,0 +1,192 @@
+
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/curves.h"
+#include "render/hair.h"
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_foreach.h"
+
+CCL_NAMESPACE_BEGIN
+
+Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
+                                     BL::Object &b_ob,
+                                     BL::Object &b_ob_instance,
+                                     bool object_updated,
+                                     bool use_particle_hair)
+{
+  /* Test if we can instance or if the object is modified. */
+  BL::ID b_ob_data = b_ob.data();
+  BL::ID b_key_id = (BKE_object_is_modified(b_ob)) ? b_ob_instance : b_ob_data;
+  GeometryKey key(b_key_id.ptr.data, use_particle_hair);
+  BL::Material material_override = view_layer.material_override;
+  Shader *default_shader = (b_ob.type() == BL::Object::type_VOLUME) ? scene->default_volume :
+                                                                      scene->default_surface;
+#ifdef WITH_NEW_OBJECT_TYPES
+  Geometry::Type geom_type = ((b_ob.type() == BL::Object::type_HAIR || use_particle_hair) &&
+                              (scene->curve_system_manager->primitive != CURVE_TRIANGLES)) ?
+                                 Geometry::HAIR :
+                                 Geometry::MESH;
+#else
+  Geometry::Type geom_type = ((use_particle_hair) &&
+                              (scene->curve_system_manager->primitive != CURVE_TRIANGLES)) ?
+                                 Geometry::HAIR :
+                                 Geometry::MESH;
+#endif
+
+  /* Find shader indices. */
+  vector<Shader *> used_shaders;
+
+  BL::Object::material_slots_iterator slot;
+  for (b_ob.material_slots.begin(slot); slot != b_ob.material_slots.end(); ++slot) {
+    if (material_override) {
+      find_shader(material_override, used_shaders, default_shader);
+    }
+    else {
+      BL::ID b_material(slot->material());
+      find_shader(b_material, used_shaders, default_shader);
+    }
+  }
+
+  if (used_shaders.size() == 0) {
+    if (material_override)
+      find_shader(material_override, used_shaders, default_shader);
+    else
+      used_shaders.push_back(default_shader);
+  }
+
+  /* Test if we need to sync. */
+  Geometry *geom = geometry_map.find(key);
+  bool sync = true;
+  if (geom == NULL) {
+    /* Add new geometry if it did not exist yet. */
+    if (geom_type == Geometry::HAIR) {
+      geom = new Hair();
+    }
+    else {
+      geom = new Mesh();
+    }
+    geometry_map.add(key, geom);
+  }
+  else {
+    /* Test if we need to update existing geometry. */
+    sync = geometry_map.update(geom, b_key_id);
+  }
+
+  if (!sync) {
+    /* If transform was applied to geometry, need full update. */
+    if (object_updated && geom->transform_applied) {
+      ;
+    }
+    /* Test if shaders changed, these can be object level so geometry
+     * does not get tagged for recalc. */
+    else if (geom->used_shaders != used_shaders) {
+      ;
+    }
+    else {
+      /* Even if not tagged for recalc, we may need to sync anyway
+       * because the shader needs different geometry attributes. */
+      bool attribute_recalc = false;
+
+      foreach (Shader *shader, geom->used_shaders) {
+        if (shader->need_update_geometry) {
+          attribute_recalc = true;
+        }
+      }
+
+      if (!attribute_recalc) {
+        return geom;
+      }
+    }
+  }
+
+  /* Ensure we only sync instanced geometry once. */
+  if (geometry_synced.find(geom) != geometry_synced.end()) {
+    return geom;
+  }
+
+  progress.set_sync_status("Synchronizing object", b_ob.name());
+
+  geometry_synced.insert(geom);
+
+  geom->name = ustring(b_ob_data.name().c_str());
+
+#ifdef WITH_NEW_OBJECT_TYPES
+  if (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) {
+#else
+  if (use_particle_hair) {
+#endif
+    sync_hair(b_depsgraph, b_ob, geom, used_shaders);
+  }
+  else if (b_ob.type() == BL::Object::type_VOLUME || object_fluid_gas_domain_find(b_ob)) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    sync_volume(b_ob, mesh, used_shaders);
+  }
+  else {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    sync_mesh(b_depsgraph, b_ob, mesh, used_shaders);
+  }
+
+  return geom;
+}
+
+void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph,
+                                       BL::Object &b_ob,
+                                       Object *object,
+                                       float motion_time,
+                                       bool use_particle_hair)
+{
+  /* Ensure we only sync instanced geometry once. */
+  Geometry *geom = object->geometry;
+
+  if (geometry_motion_synced.find(geom) != geometry_motion_synced.end())
+    return;
+
+  geometry_motion_synced.insert(geom);
+
+  /* Ensure we only motion sync geometry that also had geometry synced, to avoid
+   * unnecessary work and to ensure that its attributes were clear. */
+  if (geometry_synced.find(geom) == geometry_synced.end())
+    return;
+
+  /* Find time matching motion step required by geometry. */
+  int motion_step = geom->motion_step(motion_time);
+  if (motion_step < 0) {
+    return;
+  }
+
+#ifdef WITH_NEW_OBJECT_TYPES
+  if (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) {
+#else
+  if (use_particle_hair) {
+#endif
+    sync_hair_motion(b_depsgraph, b_ob, geom, motion_step);
+  }
+  else if (b_ob.type() == BL::Object::type_VOLUME || object_fluid_gas_domain_find(b_ob)) {
+    /* No volume motion blur support yet. */
+  }
+  else {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    sync_mesh_motion(b_depsgraph, b_ob, mesh, motion_step);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_id_map.h b/intern/cycles/blender/blender_id_map.h
new file mode 100644
index 00000000000..3bc42e349ae
--- /dev/null
+++ b/intern/cycles/blender/blender_id_map.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BLENDER_ID_MAP_H__
+#define __BLENDER_ID_MAP_H__
+
+#include <string.h>
+
+#include "util/util_map.h"
+#include "util/util_set.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* ID Map
+ *
+ * Utility class to map between Blender datablocks and Cycles data structures,
+ * and keep track of recalc tags from the dependency graph. */
+
+template<typename K, typename T> class id_map {
+ public:
+  id_map(vector<T *> *scene_data_)
+  {
+    scene_data = scene_data_;
+  }
+
+  T *find(const BL::ID &id)
+  {
+    return find(id.ptr.owner_id);
+  }
+
+  T *find(const K &key)
+  {
+    if (b_map.find(key) != b_map.end()) {
+      T *data = b_map[key];
+      return data;
+    }
+
+    return NULL;
+  }
+
+  void set_recalc(const BL::ID &id)
+  {
+    b_recalc.insert(id.ptr.data);
+  }
+
+  void set_recalc(void *id_ptr)
+  {
+    b_recalc.insert(id_ptr);
+  }
+
+  bool has_recalc()
+  {
+    return !(b_recalc.empty());
+  }
+
+  void pre_sync()
+  {
+    used_set.clear();
+  }
+
+  /* Add new data. */
+  void add(const K &key, T *data)
+  {
+    assert(find(key) == NULL);
+    scene_data->push_back(data);
+    b_map[key] = data;
+    used(data);
+  }
+
+  /* Update existing data. */
+  bool update(T *data, const BL::ID &id)
+  {
+    return update(data, id, id);
+  }
+  bool update(T *data, const BL::ID &id, const BL::ID &parent)
+  {
+    bool recalc = (b_recalc.find(id.ptr.data) != b_recalc.end());
+    if (parent.ptr.data && parent.ptr.data != id.ptr.data) {
+      recalc = recalc || (b_recalc.find(parent.ptr.data) != b_recalc.end());
+    }
+    used(data);
+    return recalc;
+  }
+
+  /* Combined add and update as needed. */
+  bool add_or_update(T **r_data, const BL::ID &id)
+  {
+    return add_or_update(r_data, id, id, id.ptr.owner_id);
+  }
+  bool add_or_update(T **r_data, const BL::ID &id, const K &key)
+  {
+    return add_or_update(r_data, id, id, key);
+  }
+  bool add_or_update(T **r_data, const BL::ID &id, const BL::ID &parent, const K &key)
+  {
+    T *data = find(key);
+    bool recalc;
+
+    if (!data) {
+      /* Add data if it didn't exist yet. */
+      data = new T();
+      add(key, data);
+      recalc = true;
+    }
+    else {
+      /* check if updated needed. */
+      recalc = update(data, id, parent);
+    }
+
+    *r_data = data;
+    return recalc;
+  }
+
+  /* Combined add or update for convenience. */
+
+  bool is_used(const K &key)
+  {
+    T *data = find(key);
+    return (data) ? used_set.find(data) != used_set.end() : false;
+  }
+
+  void used(T *data)
+  {
+    /* tag data as still in use */
+    used_set.insert(data);
+  }
+
+  void set_default(T *data)
+  {
+    b_map[NULL] = data;
+  }
+
+  bool post_sync(bool do_delete = true)
+  {
+    /* remove unused data */
+    vector<T *> new_scene_data;
+    typename vector<T *>::iterator it;
+    bool deleted = false;
+
+    for (it = scene_data->begin(); it != scene_data->end(); it++) {
+      T *data = *it;
+
+      if (do_delete && used_set.find(data) == used_set.end()) {
+        delete data;
+        deleted = true;
+      }
+      else
+        new_scene_data.push_back(data);
+    }
+
+    *scene_data = new_scene_data;
+
+    /* update mapping */
+    map<K, T *> new_map;
+    typedef pair<const K, T *> TMapPair;
+    typename map<K, T *>::iterator jt;
+
+    for (jt = b_map.begin(); jt != b_map.end(); jt++) {
+      TMapPair &pair = *jt;
+
+      if (used_set.find(pair.second) != used_set.end())
+        new_map[pair.first] = pair.second;
+    }
+
+    used_set.clear();
+    b_recalc.clear();
+    b_map = new_map;
+
+    return deleted;
+  }
+
+  const map<K, T *> &key_to_scene_data()
+  {
+    return b_map;
+  }
+
+ protected:
+  vector<T *> *scene_data;
+  map<K, T *> b_map;
+  set<T *> used_set;
+  set<void *> b_recalc;
+};
+
+/* Object Key
+ *
+ * To uniquely identify instances, we use the parent, object and persistent instance ID.
+ * We also export separate object for a mesh and its particle hair. */
+
+enum { OBJECT_PERSISTENT_ID_SIZE = 16 };
+
+struct ObjectKey {
+  void *parent;
+  int id[OBJECT_PERSISTENT_ID_SIZE];
+  void *ob;
+  bool use_particle_hair;
+
+  ObjectKey(void *parent_, int id_[OBJECT_PERSISTENT_ID_SIZE], void *ob_, bool use_particle_hair_)
+      : parent(parent_), ob(ob_), use_particle_hair(use_particle_hair_)
+  {
+    if (id_)
+      memcpy(id, id_, sizeof(id));
+    else
+      memset(id, 0, sizeof(id));
+  }
+
+  bool operator<(const ObjectKey &k) const
+  {
+    if (ob < k.ob) {
+      return true;
+    }
+    else if (ob == k.ob) {
+      if (parent < k.parent) {
+        return true;
+      }
+      else if (parent == k.parent) {
+        if (use_particle_hair < k.use_particle_hair) {
+          return true;
+        }
+        else if (use_particle_hair == k.use_particle_hair) {
+          return memcmp(id, k.id, sizeof(id)) < 0;
+        }
+      }
+    }
+
+    return false;
+  }
+};
+
+/* Geometry Key
+ *
+ * We export separate geometry for a mesh and its particle hair, so key needs to
+ * distinguish between them. */
+
+struct GeometryKey {
+  void *id;
+  bool use_particle_hair;
+
+  GeometryKey(void *id, bool use_particle_hair) : id(id), use_particle_hair(use_particle_hair)
+  {
+  }
+
+  bool operator<(const GeometryKey &k) const
+  {
+    if (id < k.id) {
+      return true;
+    }
+    else if (id == k.id) {
+      if (use_particle_hair < k.use_particle_hair) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+/* Particle System Key */
+
+struct ParticleSystemKey {
+  void *ob;
+  int id[OBJECT_PERSISTENT_ID_SIZE];
+
+  ParticleSystemKey(void *ob_, int id_[OBJECT_PERSISTENT_ID_SIZE]) : ob(ob_)
+  {
+    if (id_)
+      memcpy(id, id_, sizeof(id));
+    else
+      memset(id, 0, sizeof(id));
+  }
+
+  bool operator<(const ParticleSystemKey &k) const
+  {
+    /* first id is particle index, we don't compare that */
+    if (ob < k.ob)
+      return true;
+    else if (ob == k.ob)
+      return memcmp(id + 1, k.id + 1, sizeof(int) * (OBJECT_PERSISTENT_ID_SIZE - 1)) < 0;
+
+    return false;
+  }
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BLENDER_ID_MAP_H__ */
diff --git a/intern/cycles/blender/blender_image.cpp b/intern/cycles/blender/blender_image.cpp
new file mode 100644
index 00000000000..459dc1779fb
--- /dev/null
+++ b/intern/cycles/blender/blender_image.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MEM_guardedalloc.h"
+
+#include "blender/blender_image.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Packed Images */
+
+BlenderImageLoader::BlenderImageLoader(BL::Image b_image, int frame)
+    : b_image(b_image), frame(frame), free_cache(!b_image.has_data())
+{
+}
+
+bool BlenderImageLoader::load_metadata(ImageMetaData &metadata)
+{
+  metadata.width = b_image.size()[0];
+  metadata.height = b_image.size()[1];
+  metadata.depth = 1;
+  metadata.channels = b_image.channels();
+
+  if (b_image.is_float()) {
+    if (metadata.channels == 1) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT;
+    }
+    else if (metadata.channels == 4) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+    }
+    else {
+      return false;
+    }
+
+    /* Float images are already converted on the Blender side,
+     * no need to do anything in Cycles. */
+    metadata.colorspace = u_colorspace_raw;
+  }
+  else {
+    if (metadata.channels == 1) {
+      metadata.type = IMAGE_DATA_TYPE_BYTE;
+    }
+    else if (metadata.channels == 4) {
+      metadata.type = IMAGE_DATA_TYPE_BYTE4;
+    }
+    else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool BlenderImageLoader::load_pixels(const ImageMetaData &metadata,
+                                     void *pixels,
+                                     const size_t pixels_size,
+                                     const bool associate_alpha)
+{
+  const size_t num_pixels = ((size_t)metadata.width) * metadata.height;
+  const int channels = metadata.channels;
+  const int tile = 0; /* TODO(lukas): Support tiles here? */
+
+  if (b_image.is_float()) {
+    /* image data */
+    float *image_pixels;
+    image_pixels = image_get_float_pixels_for_frame(b_image, frame, tile);
+
+    if (image_pixels && num_pixels * channels == pixels_size) {
+      memcpy(pixels, image_pixels, pixels_size * sizeof(float));
+    }
+    else {
+      if (channels == 1) {
+        memset(pixels, 0, num_pixels * sizeof(float));
+      }
+      else {
+        const size_t num_pixels_safe = pixels_size / channels;
+        float *fp = (float *)pixels;
+        for (int i = 0; i < num_pixels_safe; i++, fp += channels) {
+          fp[0] = 1.0f;
+          fp[1] = 0.0f;
+          fp[2] = 1.0f;
+          if (channels == 4) {
+            fp[3] = 1.0f;
+          }
+        }
+      }
+    }
+
+    if (image_pixels) {
+      MEM_freeN(image_pixels);
+    }
+  }
+  else {
+    unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame, tile);
+
+    if (image_pixels && num_pixels * channels == pixels_size) {
+      memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
+    }
+    else {
+      if (channels == 1) {
+        memset(pixels, 0, pixels_size * sizeof(unsigned char));
+      }
+      else {
+        const size_t num_pixels_safe = pixels_size / channels;
+        unsigned char *cp = (unsigned char *)pixels;
+        for (size_t i = 0; i < num_pixels_safe; i++, cp += channels) {
+          cp[0] = 255;
+          cp[1] = 0;
+          cp[2] = 255;
+          if (channels == 4) {
+            cp[3] = 255;
+          }
+        }
+      }
+    }
+
+    if (image_pixels) {
+      MEM_freeN(image_pixels);
+    }
+
+    if (associate_alpha) {
+      /* Premultiply, byte images are always straight for Blender. */
+      unsigned char *cp = (unsigned char *)pixels;
+      for (size_t i = 0; i < num_pixels; i++, cp += channels) {
+        cp[0] = (cp[0] * cp[3]) >> 8;
+        cp[1] = (cp[1] * cp[3]) >> 8;
+        cp[2] = (cp[2] * cp[3]) >> 8;
+      }
+    }
+  }
+
+  /* Free image buffers to save memory during render. */
+  if (free_cache) {
+    b_image.buffers_free();
+  }
+
+  return true;
+}
+
+string BlenderImageLoader::name() const
+{
+  return BL::Image(b_image).name();
+}
+
+bool BlenderImageLoader::equals(const ImageLoader &other) const
+{
+  const BlenderImageLoader &other_loader = (const BlenderImageLoader &)other;
+  return b_image == other_loader.b_image && frame == other_loader.frame;
+}
+
+/* Point Density */
+
+BlenderPointDensityLoader::BlenderPointDensityLoader(BL::Depsgraph b_depsgraph,
+                                                     BL::ShaderNodeTexPointDensity b_node)
+    : b_depsgraph(b_depsgraph), b_node(b_node)
+{
+}
+
+bool BlenderPointDensityLoader::load_metadata(ImageMetaData &metadata)
+{
+  metadata.channels = 4;
+  metadata.width = b_node.resolution();
+  metadata.height = metadata.width;
+  metadata.depth = metadata.width;
+  metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+  return true;
+}
+
+bool BlenderPointDensityLoader::load_pixels(const ImageMetaData &,
+                                            void *pixels,
+                                            const size_t,
+                                            const bool)
+{
+  int length;
+  b_node.calc_point_density(b_depsgraph, &length, (float **)&pixels);
+  return true;
+}
+
+void BlenderSession::builtin_images_load()
+{
+  /* Force builtin images to be loaded along with Blender data sync. This
+   * is needed because we may be reading from depsgraph evaluated data which
+   * can be freed by Blender before Cycles reads it.
+   *
+   * TODO: the assumption that no further access to builtin image data will
+   * happen is really weak, and likely to break in the future. We should find
+   * a better solution to hand over the data directly to the image manager
+   * instead of through callbacks whose timing is difficult to control. */
+  ImageManager *manager = session->scene->image_manager;
+  Device *device = session->device;
+  manager->device_load_builtin(device, session->scene, session->progress);
+}
+
+string BlenderPointDensityLoader::name() const
+{
+  return BL::ShaderNodeTexPointDensity(b_node).name();
+}
+
+bool BlenderPointDensityLoader::equals(const ImageLoader &other) const
+{
+  const BlenderPointDensityLoader &other_loader = (const BlenderPointDensityLoader &)other;
+  return b_node == other_loader.b_node && b_depsgraph == other_loader.b_depsgraph;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_image.h b/intern/cycles/blender/blender_image.h
new file mode 100644
index 00000000000..b58a159a6ba
--- /dev/null
+++ b/intern/cycles/blender/blender_image.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BLENDER_IMAGE_H__
+#define __BLENDER_IMAGE_H__
+
+#include "RNA_blender_cpp.h"
+
+#include "render/image.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BlenderImageLoader : public ImageLoader {
+ public:
+  BlenderImageLoader(BL::Image b_image, int frame);
+
+  bool load_metadata(ImageMetaData &metadata) override;
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t pixels_size,
+                   const bool associate_alpha) override;
+  string name() const override;
+  bool equals(const ImageLoader &other) const override;
+
+  BL::Image b_image;
+  int frame;
+  bool free_cache;
+};
+
+class BlenderPointDensityLoader : public ImageLoader {
+ public:
+  BlenderPointDensityLoader(BL::Depsgraph depsgraph, BL::ShaderNodeTexPointDensity b_node);
+
+  bool load_metadata(ImageMetaData &metadata) override;
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t pixels_size,
+                   const bool associate_alpha) override;
+  string name() const override;
+  bool equals(const ImageLoader &other) const override;
+
+  BL::Depsgraph b_depsgraph;
+  BL::ShaderNodeTexPointDensity b_node;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BLENDER_IMAGE_H__ */
diff --git a/intern/cycles/blender/blender_light.cpp b/intern/cycles/blender/blender_light.cpp
new file mode 100644
index 00000000000..6f95821e31e
--- /dev/null
+++ b/intern/cycles/blender/blender_light.cpp
@@ -0,0 +1,212 @@
+
+
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/light.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_hash.h"
+
+CCL_NAMESPACE_BEGIN
+
+void BlenderSync::sync_light(BL::Object &b_parent,
+                             int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
+                             BL::Object &b_ob,
+                             BL::Object &b_ob_instance,
+                             int random_id,
+                             Transform &tfm,
+                             bool *use_portal)
+{
+  /* test if we need to sync */
+  Light *light;
+  ObjectKey key(b_parent, persistent_id, b_ob_instance, false);
+  BL::Light b_light(b_ob.data());
+
+  /* Update if either object or light data changed. */
+  if (!light_map.add_or_update(&light, b_ob, b_parent, key)) {
+    Shader *shader;
+    if (!shader_map.add_or_update(&shader, b_light)) {
+      if (light->is_portal)
+        *use_portal = true;
+      return;
+    }
+  }
+
+  /* type */
+  switch (b_light.type()) {
+    case BL::Light::type_POINT: {
+      BL::PointLight b_point_light(b_light);
+      light->size = b_point_light.shadow_soft_size();
+      light->type = LIGHT_POINT;
+      break;
+    }
+    case BL::Light::type_SPOT: {
+      BL::SpotLight b_spot_light(b_light);
+      light->size = b_spot_light.shadow_soft_size();
+      light->type = LIGHT_SPOT;
+      light->spot_angle = b_spot_light.spot_size();
+      light->spot_smooth = b_spot_light.spot_blend();
+      break;
+    }
+    /* Hemi were removed from 2.8 */
+    // case BL::Light::type_HEMI: {
+    //  light->type = LIGHT_DISTANT;
+    //  light->size = 0.0f;
+    //  break;
+    // }
+    case BL::Light::type_SUN: {
+      BL::SunLight b_sun_light(b_light);
+      light->angle = b_sun_light.angle();
+      light->type = LIGHT_DISTANT;
+      break;
+    }
+    case BL::Light::type_AREA: {
+      BL::AreaLight b_area_light(b_light);
+      light->size = 1.0f;
+      light->axisu = transform_get_column(&tfm, 0);
+      light->axisv = transform_get_column(&tfm, 1);
+      light->sizeu = b_area_light.size();
+      switch (b_area_light.shape()) {
+        case BL::AreaLight::shape_SQUARE:
+          light->sizev = light->sizeu;
+          light->round = false;
+          break;
+        case BL::AreaLight::shape_RECTANGLE:
+          light->sizev = b_area_light.size_y();
+          light->round = false;
+          break;
+        case BL::AreaLight::shape_DISK:
+          light->sizev = light->sizeu;
+          light->round = true;
+          break;
+        case BL::AreaLight::shape_ELLIPSE:
+          light->sizev = b_area_light.size_y();
+          light->round = true;
+          break;
+      }
+      light->type = LIGHT_AREA;
+      break;
+    }
+  }
+
+  /* strength */
+  light->strength = get_float3(b_light.color());
+  light->strength *= BL::PointLight(b_light).energy();
+
+  /* location and (inverted!) direction */
+  light->co = transform_get_column(&tfm, 3);
+  light->dir = -transform_get_column(&tfm, 2);
+  light->tfm = tfm;
+
+  /* shader */
+  vector<Shader *> used_shaders;
+  find_shader(b_light, used_shaders, scene->default_light);
+  light->shader = used_shaders[0];
+
+  /* shadow */
+  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+  PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles");
+  light->cast_shadow = get_boolean(clight, "cast_shadow");
+  light->use_mis = get_boolean(clight, "use_multiple_importance_sampling");
+
+  int samples = get_int(clight, "samples");
+  if (get_boolean(cscene, "use_square_samples"))
+    light->samples = samples * samples;
+  else
+    light->samples = samples;
+
+  light->max_bounces = get_int(clight, "max_bounces");
+
+  if (b_ob != b_ob_instance) {
+    light->random_id = random_id;
+  }
+  else {
+    light->random_id = hash_uint2(hash_string(b_ob.name().c_str()), 0);
+  }
+
+  if (light->type == LIGHT_AREA)
+    light->is_portal = get_boolean(clight, "is_portal");
+  else
+    light->is_portal = false;
+
+  if (light->is_portal)
+    *use_portal = true;
+
+  /* visibility */
+  uint visibility = object_ray_visibility(b_ob);
+  light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0;
+  light->use_glossy = (visibility & PATH_RAY_GLOSSY) != 0;
+  light->use_transmission = (visibility & PATH_RAY_TRANSMIT) != 0;
+  light->use_scatter = (visibility & PATH_RAY_VOLUME_SCATTER) != 0;
+
+  /* tag */
+  light->tag_update(scene);
+}
+
+void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
+{
+  BL::World b_world = b_scene.world();
+
+  if (b_world) {
+    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+    PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
+
+    enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM };
+    int sampling_method = get_enum(cworld, "sampling_method", SAMPLING_NUM, SAMPLING_AUTOMATIC);
+    bool sample_as_light = (sampling_method != SAMPLING_NONE);
+
+    if (sample_as_light || use_portal) {
+      /* test if we need to sync */
+      Light *light;
+      ObjectKey key(b_world, 0, b_world, false);
+
+      if (light_map.add_or_update(&light, b_world, b_world, key) || world_recalc ||
+          b_world.ptr.data != world_map) {
+        light->type = LIGHT_BACKGROUND;
+        if (sampling_method == SAMPLING_MANUAL) {
+          light->map_resolution = get_int(cworld, "sample_map_resolution");
+        }
+        else {
+          light->map_resolution = 0;
+        }
+        light->shader = scene->default_background;
+        light->use_mis = sample_as_light;
+        light->max_bounces = get_int(cworld, "max_bounces");
+
+        /* force enable light again when world is resynced */
+        light->is_enabled = true;
+
+        int samples = get_int(cworld, "samples");
+        if (get_boolean(cscene, "use_square_samples"))
+          light->samples = samples * samples;
+        else
+          light->samples = samples;
+
+        light->tag_update(scene);
+        light_map.set_recalc(b_world);
+      }
+    }
+  }
+
+  world_map = b_world.ptr.data;
+  world_recalc = false;
+  viewport_parameters = BlenderViewportParameters(b_v3d);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index b18f9a37948..a6f380a9ae7 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -14,25 +14,25 @@
  * limitations under the License.
  */
 
+#include "render/camera.h"
 #include "render/colorspace.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
-#include "render/camera.h"
 
-#include "blender/blender_sync.h"
 #include "blender/blender_session.h"
+#include "blender/blender_sync.h"
 #include "blender/blender_util.h"
 
 #include "subd/subd_patch.h"
 #include "subd/subd_split.h"
 
 #include "util/util_algorithm.h"
+#include "util/util_disjoint_set.h"
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
 #include "util/util_logging.h"
 #include "util/util_math.h"
-#include "util/util_disjoint_set.h"
 
 #include "mikktspace.h"
 
@@ -278,54 +278,6 @@ static void mikk_compute_tangents(
   genTangSpaceDefault(&context);
 }
 
-/* Create Volume Attribute */
-
-static void create_mesh_volume_attribute(
-    BL::Object &b_ob, Mesh *mesh, ImageManager *image_manager, AttributeStandard std, float frame)
-{
-  BL::FluidDomainSettings b_domain = object_fluid_domain_find(b_ob);
-
-  if (!b_domain)
-    return;
-
-  mesh->volume_isovalue = b_domain.clipping();
-
-  Attribute *attr = mesh->attributes.add(std);
-  VoxelAttribute *volume_data = attr->data_voxel();
-  ImageMetaData metadata;
-  bool animated = false;
-
-  volume_data->manager = image_manager;
-  volume_data->slot = image_manager->add_image(Attribute::standard_name(std),
-                                               b_ob.ptr.data,
-                                               animated,
-                                               frame,
-                                               INTERPOLATION_LINEAR,
-                                               EXTENSION_CLIP,
-                                               IMAGE_ALPHA_AUTO,
-                                               u_colorspace_raw,
-                                               metadata);
-}
-
-static void create_mesh_volume_attributes(Scene *scene, BL::Object &b_ob, Mesh *mesh, float frame)
-{
-  /* for smoke volume rendering */
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_DENSITY))
-    create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_DENSITY, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_COLOR))
-    create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_COLOR, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_FLAME))
-    create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_HEAT))
-    create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_TEMPERATURE))
-    create_mesh_volume_attribute(
-        b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_TEMPERATURE, frame);
-  if (mesh->need_attribute(scene, ATTR_STD_VOLUME_VELOCITY))
-    create_mesh_volume_attribute(
-        b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY, frame);
-}
-
 /* Create vertex color attributes. */
 static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivision)
 {
@@ -333,14 +285,27 @@ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh,
     BL::Mesh::vertex_colors_iterator l;
 
     for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
-      if (!mesh->need_attribute(scene, ustring(l->name().c_str())))
+      const bool active_render = l->active_render();
+      AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
+      ustring vcol_name = ustring(l->name().c_str());
+
+      const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
+                             mesh->need_attribute(scene, vcol_std);
+
+      if (!need_vcol) {
         continue;
+      }
 
-      Attribute *attr = mesh->subd_attributes.add(
-          ustring(l->name().c_str()), TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
+      Attribute *vcol_attr = NULL;
+      if (active_render) {
+        vcol_attr = mesh->subd_attributes.add(vcol_std, vcol_name);
+      }
+      else {
+        vcol_attr = mesh->subd_attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
+      }
 
       BL::Mesh::polygons_iterator p;
-      uchar4 *cdata = attr->data_uchar4();
+      uchar4 *cdata = vcol_attr->data_uchar4();
 
       for (b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
         int n = p->loop_total();
@@ -355,14 +320,27 @@ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh,
   else {
     BL::Mesh::vertex_colors_iterator l;
     for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) {
-      if (!mesh->need_attribute(scene, ustring(l->name().c_str())))
+      const bool active_render = l->active_render();
+      AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
+      ustring vcol_name = ustring(l->name().c_str());
+
+      const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
+                             mesh->need_attribute(scene, vcol_std);
+
+      if (!need_vcol) {
         continue;
+      }
 
-      Attribute *attr = mesh->attributes.add(
-          ustring(l->name().c_str()), TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
+      Attribute *vcol_attr = NULL;
+      if (active_render) {
+        vcol_attr = mesh->attributes.add(vcol_std, vcol_name);
+      }
+      else {
+        vcol_attr = mesh->attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
+      }
 
       BL::Mesh::loop_triangles_iterator t;
-      uchar4 *cdata = attr->data_uchar4();
+      uchar4 *cdata = vcol_attr->data_uchar4();
 
       for (b_mesh.loop_triangles.begin(t); t != b_mesh.loop_triangles.end(); ++t) {
         int3 li = get_int3(t->loops());
@@ -859,9 +837,9 @@ static void create_mesh(Scene *scene,
     attr_create_uv_map(scene, mesh, b_mesh);
   }
 
-  /* for volume objects, create a matrix to transform from object space to
+  /* For volume objects, create a matrix to transform from object space to
    * mesh texture space. this does not work with deformations but that can
-   * probably only be done well with a volume grid mapping of coordinates */
+   * probably only be done well with a volume grid mapping of coordinates. */
   if (mesh->need_attribute(scene, ATTR_STD_GENERATED_TRANSFORM)) {
     Attribute *attr = mesh->attributes.add(ATTR_STD_GENERATED_TRANSFORM);
     Transform *tfm = attr->data_transform();
@@ -930,7 +908,7 @@ static void sync_mesh_fluid_motion(BL::Object &b_ob, Scene *scene, Mesh *mesh)
   if (scene->need_motion() == Scene::MOTION_NONE)
     return;
 
-  BL::FluidDomainSettings b_fluid_domain = object_fluid_domain_find(b_ob);
+  BL::FluidDomainSettings b_fluid_domain = object_fluid_liquid_domain_find(b_ob);
 
   if (!b_fluid_domain)
     return;
@@ -963,82 +941,11 @@ static void sync_mesh_fluid_motion(BL::Object &b_ob, Scene *scene, Mesh *mesh)
   }
 }
 
-Mesh *BlenderSync::sync_mesh(BL::Depsgraph &b_depsgraph,
-                             BL::Object &b_ob,
-                             BL::Object &b_ob_instance,
-                             bool object_updated,
-                             bool show_self,
-                             bool show_particles)
+void BlenderSync::sync_mesh(BL::Depsgraph b_depsgraph,
+                            BL::Object b_ob,
+                            Mesh *mesh,
+                            const vector<Shader *> &used_shaders)
 {
-  /* test if we can instance or if the object is modified */
-  BL::ID b_ob_data = b_ob.data();
-  BL::ID key = (BKE_object_is_modified(b_ob)) ? b_ob_instance : b_ob_data;
-  BL::Material material_override = view_layer.material_override;
-
-  /* find shader indices */
-  vector<Shader *> used_shaders;
-
-  BL::Object::material_slots_iterator slot;
-  for (b_ob.material_slots.begin(slot); slot != b_ob.material_slots.end(); ++slot) {
-    if (material_override) {
-      find_shader(material_override, used_shaders, scene->default_surface);
-    }
-    else {
-      BL::ID b_material(slot->material());
-      find_shader(b_material, used_shaders, scene->default_surface);
-    }
-  }
-
-  if (used_shaders.size() == 0) {
-    if (material_override)
-      find_shader(material_override, used_shaders, scene->default_surface);
-    else
-      used_shaders.push_back(scene->default_surface);
-  }
-
-  /* test if we need to sync */
-  int requested_geometry_flags = Mesh::GEOMETRY_NONE;
-  if (view_layer.use_surfaces) {
-    requested_geometry_flags |= Mesh::GEOMETRY_TRIANGLES;
-  }
-  if (view_layer.use_hair) {
-    requested_geometry_flags |= Mesh::GEOMETRY_CURVES;
-  }
-  Mesh *mesh;
-
-  if (!mesh_map.sync(&mesh, key)) {
-    /* if transform was applied to mesh, need full update */
-    if (object_updated && mesh->transform_applied)
-      ;
-    /* test if shaders changed, these can be object level so mesh
-     * does not get tagged for recalc */
-    else if (mesh->used_shaders != used_shaders)
-      ;
-    else if (requested_geometry_flags != mesh->geometry_flags)
-      ;
-    else {
-      /* even if not tagged for recalc, we may need to sync anyway
-       * because the shader needs different mesh attributes */
-      bool attribute_recalc = false;
-
-      foreach (Shader *shader, mesh->used_shaders)
-        if (shader->need_update_mesh)
-          attribute_recalc = true;
-
-      if (!attribute_recalc)
-        return mesh;
-    }
-  }
-
-  /* ensure we only sync instanced meshes once */
-  if (mesh_synced.find(mesh) != mesh_synced.end())
-    return mesh;
-
-  progress.set_sync_status("Synchronizing object", b_ob.name());
-
-  mesh_synced.insert(mesh);
-
-  /* create derived mesh */
   array<int> oldtriangles;
   array<Mesh::SubdFace> oldsubd_faces;
   array<int> oldsubd_face_corners;
@@ -1046,150 +953,73 @@ Mesh *BlenderSync::sync_mesh(BL::Depsgraph &b_depsgraph,
   oldsubd_faces.steal_data(mesh->subd_faces);
   oldsubd_face_corners.steal_data(mesh->subd_face_corners);
 
-  /* compares curve_keys rather than strands in order to handle quick hair
-   * adjustments in dynamic BVH - other methods could probably do this better*/
-  array<float3> oldcurve_keys;
-  array<float> oldcurve_radius;
-  oldcurve_keys.steal_data(mesh->curve_keys);
-  oldcurve_radius.steal_data(mesh->curve_radius);
-
-  /* ensure bvh rebuild (instead of refit) if has_voxel_attributes() changed */
-  bool oldhas_voxel_attributes = mesh->has_voxel_attributes();
-
   mesh->clear();
   mesh->used_shaders = used_shaders;
-  mesh->name = ustring(b_ob_data.name().c_str());
 
-  if (requested_geometry_flags != Mesh::GEOMETRY_NONE) {
+  mesh->subdivision_type = Mesh::SUBDIVISION_NONE;
+
+  if (view_layer.use_surfaces) {
     /* Adaptive subdivision setup. Not for baking since that requires
      * exact mapping to the Blender mesh. */
-    if (scene->bake_manager->get_baking()) {
-      mesh->subdivision_type = Mesh::SUBDIVISION_NONE;
-    }
-    else {
+    if (!scene->bake_manager->get_baking()) {
       mesh->subdivision_type = object_subdivision_type(b_ob, preview, experimental);
     }
 
     /* For some reason, meshes do not need this... */
     bool need_undeformed = mesh->need_attribute(scene, ATTR_STD_GENERATED);
-
     BL::Mesh b_mesh = object_to_mesh(
         b_data, b_ob, b_depsgraph, need_undeformed, mesh->subdivision_type);
 
     if (b_mesh) {
       /* Sync mesh itself. */
-      if (view_layer.use_surfaces && show_self) {
-        if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE)
-          create_subd_mesh(scene, mesh, b_ob, b_mesh, used_shaders, dicing_rate, max_subdivisions);
-        else
-          create_mesh(scene, mesh, b_mesh, used_shaders, false);
-
-        create_mesh_volume_attributes(scene, b_ob, mesh, b_scene.frame_current());
-      }
-
-      /* Sync hair curves. */
-      if (view_layer.use_hair && show_particles &&
-          mesh->subdivision_type == Mesh::SUBDIVISION_NONE) {
-        sync_curves(mesh, b_mesh, b_ob, false);
-      }
+      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE)
+        create_subd_mesh(
+            scene, mesh, b_ob, b_mesh, mesh->used_shaders, dicing_rate, max_subdivisions);
+      else
+        create_mesh(scene, mesh, b_mesh, mesh->used_shaders, false);
 
       free_object_to_mesh(b_data, b_ob, b_mesh);
     }
   }
-  mesh->geometry_flags = requested_geometry_flags;
 
   /* mesh fluid motion mantaflow */
   sync_mesh_fluid_motion(b_ob, scene, mesh);
 
   /* tag update */
   bool rebuild = (oldtriangles != mesh->triangles) || (oldsubd_faces != mesh->subd_faces) ||
-                 (oldsubd_face_corners != mesh->subd_face_corners) ||
-                 (oldcurve_keys != mesh->curve_keys) || (oldcurve_radius != mesh->curve_radius) ||
-                 (oldhas_voxel_attributes != mesh->has_voxel_attributes());
+                 (oldsubd_face_corners != mesh->subd_face_corners);
 
   mesh->tag_update(scene, rebuild);
-
-  return mesh;
 }
 
-void BlenderSync::sync_mesh_motion(BL::Depsgraph &b_depsgraph,
-                                   BL::Object &b_ob,
-                                   Object *object,
-                                   float motion_time)
+void BlenderSync::sync_mesh_motion(BL::Depsgraph b_depsgraph,
+                                   BL::Object b_ob,
+                                   Mesh *mesh,
+                                   int motion_step)
 {
-  /* ensure we only sync instanced meshes once */
-  Mesh *mesh = object->mesh;
-
-  if (mesh_motion_synced.find(mesh) != mesh_motion_synced.end())
-    return;
-
-  mesh_motion_synced.insert(mesh);
-
-  /* ensure we only motion sync meshes that also had mesh synced, to avoid
-   * unnecessary work and to ensure that its attributes were clear */
-  if (mesh_synced.find(mesh) == mesh_synced.end())
-    return;
-
-  /* Find time matching motion step required by mesh. */
-  int motion_step = mesh->motion_step(motion_time);
-  if (motion_step < 0) {
+  /* Fluid motion blur already exported. */
+  BL::FluidDomainSettings b_fluid_domain = object_fluid_liquid_domain_find(b_ob);
+  if (b_fluid_domain) {
     return;
   }
 
-  /* skip empty meshes */
-  const size_t numverts = mesh->verts.size();
-  const size_t numkeys = mesh->curve_keys.size();
-
-  if (!numverts && !numkeys)
+  /* Skip if no vertices were exported. */
+  size_t numverts = mesh->verts.size();
+  if (numverts == 0) {
     return;
+  }
 
-  /* skip objects without deforming modifiers. this is not totally reliable,
-   * would need a more extensive check to see which objects are animated */
+  /* Skip objects without deforming modifiers. this is not totally reliable,
+   * would need a more extensive check to see which objects are animated. */
   BL::Mesh b_mesh(PointerRNA_NULL);
-
-  /* manta motion is exported immediate with mesh, skip here */
-  BL::FluidDomainSettings b_fluid_domain = object_fluid_domain_find(b_ob);
-  if (b_fluid_domain)
-    return;
-
   if (ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
     /* get derived mesh */
     b_mesh = object_to_mesh(b_data, b_ob, b_depsgraph, false, Mesh::SUBDIVISION_NONE);
   }
 
-  if (!b_mesh) {
-    /* if we have no motion blur on this frame, but on other frames, copy */
-    if (numverts) {
-      /* triangles */
-      Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-      if (attr_mP) {
-        Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
-        Attribute *attr_N = mesh->attributes.find(ATTR_STD_VERTEX_NORMAL);
-        float3 *P = &mesh->verts[0];
-        float3 *N = (attr_N) ? attr_N->data_float3() : NULL;
-
-        memcpy(attr_mP->data_float3() + motion_step * numverts, P, sizeof(float3) * numverts);
-        if (attr_mN)
-          memcpy(attr_mN->data_float3() + motion_step * numverts, N, sizeof(float3) * numverts);
-      }
-    }
-
-    if (numkeys) {
-      /* curves */
-      Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-      if (attr_mP) {
-        float3 *keys = &mesh->curve_keys[0];
-        memcpy(attr_mP->data_float3() + motion_step * numkeys, keys, sizeof(float3) * numkeys);
-      }
-    }
-
-    return;
-  }
-
   /* TODO(sergey): Perform preliminary check for number of vertices. */
-  if (numverts) {
+  if (b_mesh) {
+    /* Export deformed coordinates. */
     /* Find attributes. */
     Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
@@ -1254,14 +1084,13 @@ void BlenderSync::sync_mesh_motion(BL::Depsgraph &b_depsgraph,
         }
       }
     }
-  }
 
-  /* hair motion */
-  if (numkeys)
-    sync_curves(mesh, b_mesh, b_ob, true, motion_step);
+    free_object_to_mesh(b_data, b_ob, b_mesh);
+    return;
+  }
 
-  /* free derived mesh */
-  free_object_to_mesh(b_data, b_ob, b_mesh);
+  /* No deformation on this frame, copy coordinates if other frames did have it. */
+  mesh->copy_center_to_motion_step(motion_step);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 6981412bb88..4b29c28913b 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -15,14 +15,14 @@
  */
 
 #include "render/camera.h"
-#include "render/integrator.h"
 #include "render/graph.h"
+#include "render/integrator.h"
 #include "render/light.h"
 #include "render/mesh.h"
-#include "render/object.h"
-#include "render/scene.h"
 #include "render/nodes.h"
+#include "render/object.h"
 #include "render/particles.h"
+#include "render/scene.h"
 #include "render/shader.h"
 
 #include "blender/blender_object_cull.h"
@@ -67,10 +67,20 @@ bool BlenderSync::object_is_mesh(BL::Object &b_ob)
     return false;
   }
 
-  if (b_ob.type() == BL::Object::type_CURVE) {
+  BL::Object::type_enum type = b_ob.type();
+
+#ifdef WITH_NEW_OBJECT_TYPES
+  if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR) {
+#else
+  if (type == BL::Object::type_VOLUME) {
+#endif
+    /* Will be exported attached to mesh. */
+    return true;
+  }
+  else if (type == BL::Object::type_CURVE) {
     /* Skip exporting curves without faces, overhead can be
      * significant if there are many for path animation. */
-    BL::Curve b_curve(b_ob.data());
+    BL::Curve b_curve(b_ob_data);
 
     return (b_curve.bevel_object() || b_curve.extrude() != 0.0f || b_curve.bevel_depth() != 0.0f ||
             b_curve.dimensions() == BL::Curve::dimensions_2D || b_ob.modifiers.length());
@@ -88,215 +98,13 @@ bool BlenderSync::object_is_light(BL::Object &b_ob)
   return (b_ob_data && b_ob_data.is_a(&RNA_Light));
 }
 
-static uint object_ray_visibility(BL::Object &b_ob)
-{
-  PointerRNA cvisibility = RNA_pointer_get(&b_ob.ptr, "cycles_visibility");
-  uint flag = 0;
-
-  flag |= get_boolean(cvisibility, "camera") ? PATH_RAY_CAMERA : 0;
-  flag |= get_boolean(cvisibility, "diffuse") ? PATH_RAY_DIFFUSE : 0;
-  flag |= get_boolean(cvisibility, "glossy") ? PATH_RAY_GLOSSY : 0;
-  flag |= get_boolean(cvisibility, "transmission") ? PATH_RAY_TRANSMIT : 0;
-  flag |= get_boolean(cvisibility, "shadow") ? PATH_RAY_SHADOW : 0;
-  flag |= get_boolean(cvisibility, "scatter") ? PATH_RAY_VOLUME_SCATTER : 0;
-
-  return flag;
-}
-
-/* Light */
-
-void BlenderSync::sync_light(BL::Object &b_parent,
-                             int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
-                             BL::Object &b_ob,
-                             BL::Object &b_ob_instance,
-                             int random_id,
-                             Transform &tfm,
-                             bool *use_portal)
-{
-  /* test if we need to sync */
-  Light *light;
-  ObjectKey key(b_parent, persistent_id, b_ob_instance);
-  BL::Light b_light(b_ob.data());
-
-  /* Update if either object or light data changed. */
-  if (!light_map.sync(&light, b_ob, b_parent, key)) {
-    Shader *shader;
-    if (!shader_map.sync(&shader, b_light)) {
-      if (light->is_portal)
-        *use_portal = true;
-      return;
-    }
-  }
-
-  /* type */
-  switch (b_light.type()) {
-    case BL::Light::type_POINT: {
-      BL::PointLight b_point_light(b_light);
-      light->size = b_point_light.shadow_soft_size();
-      light->type = LIGHT_POINT;
-      break;
-    }
-    case BL::Light::type_SPOT: {
-      BL::SpotLight b_spot_light(b_light);
-      light->size = b_spot_light.shadow_soft_size();
-      light->type = LIGHT_SPOT;
-      light->spot_angle = b_spot_light.spot_size();
-      light->spot_smooth = b_spot_light.spot_blend();
-      break;
-    }
-    /* Hemi were removed from 2.8 */
-    // case BL::Light::type_HEMI: {
-    //  light->type = LIGHT_DISTANT;
-    //  light->size = 0.0f;
-    //  break;
-    // }
-    case BL::Light::type_SUN: {
-      BL::SunLight b_sun_light(b_light);
-      light->angle = b_sun_light.angle();
-      light->type = LIGHT_DISTANT;
-      break;
-    }
-    case BL::Light::type_AREA: {
-      BL::AreaLight b_area_light(b_light);
-      light->size = 1.0f;
-      light->axisu = transform_get_column(&tfm, 0);
-      light->axisv = transform_get_column(&tfm, 1);
-      light->sizeu = b_area_light.size();
-      switch (b_area_light.shape()) {
-        case BL::AreaLight::shape_SQUARE:
-          light->sizev = light->sizeu;
-          light->round = false;
-          break;
-        case BL::AreaLight::shape_RECTANGLE:
-          light->sizev = b_area_light.size_y();
-          light->round = false;
-          break;
-        case BL::AreaLight::shape_DISK:
-          light->sizev = light->sizeu;
-          light->round = true;
-          break;
-        case BL::AreaLight::shape_ELLIPSE:
-          light->sizev = b_area_light.size_y();
-          light->round = true;
-          break;
-      }
-      light->type = LIGHT_AREA;
-      break;
-    }
-  }
-
-  /* strength */
-  light->strength = get_float3(b_light.color());
-  light->strength *= BL::PointLight(b_light).energy();
-
-  /* location and (inverted!) direction */
-  light->co = transform_get_column(&tfm, 3);
-  light->dir = -transform_get_column(&tfm, 2);
-  light->tfm = tfm;
-
-  /* shader */
-  vector<Shader *> used_shaders;
-  find_shader(b_light, used_shaders, scene->default_light);
-  light->shader = used_shaders[0];
-
-  /* shadow */
-  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-  PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles");
-  light->cast_shadow = get_boolean(clight, "cast_shadow");
-  light->use_mis = get_boolean(clight, "use_multiple_importance_sampling");
-
-  int samples = get_int(clight, "samples");
-  if (get_boolean(cscene, "use_square_samples"))
-    light->samples = samples * samples;
-  else
-    light->samples = samples;
-
-  light->max_bounces = get_int(clight, "max_bounces");
-
-  if (b_ob != b_ob_instance) {
-    light->random_id = random_id;
-  }
-  else {
-    light->random_id = hash_uint2(hash_string(b_ob.name().c_str()), 0);
-  }
-
-  if (light->type == LIGHT_AREA)
-    light->is_portal = get_boolean(clight, "is_portal");
-  else
-    light->is_portal = false;
-
-  if (light->is_portal)
-    *use_portal = true;
-
-  /* visibility */
-  uint visibility = object_ray_visibility(b_ob);
-  light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0;
-  light->use_glossy = (visibility & PATH_RAY_GLOSSY) != 0;
-  light->use_transmission = (visibility & PATH_RAY_TRANSMIT) != 0;
-  light->use_scatter = (visibility & PATH_RAY_VOLUME_SCATTER) != 0;
-
-  /* tag */
-  light->tag_update(scene);
-}
-
-void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
-{
-  BL::World b_world = b_scene.world();
-
-  if (b_world) {
-    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-    PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
-
-    enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM };
-    int sampling_method = get_enum(cworld, "sampling_method", SAMPLING_NUM, SAMPLING_AUTOMATIC);
-    bool sample_as_light = (sampling_method != SAMPLING_NONE);
-
-    if (sample_as_light || use_portal) {
-      /* test if we need to sync */
-      Light *light;
-      ObjectKey key(b_world, 0, b_world);
-
-      if (light_map.sync(&light, b_world, b_world, key) || world_recalc ||
-          b_world.ptr.data != world_map) {
-        light->type = LIGHT_BACKGROUND;
-        if (sampling_method == SAMPLING_MANUAL) {
-          light->map_resolution = get_int(cworld, "sample_map_resolution");
-        }
-        else {
-          light->map_resolution = 0;
-        }
-        light->shader = scene->default_background;
-        light->use_mis = sample_as_light;
-        light->max_bounces = get_int(cworld, "max_bounces");
-
-        /* force enable light again when world is resynced */
-        light->is_enabled = true;
-
-        int samples = get_int(cworld, "samples");
-        if (get_boolean(cscene, "use_square_samples"))
-          light->samples = samples * samples;
-        else
-          light->samples = samples;
-
-        light->tag_update(scene);
-        light_map.set_recalc(b_world);
-      }
-    }
-  }
-
-  world_map = b_world.ptr.data;
-  world_recalc = false;
-  viewport_parameters = BlenderViewportParameters(b_v3d);
-}
-
 /* Object */
 
 Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
                                  BL::ViewLayer &b_view_layer,
                                  BL::DepsgraphObjectInstance &b_instance,
                                  float motion_time,
-                                 bool show_self,
-                                 bool show_particles,
+                                 bool use_particle_hair,
                                  bool show_lights,
                                  BlenderObjectCulling &culling,
                                  bool *use_portal)
@@ -378,7 +186,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
   }
 
   /* key to lookup object */
-  ObjectKey key(b_parent, persistent_id, b_ob_instance);
+  ObjectKey key(b_parent, persistent_id, b_ob_instance, use_particle_hair);
   Object *object;
 
   /* motion vector case */
@@ -393,8 +201,8 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
       }
 
       /* mesh deformation */
-      if (object->mesh)
-        sync_mesh_motion(b_depsgraph, b_ob, object, motion_time);
+      if (object->geometry)
+        sync_geometry_motion(b_depsgraph, b_ob, object, motion_time, use_particle_hair);
     }
 
     return object;
@@ -403,12 +211,12 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
   /* test if we need to sync */
   bool object_updated = false;
 
-  if (object_map.sync(&object, b_ob, b_parent, key))
+  if (object_map.add_or_update(&object, b_ob, b_parent, key))
     object_updated = true;
 
   /* mesh sync */
-  object->mesh = sync_mesh(
-      b_depsgraph, b_ob, b_ob_instance, object_updated, show_self, show_particles);
+  object->geometry = sync_geometry(
+      b_depsgraph, b_ob, b_ob_instance, object_updated, use_particle_hair);
 
   /* special case not tracked by object update flags */
 
@@ -450,7 +258,8 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
   /* object sync
    * transform comparison should not be needed, but duplis don't work perfect
    * in the depsgraph and may not signal changes, so this is a workaround */
-  if (object_updated || (object->mesh && object->mesh->need_update) || tfm != object->tfm) {
+  if (object_updated || (object->geometry && object->geometry->need_update) ||
+      tfm != object->tfm) {
     object->name = b_ob.name().c_str();
     object->pass_id = b_ob.pass_index();
     object->color = get_float3(b_ob.color());
@@ -459,23 +268,23 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
 
     /* motion blur */
     Scene::MotionType need_motion = scene->need_motion();
-    if (need_motion != Scene::MOTION_NONE && object->mesh) {
-      Mesh *mesh = object->mesh;
-      mesh->use_motion_blur = false;
-      mesh->motion_steps = 0;
+    if (need_motion != Scene::MOTION_NONE && object->geometry) {
+      Geometry *geom = object->geometry;
+      geom->use_motion_blur = false;
+      geom->motion_steps = 0;
 
       uint motion_steps;
 
       if (need_motion == Scene::MOTION_BLUR) {
-        motion_steps = object_motion_steps(b_parent, b_ob);
-        mesh->motion_steps = motion_steps;
+        motion_steps = object_motion_steps(b_parent, b_ob, Object::MAX_MOTION_STEPS);
+        geom->motion_steps = motion_steps;
         if (motion_steps && object_use_deform_motion(b_parent, b_ob)) {
-          mesh->use_motion_blur = true;
+          geom->use_motion_blur = true;
         }
       }
       else {
         motion_steps = 3;
-        mesh->motion_steps = motion_steps;
+        geom->motion_steps = motion_steps;
       }
 
       object->motion.clear();
@@ -526,13 +335,13 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
   if (!motion) {
     /* prepare for sync */
     light_map.pre_sync();
-    mesh_map.pre_sync();
+    geometry_map.pre_sync();
     object_map.pre_sync();
     particle_system_map.pre_sync();
     motion_times.clear();
   }
   else {
-    mesh_motion_synced.clear();
+    geometry_motion_synced.clear();
   }
 
   /* initialize culling */
@@ -552,22 +361,34 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
     BL::DepsgraphObjectInstance b_instance = *b_instance_iter;
     BL::Object b_ob = b_instance.object();
 
-    /* load per-object culling data */
+    /* Viewport visibility. */
+    const bool show_in_viewport = !b_v3d || b_ob.visible_in_viewport_get(b_v3d);
+    if (show_in_viewport == false) {
+      continue;
+    }
+
+    /* Load per-object culling data. */
     culling.init_object(scene, b_ob);
 
-    /* test if object needs to be hidden */
-    const bool show_self = b_instance.show_self();
-    const bool show_particles = b_instance.show_particles();
-    const bool show_in_viewport = !b_v3d || b_ob.visible_in_viewport_get(b_v3d);
+    /* Object itself. */
+    if (b_instance.show_self()) {
+      sync_object(b_depsgraph,
+                  b_view_layer,
+                  b_instance,
+                  motion_time,
+                  false,
+                  show_lights,
+                  culling,
+                  &use_portal);
+    }
 
-    if (show_in_viewport && (show_self || show_particles)) {
-      /* object itself */
+    /* Particle hair as separate object. */
+    if (b_instance.show_particles() && object_has_particle_hair(b_ob)) {
       sync_object(b_depsgraph,
                   b_view_layer,
                   b_instance,
                   motion_time,
-                  show_self,
-                  show_particles,
+                  true,
                   show_lights,
                   culling,
                   &use_portal);
@@ -584,8 +405,8 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
     /* handle removed data and modified pointers */
     if (light_map.post_sync())
       scene->light_manager->tag_update(scene);
-    if (mesh_map.post_sync())
-      scene->mesh_manager->tag_update(scene);
+    if (geometry_map.post_sync())
+      scene->geometry_manager->tag_update(scene);
     if (object_map.post_sync())
       scene->object_manager->tag_update(scene);
     if (particle_system_map.post_sync())
@@ -593,7 +414,7 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
   }
 
   if (motion)
-    mesh_motion_synced.clear();
+    geometry_motion_synced.clear();
 }
 
 void BlenderSync::sync_motion(BL::RenderSettings &b_render,
diff --git a/intern/cycles/blender/blender_object_cull.cpp b/intern/cycles/blender/blender_object_cull.cpp
index 74f8fb1dc53..bebecb364eb 100644
--- a/intern/cycles/blender/blender_object_cull.cpp
+++ b/intern/cycles/blender/blender_object_cull.cpp
@@ -19,6 +19,7 @@
 #include "render/camera.h"
 
 #include "blender/blender_object_cull.h"
+#include "blender/blender_util.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp
index d74f132ed60..e5eab1ae62b 100644
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -39,7 +39,7 @@ bool BlenderSync::sync_dupli_particle(BL::Object &b_ob,
   object->hide_on_missing_motion = true;
 
   /* test if we need particle data */
-  if (!object->mesh->need_attribute(scene, ATTR_STD_PARTICLE))
+  if (!object->geometry->need_attribute(scene, ATTR_STD_PARTICLE))
     return false;
 
   /* don't handle child particles yet */
@@ -53,10 +53,10 @@ bool BlenderSync::sync_dupli_particle(BL::Object &b_ob,
   ParticleSystem *psys;
 
   bool first_use = !particle_system_map.is_used(key);
-  bool need_update = particle_system_map.sync(&psys, b_ob, b_instance.object(), key);
+  bool need_update = particle_system_map.add_or_update(&psys, b_ob, b_instance.object(), key);
 
   /* no update needed? */
-  if (!need_update && !object->mesh->need_update && !scene->object_manager->need_update)
+  if (!need_update && !object->geometry->need_update && !scene->object_manager->need_update)
     return true;
 
   /* first time used in this sync loop? clear and tag update */
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 335d4daf09c..89bcebda193 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -19,8 +19,9 @@
 #include "blender/CCL_api.h"
 
 #include "blender/blender_device.h"
-#include "blender/blender_sync.h"
 #include "blender/blender_session.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
 #include "render/denoising.h"
 #include "render/merge.h"
@@ -37,8 +38,8 @@
 #ifdef WITH_OSL
 #  include "render/osl.h"
 
-#  include <OSL/oslquery.h>
 #  include <OSL/oslconfig.h>
+#  include <OSL/oslquery.h>
 #endif
 
 #ifdef WITH_OPENCL
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index e2dea24fdd1..5ea96d6bdfd 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -41,8 +41,8 @@
 #include "util/util_progress.h"
 #include "util/util_time.h"
 
-#include "blender/blender_sync.h"
 #include "blender/blender_session.h"
+#include "blender/blender_sync.h"
 #include "blender/blender_util.h"
 
 CCL_NAMESPACE_BEGIN
@@ -138,14 +138,6 @@ void BlenderSession::create_session()
   scene = new Scene(scene_params, session->device);
   scene->name = b_scene.name();
 
-  /* setup callbacks for builtin image support */
-  scene->image_manager->builtin_image_info_cb = function_bind(
-      &BlenderSession::builtin_image_info, this, _1, _2, _3);
-  scene->image_manager->builtin_image_pixels_cb = function_bind(
-      &BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5, _6, _7);
-  scene->image_manager->builtin_image_float_pixels_cb = function_bind(
-      &BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5, _6, _7);
-
   session->scene = scene;
 
   /* There is no single depsgraph to use for the entire render.
@@ -166,7 +158,7 @@ void BlenderSession::create_session()
 
   /* set buffer parameters */
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_render, b_v3d, b_rv3d, scene->camera, width, height);
+      b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
   session->reset(buffer_params, session_params.samples);
 
   b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -244,7 +236,7 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
   BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL);
   BL::RegionView3D b_null_region_view3d(PointerRNA_NULL);
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_render, b_null_space_view3d, b_null_region_view3d, scene->camera, width, height);
+      b_scene, b_render, b_null_space_view3d, b_null_region_view3d, scene->camera, width, height);
   session->reset(buffer_params, session_params.samples);
 
   b_engine.use_highlight_tiles(session_params.progressive_refine == false);
@@ -278,8 +270,6 @@ static ShaderEvalType get_shader_type(const string &pass_type)
     return SHADER_EVAL_GLOSSY_COLOR;
   else if (strcmp(shader_type, "TRANSMISSION_COLOR") == 0)
     return SHADER_EVAL_TRANSMISSION_COLOR;
-  else if (strcmp(shader_type, "SUBSURFACE_COLOR") == 0)
-    return SHADER_EVAL_SUBSURFACE_COLOR;
   else if (strcmp(shader_type, "EMIT") == 0)
     return SHADER_EVAL_EMISSION;
 
@@ -296,8 +286,6 @@ static ShaderEvalType get_shader_type(const string &pass_type)
     return SHADER_EVAL_GLOSSY;
   else if (strcmp(shader_type, "TRANSMISSION") == 0)
     return SHADER_EVAL_TRANSMISSION;
-  else if (strcmp(shader_type, "SUBSURFACE") == 0)
-    return SHADER_EVAL_SUBSURFACE;
 
   /* extra */
   else if (strcmp(shader_type, "ENVIRONMENT") == 0)
@@ -460,7 +448,7 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_render, b_v3d, b_rv3d, scene->camera, width, height);
+      b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
   /* render each layer */
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
@@ -474,7 +462,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   b_rlay_name = b_view_layer.name();
 
   /* add passes */
-  vector<Pass> passes = sync->sync_render_passes(b_rlay, b_view_layer);
+  vector<Pass> passes = sync->sync_render_passes(
+      b_rlay, b_view_layer, session_params.adaptive_sampling);
   buffer_params.passes = passes;
 
   PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
@@ -640,8 +629,6 @@ static int bake_pass_filter_get(const int pass_filter)
     flag |= BAKE_FILTER_GLOSSY;
   if ((pass_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0)
     flag |= BAKE_FILTER_TRANSMISSION;
-  if ((pass_filter & BL::BakeSettings::pass_filter_SUBSURFACE) != 0)
-    flag |= BAKE_FILTER_SUBSURFACE;
 
   if ((pass_filter & BL::BakeSettings::pass_filter_EMIT) != 0)
     flag |= BAKE_FILTER_EMISSION;
@@ -706,7 +693,7 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
     SessionParams session_params = BlenderSync::get_session_params(
         b_engine, b_userpref, b_scene, background);
     BufferParams buffer_params = BlenderSync::get_buffer_params(
-        b_render, b_v3d, b_rv3d, scene->camera, width, height);
+        b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
 
     scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
 
@@ -720,9 +707,12 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
     int tri_offset = 0;
 
     for (size_t i = 0; i < scene->objects.size(); i++) {
-      if (strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
+      const Object *object = scene->objects[i];
+      const Geometry *geom = object->geometry;
+      if (object->name == b_object.name() && geom->type == Geometry::MESH) {
+        const Mesh *mesh = static_cast<const Mesh *>(geom);
         object_index = i;
-        tri_offset = scene->objects[i]->mesh->tri_offset;
+        tri_offset = mesh->prim_offset;
         break;
       }
     }
@@ -848,11 +838,11 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
   if (session->params.modified(session_params) || scene->params.modified(scene_params)) {
     free_session();
     create_session();
-    return;
   }
 
   /* increase samples, but never decrease */
   session->set_samples(session_params.samples);
+  session->set_denoising_start_sample(session_params.denoising_start_sample);
   session->set_pause(session_pause);
 
   /* copy recalc flags, outside of mutex so we can decide to do the real
@@ -883,10 +873,28 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
   else
     sync->sync_camera(b_render, b_camera_override, width, height, "");
 
+  /* get buffer parameters */
+  BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
+
+  if (session_params.device.type != DEVICE_OPTIX &&
+      session_params.device.denoising_devices.empty()) {
+    /* cannot use OptiX denoising when it is not supported by the device. */
+    buffer_params.denoising_data_pass = false;
+  }
+  else {
+    session->set_denoising(buffer_params.denoising_data_pass, true);
+  }
+
+  if (scene->film->denoising_data_pass != buffer_params.denoising_data_pass) {
+    scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
+
+    /* Force a scene and session reset below. */
+    scene->film->tag_update(scene);
+  }
+
   /* reset if needed */
   if (scene->need_reset()) {
-    BufferParams buffer_params = BlenderSync::get_buffer_params(
-        b_render, b_v3d, b_rv3d, scene->camera, width, height);
     session->reset(buffer_params, session_params.samples);
 
     /* After session reset, so device is not accessing image data anymore. */
@@ -953,7 +961,7 @@ bool BlenderSession::draw(int w, int h)
       SessionParams session_params = BlenderSync::get_session_params(
           b_engine, b_userpref, b_scene, background);
       BufferParams buffer_params = BlenderSync::get_buffer_params(
-          b_render, b_v3d, b_rv3d, scene->camera, width, height);
+          b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
       bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
       if (session_pause == false) {
@@ -971,7 +979,7 @@ bool BlenderSession::draw(int w, int h)
 
   /* draw */
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_render, b_v3d, b_rv3d, scene->camera, width, height);
+      b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height);
   DeviceDrawParams draw_params;
 
   if (session->params.display_buffer_linear) {
@@ -1112,341 +1120,6 @@ void BlenderSession::test_cancel()
       session->progress.set_cancel("Cancelled");
 }
 
-/* builtin image file name is actually an image datablock name with
- * absolute sequence frame number concatenated via '@' character
- *
- * this function splits frame from builtin name
- */
-int BlenderSession::builtin_image_frame(const string &builtin_name)
-{
-  int last = builtin_name.find_last_of('@');
-  return atoi(builtin_name.substr(last + 1, builtin_name.size() - last - 1).c_str());
-}
-
-void BlenderSession::builtin_image_info(const string &builtin_name,
-                                        void *builtin_data,
-                                        ImageMetaData &metadata)
-{
-  /* empty image */
-  metadata.width = 1;
-  metadata.height = 1;
-
-  if (!builtin_data)
-    return;
-
-  /* recover ID pointer */
-  PointerRNA ptr;
-  RNA_id_pointer_create((ID *)builtin_data, &ptr);
-  BL::ID b_id(ptr);
-
-  if (b_id.is_a(&RNA_Image)) {
-    /* image data */
-    BL::Image b_image(b_id);
-
-    metadata.builtin_free_cache = !b_image.has_data();
-    metadata.is_float = b_image.is_float();
-    metadata.width = b_image.size()[0];
-    metadata.height = b_image.size()[1];
-    metadata.depth = 1;
-    metadata.channels = b_image.channels();
-
-    if (metadata.is_float) {
-      /* Float images are already converted on the Blender side,
-       * no need to do anything in Cycles. */
-      metadata.colorspace = u_colorspace_raw;
-    }
-  }
-  else if (b_id.is_a(&RNA_Object)) {
-    /* smoke volume data */
-    BL::Object b_ob(b_id);
-    BL::FluidDomainSettings b_domain = object_fluid_domain_find(b_ob);
-
-    metadata.is_float = true;
-    metadata.depth = 1;
-    metadata.channels = 1;
-
-    if (!b_domain)
-      return;
-
-    if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE))
-      metadata.channels = 1;
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR))
-      metadata.channels = 4;
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY))
-      metadata.channels = 3;
-    else
-      return;
-
-    int3 resolution = get_int3(b_domain.domain_resolution());
-    int amplify = (b_domain.use_noise()) ? b_domain.noise_scale() : 1;
-
-    /* Velocity and heat data is always low-resolution. */
-    if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) {
-      amplify = 1;
-    }
-
-    metadata.width = resolution.x * amplify;
-    metadata.height = resolution.y * amplify;
-    metadata.depth = resolution.z * amplify;
-  }
-  else {
-    /* TODO(sergey): Check we're indeed in shader node tree. */
-    PointerRNA ptr;
-    RNA_pointer_create(NULL, &RNA_Node, builtin_data, &ptr);
-    BL::Node b_node(ptr);
-    if (b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
-      BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
-      metadata.channels = 4;
-      metadata.width = b_point_density_node.resolution();
-      metadata.height = metadata.width;
-      metadata.depth = metadata.width;
-      metadata.is_float = true;
-    }
-  }
-}
-
-bool BlenderSession::builtin_image_pixels(const string &builtin_name,
-                                          void *builtin_data,
-                                          int tile,
-                                          unsigned char *pixels,
-                                          const size_t pixels_size,
-                                          const bool associate_alpha,
-                                          const bool free_cache)
-{
-  if (!builtin_data) {
-    return false;
-  }
-
-  const int frame = builtin_image_frame(builtin_name);
-
-  PointerRNA ptr;
-  RNA_id_pointer_create((ID *)builtin_data, &ptr);
-  BL::Image b_image(ptr);
-
-  const int width = b_image.size()[0];
-  const int height = b_image.size()[1];
-  const int channels = b_image.channels();
-
-  unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame, tile);
-  const size_t num_pixels = ((size_t)width) * height;
-
-  if (image_pixels && num_pixels * channels == pixels_size) {
-    memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
-  }
-  else {
-    if (channels == 1) {
-      memset(pixels, 0, pixels_size * sizeof(unsigned char));
-    }
-    else {
-      const size_t num_pixels_safe = pixels_size / channels;
-      unsigned char *cp = pixels;
-      for (size_t i = 0; i < num_pixels_safe; i++, cp += channels) {
-        cp[0] = 255;
-        cp[1] = 0;
-        cp[2] = 255;
-        if (channels == 4) {
-          cp[3] = 255;
-        }
-      }
-    }
-  }
-
-  if (image_pixels) {
-    MEM_freeN(image_pixels);
-  }
-
-  /* Free image buffers to save memory during render. */
-  if (free_cache) {
-    b_image.buffers_free();
-  }
-
-  if (associate_alpha) {
-    /* Premultiply, byte images are always straight for Blender. */
-    unsigned char *cp = pixels;
-    for (size_t i = 0; i < num_pixels; i++, cp += channels) {
-      cp[0] = (cp[0] * cp[3]) >> 8;
-      cp[1] = (cp[1] * cp[3]) >> 8;
-      cp[2] = (cp[2] * cp[3]) >> 8;
-    }
-  }
-  return true;
-}
-
-bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
-                                                void *builtin_data,
-                                                int tile,
-                                                float *pixels,
-                                                const size_t pixels_size,
-                                                const bool,
-                                                const bool free_cache)
-{
-  if (!builtin_data) {
-    return false;
-  }
-
-  PointerRNA ptr;
-  RNA_id_pointer_create((ID *)builtin_data, &ptr);
-  BL::ID b_id(ptr);
-
-  if (b_id.is_a(&RNA_Image)) {
-    /* image data */
-    BL::Image b_image(b_id);
-    int frame = builtin_image_frame(builtin_name);
-
-    const int width = b_image.size()[0];
-    const int height = b_image.size()[1];
-    const int channels = b_image.channels();
-
-    float *image_pixels;
-    image_pixels = image_get_float_pixels_for_frame(b_image, frame, tile);
-    const size_t num_pixels = ((size_t)width) * height;
-
-    if (image_pixels && num_pixels * channels == pixels_size) {
-      memcpy(pixels, image_pixels, pixels_size * sizeof(float));
-    }
-    else {
-      if (channels == 1) {
-        memset(pixels, 0, num_pixels * sizeof(float));
-      }
-      else {
-        const size_t num_pixels_safe = pixels_size / channels;
-        float *fp = pixels;
-        for (int i = 0; i < num_pixels_safe; i++, fp += channels) {
-          fp[0] = 1.0f;
-          fp[1] = 0.0f;
-          fp[2] = 1.0f;
-          if (channels == 4) {
-            fp[3] = 1.0f;
-          }
-        }
-      }
-    }
-
-    if (image_pixels) {
-      MEM_freeN(image_pixels);
-    }
-
-    /* Free image buffers to save memory during render. */
-    if (free_cache) {
-      b_image.buffers_free();
-    }
-
-    return true;
-  }
-  else if (b_id.is_a(&RNA_Object)) {
-    /* smoke volume data */
-    BL::Object b_ob(b_id);
-    BL::FluidDomainSettings b_domain = object_fluid_domain_find(b_ob);
-
-    if (!b_domain) {
-      return false;
-    }
-#if WITH_FLUID
-    int3 resolution = get_int3(b_domain.domain_resolution());
-    int length, amplify = (b_domain.use_noise()) ? b_domain.noise_scale() : 1;
-
-    /* Velocity and heat data is always low-resolution. */
-    if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY) ||
-        builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) {
-      amplify = 1;
-    }
-
-    const int width = resolution.x * amplify;
-    const int height = resolution.y * amplify;
-    const int depth = resolution.z * amplify;
-    const size_t num_pixels = ((size_t)width) * height * depth;
-
-    if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
-      FluidDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels) {
-        FluidDomainSettings_density_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME)) {
-      /* this is in range 0..1, and interpreted by the OpenGL smoke viewer
-       * as 1500..3000 K with the first part faded to zero density */
-      FluidDomainSettings_flame_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels) {
-        FluidDomainSettings_flame_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR)) {
-      /* the RGB is "premultiplied" by density for better interpolation results */
-      FluidDomainSettings_color_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels * 4) {
-        FluidDomainSettings_color_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY)) {
-      FluidDomainSettings_velocity_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels * 3) {
-        FluidDomainSettings_velocity_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) {
-      FluidDomainSettings_heat_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels) {
-        FluidDomainSettings_heat_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else if (builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE)) {
-      FluidDomainSettings_temperature_grid_get_length(&b_domain.ptr, &length);
-      if (length == num_pixels) {
-        FluidDomainSettings_temperature_grid_get(&b_domain.ptr, pixels);
-        return true;
-      }
-    }
-    else {
-      fprintf(
-          stderr, "Cycles error: unknown volume attribute %s, skipping\n", builtin_name.c_str());
-      pixels[0] = 0.0f;
-      return false;
-    }
-#endif
-    fprintf(stderr, "Cycles error: unexpected smoke volume resolution, skipping\n");
-  }
-  else {
-    /* We originally were passing view_layer here but in reality we need a
-     * a depsgraph to pass to the RE_point_density_minmax() function.
-     */
-    /* TODO(sergey): Check we're indeed in shader node tree. */
-    PointerRNA ptr;
-    RNA_pointer_create(NULL, &RNA_Node, builtin_data, &ptr);
-    BL::Node b_node(ptr);
-    if (b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
-      BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
-      int length;
-      b_point_density_node.calc_point_density(b_depsgraph, &length, &pixels);
-    }
-  }
-
-  return false;
-}
-
-void BlenderSession::builtin_images_load()
-{
-  /* Force builtin images to be loaded along with Blender data sync. This
-   * is needed because we may be reading from depsgraph evaluated data which
-   * can be freed by Blender before Cycles reads it.
-   *
-   * TODO: the assumption that no further access to builtin image data will
-   * happen is really weak, and likely to break in the future. We should find
-   * a better solution to hand over the data directly to the image manager
-   * instead of through callbacks whose timing is difficult to control. */
-  ImageManager *manager = session->scene->image_manager;
-  Device *device = session->device;
-  manager->device_load_builtin(device, session->scene, session->progress);
-}
-
 void BlenderSession::update_resumable_tile_manager(int num_samples)
 {
   const int num_resumable_chunks = BlenderSession::num_resumable_chunks,
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 2f25ec740f9..3e6498bb655 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -17,15 +17,19 @@
 #ifndef __BLENDER_SESSION_H__
 #define __BLENDER_SESSION_H__
 
+#include "RNA_blender_cpp.h"
+
 #include "device/device.h"
+
+#include "render/bake.h"
 #include "render/scene.h"
 #include "render/session.h"
-#include "render/bake.h"
 
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
+class BlenderSync;
 class ImageMetaData;
 class Scene;
 class Session;
@@ -153,22 +157,6 @@ class BlenderSession {
                                      bool do_update_only);
   void do_write_update_render_tile(RenderTile &rtile, bool do_update_only, bool highlight);
 
-  int builtin_image_frame(const string &builtin_name);
-  void builtin_image_info(const string &builtin_name, void *builtin_data, ImageMetaData &metadata);
-  bool builtin_image_pixels(const string &builtin_name,
-                            void *builtin_data,
-                            int tile,
-                            unsigned char *pixels,
-                            const size_t pixels_size,
-                            const bool associate_alpha,
-                            const bool free_cache);
-  bool builtin_image_float_pixels(const string &builtin_name,
-                                  void *builtin_data,
-                                  int tile,
-                                  float *pixels,
-                                  const size_t pixels_size,
-                                  const bool associate_alpha,
-                                  const bool free_cache);
   void builtin_images_load();
 
   /* Update tile manager to reflect resumable render settings. */
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 206058259af..edde1fd243e 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -23,14 +23,15 @@
 #include "render/scene.h"
 #include "render/shader.h"
 
-#include "blender/blender_texture.h"
+#include "blender/blender_image.h"
 #include "blender/blender_sync.h"
+#include "blender/blender_texture.h"
 #include "blender/blender_util.h"
 
 #include "util/util_debug.h"
 #include "util/util_foreach.h"
-#include "util/util_string.h"
 #include "util/util_set.h"
+#include "util/util_string.h"
 #include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
@@ -323,6 +324,13 @@ static ShaderNode *add_node(Scene *scene,
     vector_math_node->type = (NodeVectorMathType)b_vector_math_node.operation();
     node = vector_math_node;
   }
+  else if (b_node.is_a(&RNA_ShaderNodeVectorRotate)) {
+    BL::ShaderNodeVectorRotate b_vector_rotate_node(b_node);
+    VectorRotateNode *vector_rotate_node = new VectorRotateNode();
+    vector_rotate_node->type = (NodeVectorRotateType)b_vector_rotate_node.rotation_type();
+    vector_rotate_node->invert = b_vector_rotate_node.invert();
+    node = vector_rotate_node;
+  }
   else if (b_node.is_a(&RNA_ShaderNodeVectorTransform)) {
     BL::ShaderNodeVectorTransform b_vector_transform_node(b_node);
     VectorTransformNode *vtransform = new VectorTransformNode();
@@ -612,16 +620,16 @@ static ShaderNode *add_node(Scene *scene,
       /* create script node */
       BL::ShaderNodeScript b_script_node(b_node);
 
-      OSLShaderManager *manager = (OSLShaderManager *)scene->shader_manager;
+      ShaderManager *manager = scene->shader_manager;
       string bytecode_hash = b_script_node.bytecode_hash();
 
       if (!bytecode_hash.empty()) {
-        node = manager->osl_node("", bytecode_hash, b_script_node.bytecode());
+        node = OSLShaderManager::osl_node(manager, "", bytecode_hash, b_script_node.bytecode());
       }
       else {
         string absolute_filepath = blender_absolute_path(
             b_data, b_ntree, b_script_node.filepath());
-        node = manager->osl_node(absolute_filepath, "");
+        node = OSLShaderManager::osl_node(manager, absolute_filepath, "");
       }
     }
 #else
@@ -634,7 +642,27 @@ static ShaderNode *add_node(Scene *scene,
     BL::Image b_image(b_image_node.image());
     BL::ImageUser b_image_user(b_image_node.image_user());
     ImageTextureNode *image = new ImageTextureNode();
+
+    image->interpolation = get_image_interpolation(b_image_node);
+    image->extension = get_image_extension(b_image_node);
+    image->projection = (NodeImageProjection)b_image_node.projection();
+    image->projection_blend = b_image_node.projection_blend();
+    BL::TexMapping b_texture_mapping(b_image_node.texture_mapping());
+    get_tex_mapping(&image->tex_mapping, b_texture_mapping);
+
     if (b_image) {
+      PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
+      image->colorspace = get_enum_identifier(colorspace_ptr, "name");
+
+      image->animated = b_image_node.image_user().use_auto_refresh();
+      image->alpha_type = get_image_alpha_type(b_image);
+
+      image->tiles.clear();
+      BL::Image::tiles_iterator b_iter;
+      for (b_image.tiles.begin(b_iter); b_iter != b_image.tiles.end(); ++b_iter) {
+        image->tiles.push_back(b_iter->number());
+      }
+
       /* builtin images will use callback-based reading because
        * they could only be loaded correct from blender side
        */
@@ -651,46 +679,14 @@ static ShaderNode *add_node(Scene *scene,
          */
         int scene_frame = b_scene.frame_current();
         int image_frame = image_user_frame_number(b_image_user, scene_frame);
-        image->filename = b_image.name() + "@" + string_printf("%d", image_frame);
-        image->builtin_data = b_image.ptr.data;
+        image->handle = scene->image_manager->add_image(
+            new BlenderImageLoader(b_image, image_frame), image->image_params());
       }
       else {
         image->filename = image_user_file_path(
             b_image_user, b_image, b_scene.frame_current(), true);
-        image->builtin_data = NULL;
-      }
-
-      PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
-      image->colorspace = get_enum_identifier(colorspace_ptr, "name");
-
-      image->animated = b_image_node.image_user().use_auto_refresh();
-      image->alpha_type = get_image_alpha_type(b_image);
-
-      image->tiles.clear();
-      BL::Image::tiles_iterator b_iter;
-      for (b_image.tiles.begin(b_iter); b_iter != b_image.tiles.end(); ++b_iter) {
-        image->tiles.push_back(b_iter->number());
       }
-
-      /* TODO: restore */
-      /* TODO(sergey): Does not work properly when we change builtin type. */
-#if 0
-      if (b_image.is_updated()) {
-        scene->image_manager->tag_reload_image(image->filename.string(),
-                                               image->builtin_data,
-                                               get_image_interpolation(b_image_node),
-                                               get_image_extension(b_image_node),
-                                               image->use_alpha,
-                                               image->colorspace);
-      }
-#endif
     }
-    image->projection = (NodeImageProjection)b_image_node.projection();
-    image->interpolation = get_image_interpolation(b_image_node);
-    image->extension = get_image_extension(b_image_node);
-    image->projection_blend = b_image_node.projection_blend();
-    BL::TexMapping b_texture_mapping(b_image_node.texture_mapping());
-    get_tex_mapping(&image->tex_mapping, b_texture_mapping);
     node = image;
   }
   else if (b_node.is_a(&RNA_ShaderNodeTexEnvironment)) {
@@ -698,7 +694,19 @@ static ShaderNode *add_node(Scene *scene,
     BL::Image b_image(b_env_node.image());
     BL::ImageUser b_image_user(b_env_node.image_user());
     EnvironmentTextureNode *env = new EnvironmentTextureNode();
+
+    env->interpolation = get_image_interpolation(b_env_node);
+    env->projection = (NodeEnvironmentProjection)b_env_node.projection();
+    BL::TexMapping b_texture_mapping(b_env_node.texture_mapping());
+    get_tex_mapping(&env->tex_mapping, b_texture_mapping);
+
     if (b_image) {
+      PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
+      env->colorspace = get_enum_identifier(colorspace_ptr, "name");
+
+      env->animated = b_env_node.image_user().use_auto_refresh();
+      env->alpha_type = get_image_alpha_type(b_image);
+
       bool is_builtin = b_image.packed_file() || b_image.source() == BL::Image::source_GENERATED ||
                         b_image.source() == BL::Image::source_MOVIE ||
                         (b_engine.is_preview() && b_image.source() != BL::Image::source_SEQUENCE);
@@ -706,38 +714,14 @@ static ShaderNode *add_node(Scene *scene,
       if (is_builtin) {
         int scene_frame = b_scene.frame_current();
         int image_frame = image_user_frame_number(b_image_user, scene_frame);
-        env->filename = b_image.name() + "@" + string_printf("%d", image_frame);
-        env->builtin_data = b_image.ptr.data;
+        env->handle = scene->image_manager->add_image(new BlenderImageLoader(b_image, image_frame),
+                                                      env->image_params());
       }
       else {
         env->filename = image_user_file_path(
             b_image_user, b_image, b_scene.frame_current(), false);
-        env->builtin_data = NULL;
-      }
-
-      PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
-      env->colorspace = get_enum_identifier(colorspace_ptr, "name");
-
-      env->animated = b_env_node.image_user().use_auto_refresh();
-      env->alpha_type = get_image_alpha_type(b_image);
-
-      /* TODO: restore */
-      /* TODO(sergey): Does not work properly when we change builtin type. */
-#if 0
-      if (b_image.is_updated()) {
-        scene->image_manager->tag_reload_image(env->filename.string(),
-                                               env->builtin_data,
-                                               get_image_interpolation(b_env_node),
-                                               EXTENSION_REPEAT,
-                                               env->use_alpha,
-                                               env->colorspace);
       }
-#endif
     }
-    env->interpolation = get_image_interpolation(b_env_node);
-    env->projection = (NodeEnvironmentProjection)b_env_node.projection();
-    BL::TexMapping b_texture_mapping(b_env_node.texture_mapping());
-    get_tex_mapping(&env->tex_mapping, b_texture_mapping);
     node = env;
   }
   else if (b_node.is_a(&RNA_ShaderNodeTexGradient)) {
@@ -770,6 +754,8 @@ static ShaderNode *add_node(Scene *scene,
     BL::ShaderNodeTexWave b_wave_node(b_node);
     WaveTextureNode *wave = new WaveTextureNode();
     wave->type = (NodeWaveType)b_wave_node.wave_type();
+    wave->bands_direction = (NodeWaveBandsDirection)b_wave_node.bands_direction();
+    wave->rings_direction = (NodeWaveRingsDirection)b_wave_node.rings_direction();
     wave->profile = (NodeWaveProfile)b_wave_node.wave_profile();
     BL::TexMapping b_texture_mapping(b_wave_node.texture_mapping());
     get_tex_mapping(&wave->tex_mapping, b_texture_mapping);
@@ -878,23 +864,13 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeTexPointDensity)) {
     BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
     PointDensityTextureNode *point_density = new PointDensityTextureNode();
-    point_density->filename = b_point_density_node.name();
     point_density->space = (NodeTexVoxelSpace)b_point_density_node.space();
     point_density->interpolation = get_image_interpolation(b_point_density_node);
-    point_density->builtin_data = b_point_density_node.ptr.data;
-    point_density->image_manager = scene->image_manager;
-
-    /* TODO(sergey): Use more proper update flag. */
-    if (true) {
-      point_density->add_image();
-      b_point_density_node.cache_point_density(b_depsgraph);
-      scene->image_manager->tag_reload_image(point_density->filename.string(),
-                                             point_density->builtin_data,
-                                             point_density->interpolation,
-                                             EXTENSION_CLIP,
-                                             IMAGE_ALPHA_AUTO,
-                                             u_colorspace_raw);
-    }
+    point_density->handle = scene->image_manager->add_image(
+        new BlenderPointDensityLoader(b_depsgraph, b_point_density_node),
+        point_density->image_params());
+
+    b_point_density_node.cache_point_density(b_depsgraph);
     node = point_density;
 
     /* Transformation form world space to texture space.
@@ -1255,7 +1231,7 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
     Shader *shader;
 
     /* test if we need to sync */
-    if (shader_map.sync(&shader, b_mat) || shader->need_sync_object || update_all) {
+    if (shader_map.add_or_update(&shader, b_mat) || shader->need_sync_object || update_all) {
       ShaderGraph *graph = new ShaderGraph();
 
       shader->name = b_mat.name().c_str();
@@ -1284,6 +1260,7 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
       shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume");
       shader->volume_sampling_method = get_volume_sampling(cmat);
       shader->volume_interpolation_method = get_volume_interpolation(cmat);
+      shader->volume_step_rate = get_float(cmat, "volume_step_rate");
       shader->displacement_method = get_displacement_method(cmat);
 
       shader->set_graph(graph);
@@ -1348,6 +1325,7 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
       shader->heterogeneous_volume = !get_boolean(cworld, "homogeneous_volume");
       shader->volume_sampling_method = get_volume_sampling(cworld);
       shader->volume_interpolation_method = get_volume_interpolation(cworld);
+      shader->volume_step_rate = get_float(cworld, "volume_step_size");
     }
     else if (new_viewport_parameters.use_scene_world && b_world) {
       BackgroundNode *background = new BackgroundNode();
@@ -1480,7 +1458,7 @@ void BlenderSync::sync_lights(BL::Depsgraph &b_depsgraph, bool update_all)
     Shader *shader;
 
     /* test if we need to sync */
-    if (shader_map.sync(&shader, b_light) || update_all) {
+    if (shader_map.add_or_update(&shader, b_light) || update_all) {
       ShaderGraph *graph = new ShaderGraph();
 
       /* create nodes */
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 20dbe23cdb7..28a737c3341 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -16,6 +16,7 @@
 
 #include "render/background.h"
 #include "render/camera.h"
+#include "render/curves.h"
 #include "render/film.h"
 #include "render/graph.h"
 #include "render/integrator.h"
@@ -25,19 +26,18 @@
 #include "render/object.h"
 #include "render/scene.h"
 #include "render/shader.h"
-#include "render/curves.h"
 
 #include "device/device.h"
 
 #include "blender/blender_device.h"
-#include "blender/blender_sync.h"
 #include "blender/blender_session.h"
+#include "blender/blender_sync.h"
 #include "blender/blender_util.h"
 
 #include "util/util_debug.h"
 #include "util/util_foreach.h"
-#include "util/util_opengl.h"
 #include "util/util_hash.h"
+#include "util/util_opengl.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -56,7 +56,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
       b_scene(b_scene),
       shader_map(&scene->shaders),
       object_map(&scene->objects),
-      mesh_map(&scene->meshes),
+      geometry_map(&scene->geometry),
       light_map(&scene->lights),
       particle_system_map(&scene->particle_systems),
       world_map(NULL),
@@ -108,10 +108,15 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
     }
 
     if (dicing_prop_changed) {
-      for (const pair<void *, Mesh *> &iter : mesh_map.key_to_scene_data()) {
-        Mesh *mesh = iter.second;
-        if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
-          mesh_map.set_recalc(iter.first);
+      for (const pair<GeometryKey, Geometry *> &iter : geometry_map.key_to_scene_data()) {
+        Geometry *geom = iter.second;
+        if (geom->type == Geometry::MESH) {
+          Mesh *mesh = static_cast<Mesh *>(geom);
+          if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
+            PointerRNA id_ptr;
+            RNA_id_pointer_create((::ID *)iter.first.id, &id_ptr);
+            geometry_map.set_recalc(BL::ID(id_ptr));
+          }
         }
       }
     }
@@ -146,7 +151,7 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
         if (updated_geometry ||
             (object_subdivision_type(b_ob, preview, experimental) != Mesh::SUBDIVISION_NONE)) {
           BL::ID key = BKE_object_is_modified(b_ob) ? b_ob : b_ob.data();
-          mesh_map.set_recalc(key);
+          geometry_map.set_recalc(key);
         }
       }
       else if (object_is_light(b_ob)) {
@@ -164,7 +169,7 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
     /* Mesh */
     else if (b_id.is_a(&RNA_Mesh)) {
       BL::Mesh b_mesh(b_id);
-      mesh_map.set_recalc(b_mesh);
+      geometry_map.set_recalc(b_mesh);
     }
     /* World */
     else if (b_id.is_a(&RNA_World)) {
@@ -173,6 +178,11 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
         world_recalc = true;
       }
     }
+    /* Volume */
+    else if (b_id.is_a(&RNA_Volume)) {
+      BL::Volume b_volume(b_id);
+      geometry_map.set_recalc(b_volume);
+    }
   }
 
   BlenderViewportParameters new_viewport_parameters(b_v3d);
@@ -211,7 +221,7 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
   sync_images();
   sync_curve_settings();
 
-  mesh_synced.clear(); /* use for objects and motion sync */
+  geometry_synced.clear(); /* use for objects and motion sync */
 
   if (scene->need_motion() == Scene::MOTION_PASS || scene->need_motion() == Scene::MOTION_NONE ||
       scene->camera->motion_position == Camera::MOTION_POSITION_CENTER) {
@@ -219,7 +229,7 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
   }
   sync_motion(b_render, b_depsgraph, b_v3d, b_override, width, height, python_thread_state);
 
-  mesh_synced.clear();
+  geometry_synced.clear();
 
   /* Shader sync done at the end, since object sync uses it.
    * false = don't delete unused shaders, not supported. */
@@ -252,7 +262,8 @@ void BlenderSync::sync_integrator()
   integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces");
 
   integrator->volume_max_steps = get_int(cscene, "volume_max_steps");
-  integrator->volume_step_size = get_float(cscene, "volume_step_size");
+  integrator->volume_step_rate = (preview) ? get_float(cscene, "volume_preview_step_rate") :
+                                             get_float(cscene, "volume_step_rate");
 
   integrator->caustics_reflective = get_boolean(cscene, "caustics_reflective");
   integrator->caustics_refractive = get_boolean(cscene, "caustics_refractive");
@@ -291,6 +302,16 @@ void BlenderSync::sync_integrator()
   integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
   integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold");
 
+  if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
+    integrator->sampling_pattern = SAMPLING_PATTERN_PMJ;
+    integrator->adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
+    integrator->adaptive_threshold = get_float(cscene, "adaptive_threshold");
+  }
+  else {
+    integrator->adaptive_min_samples = INT_MAX;
+    integrator->adaptive_threshold = 0.0f;
+  }
+
   int diffuse_samples = get_int(cscene, "diffuse_samples");
   int glossy_samples = get_int(cscene, "glossy_samples");
   int transmission_samples = get_int(cscene, "transmission_samples");
@@ -307,6 +328,8 @@ void BlenderSync::sync_integrator()
     integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples;
     integrator->subsurface_samples = subsurface_samples * subsurface_samples;
     integrator->volume_samples = volume_samples * volume_samples;
+    integrator->adaptive_min_samples = min(
+        integrator->adaptive_min_samples * integrator->adaptive_min_samples, INT_MAX);
   }
   else {
     integrator->diffuse_samples = diffuse_samples;
@@ -388,6 +411,7 @@ void BlenderSync::sync_view_layer(BL::SpaceView3D & /*b_v3d*/, BL::ViewLayer &b_
   view_layer.use_background_ao = b_view_layer.use_ao();
   view_layer.use_surfaces = b_view_layer.use_solid();
   view_layer.use_hair = b_view_layer.use_strand();
+  view_layer.use_volumes = b_view_layer.use_volumes();
 
   /* Material override. */
   view_layer.material_override = b_view_layer.material_override();
@@ -456,19 +480,16 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT);
   MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT);
   MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT);
-  MAP_PASS("SubsurfaceDir", PASS_SUBSURFACE_DIRECT);
   MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT);
 
   MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT);
   MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT);
   MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT);
-  MAP_PASS("SubsurfaceInd", PASS_SUBSURFACE_INDIRECT);
   MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT);
 
   MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR);
   MAP_PASS("GlossCol", PASS_GLOSSY_COLOR);
   MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR);
-  MAP_PASS("SubsurfaceCol", PASS_SUBSURFACE_COLOR);
 
   MAP_PASS("Emit", PASS_EMISSION);
   MAP_PASS("Env", PASS_BACKGROUND);
@@ -482,6 +503,8 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
 #endif
   MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
+  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
   if (string_startswith(name, cryptomatte_prefix)) {
     return PASS_CRYPTOMATTE;
   }
@@ -517,7 +540,9 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
   return -1;
 }
 
-vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
+vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
+                                             BL::ViewLayer &b_view_layer,
+                                             bool adaptive_sampling)
 {
   vector<Pass> passes;
 
@@ -551,8 +576,6 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
       MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
       MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
       MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
-      MAP_OPTION("denoising_subsurface_direct", DENOISING_CLEAN_SUBSURFACE_DIR);
-      MAP_OPTION("denoising_subsurface_indirect", DENOISING_CLEAN_SUBSURFACE_IND);
 #undef MAP_OPTION
     }
     b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
@@ -595,6 +618,10 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
   }
+  if (get_boolean(crp, "pass_debug_sample_count")) {
+    b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
+    Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
+  }
   if (get_boolean(crp, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
     Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
@@ -606,12 +633,12 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
 
   /* Cryptomatte stores two ID/weight pairs per RGBA layer.
    * User facing parameter is the number of pairs. */
-  int crypto_depth = min(16, get_int(crp, "pass_crypto_depth")) / 2;
+  int crypto_depth = min(16, get_int(crp, "pass_crypto_depth"));
   scene->film->cryptomatte_depth = crypto_depth;
   scene->film->cryptomatte_passes = CRYPT_NONE;
   if (get_boolean(crp, "use_pass_crypto_object")) {
-    for (int i = 0; i < crypto_depth; ++i) {
-      string passname = cryptomatte_prefix + string_printf("Object%02d", i);
+    for (int i = 0; i < crypto_depth; i += 2) {
+      string passname = cryptomatte_prefix + string_printf("Object%02d", i / 2);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
     }
@@ -619,8 +646,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
                                                         CRYPT_OBJECT);
   }
   if (get_boolean(crp, "use_pass_crypto_material")) {
-    for (int i = 0; i < crypto_depth; ++i) {
-      string passname = cryptomatte_prefix + string_printf("Material%02d", i);
+    for (int i = 0; i < crypto_depth; i += 2) {
+      string passname = cryptomatte_prefix + string_printf("Material%02d", i / 2);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
     }
@@ -628,8 +655,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
                                                         CRYPT_MATERIAL);
   }
   if (get_boolean(crp, "use_pass_crypto_asset")) {
-    for (int i = 0; i < crypto_depth; ++i) {
-      string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
+    for (int i = 0; i < crypto_depth; i += 2) {
+      string passname = cryptomatte_prefix + string_printf("Asset%02d", i / 2);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
     }
@@ -641,6 +668,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
                                                         CRYPT_ACCURATE);
   }
 
+  if (adaptive_sampling) {
+    Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
+    if (!get_boolean(crp, "pass_debug_sample_count")) {
+      Pass::add(PASS_SAMPLE_COUNT, passes);
+    }
+  }
+
   RNA_BEGIN (&crp, b_aov, "aovs") {
     bool is_color = (get_enum(b_aov, "type") == 1);
     string name = get_string(b_aov, "name");
@@ -832,6 +866,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
 
   /* other parameters */
   params.start_resolution = get_int(cscene, "preview_start_resolution");
+  params.denoising_start_sample = get_int(cscene, "preview_denoising_start_sample");
   params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
 
   /* other parameters */
@@ -841,20 +876,10 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
 
   /* progressive refine */
   BL::RenderSettings b_r = b_scene.render();
-  params.progressive_refine = (b_engine.is_preview() ||
-                               get_boolean(cscene, "use_progressive_refine")) &&
-                              !b_r.use_save_buffers();
-
-  if (params.progressive_refine) {
-    BL::Scene::view_layers_iterator b_view_layer;
-    for (b_scene.view_layers.begin(b_view_layer); b_view_layer != b_scene.view_layers.end();
-         ++b_view_layer) {
-      PointerRNA crl = RNA_pointer_get(&b_view_layer->ptr, "cycles");
-      if (get_boolean(crl, "use_denoising")) {
-        params.progressive_refine = false;
-      }
-    }
-  }
+  params.progressive_refine = b_engine.is_preview() ||
+                              get_boolean(cscene, "use_progressive_refine");
+  if (b_r.use_save_buffers())
+    params.progressive_refine = false;
 
   if (background) {
     if (params.progressive_refine)
@@ -889,6 +914,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
                          BlenderSession::print_render_stats;
 
+  params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
+
   return params;
 }
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index a80f484fb92..650b4f5bb4e 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -18,11 +18,11 @@
 #define __BLENDER_SYNC_H__
 
 #include "MEM_guardedalloc.h"
-#include "RNA_types.h"
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
+#include "RNA_types.h"
 
-#include "blender/blender_util.h"
+#include "blender/blender_id_map.h"
 #include "blender/blender_viewport.h"
 
 #include "render/scene.h"
@@ -40,6 +40,7 @@ class BlenderObjectCulling;
 class BlenderViewportParameters;
 class Camera;
 class Film;
+class Hair;
 class Light;
 class Mesh;
 class Object;
@@ -70,7 +71,9 @@ class BlenderSync {
                  int height,
                  void **python_thread_state);
   void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer);
-  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer,
+                                  BL::ViewLayer &b_view_layer,
+                                  bool adaptive_sampling);
   void sync_integrator();
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,
@@ -94,7 +97,8 @@ class BlenderSync {
                                           BL::Scene &b_scene,
                                           bool background);
   static bool get_session_pause(BL::Scene &b_scene, bool background);
-  static BufferParams get_buffer_params(BL::RenderSettings &b_render,
+  static BufferParams get_buffer_params(BL::Scene &b_scene,
+                                        BL::RenderSettings &b_render,
                                         BL::SpaceView3D &b_v3d,
                                         BL::RegionView3D &b_rv3d,
                                         Camera *cam,
@@ -118,28 +122,64 @@ class BlenderSync {
                    void **python_thread_state);
   void sync_film(BL::SpaceView3D &b_v3d);
   void sync_view();
+
+  /* Shader */
   void sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, bool update_all);
   void sync_shaders(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d);
-  void sync_curve_settings();
-
   void sync_nodes(Shader *shader, BL::ShaderNodeTree &b_ntree);
-  Mesh *sync_mesh(BL::Depsgraph &b_depsgrpah,
-                  BL::Object &b_ob,
-                  BL::Object &b_ob_instance,
-                  bool object_updated,
-                  bool show_self,
-                  bool show_particles);
-  void sync_curves(
-      Mesh *mesh, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step = 0);
+
+  /* Object */
   Object *sync_object(BL::Depsgraph &b_depsgraph,
                       BL::ViewLayer &b_view_layer,
                       BL::DepsgraphObjectInstance &b_instance,
                       float motion_time,
-                      bool show_self,
-                      bool show_particles,
+                      bool use_particle_hair,
                       bool show_lights,
                       BlenderObjectCulling &culling,
                       bool *use_portal);
+
+  /* Volume */
+  void sync_volume(BL::Object &b_ob, Mesh *mesh, const vector<Shader *> &used_shaders);
+
+  /* Mesh */
+  void sync_mesh(BL::Depsgraph b_depsgraph,
+                 BL::Object b_ob,
+                 Mesh *mesh,
+                 const vector<Shader *> &used_shaders);
+  void sync_mesh_motion(BL::Depsgraph b_depsgraph, BL::Object b_ob, Mesh *mesh, int motion_step);
+
+  /* Hair */
+  void sync_hair(BL::Depsgraph b_depsgraph,
+                 BL::Object b_ob,
+                 Geometry *geom,
+                 const vector<Shader *> &used_shaders);
+  void sync_hair_motion(BL::Depsgraph b_depsgraph,
+                        BL::Object b_ob,
+                        Geometry *geom,
+                        int motion_step);
+  void sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motion_step = 0);
+  void sync_particle_hair(
+      Geometry *geom, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step = 0);
+  void sync_curve_settings();
+  bool object_has_particle_hair(BL::Object b_ob);
+
+  /* Camera */
+  void sync_camera_motion(
+      BL::RenderSettings &b_render, BL::Object &b_ob, int width, int height, float motion_time);
+
+  /* Geometry */
+  Geometry *sync_geometry(BL::Depsgraph &b_depsgrpah,
+                          BL::Object &b_ob,
+                          BL::Object &b_ob_instance,
+                          bool object_updated,
+                          bool use_particle_hair);
+  void sync_geometry_motion(BL::Depsgraph &b_depsgraph,
+                            BL::Object &b_ob,
+                            Object *object,
+                            float motion_time,
+                            bool use_particle_hair);
+
+  /* Light */
   void sync_light(BL::Object &b_parent,
                   int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
                   BL::Object &b_ob,
@@ -148,14 +188,8 @@ class BlenderSync {
                   Transform &tfm,
                   bool *use_portal);
   void sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal);
-  void sync_mesh_motion(BL::Depsgraph &b_depsgraph,
-                        BL::Object &b_ob,
-                        Object *object,
-                        float motion_time);
-  void sync_camera_motion(
-      BL::RenderSettings &b_render, BL::Object &b_ob, int width, int height, float motion_time);
 
-  /* particles */
+  /* Particles */
   bool sync_dupli_particle(BL::Object &b_ob,
                            BL::DepsgraphObjectInstance &b_instance,
                            Object *object);
@@ -179,11 +213,11 @@ class BlenderSync {
 
   id_map<void *, Shader> shader_map;
   id_map<ObjectKey, Object> object_map;
-  id_map<void *, Mesh> mesh_map;
+  id_map<GeometryKey, Geometry> geometry_map;
   id_map<ObjectKey, Light> light_map;
   id_map<ParticleSystemKey, ParticleSystem> particle_system_map;
-  set<Mesh *> mesh_synced;
-  set<Mesh *> mesh_motion_synced;
+  set<Geometry *> geometry_synced;
+  set<Geometry *> geometry_motion_synced;
   set<float> motion_times;
   void *world_map;
   bool world_recalc;
@@ -203,6 +237,7 @@ class BlenderSync {
           use_background_ao(true),
           use_surfaces(true),
           use_hair(true),
+          use_volumes(true),
           samples(0),
           bound_samples(false)
     {
@@ -214,6 +249,7 @@ class BlenderSync {
     bool use_background_ao;
     bool use_surfaces;
     bool use_hair;
+    bool use_volumes;
     int samples;
     bool bound_samples;
   } view_layer;
diff --git a/intern/cycles/blender/blender_texture.h b/intern/cycles/blender/blender_texture.h
index 896bf62da70..8ab061aaed9 100644
--- a/intern/cycles/blender/blender_texture.h
+++ b/intern/cycles/blender/blender_texture.h
@@ -17,8 +17,8 @@
 #ifndef __BLENDER_TEXTURE_H__
 #define __BLENDER_TEXTURE_H__
 
-#include <stdlib.h>
 #include "blender/blender_sync.h"
+#include <stdlib.h>
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index bea30a20b8c..ad90a5f8d52 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -483,7 +483,9 @@ static inline void mesh_texture_space(BL::Mesh &b_mesh, float3 &loc, float3 &siz
 }
 
 /* Object motion steps, returns 0 if no motion blur needed. */
-static inline uint object_motion_steps(BL::Object &b_parent, BL::Object &b_ob)
+static inline uint object_motion_steps(BL::Object &b_parent,
+                                       BL::Object &b_ob,
+                                       const int max_steps = INT_MAX)
 {
   /* Get motion enabled and steps from object itself. */
   PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
@@ -492,7 +494,7 @@ static inline uint object_motion_steps(BL::Object &b_parent, BL::Object &b_ob)
     return 0;
   }
 
-  uint steps = max(1, get_int(cobject, "motion_steps"));
+  int steps = max(1, get_int(cobject, "motion_steps"));
 
   /* Also check parent object, so motion blur and steps can be
    * controlled by dupligroup duplicator for linked groups. */
@@ -510,7 +512,7 @@ static inline uint object_motion_steps(BL::Object &b_parent, BL::Object &b_ob)
   /* Use uneven number of steps so we get one keyframe at the current frame,
    * and use 2^(steps - 1) so objects with more/fewer steps still have samples
    * at the same times, to avoid sampling at many different times. */
-  return (2 << (steps - 1)) + 1;
+  return min((2 << (steps - 1)) + 1, max_steps);
 }
 
 /* object uses deformation motion blur */
@@ -531,7 +533,7 @@ static inline bool object_use_deform_motion(BL::Object &b_parent, BL::Object &b_
   return use_deform_motion;
 }
 
-static inline BL::FluidDomainSettings object_fluid_domain_find(BL::Object &b_ob)
+static inline BL::FluidDomainSettings object_fluid_liquid_domain_find(BL::Object &b_ob)
 {
   BL::Object::modifiers_iterator b_mod;
 
@@ -539,8 +541,28 @@ static inline BL::FluidDomainSettings object_fluid_domain_find(BL::Object &b_ob)
     if (b_mod->is_a(&RNA_FluidModifier)) {
       BL::FluidModifier b_mmd(*b_mod);
 
-      if (b_mmd.fluid_type() == BL::FluidModifier::fluid_type_DOMAIN)
+      if (b_mmd.fluid_type() == BL::FluidModifier::fluid_type_DOMAIN &&
+          b_mmd.domain_settings().domain_type() == BL::FluidDomainSettings::domain_type_LIQUID) {
         return b_mmd.domain_settings();
+      }
+    }
+  }
+
+  return BL::FluidDomainSettings(PointerRNA_NULL);
+}
+
+static inline BL::FluidDomainSettings object_fluid_gas_domain_find(BL::Object &b_ob)
+{
+  BL::Object::modifiers_iterator b_mod;
+
+  for (b_ob.modifiers.begin(b_mod); b_mod != b_ob.modifiers.end(); ++b_mod) {
+    if (b_mod->is_a(&RNA_FluidModifier)) {
+      BL::FluidModifier b_mmd(*b_mod);
+
+      if (b_mmd.fluid_type() == BL::FluidModifier::fluid_type_DOMAIN &&
+          b_mmd.domain_settings().domain_type() == BL::FluidDomainSettings::domain_type_GAS) {
+        return b_mmd.domain_settings();
+      }
     }
   }
 
@@ -573,209 +595,20 @@ static inline Mesh::SubdivisionType object_subdivision_type(BL::Object &b_ob,
   return Mesh::SUBDIVISION_NONE;
 }
 
-/* ID Map
- *
- * Utility class to keep in sync with blender data.
- * Used for objects, meshes, lights and shaders. */
-
-template<typename K, typename T> class id_map {
- public:
-  id_map(vector<T *> *scene_data_)
-  {
-    scene_data = scene_data_;
-  }
-
-  T *find(const BL::ID &id)
-  {
-    return find(id.ptr.owner_id);
-  }
-
-  T *find(const K &key)
-  {
-    if (b_map.find(key) != b_map.end()) {
-      T *data = b_map[key];
-      return data;
-    }
-
-    return NULL;
-  }
-
-  void set_recalc(const BL::ID &id)
-  {
-    b_recalc.insert(id.ptr.data);
-  }
-
-  void set_recalc(void *id_ptr)
-  {
-    b_recalc.insert(id_ptr);
-  }
-
-  bool has_recalc()
-  {
-    return !(b_recalc.empty());
-  }
-
-  void pre_sync()
-  {
-    used_set.clear();
-  }
-
-  bool sync(T **r_data, const BL::ID &id)
-  {
-    return sync(r_data, id, id, id.ptr.owner_id);
-  }
-
-  bool sync(T **r_data, const BL::ID &id, const BL::ID &parent, const K &key)
-  {
-    T *data = find(key);
-    bool recalc;
-
-    if (!data) {
-      /* add data if it didn't exist yet */
-      data = new T();
-      scene_data->push_back(data);
-      b_map[key] = data;
-      recalc = true;
-    }
-    else {
-      recalc = (b_recalc.find(id.ptr.data) != b_recalc.end());
-      if (parent.ptr.data)
-        recalc = recalc || (b_recalc.find(parent.ptr.data) != b_recalc.end());
-    }
-
-    used(data);
-
-    *r_data = data;
-    return recalc;
-  }
-
-  bool is_used(const K &key)
-  {
-    T *data = find(key);
-    return (data) ? used_set.find(data) != used_set.end() : false;
-  }
-
-  void used(T *data)
-  {
-    /* tag data as still in use */
-    used_set.insert(data);
-  }
-
-  void set_default(T *data)
-  {
-    b_map[NULL] = data;
-  }
-
-  bool post_sync(bool do_delete = true)
-  {
-    /* remove unused data */
-    vector<T *> new_scene_data;
-    typename vector<T *>::iterator it;
-    bool deleted = false;
-
-    for (it = scene_data->begin(); it != scene_data->end(); it++) {
-      T *data = *it;
-
-      if (do_delete && used_set.find(data) == used_set.end()) {
-        delete data;
-        deleted = true;
-      }
-      else
-        new_scene_data.push_back(data);
-    }
-
-    *scene_data = new_scene_data;
-
-    /* update mapping */
-    map<K, T *> new_map;
-    typedef pair<const K, T *> TMapPair;
-    typename map<K, T *>::iterator jt;
-
-    for (jt = b_map.begin(); jt != b_map.end(); jt++) {
-      TMapPair &pair = *jt;
-
-      if (used_set.find(pair.second) != used_set.end())
-        new_map[pair.first] = pair.second;
-    }
-
-    used_set.clear();
-    b_recalc.clear();
-    b_map = new_map;
-
-    return deleted;
-  }
-
-  const map<K, T *> &key_to_scene_data()
-  {
-    return b_map;
-  }
-
- protected:
-  vector<T *> *scene_data;
-  map<K, T *> b_map;
-  set<T *> used_set;
-  set<void *> b_recalc;
-};
-
-/* Object Key */
-
-enum { OBJECT_PERSISTENT_ID_SIZE = 16 };
-
-struct ObjectKey {
-  void *parent;
-  int id[OBJECT_PERSISTENT_ID_SIZE];
-  void *ob;
-
-  ObjectKey(void *parent_, int id_[OBJECT_PERSISTENT_ID_SIZE], void *ob_)
-      : parent(parent_), ob(ob_)
-  {
-    if (id_)
-      memcpy(id, id_, sizeof(id));
-    else
-      memset(id, 0, sizeof(id));
-  }
-
-  bool operator<(const ObjectKey &k) const
-  {
-    if (ob < k.ob) {
-      return true;
-    }
-    else if (ob == k.ob) {
-      if (parent < k.parent)
-        return true;
-      else if (parent == k.parent)
-        return memcmp(id, k.id, sizeof(id)) < 0;
-    }
-
-    return false;
-  }
-};
-
-/* Particle System Key */
-
-struct ParticleSystemKey {
-  void *ob;
-  int id[OBJECT_PERSISTENT_ID_SIZE];
-
-  ParticleSystemKey(void *ob_, int id_[OBJECT_PERSISTENT_ID_SIZE]) : ob(ob_)
-  {
-    if (id_)
-      memcpy(id, id_, sizeof(id));
-    else
-      memset(id, 0, sizeof(id));
-  }
+static inline uint object_ray_visibility(BL::Object &b_ob)
+{
+  PointerRNA cvisibility = RNA_pointer_get(&b_ob.ptr, "cycles_visibility");
+  uint flag = 0;
 
-  bool operator<(const ParticleSystemKey &k) const
-  {
-    /* first id is particle index, we don't compare that */
-    if (ob < k.ob)
-      return true;
-    else if (ob == k.ob)
-      return memcmp(id + 1, k.id + 1, sizeof(int) * (OBJECT_PERSISTENT_ID_SIZE - 1)) < 0;
+  flag |= get_boolean(cvisibility, "camera") ? PATH_RAY_CAMERA : 0;
+  flag |= get_boolean(cvisibility, "diffuse") ? PATH_RAY_DIFFUSE : 0;
+  flag |= get_boolean(cvisibility, "glossy") ? PATH_RAY_GLOSSY : 0;
+  flag |= get_boolean(cvisibility, "transmission") ? PATH_RAY_TRANSMIT : 0;
+  flag |= get_boolean(cvisibility, "shadow") ? PATH_RAY_SHADOW : 0;
+  flag |= get_boolean(cvisibility, "scatter") ? PATH_RAY_VOLUME_SCATTER : 0;
 
-    return false;
-  }
-};
+  return flag;
+}
 
 class EdgeMap {
  public:
diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp
index 73ef5f94720..93e84e28032 100644
--- a/intern/cycles/blender/blender_viewport.cpp
+++ b/intern/cycles/blender/blender_viewport.cpp
@@ -61,6 +61,17 @@ const bool BlenderViewportParameters::custom_viewport_parameters() const
   return !(use_scene_world && use_scene_lights);
 }
 
+bool BlenderViewportParameters::get_viewport_display_denoising(BL::SpaceView3D &b_v3d,
+                                                               BL::Scene &b_scene)
+{
+  bool use_denoising = false;
+  if (b_v3d) {
+    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+    use_denoising = get_enum(cscene, "preview_denoising") != 0;
+  }
+  return use_denoising;
+}
+
 PassType BlenderViewportParameters::get_viewport_display_render_pass(BL::SpaceView3D &b_v3d)
 {
   PassType display_pass = PASS_NONE;
@@ -72,6 +83,11 @@ PassType BlenderViewportParameters::get_viewport_display_render_pass(BL::SpaceVi
   return display_pass;
 }
 
+bool update_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene)
+{
+  return BlenderViewportParameters::get_viewport_display_denoising(b_v3d, b_scene);
+}
+
 PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes)
 {
   if (b_v3d) {
diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h
index f26d0d38115..3e44e552f1d 100644
--- a/intern/cycles/blender/blender_viewport.h
+++ b/intern/cycles/blender/blender_viewport.h
@@ -18,9 +18,9 @@
 #define __BLENDER_VIEWPORT_H__
 
 #include "MEM_guardedalloc.h"
-#include "RNA_types.h"
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
+#include "RNA_types.h"
 
 #include "render/film.h"
 #include "util/util_param.h"
@@ -44,11 +44,15 @@ class BlenderViewportParameters {
   friend class BlenderSync;
 
  public:
+  /* Get whether to enable denoising data pass in viewport. */
+  static bool get_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene);
   /* Retrieve the render pass that needs to be displayed on the given `SpaceView3D`
    * When the `b_v3d` parameter is not given `PASS_NONE` will be returned. */
   static PassType get_viewport_display_render_pass(BL::SpaceView3D &b_v3d);
 };
 
+bool update_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene);
+
 PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_volume.cpp b/intern/cycles/blender/blender_volume.cpp
new file mode 100644
index 00000000000..6254a1a1b24
--- /dev/null
+++ b/intern/cycles/blender/blender_volume.cpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/colorspace.h"
+#include "render/image.h"
+#include "render/image_vdb.h"
+#include "render/mesh.h"
+#include "render/object.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#ifdef WITH_OPENVDB
+#  include <openvdb/openvdb.h>
+openvdb::GridBase::ConstPtr BKE_volume_grid_openvdb_for_read(const struct Volume *volume,
+                                                             struct VolumeGrid *grid);
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* TODO: verify this is not loading unnecessary attributes. */
+class BlenderSmokeLoader : public ImageLoader {
+ public:
+  BlenderSmokeLoader(const BL::Object &b_ob, AttributeStandard attribute)
+      : b_ob(b_ob), attribute(attribute)
+  {
+  }
+
+  bool load_metadata(ImageMetaData &metadata) override
+  {
+    BL::FluidDomainSettings b_domain = object_fluid_gas_domain_find(b_ob);
+
+    if (!b_domain) {
+      return false;
+    }
+
+    if (attribute == ATTR_STD_VOLUME_DENSITY || attribute == ATTR_STD_VOLUME_FLAME ||
+        attribute == ATTR_STD_VOLUME_HEAT || attribute == ATTR_STD_VOLUME_TEMPERATURE) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT;
+      metadata.channels = 1;
+    }
+    else if (attribute == ATTR_STD_VOLUME_COLOR) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+      metadata.channels = 4;
+    }
+    else if (attribute == ATTR_STD_VOLUME_VELOCITY) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+      metadata.channels = 3;
+    }
+    else {
+      return false;
+    }
+
+    int3 resolution = get_int3(b_domain.domain_resolution());
+    int amplify = (b_domain.use_noise()) ? b_domain.noise_scale() : 1;
+
+    /* Velocity and heat data is always low-resolution. */
+    if (attribute == ATTR_STD_VOLUME_VELOCITY || attribute == ATTR_STD_VOLUME_HEAT) {
+      amplify = 1;
+    }
+
+    metadata.width = resolution.x * amplify;
+    metadata.height = resolution.y * amplify;
+    metadata.depth = resolution.z * amplify;
+
+    /* Create a matrix to transform from object space to mesh texture space.
+     * This does not work with deformations but that can probably only be done
+     * well with a volume grid mapping of coordinates. */
+    BL::Mesh b_mesh(b_ob.data());
+    float3 loc, size;
+    mesh_texture_space(b_mesh, loc, size);
+    metadata.transform_3d = transform_translate(-loc) * transform_scale(size);
+    metadata.use_transform_3d = true;
+
+    return true;
+  }
+
+  bool load_pixels(const ImageMetaData &, void *pixels, const size_t, const bool) override
+  {
+    /* smoke volume data */
+    BL::FluidDomainSettings b_domain = object_fluid_gas_domain_find(b_ob);
+
+    if (!b_domain) {
+      return false;
+    }
+#ifdef WITH_FLUID
+    int3 resolution = get_int3(b_domain.domain_resolution());
+    int length, amplify = (b_domain.use_noise()) ? b_domain.noise_scale() : 1;
+
+    /* Velocity and heat data is always low-resolution. */
+    if (attribute == ATTR_STD_VOLUME_VELOCITY || attribute == ATTR_STD_VOLUME_HEAT) {
+      amplify = 1;
+    }
+
+    const int width = resolution.x * amplify;
+    const int height = resolution.y * amplify;
+    const int depth = resolution.z * amplify;
+    const size_t num_pixels = ((size_t)width) * height * depth;
+
+    float *fpixels = (float *)pixels;
+
+    if (attribute == ATTR_STD_VOLUME_DENSITY) {
+      FluidDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels) {
+        FluidDomainSettings_density_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_FLAME) {
+      /* this is in range 0..1, and interpreted by the OpenGL smoke viewer
+       * as 1500..3000 K with the first part faded to zero density */
+      FluidDomainSettings_flame_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels) {
+        FluidDomainSettings_flame_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_COLOR) {
+      /* the RGB is "premultiplied" by density for better interpolation results */
+      FluidDomainSettings_color_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels * 4) {
+        FluidDomainSettings_color_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_VELOCITY) {
+      FluidDomainSettings_velocity_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels * 3) {
+        FluidDomainSettings_velocity_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_HEAT) {
+      FluidDomainSettings_heat_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels) {
+        FluidDomainSettings_heat_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else if (attribute == ATTR_STD_VOLUME_TEMPERATURE) {
+      FluidDomainSettings_temperature_grid_get_length(&b_domain.ptr, &length);
+      if (length == num_pixels) {
+        FluidDomainSettings_temperature_grid_get(&b_domain.ptr, fpixels);
+        return true;
+      }
+    }
+    else {
+      fprintf(stderr,
+              "Cycles error: unknown volume attribute %s, skipping\n",
+              Attribute::standard_name(attribute));
+      fpixels[0] = 0.0f;
+      return false;
+    }
+#else
+    (void)pixels;
+#endif
+    fprintf(stderr, "Cycles error: unexpected smoke volume resolution, skipping\n");
+    return false;
+  }
+
+  string name() const override
+  {
+    return Attribute::standard_name(attribute);
+  }
+
+  bool equals(const ImageLoader &other) const override
+  {
+    const BlenderSmokeLoader &other_loader = (const BlenderSmokeLoader &)other;
+    return b_ob == other_loader.b_ob && attribute == other_loader.attribute;
+  }
+
+  BL::Object b_ob;
+  AttributeStandard attribute;
+};
+
+static void sync_smoke_volume(Scene *scene, BL::Object &b_ob, Mesh *mesh, float frame)
+{
+  BL::FluidDomainSettings b_domain = object_fluid_gas_domain_find(b_ob);
+  if (!b_domain) {
+    return;
+  }
+
+  AttributeStandard attributes[] = {ATTR_STD_VOLUME_DENSITY,
+                                    ATTR_STD_VOLUME_COLOR,
+                                    ATTR_STD_VOLUME_FLAME,
+                                    ATTR_STD_VOLUME_HEAT,
+                                    ATTR_STD_VOLUME_TEMPERATURE,
+                                    ATTR_STD_VOLUME_VELOCITY,
+                                    ATTR_STD_NONE};
+
+  for (int i = 0; attributes[i] != ATTR_STD_NONE; i++) {
+    AttributeStandard std = attributes[i];
+    if (!mesh->need_attribute(scene, std)) {
+      continue;
+    }
+
+    mesh->volume_clipping = b_domain.clipping();
+
+    Attribute *attr = mesh->attributes.add(std);
+
+    ImageLoader *loader = new BlenderSmokeLoader(b_ob, std);
+    ImageParams params;
+    params.frame = frame;
+
+    attr->data_voxel() = scene->image_manager->add_image(loader, params);
+  }
+}
+
+class BlenderVolumeLoader : public VDBImageLoader {
+ public:
+  BlenderVolumeLoader(BL::Volume b_volume, const string &grid_name)
+      : VDBImageLoader(grid_name),
+        b_volume(b_volume),
+        b_volume_grid(PointerRNA_NULL),
+        unload(false)
+  {
+#ifdef WITH_OPENVDB
+    /* Find grid with matching name. */
+    BL::Volume::grids_iterator b_grid_iter;
+    for (b_volume.grids.begin(b_grid_iter); b_grid_iter != b_volume.grids.end(); ++b_grid_iter) {
+      if (b_grid_iter->name() == grid_name) {
+        b_volume_grid = *b_grid_iter;
+      }
+    }
+#endif
+  }
+
+  bool load_metadata(ImageMetaData &metadata) override
+  {
+    if (!b_volume_grid) {
+      return false;
+    }
+
+    unload = !b_volume_grid.is_loaded();
+
+#ifdef WITH_OPENVDB
+    Volume *volume = (Volume *)b_volume.ptr.data;
+    VolumeGrid *volume_grid = (VolumeGrid *)b_volume_grid.ptr.data;
+    grid = BKE_volume_grid_openvdb_for_read(volume, volume_grid);
+#endif
+
+    return VDBImageLoader::load_metadata(metadata);
+  }
+
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t pixel_size,
+                   const bool associate_alpha) override
+  {
+    if (!b_volume_grid) {
+      return false;
+    }
+
+    return VDBImageLoader::load_pixels(metadata, pixels, pixel_size, associate_alpha);
+  }
+
+  bool equals(const ImageLoader &other) const override
+  {
+    /* TODO: detect multiple volume datablocks with the same filepath. */
+    const BlenderVolumeLoader &other_loader = (const BlenderVolumeLoader &)other;
+    return b_volume == other_loader.b_volume && b_volume_grid == other_loader.b_volume_grid;
+  }
+
+  void cleanup() override
+  {
+    VDBImageLoader::cleanup();
+    if (b_volume_grid && unload) {
+      b_volume_grid.unload();
+    }
+  }
+
+  BL::Volume b_volume;
+  BL::VolumeGrid b_volume_grid;
+  bool unload;
+};
+
+static void sync_volume_object(BL::BlendData &b_data, BL::Object &b_ob, Scene *scene, Mesh *mesh)
+{
+  BL::Volume b_volume(b_ob.data());
+  b_volume.grids.load(b_data.ptr.data);
+
+  BL::VolumeRender b_render(b_volume.render());
+
+  mesh->volume_clipping = b_render.clipping();
+  mesh->volume_step_size = b_render.step_size();
+  mesh->volume_object_space = (b_render.space() == BL::VolumeRender::space_OBJECT);
+
+  /* Find grid with matching name. */
+  BL::Volume::grids_iterator b_grid_iter;
+  for (b_volume.grids.begin(b_grid_iter); b_grid_iter != b_volume.grids.end(); ++b_grid_iter) {
+    BL::VolumeGrid b_grid = *b_grid_iter;
+    ustring name = ustring(b_grid.name());
+    AttributeStandard std = ATTR_STD_NONE;
+
+    if (name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
+      std = ATTR_STD_VOLUME_DENSITY;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR)) {
+      std = ATTR_STD_VOLUME_COLOR;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME)) {
+      std = ATTR_STD_VOLUME_FLAME;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) {
+      std = ATTR_STD_VOLUME_HEAT;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE)) {
+      std = ATTR_STD_VOLUME_TEMPERATURE;
+    }
+    else if (name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY)) {
+      std = ATTR_STD_VOLUME_VELOCITY;
+    }
+
+    if ((std != ATTR_STD_NONE && mesh->need_attribute(scene, std)) ||
+        mesh->need_attribute(scene, name)) {
+      Attribute *attr = (std != ATTR_STD_NONE) ?
+                            mesh->attributes.add(std) :
+                            mesh->attributes.add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL);
+
+      ImageLoader *loader = new BlenderVolumeLoader(b_volume, name.string());
+      ImageParams params;
+      params.frame = b_volume.grids.frame();
+
+      attr->data_voxel() = scene->image_manager->add_image(loader, params);
+    }
+  }
+}
+
+/* If the voxel attributes change, we need to rebuild the bounding mesh. */
+static vector<int> get_voxel_image_slots(Mesh *mesh)
+{
+  vector<int> slots;
+  for (const Attribute &attr : mesh->attributes.attributes) {
+    if (attr.element == ATTR_ELEMENT_VOXEL) {
+      slots.push_back(attr.data_voxel().svm_slot());
+    }
+  }
+
+  return slots;
+}
+
+void BlenderSync::sync_volume(BL::Object &b_ob, Mesh *mesh, const vector<Shader *> &used_shaders)
+{
+  vector<int> old_voxel_slots = get_voxel_image_slots(mesh);
+
+  mesh->clear();
+  mesh->used_shaders = used_shaders;
+
+  if (view_layer.use_volumes) {
+    if (b_ob.type() == BL::Object::type_VOLUME) {
+      /* Volume object. Create only attributes, bounding mesh will then
+       * be automatically generated later. */
+      sync_volume_object(b_data, b_ob, scene, mesh);
+    }
+    else {
+      /* Smoke domain. */
+      sync_smoke_volume(scene, b_ob, mesh, b_scene.frame_current());
+    }
+  }
+
+  /* Tag update. */
+  bool rebuild = (old_voxel_slots != get_voxel_image_slots(mesh));
+  mesh->tag_update(scene, rebuild);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 16c721da06a..e6502a40313 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -17,6 +17,7 @@
 
 #include "bvh/bvh.h"
 
+#include "render/hair.h"
 #include "render/mesh.h"
 #include "render/object.h"
 
@@ -99,31 +100,33 @@ int BVHStackEntry::encodeIdx() const
 
 /* BVH */
 
-BVH::BVH(const BVHParams &params_, const vector<Mesh *> &meshes_, const vector<Object *> &objects_)
-    : params(params_), meshes(meshes_), objects(objects_)
+BVH::BVH(const BVHParams &params_,
+         const vector<Geometry *> &geometry_,
+         const vector<Object *> &objects_)
+    : params(params_), geometry(geometry_), objects(objects_)
 {
 }
 
 BVH *BVH::create(const BVHParams &params,
-                 const vector<Mesh *> &meshes,
+                 const vector<Geometry *> &geometry,
                  const vector<Object *> &objects)
 {
   switch (params.bvh_layout) {
     case BVH_LAYOUT_BVH2:
-      return new BVH2(params, meshes, objects);
+      return new BVH2(params, geometry, objects);
     case BVH_LAYOUT_BVH4:
-      return new BVH4(params, meshes, objects);
+      return new BVH4(params, geometry, objects);
     case BVH_LAYOUT_BVH8:
-      return new BVH8(params, meshes, objects);
+      return new BVH8(params, geometry, objects);
     case BVH_LAYOUT_EMBREE:
 #ifdef WITH_EMBREE
-      return new BVHEmbree(params, meshes, objects);
+      return new BVHEmbree(params, geometry, objects);
 #else
       break;
 #endif
     case BVH_LAYOUT_OPTIX:
 #ifdef WITH_OPTIX
-      return new BVHOptiX(params, meshes, objects);
+      return new BVHOptiX(params, geometry, objects);
 #else
       break;
 #endif
@@ -217,36 +220,36 @@ void BVH::refit_primitives(int start, int end, BoundBox &bbox, uint &visibility)
     }
     else {
       /* Primitives. */
-      const Mesh *mesh = ob->mesh;
-
       if (pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
         /* Curves. */
-        int str_offset = (params.top_level) ? mesh->curve_offset : 0;
-        Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+        const Hair *hair = static_cast<const Hair *>(ob->geometry);
+        int prim_offset = (params.top_level) ? hair->prim_offset : 0;
+        Hair::Curve curve = hair->get_curve(pidx - prim_offset);
         int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
 
-        curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+        curve.bounds_grow(k, &hair->curve_keys[0], &hair->curve_radius[0], bbox);
 
         visibility |= PATH_RAY_CURVE;
 
         /* Motion curves. */
-        if (mesh->use_motion_blur) {
-          Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+        if (hair->use_motion_blur) {
+          Attribute *attr = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
           if (attr) {
-            size_t mesh_size = mesh->curve_keys.size();
-            size_t steps = mesh->motion_steps - 1;
+            size_t hair_size = hair->curve_keys.size();
+            size_t steps = hair->motion_steps - 1;
             float3 *key_steps = attr->data_float3();
 
             for (size_t i = 0; i < steps; i++)
-              curve.bounds_grow(k, key_steps + i * mesh_size, &mesh->curve_radius[0], bbox);
+              curve.bounds_grow(k, key_steps + i * hair_size, &hair->curve_radius[0], bbox);
           }
         }
       }
       else {
         /* Triangles. */
-        int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
-        Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+        const Mesh *mesh = static_cast<const Mesh *>(ob->geometry);
+        int prim_offset = (params.top_level) ? mesh->prim_offset : 0;
+        Mesh::Triangle triangle = mesh->get_triangle(pidx - prim_offset);
         const float3 *vpos = &mesh->verts[0];
 
         triangle.bounds_grow(vpos, bbox);
@@ -276,7 +279,7 @@ void BVH::pack_triangle(int idx, float4 tri_verts[3])
 {
   int tob = pack.prim_object[idx];
   assert(tob >= 0 && tob < objects.size());
-  const Mesh *mesh = objects[tob]->mesh;
+  const Mesh *mesh = static_cast<const Mesh *>(objects[tob]->geometry);
 
   int tidx = pack.prim_index[idx];
   Mesh::Triangle t = mesh->get_triangle(tidx);
@@ -347,15 +350,13 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
   const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8);
 
   /* Adjust primitive index to point to the triangle in the global array, for
-   * meshes with transform applied and already in the top level BVH.
+   * geometry with transform applied and already in the top level BVH.
    */
-  for (size_t i = 0; i < pack.prim_index.size(); i++)
+  for (size_t i = 0; i < pack.prim_index.size(); i++) {
     if (pack.prim_index[i] != -1) {
-      if (pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
-        pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->curve_offset;
-      else
-        pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->tri_offset;
+      pack.prim_index[i] += objects[pack.prim_object[i]]->geometry->prim_offset;
     }
+  }
 
   /* track offsets of instanced BVH data in global array */
   size_t prim_offset = pack.prim_index.size();
@@ -375,10 +376,10 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
   size_t pack_leaf_nodes_offset = leaf_nodes_size;
   size_t object_offset = 0;
 
-  foreach (Mesh *mesh, meshes) {
-    BVH *bvh = mesh->bvh;
+  foreach (Geometry *geom, geometry) {
+    BVH *bvh = geom->bvh;
 
-    if (mesh->need_build_bvh(params.bvh_layout)) {
+    if (geom->need_build_bvh(params.bvh_layout)) {
       prim_index_size += bvh->pack.prim_index.size();
       prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
       nodes_size += bvh->pack.nodes.size();
@@ -410,36 +411,35 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
   int4 *pack_leaf_nodes = (pack.leaf_nodes.size()) ? &pack.leaf_nodes[0] : NULL;
   float2 *pack_prim_time = (pack.prim_time.size()) ? &pack.prim_time[0] : NULL;
 
-  map<Mesh *, int> mesh_map;
+  map<Geometry *, int> geometry_map;
 
   /* merge */
   foreach (Object *ob, objects) {
-    Mesh *mesh = ob->mesh;
+    Geometry *geom = ob->geometry;
 
     /* We assume that if mesh doesn't need own BVH it was already included
      * into a top-level BVH and no packing here is needed.
      */
-    if (!mesh->need_build_bvh(params.bvh_layout)) {
+    if (!geom->need_build_bvh(params.bvh_layout)) {
       pack.object_node[object_offset++] = 0;
       continue;
     }
 
     /* if mesh already added once, don't add it again, but used set
      * node offset for this object */
-    map<Mesh *, int>::iterator it = mesh_map.find(mesh);
+    map<Geometry *, int>::iterator it = geometry_map.find(geom);
 
-    if (mesh_map.find(mesh) != mesh_map.end()) {
+    if (geometry_map.find(geom) != geometry_map.end()) {
       int noffset = it->second;
       pack.object_node[object_offset++] = noffset;
       continue;
     }
 
-    BVH *bvh = mesh->bvh;
+    BVH *bvh = geom->bvh;
 
     int noffset = nodes_offset;
     int noffset_leaf = nodes_leaf_offset;
-    int mesh_tri_offset = mesh->tri_offset;
-    int mesh_curve_offset = mesh->curve_offset;
+    int geom_prim_offset = geom->prim_offset;
 
     /* fill in node indexes for instances */
     if (bvh->pack.root_index == -1)
@@ -447,7 +447,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
     else
       pack.object_node[object_offset++] = noffset;
 
-    mesh_map[mesh] = pack.object_node[object_offset - 1];
+    geometry_map[geom] = pack.object_node[object_offset - 1];
 
     /* merge primitive, object and triangle indexes */
     if (bvh->pack.prim_index.size()) {
@@ -460,11 +460,11 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
       for (size_t i = 0; i < bvh_prim_index_size; i++) {
         if (bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset;
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_prim_index_offset] = -1;
         }
         else {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset;
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_prim_index_offset] = bvh_prim_tri_index[i] +
                                                         pack_prim_tri_verts_offset;
         }
@@ -535,8 +535,9 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 
         /* Modify offsets into arrays */
         int4 data = bvh_nodes[i + nsize_bbox];
-        int4 data1 = bvh_nodes[i + nsize_bbox - 1];
+
         if (use_obvh) {
+          int4 data1 = bvh_nodes[i + nsize_bbox - 1];
           data.z += (data.z < 0) ? -noffset_leaf : noffset;
           data.w += (data.w < 0) ? -noffset_leaf : noffset;
           data.x += (data.x < 0) ? -noffset_leaf : noffset;
@@ -545,6 +546,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
           data1.w += (data1.w < 0) ? -noffset_leaf : noffset;
           data1.x += (data1.x < 0) ? -noffset_leaf : noffset;
           data1.y += (data1.y < 0) ? -noffset_leaf : noffset;
+          pack_nodes[pack_nodes_offset + nsize_bbox] = data;
+          pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
         }
         else {
           data.z += (data.z < 0) ? -noffset_leaf : noffset;
@@ -553,10 +556,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
             data.x += (data.x < 0) ? -noffset_leaf : noffset;
             data.y += (data.y < 0) ? -noffset_leaf : noffset;
           }
-        }
-        pack_nodes[pack_nodes_offset + nsize_bbox] = data;
-        if (use_obvh) {
-          pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1;
+          pack_nodes[pack_nodes_offset + nsize_bbox] = data;
         }
 
         /* Usually this copies nothing, but we better
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 92082e4de86..bdde38640c9 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -33,7 +33,7 @@ struct BVHStackEntry;
 class BVHParams;
 class BoundBox;
 class LeafNode;
-class Mesh;
+class Geometry;
 class Object;
 class Progress;
 
@@ -84,11 +84,11 @@ class BVH {
  public:
   PackedBVH pack;
   BVHParams params;
-  vector<Mesh *> meshes;
+  vector<Geometry *> geometry;
   vector<Object *> objects;
 
   static BVH *create(const BVHParams &params,
-                     const vector<Mesh *> &meshes,
+                     const vector<Geometry *> &geometry,
                      const vector<Object *> &objects);
   virtual ~BVH()
   {
@@ -102,7 +102,9 @@ class BVH {
   void refit(Progress &progress);
 
  protected:
-  BVH(const BVHParams &params, const vector<Mesh *> &meshes, const vector<Object *> &objects);
+  BVH(const BVHParams &params,
+      const vector<Geometry *> &geometry,
+      const vector<Object *> &objects);
 
   /* Refit range of primitives. */
   void refit_primitives(int start, int end, BoundBox &bbox, uint &visibility);
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
index b1a9148c297..c903070429e 100644
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -26,9 +26,9 @@
 CCL_NAMESPACE_BEGIN
 
 BVH2::BVH2(const BVHParams &params_,
-           const vector<Mesh *> &meshes_,
+           const vector<Geometry *> &geometry_,
            const vector<Object *> &objects_)
-    : BVH(params_, meshes_, objects_)
+    : BVH(params_, geometry_, objects_)
 {
 }
 
diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h
index a3eaff9cf65..fa3e45b72d2 100644
--- a/intern/cycles/bvh/bvh2.h
+++ b/intern/cycles/bvh/bvh2.h
@@ -46,7 +46,9 @@ class BVH2 : public BVH {
  protected:
   /* constructor */
   friend class BVH;
-  BVH2(const BVHParams &params, const vector<Mesh *> &meshes, const vector<Object *> &objects);
+  BVH2(const BVHParams &params,
+       const vector<Geometry *> &geometry,
+       const vector<Object *> &objects);
 
   /* Building process. */
   virtual BVHNode *widen_children_nodes(const BVHNode *root) override;
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
index 89b42ee1d21..143c3e54f94 100644
--- a/intern/cycles/bvh/bvh4.cpp
+++ b/intern/cycles/bvh/bvh4.cpp
@@ -32,9 +32,9 @@ CCL_NAMESPACE_BEGIN
  */
 
 BVH4::BVH4(const BVHParams &params_,
-           const vector<Mesh *> &meshes_,
+           const vector<Geometry *> &geometry_,
            const vector<Object *> &objects_)
-    : BVH(params_, meshes_, objects_)
+    : BVH(params_, geometry_, objects_)
 {
   params.bvh_layout = BVH_LAYOUT_BVH4;
 }
diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h
index c44f2833c84..afbb9007afb 100644
--- a/intern/cycles/bvh/bvh4.h
+++ b/intern/cycles/bvh/bvh4.h
@@ -46,7 +46,9 @@ class BVH4 : public BVH {
  protected:
   /* constructor */
   friend class BVH;
-  BVH4(const BVHParams &params, const vector<Mesh *> &meshes, const vector<Object *> &objects);
+  BVH4(const BVHParams &params,
+       const vector<Geometry *> &geometry,
+       const vector<Object *> &objects);
 
   /* Building process. */
   virtual BVHNode *widen_children_nodes(const BVHNode *root) override;
diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp
index d3516525f78..342dd9e85a5 100644
--- a/intern/cycles/bvh/bvh8.cpp
+++ b/intern/cycles/bvh/bvh8.cpp
@@ -28,6 +28,7 @@
 
 #include "bvh/bvh8.h"
 
+#include "render/hair.h"
 #include "render/mesh.h"
 #include "render/object.h"
 
@@ -37,9 +38,9 @@
 CCL_NAMESPACE_BEGIN
 
 BVH8::BVH8(const BVHParams &params_,
-           const vector<Mesh *> &meshes_,
+           const vector<Geometry *> &geometry_,
            const vector<Object *> &objects_)
-    : BVH(params_, meshes_, objects_)
+    : BVH(params_, geometry_, objects_)
 {
 }
 
@@ -429,37 +430,37 @@ void BVH8::refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility)
       }
       else {
         /* Primitives. */
-        const Mesh *mesh = ob->mesh;
-
         if (pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
           /* Curves. */
-          int str_offset = (params.top_level) ? mesh->curve_offset : 0;
-          Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
+          const Hair *hair = static_cast<const Hair *>(ob->geometry);
+          int prim_offset = (params.top_level) ? hair->prim_offset : 0;
+          Hair::Curve curve = hair->get_curve(pidx - prim_offset);
           int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
 
-          curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);
+          curve.bounds_grow(k, &hair->curve_keys[0], &hair->curve_radius[0], bbox);
 
           visibility |= PATH_RAY_CURVE;
 
           /* Motion curves. */
-          if (mesh->use_motion_blur) {
-            Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+          if (hair->use_motion_blur) {
+            Attribute *attr = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
             if (attr) {
-              size_t mesh_size = mesh->curve_keys.size();
-              size_t steps = mesh->motion_steps - 1;
+              size_t hair_size = hair->curve_keys.size();
+              size_t steps = hair->motion_steps - 1;
               float3 *key_steps = attr->data_float3();
 
               for (size_t i = 0; i < steps; i++) {
-                curve.bounds_grow(k, key_steps + i * mesh_size, &mesh->curve_radius[0], bbox);
+                curve.bounds_grow(k, key_steps + i * hair_size, &hair->curve_radius[0], bbox);
               }
             }
           }
         }
         else {
           /* Triangles. */
-          int tri_offset = (params.top_level) ? mesh->tri_offset : 0;
-          Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
+          const Mesh *mesh = static_cast<const Mesh *>(ob->geometry);
+          int prim_offset = (params.top_level) ? mesh->prim_offset : 0;
+          Mesh::Triangle triangle = mesh->get_triangle(pidx - prim_offset);
           const float3 *vpos = &mesh->verts[0];
 
           triangle.bounds_grow(vpos, bbox);
diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h
index 5f26fd423e1..d23fa528e3e 100644
--- a/intern/cycles/bvh/bvh8.h
+++ b/intern/cycles/bvh/bvh8.h
@@ -57,7 +57,9 @@ class BVH8 : public BVH {
  protected:
   /* constructor */
   friend class BVH;
-  BVH8(const BVHParams &params, const vector<Mesh *> &meshes, const vector<Object *> &objects);
+  BVH8(const BVHParams &params,
+       const vector<Geometry *> &geometry,
+       const vector<Object *> &objects);
 
   /* Building process. */
   virtual BVHNode *widen_children_nodes(const BVHNode *root) override;
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 1d9b006e8cb..db156219f09 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -22,19 +22,20 @@
 #include "bvh/bvh_params.h"
 #include "bvh_split.h"
 
+#include "render/curves.h"
+#include "render/hair.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
-#include "render/curves.h"
 
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_progress.h"
-#include "util/util_stack_allocator.h"
+#include "util/util_queue.h"
 #include "util/util_simd.h"
+#include "util/util_stack_allocator.h"
 #include "util/util_time.h"
-#include "util/util_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -194,21 +195,21 @@ void BVHBuild::add_reference_triangles(BoundBox &root, BoundBox &center, Mesh *m
   }
 }
 
-void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh, int i)
+void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair, int i)
 {
   const Attribute *curve_attr_mP = NULL;
-  if (mesh->has_motion_blur()) {
-    curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  if (hair->has_motion_blur()) {
+    curve_attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
   }
-  const size_t num_curves = mesh->num_curves();
+  const size_t num_curves = hair->num_curves();
   for (uint j = 0; j < num_curves; j++) {
-    const Mesh::Curve curve = mesh->get_curve(j);
-    const float *curve_radius = &mesh->curve_radius[0];
+    const Hair::Curve curve = hair->get_curve(j);
+    const float *curve_radius = &hair->curve_radius[0];
     for (int k = 0; k < curve.num_keys - 1; k++) {
       if (curve_attr_mP == NULL) {
         /* Really simple logic for static hair. */
         BoundBox bounds = BoundBox::empty;
-        curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
+        curve.bounds_grow(k, &hair->curve_keys[0], curve_radius, bounds);
         if (bounds.valid()) {
           int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_CURVE, k);
           references.push_back(BVHReference(bounds, j, i, packed_type));
@@ -223,9 +224,9 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh
          */
         /* TODO(sergey): Support motion steps for spatially split BVH. */
         BoundBox bounds = BoundBox::empty;
-        curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
-        const size_t num_keys = mesh->curve_keys.size();
-        const size_t num_steps = mesh->motion_steps;
+        curve.bounds_grow(k, &hair->curve_keys[0], curve_radius, bounds);
+        const size_t num_keys = hair->curve_keys.size();
+        const size_t num_steps = hair->motion_steps;
         const float3 *key_steps = curve_attr_mP->data_float3();
         for (size_t step = 0; step < num_steps - 1; step++) {
           curve.bounds_grow(k, key_steps + step * num_keys, curve_radius, bounds);
@@ -244,10 +245,10 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh
          */
         const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
         const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
-        const size_t num_steps = mesh->motion_steps;
-        const float3 *curve_keys = &mesh->curve_keys[0];
+        const size_t num_steps = hair->motion_steps;
+        const float3 *curve_keys = &hair->curve_keys[0];
         const float3 *key_steps = curve_attr_mP->data_float3();
-        const size_t num_keys = mesh->curve_keys.size();
+        const size_t num_keys = hair->curve_keys.size();
         /* Calculate bounding box of the previous time step.
          * Will be reused later to avoid duplicated work on
          * calculating BVH time step boundbox.
@@ -302,13 +303,15 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh
   }
 }
 
-void BVHBuild::add_reference_mesh(BoundBox &root, BoundBox &center, Mesh *mesh, int i)
+void BVHBuild::add_reference_geometry(BoundBox &root, BoundBox &center, Geometry *geom, int i)
 {
-  if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
     add_reference_triangles(root, center, mesh, i);
   }
-  if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-    add_reference_curves(root, center, mesh, i);
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    add_reference_curves(root, center, hair, i);
   }
 }
 
@@ -319,16 +322,30 @@ void BVHBuild::add_reference_object(BoundBox &root, BoundBox &center, Object *ob
   center.grow(ob->bounds.center2());
 }
 
-static size_t count_curve_segments(Mesh *mesh)
+static size_t count_curve_segments(Hair *hair)
 {
-  size_t num = 0, num_curves = mesh->num_curves();
+  size_t num = 0, num_curves = hair->num_curves();
 
   for (size_t i = 0; i < num_curves; i++)
-    num += mesh->get_curve(i).num_keys - 1;
+    num += hair->get_curve(i).num_keys - 1;
 
   return num;
 }
 
+static size_t count_primitives(Geometry *geom)
+{
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    return mesh->num_triangles();
+  }
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    return count_curve_segments(hair);
+  }
+
+  return 0;
+}
+
 void BVHBuild::add_references(BVHRange &root)
 {
   /* reserve space for references */
@@ -339,24 +356,14 @@ void BVHBuild::add_references(BVHRange &root)
       if (!ob->is_traceable()) {
         continue;
       }
-      if (!ob->mesh->is_instanced()) {
-        if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
-          num_alloc_references += ob->mesh->num_triangles();
-        }
-        if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-          num_alloc_references += count_curve_segments(ob->mesh);
-        }
+      if (!ob->geometry->is_instanced()) {
+        num_alloc_references += count_primitives(ob->geometry);
       }
       else
         num_alloc_references++;
     }
     else {
-      if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
-        num_alloc_references += ob->mesh->num_triangles();
-      }
-      if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-        num_alloc_references += count_curve_segments(ob->mesh);
-      }
+      num_alloc_references += count_primitives(ob->geometry);
     }
   }
 
@@ -372,13 +379,13 @@ void BVHBuild::add_references(BVHRange &root)
         ++i;
         continue;
       }
-      if (!ob->mesh->is_instanced())
-        add_reference_mesh(bounds, center, ob->mesh, i);
+      if (!ob->geometry->is_instanced())
+        add_reference_geometry(bounds, center, ob->geometry, i);
       else
         add_reference_object(bounds, center, ob, i);
     }
     else
-      add_reference_mesh(bounds, center, ob->mesh, i);
+      add_reference_geometry(bounds, center, ob->geometry, i);
 
     i++;
 
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 9685e26cfac..3fe4c3799e2 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -35,6 +35,8 @@ class BVHNode;
 class BVHSpatialSplitBuildTask;
 class BVHParams;
 class InnerNode;
+class Geometry;
+class Hair;
 class Mesh;
 class Object;
 class Progress;
@@ -65,8 +67,8 @@ class BVHBuild {
 
   /* Adding references. */
   void add_reference_triangles(BoundBox &root, BoundBox &center, Mesh *mesh, int i);
-  void add_reference_curves(BoundBox &root, BoundBox &center, Mesh *mesh, int i);
-  void add_reference_mesh(BoundBox &root, BoundBox &center, Mesh *mesh, int i);
+  void add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair, int i);
+  void add_reference_geometry(BoundBox &root, BoundBox &center, Geometry *geom, int i);
   void add_reference_object(BoundBox &root, BoundBox &center, Object *ob, int i);
   void add_references(BVHRange &root);
 
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
index 3e4978a2c0a..9356adf3ea5 100644
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -35,9 +35,9 @@
 
 #ifdef WITH_EMBREE
 
+#  include <embree3/rtcore_geometry.h>
 #  include <pmmintrin.h>
 #  include <xmmintrin.h>
-#  include <embree3/rtcore_geometry.h>
 
 #  include "bvh/bvh_embree.h"
 
@@ -45,10 +45,11 @@
  */
 #  include "kernel/bvh/bvh_embree.h"
 #  include "kernel/kernel_compat_cpu.h"
-#  include "kernel/split/kernel_split_data_types.h"
 #  include "kernel/kernel_globals.h"
 #  include "kernel/kernel_random.h"
+#  include "kernel/split/kernel_split_data_types.h"
 
+#  include "render/hair.h"
 #  include "render/mesh.h"
 #  include "render/object.h"
 #  include "util/util_foreach.h"
@@ -57,6 +58,11 @@
 
 CCL_NAMESPACE_BEGIN
 
+static_assert(Object::MAX_MOTION_STEPS <= RTC_MAX_TIME_STEP_COUNT,
+              "Object and Embree max motion steps inconsistent");
+static_assert(Object::MAX_MOTION_STEPS == Geometry::MAX_MOTION_STEPS,
+              "Object and Geometry max motion steps inconsistent");
+
 #  define IS_HAIR(x) (x & 1)
 
 /* This gets called by Embree at every valid ray/object intersection.
@@ -301,10 +307,24 @@ RTCDevice BVHEmbree::rtc_shared_device = NULL;
 int BVHEmbree::rtc_shared_users = 0;
 thread_mutex BVHEmbree::rtc_shared_mutex;
 
+static size_t count_primitives(Geometry *geom)
+{
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    return mesh->num_triangles();
+  }
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    return hair->num_segments();
+  }
+
+  return 0;
+}
+
 BVHEmbree::BVHEmbree(const BVHParams &params_,
-                     const vector<Mesh *> &meshes_,
+                     const vector<Geometry *> &geometry_,
                      const vector<Object *> &objects_)
-    : BVH(params_, meshes_, objects_),
+    : BVH(params_, geometry_, objects_),
       scene(NULL),
       mem_used(0),
       top_level(NULL),
@@ -325,7 +345,7 @@ BVHEmbree::BVHEmbree(const BVHParams &params_,
     if (ret != 1) {
       assert(0);
       VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED flag."
-                 "Ray visiblity will not work.";
+                 "Ray visibility will not work.";
     }
     ret = rtcGetDeviceProperty(rtc_shared_device, RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED);
     if (ret != 1) {
@@ -436,29 +456,15 @@ void BVHEmbree::build(Progress &progress, Stats *stats_)
       if (!ob->is_traceable()) {
         continue;
       }
-      if (!ob->mesh->is_instanced()) {
-        if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
-          prim_count += ob->mesh->num_triangles();
-        }
-        if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-          for (size_t j = 0; j < ob->mesh->num_curves(); ++j) {
-            prim_count += ob->mesh->get_curve(j).num_segments();
-          }
-        }
+      if (!ob->geometry->is_instanced()) {
+        prim_count += count_primitives(ob->geometry);
       }
       else {
         ++prim_count;
       }
     }
     else {
-      if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && ob->mesh->num_triangles() > 0) {
-        prim_count += ob->mesh->num_triangles();
-      }
-      if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-        for (size_t j = 0; j < ob->mesh->num_curves(); ++j) {
-          prim_count += ob->mesh->get_curve(j).num_segments();
-        }
-      }
+      prim_count += count_primitives(ob->geometry);
     }
   }
 
@@ -477,7 +483,7 @@ void BVHEmbree::build(Progress &progress, Stats *stats_)
         ++i;
         continue;
       }
-      if (!ob->mesh->is_instanced()) {
+      if (!ob->geometry->is_instanced()) {
         add_object(ob, i);
       }
       else {
@@ -528,36 +534,57 @@ BVHNode *BVHEmbree::widen_children_nodes(const BVHNode * /*root*/)
 
 void BVHEmbree::add_object(Object *ob, int i)
 {
-  Mesh *mesh = ob->mesh;
-  if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
-    add_triangles(ob, i);
+  Geometry *geom = ob->geometry;
+
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    if (mesh->num_triangles() > 0) {
+      add_triangles(ob, mesh, i);
+    }
   }
-  if (params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
-    add_curves(ob, i);
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    if (hair->num_curves() > 0) {
+      add_curves(ob, hair, i);
+    }
   }
 }
 
 void BVHEmbree::add_instance(Object *ob, int i)
 {
-  if (!ob || !ob->mesh) {
+  if (!ob || !ob->geometry) {
     assert(0);
     return;
   }
-  BVHEmbree *instance_bvh = (BVHEmbree *)(ob->mesh->bvh);
+  BVHEmbree *instance_bvh = (BVHEmbree *)(ob->geometry->bvh);
 
   if (instance_bvh->top_level != this) {
     instance_bvh->top_level = this;
   }
 
-  const size_t num_motion_steps = ob->use_motion() ? ob->motion.size() : 1;
+  const size_t num_object_motion_steps = ob->use_motion() ? ob->motion.size() : 1;
+  const size_t num_motion_steps = min(num_object_motion_steps, RTC_MAX_TIME_STEP_COUNT);
+  assert(num_object_motion_steps <= RTC_MAX_TIME_STEP_COUNT);
+
   RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, RTC_GEOMETRY_TYPE_INSTANCE);
   rtcSetGeometryInstancedScene(geom_id, instance_bvh->scene);
   rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);
 
   if (ob->use_motion()) {
+    array<DecomposedTransform> decomp(ob->motion.size());
+    transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
     for (size_t step = 0; step < num_motion_steps; ++step) {
-      rtcSetGeometryTransform(
-          geom_id, step, RTC_FORMAT_FLOAT3X4_ROW_MAJOR, (const float *)&ob->motion[step]);
+      RTCQuaternionDecomposition rtc_decomp;
+      rtcInitQuaternionDecomposition(&rtc_decomp);
+      rtcQuaternionDecompositionSetQuaternion(
+          &rtc_decomp, decomp[step].x.w, decomp[step].x.x, decomp[step].x.y, decomp[step].x.z);
+      rtcQuaternionDecompositionSetScale(
+          &rtc_decomp, decomp[step].y.w, decomp[step].z.w, decomp[step].w.w);
+      rtcQuaternionDecompositionSetTranslation(
+          &rtc_decomp, decomp[step].y.x, decomp[step].y.y, decomp[step].y.z);
+      rtcQuaternionDecompositionSetSkew(
+          &rtc_decomp, decomp[step].z.x, decomp[step].z.y, decomp[step].w.x);
+      rtcSetGeometryTransformQuaternion(geom_id, step, &rtc_decomp);
     }
   }
   else {
@@ -570,30 +597,28 @@ void BVHEmbree::add_instance(Object *ob, int i)
   pack.prim_tri_index.push_back_slow(-1);
 
   rtcSetGeometryUserData(geom_id, (void *)instance_bvh->scene);
-  rtcSetGeometryMask(geom_id, ob->visibility);
+  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
   rtcCommitGeometry(geom_id);
   rtcAttachGeometryByID(scene, geom_id, i * 2);
   rtcReleaseGeometry(geom_id);
 }
 
-void BVHEmbree::add_triangles(Object *ob, int i)
+void BVHEmbree::add_triangles(const Object *ob, const Mesh *mesh, int i)
 {
   size_t prim_offset = pack.prim_index.size();
-  Mesh *mesh = ob->mesh;
   const Attribute *attr_mP = NULL;
-  size_t num_motion_steps = 1;
+  size_t num_geometry_motion_steps = 1;
   if (mesh->has_motion_blur()) {
     attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     if (attr_mP) {
-      num_motion_steps = mesh->motion_steps;
-      if (num_motion_steps > RTC_MAX_TIME_STEP_COUNT) {
-        assert(0);
-        num_motion_steps = RTC_MAX_TIME_STEP_COUNT;
-      }
+      num_geometry_motion_steps = mesh->motion_steps;
     }
   }
 
+  const size_t num_motion_steps = min(num_geometry_motion_steps, RTC_MAX_TIME_STEP_COUNT);
+  assert(num_geometry_motion_steps <= RTC_MAX_TIME_STEP_COUNT);
+
   const size_t num_triangles = mesh->num_triangles();
   RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, RTC_GEOMETRY_TYPE_TRIANGLE);
   rtcSetGeometryBuildQuality(geom_id, build_quality);
@@ -635,7 +660,7 @@ void BVHEmbree::add_triangles(Object *ob, int i)
   rtcSetGeometryUserData(geom_id, (void *)prim_offset);
   rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func);
   rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
-  rtcSetGeometryMask(geom_id, ob->visibility);
+  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
   rtcCommitGeometry(geom_id);
   rtcAttachGeometryByID(scene, geom_id, i * 2);
@@ -684,31 +709,37 @@ void BVHEmbree::update_tri_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh)
   }
 }
 
-void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh)
+void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair)
 {
   const Attribute *attr_mP = NULL;
   size_t num_motion_steps = 1;
-  if (mesh->has_motion_blur()) {
-    attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  if (hair->has_motion_blur()) {
+    attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     if (attr_mP) {
-      num_motion_steps = mesh->motion_steps;
+      num_motion_steps = hair->motion_steps;
     }
   }
 
-  const size_t num_curves = mesh->num_curves();
+  const size_t num_curves = hair->num_curves();
   size_t num_keys = 0;
   for (size_t j = 0; j < num_curves; ++j) {
-    const Mesh::Curve c = mesh->get_curve(j);
+    const Hair::Curve c = hair->get_curve(j);
     num_keys += c.num_keys;
   }
 
+  /* Catmull-Rom splines need extra CVs at the beginning and end of each curve. */
+  size_t num_keys_embree = num_keys;
+  if (use_curves) {
+    num_keys_embree += num_curves * 2;
+  }
+
   /* Copy the CV data to Embree */
   const int t_mid = (num_motion_steps - 1) / 2;
-  const float *curve_radius = &mesh->curve_radius[0];
+  const float *curve_radius = &hair->curve_radius[0];
   for (int t = 0; t < num_motion_steps; ++t) {
     const float3 *verts;
     if (t == t_mid || attr_mP == NULL) {
-      verts = &mesh->curve_keys[0];
+      verts = &hair->curve_keys[0];
     }
     else {
       int t_ = (t > t_mid) ? (t - 1) : t;
@@ -716,42 +747,28 @@ void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh
     }
 
     float4 *rtc_verts = (float4 *)rtcSetNewGeometryBuffer(
-        geom_id, RTC_BUFFER_TYPE_VERTEX, t, RTC_FORMAT_FLOAT4, sizeof(float) * 4, num_keys);
-    float4 *rtc_tangents = NULL;
-    if (use_curves) {
-      rtc_tangents = (float4 *)rtcSetNewGeometryBuffer(
-          geom_id, RTC_BUFFER_TYPE_TANGENT, t, RTC_FORMAT_FLOAT4, sizeof(float) * 4, num_keys);
-      assert(rtc_tangents);
-    }
+        geom_id, RTC_BUFFER_TYPE_VERTEX, t, RTC_FORMAT_FLOAT4, sizeof(float) * 4, num_keys_embree);
+
     assert(rtc_verts);
     if (rtc_verts) {
-      if (use_curves && rtc_tangents) {
-        const size_t num_curves = mesh->num_curves();
+      if (use_curves) {
+        const size_t num_curves = hair->num_curves();
         for (size_t j = 0; j < num_curves; ++j) {
-          Mesh::Curve c = mesh->get_curve(j);
+          Hair::Curve c = hair->get_curve(j);
           int fk = c.first_key;
-          rtc_verts[0] = float3_to_float4(verts[fk]);
-          rtc_verts[0].w = curve_radius[fk];
-          rtc_tangents[0] = float3_to_float4(verts[fk + 1] - verts[fk]);
-          rtc_tangents[0].w = curve_radius[fk + 1] - curve_radius[fk];
-          ++fk;
           int k = 1;
-          for (; k < c.num_segments(); ++k, ++fk) {
+          for (; k < c.num_keys + 1; ++k, ++fk) {
             rtc_verts[k] = float3_to_float4(verts[fk]);
             rtc_verts[k].w = curve_radius[fk];
-            rtc_tangents[k] = float3_to_float4((verts[fk + 1] - verts[fk - 1]) * 0.5f);
-            rtc_tangents[k].w = (curve_radius[fk + 1] - curve_radius[fk - 1]) * 0.5f;
           }
-          rtc_verts[k] = float3_to_float4(verts[fk]);
-          rtc_verts[k].w = curve_radius[fk];
-          rtc_tangents[k] = float3_to_float4(verts[fk] - verts[fk - 1]);
-          rtc_tangents[k].w = curve_radius[fk] - curve_radius[fk - 1];
-          rtc_verts += c.num_keys;
-          rtc_tangents += c.num_keys;
+          /* Duplicate Embree's Catmull-Rom spline CVs at the start and end of each curve. */
+          rtc_verts[0] = rtc_verts[1];
+          rtc_verts[k] = rtc_verts[k - 1];
+          rtc_verts += c.num_keys + 2;
         }
       }
       else {
-        for (size_t j = 0; j < num_keys; ++j) {
+        for (size_t j = 0; j < num_keys_embree; ++j) {
           rtc_verts[j] = float3_to_float4(verts[j]);
           rtc_verts[j].w = curve_radius[j];
         }
@@ -760,23 +777,25 @@ void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh
   }
 }
 
-void BVHEmbree::add_curves(Object *ob, int i)
+void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
 {
   size_t prim_offset = pack.prim_index.size();
-  const Mesh *mesh = ob->mesh;
   const Attribute *attr_mP = NULL;
-  size_t num_motion_steps = 1;
-  if (mesh->has_motion_blur()) {
-    attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  size_t num_geometry_motion_steps = 1;
+  if (hair->has_motion_blur()) {
+    attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     if (attr_mP) {
-      num_motion_steps = mesh->motion_steps;
+      num_geometry_motion_steps = hair->motion_steps;
     }
   }
 
-  const size_t num_curves = mesh->num_curves();
+  const size_t num_motion_steps = min(num_geometry_motion_steps, RTC_MAX_TIME_STEP_COUNT);
+  assert(num_geometry_motion_steps <= RTC_MAX_TIME_STEP_COUNT);
+
+  const size_t num_curves = hair->num_curves();
   size_t num_segments = 0;
   for (size_t j = 0; j < num_curves; ++j) {
-    Mesh::Curve c = mesh->get_curve(j);
+    Hair::Curve c = hair->get_curve(j);
     assert(c.num_segments() > 0);
     num_segments += c.num_segments();
   }
@@ -793,8 +812,8 @@ void BVHEmbree::add_curves(Object *ob, int i)
 
   enum RTCGeometryType type = (!use_curves) ?
                                   RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE :
-                                  (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE :
-                                                 RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE);
+                                  (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE :
+                                                 RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE);
 
   RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, type);
   rtcSetGeometryTessellationRate(geom_id, curve_subdivisions);
@@ -802,9 +821,13 @@ void BVHEmbree::add_curves(Object *ob, int i)
       geom_id, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT, sizeof(int), num_segments);
   size_t rtc_index = 0;
   for (size_t j = 0; j < num_curves; ++j) {
-    Mesh::Curve c = mesh->get_curve(j);
+    Hair::Curve c = hair->get_curve(j);
     for (size_t k = 0; k < c.num_segments(); ++k) {
       rtc_indices[rtc_index] = c.first_key + k;
+      if (use_curves) {
+        /* Room for extra CVs at Catmull-Rom splines. */
+        rtc_indices[rtc_index] += j * 2;
+      }
       /* Cycles specific data. */
       pack.prim_object[prim_object_size + rtc_index] = i;
       pack.prim_type[prim_type_size + rtc_index] = (PRIMITIVE_PACK_SEGMENT(
@@ -819,12 +842,12 @@ void BVHEmbree::add_curves(Object *ob, int i)
   rtcSetGeometryBuildQuality(geom_id, build_quality);
   rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);
 
-  update_curve_vertex_buffer(geom_id, mesh);
+  update_curve_vertex_buffer(geom_id, hair);
 
   rtcSetGeometryUserData(geom_id, (void *)prim_offset);
   rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func);
   rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
-  rtcSetGeometryMask(geom_id, ob->visibility);
+  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
   rtcCommitGeometry(geom_id);
   rtcAttachGeometryByID(scene, geom_id, i * 2 + 1);
@@ -840,10 +863,7 @@ void BVHEmbree::pack_nodes(const BVHNode *)
 
   for (size_t i = 0; i < pack.prim_index.size(); ++i) {
     if (pack.prim_index[i] != -1) {
-      if (pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
-        pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->curve_offset;
-      else
-        pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->tri_offset;
+      pack.prim_index[i] += objects[pack.prim_object[i]]->geometry->prim_offset;
     }
   }
 
@@ -857,22 +877,22 @@ void BVHEmbree::pack_nodes(const BVHNode *)
   size_t pack_prim_tri_verts_offset = prim_tri_verts_size;
   size_t object_offset = 0;
 
-  map<Mesh *, int> mesh_map;
+  map<Geometry *, int> geometry_map;
 
   foreach (Object *ob, objects) {
-    Mesh *mesh = ob->mesh;
-    BVH *bvh = mesh->bvh;
+    Geometry *geom = ob->geometry;
+    BVH *bvh = geom->bvh;
 
-    if (mesh->need_build_bvh(BVH_LAYOUT_EMBREE)) {
-      if (mesh_map.find(mesh) == mesh_map.end()) {
+    if (geom->need_build_bvh(BVH_LAYOUT_EMBREE)) {
+      if (geometry_map.find(geom) == geometry_map.end()) {
         prim_index_size += bvh->pack.prim_index.size();
         prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
-        mesh_map[mesh] = 1;
+        geometry_map[geom] = 1;
       }
     }
   }
 
-  mesh_map.clear();
+  geometry_map.clear();
 
   pack.prim_index.resize(prim_index_size);
   pack.prim_type.resize(prim_index_size);
@@ -890,38 +910,37 @@ void BVHEmbree::pack_nodes(const BVHNode *)
 
   /* merge */
   foreach (Object *ob, objects) {
-    Mesh *mesh = ob->mesh;
+    Geometry *geom = ob->geometry;
 
     /* We assume that if mesh doesn't need own BVH it was already included
      * into a top-level BVH and no packing here is needed.
      */
-    if (!mesh->need_build_bvh(BVH_LAYOUT_EMBREE)) {
+    if (!geom->need_build_bvh(BVH_LAYOUT_EMBREE)) {
       pack.object_node[object_offset++] = prim_offset;
       continue;
     }
 
-    /* if mesh already added once, don't add it again, but used set
+    /* if geom already added once, don't add it again, but used set
      * node offset for this object */
-    map<Mesh *, int>::iterator it = mesh_map.find(mesh);
+    map<Geometry *, int>::iterator it = geometry_map.find(geom);
 
-    if (mesh_map.find(mesh) != mesh_map.end()) {
+    if (geometry_map.find(geom) != geometry_map.end()) {
       int noffset = it->second;
       pack.object_node[object_offset++] = noffset;
       continue;
     }
 
-    BVHEmbree *bvh = (BVHEmbree *)mesh->bvh;
+    BVHEmbree *bvh = (BVHEmbree *)geom->bvh;
 
     rtc_memory_monitor_func(stats, unaccounted_mem, true);
     unaccounted_mem = 0;
 
-    int mesh_tri_offset = mesh->tri_offset;
-    int mesh_curve_offset = mesh->curve_offset;
+    int geom_prim_offset = geom->prim_offset;
 
     /* fill in node indexes for instances */
     pack.object_node[object_offset++] = prim_offset;
 
-    mesh_map[mesh] = pack.object_node[object_offset - 1];
+    geometry_map[geom] = pack.object_node[object_offset - 1];
 
     /* merge primitive, object and triangle indexes */
     if (bvh->pack.prim_index.size()) {
@@ -932,11 +951,11 @@ void BVHEmbree::pack_nodes(const BVHNode *)
 
       for (size_t i = 0; i < bvh_prim_index_size; ++i) {
         if (bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset;
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_prim_index_offset] = -1;
         }
         else {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset;
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_prim_index_offset] = bvh_prim_tri_index[i] +
                                                         pack_prim_tri_verts_offset;
         }
@@ -966,15 +985,22 @@ void BVHEmbree::refit_nodes()
   /* Update all vertex buffers, then tell Embree to rebuild/-fit the BVHs. */
   unsigned geom_id = 0;
   foreach (Object *ob, objects) {
-    if (!params.top_level || (ob->is_traceable() && !ob->mesh->is_instanced())) {
-      if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && ob->mesh->num_triangles() > 0) {
-        update_tri_vertex_buffer(rtcGetGeometry(scene, geom_id), ob->mesh);
-        rtcCommitGeometry(rtcGetGeometry(scene, geom_id));
+    if (!params.top_level || (ob->is_traceable() && !ob->geometry->is_instanced())) {
+      Geometry *geom = ob->geometry;
+
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (mesh->num_triangles() > 0) {
+          update_tri_vertex_buffer(rtcGetGeometry(scene, geom_id), mesh);
+          rtcCommitGeometry(rtcGetGeometry(scene, geom_id));
+        }
       }
-
-      if (params.primitive_mask & PRIMITIVE_ALL_CURVE && ob->mesh->num_curves() > 0) {
-        update_curve_vertex_buffer(rtcGetGeometry(scene, geom_id + 1), ob->mesh);
-        rtcCommitGeometry(rtcGetGeometry(scene, geom_id + 1));
+      else if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        if (hair->num_curves() > 0) {
+          update_curve_vertex_buffer(rtcGetGeometry(scene, geom_id + 1), hair);
+          rtcCommitGeometry(rtcGetGeometry(scene, geom_id + 1));
+        }
       }
     }
     geom_id += 2;
diff --git a/intern/cycles/bvh/bvh_embree.h b/intern/cycles/bvh/bvh_embree.h
index 123e87dd9b0..eb121d060b7 100644
--- a/intern/cycles/bvh/bvh_embree.h
+++ b/intern/cycles/bvh/bvh_embree.h
@@ -31,6 +31,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+class Geometry;
+class Hair;
 class Mesh;
 
 class BVHEmbree : public BVH {
@@ -47,7 +49,7 @@ class BVHEmbree : public BVH {
  protected:
   friend class BVH;
   BVHEmbree(const BVHParams &params,
-            const vector<Mesh *> &meshes,
+            const vector<Geometry *> &geometry,
             const vector<Object *> &objects);
 
   virtual void pack_nodes(const BVHNode *) override;
@@ -55,8 +57,8 @@ class BVHEmbree : public BVH {
 
   void add_object(Object *ob, int i);
   void add_instance(Object *ob, int i);
-  void add_curves(Object *ob, int i);
-  void add_triangles(Object *ob, int i);
+  void add_curves(const Object *ob, const Hair *hair, int i);
+  void add_triangles(const Object *ob, const Mesh *mesh, int i);
 
   ssize_t mem_used;
 
@@ -69,7 +71,7 @@ class BVHEmbree : public BVH {
  private:
   void delete_rtcScene();
   void update_tri_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh);
-  void update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh);
+  void update_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair);
 
   static RTCDevice rtc_shared_device;
   static int rtc_shared_users;
diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp
index 86d755ab06a..26b64c24db5 100644
--- a/intern/cycles/bvh/bvh_optix.cpp
+++ b/intern/cycles/bvh/bvh_optix.cpp
@@ -18,17 +18,20 @@
 #ifdef WITH_OPTIX
 
 #  include "bvh/bvh_optix.h"
+#  include "render/geometry.h"
+#  include "render/hair.h"
 #  include "render/mesh.h"
 #  include "render/object.h"
+#  include "util/util_foreach.h"
 #  include "util/util_logging.h"
 #  include "util/util_progress.h"
 
 CCL_NAMESPACE_BEGIN
 
 BVHOptiX::BVHOptiX(const BVHParams &params_,
-                   const vector<Mesh *> &meshes_,
+                   const vector<Geometry *> &geometry_,
                    const vector<Object *> &objects_)
-    : BVH(params_, meshes_, objects_)
+    : BVH(params_, geometry_, objects_)
 {
 }
 
@@ -56,47 +59,52 @@ void BVHOptiX::copy_to_device(Progress &progress, DeviceScene *dscene)
 void BVHOptiX::pack_blas()
 {
   // Bottom-level BVH can contain multiple primitive types, so merge them:
-  assert(meshes.size() == 1 && objects.size() == 1);  // These are build per-mesh
-  Mesh *const mesh = meshes[0];
-
-  if (params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
-    const size_t num_curves = mesh->num_curves();
-    const size_t num_segments = mesh->num_segments();
-    pack.prim_type.reserve(pack.prim_type.size() + num_segments);
-    pack.prim_index.reserve(pack.prim_index.size() + num_segments);
-    pack.prim_object.reserve(pack.prim_object.size() + num_segments);
-    // 'pack.prim_time' is only used in geom_curve_intersect.h
-    // It is not needed because of OPTIX_MOTION_FLAG_[START|END]_VANISH
-
-    uint type = PRIMITIVE_CURVE;
-    if (mesh->use_motion_blur && mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
-      type = PRIMITIVE_MOTION_CURVE;
-
-    for (size_t j = 0; j < num_curves; ++j) {
-      const Mesh::Curve curve = mesh->get_curve(j);
-      for (size_t k = 0; k < curve.num_segments(); ++k) {
-        pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(type, k));
-        // Each curve segment points back to its curve index
-        pack.prim_index.push_back_reserved(j);
-        pack.prim_object.push_back_reserved(0);
+  assert(geometry.size() == 1 && objects.size() == 1);  // These are built per-mesh
+  Geometry *const geom = geometry[0];
+
+  if (geom->type == Geometry::HAIR) {
+    Hair *const hair = static_cast<Hair *const>(geom);
+    if (hair->num_curves() > 0) {
+      const size_t num_curves = hair->num_curves();
+      const size_t num_segments = hair->num_segments();
+      pack.prim_type.reserve(pack.prim_type.size() + num_segments);
+      pack.prim_index.reserve(pack.prim_index.size() + num_segments);
+      pack.prim_object.reserve(pack.prim_object.size() + num_segments);
+      // 'pack.prim_time' is only used in geom_curve_intersect.h
+      // It is not needed because of OPTIX_MOTION_FLAG_[START|END]_VANISH
+
+      uint type = PRIMITIVE_CURVE;
+      if (hair->use_motion_blur && hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
+        type = PRIMITIVE_MOTION_CURVE;
+
+      for (size_t j = 0; j < num_curves; ++j) {
+        const Hair::Curve curve = hair->get_curve(j);
+        for (size_t k = 0; k < curve.num_segments(); ++k) {
+          pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(type, k));
+          // Each curve segment points back to its curve index
+          pack.prim_index.push_back_reserved(j);
+          pack.prim_object.push_back_reserved(0);
+        }
       }
     }
   }
-
-  if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
-    const size_t num_triangles = mesh->num_triangles();
-    pack.prim_type.reserve(pack.prim_type.size() + num_triangles);
-    pack.prim_index.reserve(pack.prim_index.size() + num_triangles);
-    pack.prim_object.reserve(pack.prim_object.size() + num_triangles);
-
-    uint type = PRIMITIVE_TRIANGLE;
-    if (mesh->use_motion_blur && mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
-      type = PRIMITIVE_MOTION_TRIANGLE;
-
-    for (size_t k = 0; k < num_triangles; ++k) {
-      pack.prim_type.push_back_reserved(type);
-      pack.prim_index.push_back_reserved(k);
-      pack.prim_object.push_back_reserved(0);
+  else if (geom->type == Geometry::MESH) {
+    Mesh *const mesh = static_cast<Mesh *const>(geom);
+    if (mesh->num_triangles() > 0) {
+      const size_t num_triangles = mesh->num_triangles();
+      pack.prim_type.reserve(pack.prim_type.size() + num_triangles);
+      pack.prim_index.reserve(pack.prim_index.size() + num_triangles);
+      pack.prim_object.reserve(pack.prim_object.size() + num_triangles);
+
+      uint type = PRIMITIVE_TRIANGLE;
+      if (mesh->use_motion_blur && mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
+        type = PRIMITIVE_MOTION_TRIANGLE;
+
+      for (size_t k = 0; k < num_triangles; ++k) {
+        pack.prim_type.push_back_reserved(type);
+        pack.prim_index.push_back_reserved(k);
+        pack.prim_object.push_back_reserved(0);
+      }
     }
   }
 
@@ -116,8 +124,8 @@ void BVHOptiX::pack_tlas()
   // Calculate total packed size
   size_t prim_index_size = 0;
   size_t prim_tri_verts_size = 0;
-  foreach (Mesh *mesh, meshes) {
-    BVH *const bvh = mesh->bvh;
+  foreach (Geometry *geom, geometry) {
+    BVH *const bvh = geom->bvh;
     prim_index_size += bvh->pack.prim_index.size();
     prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
   }
@@ -141,13 +149,12 @@ void BVHOptiX::pack_tlas()
   pack.prim_tri_verts.resize(prim_tri_verts_size);
   float4 *pack_prim_tri_verts = pack.prim_tri_verts.data();
 
-  // Top-level BVH should only contain instances, see 'Mesh::need_build_bvh'
+  // Top-level BVH should only contain instances, see 'Geometry::need_build_bvh'
   // Iterate over scene mesh list instead of objects, since the 'prim_offset' is calculated based
   // on that list, which may be ordered differently from the object list.
-  foreach (Mesh *mesh, meshes) {
-    PackedBVH &bvh_pack = mesh->bvh->pack;
-    int mesh_tri_offset = mesh->tri_offset;
-    int mesh_curve_offset = mesh->curve_offset;
+  foreach (Geometry *geom, geometry) {
+    PackedBVH &bvh_pack = geom->bvh->pack;
+    int geom_prim_offset = geom->prim_offset;
 
     // Merge primitive, object and triangle indexes
     if (!bvh_pack.prim_index.empty()) {
@@ -158,16 +165,16 @@ void BVHOptiX::pack_tlas()
 
       for (size_t i = 0; i < bvh_pack.prim_index.size(); i++, pack_offset++) {
         if (bvh_pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
-          pack_prim_index[pack_offset] = bvh_prim_index[i] + mesh_curve_offset;
+          pack_prim_index[pack_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_offset] = -1;
         }
         else {
-          pack_prim_index[pack_offset] = bvh_prim_index[i] + mesh_tri_offset;
+          pack_prim_index[pack_offset] = bvh_prim_index[i] + geom_prim_offset;
           pack_prim_tri_index[pack_offset] = bvh_prim_tri_index[i] + pack_verts_offset;
         }
 
         pack_prim_type[pack_offset] = bvh_prim_type[i];
-        pack_prim_object[pack_offset] = 0;  // Unused for instanced meshes
+        pack_prim_object[pack_offset] = 0;  // Unused for instanced geometry
         pack_prim_visibility[pack_offset] = bvh_prim_visibility[i];
       }
     }
@@ -182,15 +189,24 @@ void BVHOptiX::pack_tlas()
     }
   }
 
-  // Merge visibility flags of all objects and fix object indices for non-instanced meshes
+  // Merge visibility flags of all objects and fix object indices for non-instanced geometry
   foreach (Object *ob, objects) {
-    Mesh *const mesh = ob->mesh;
-    for (size_t i = 0; i < mesh->num_primitives(); ++i) {
-      if (!ob->mesh->is_instanced()) {
-        assert(pack.prim_object[mesh->prim_offset + i] == 0);
-        pack.prim_object[mesh->prim_offset + i] = ob->get_device_index();
+    Geometry *const geom = ob->geometry;
+    size_t num_primitives = 0;
+
+    if (geom->type == Geometry::MESH) {
+      num_primitives = static_cast<Mesh *const>(geom)->num_triangles();
+    }
+    else if (geom->type == Geometry::HAIR) {
+      num_primitives = static_cast<Hair *const>(geom)->num_segments();
+    }
+
+    for (size_t i = 0; i < num_primitives; ++i) {
+      if (!geom->is_instanced()) {
+        assert(pack.prim_object[geom->optix_prim_offset + i] == 0);
+        pack.prim_object[geom->optix_prim_offset + i] = ob->get_device_index();
       }
-      pack.prim_visibility[mesh->prim_offset + i] |= ob->visibility_for_tracing();
+      pack.prim_visibility[geom->optix_prim_offset + i] |= ob->visibility_for_tracing();
     }
   }
 }
diff --git a/intern/cycles/bvh/bvh_optix.h b/intern/cycles/bvh/bvh_optix.h
index 35033fe635f..e4745b093b5 100644
--- a/intern/cycles/bvh/bvh_optix.h
+++ b/intern/cycles/bvh/bvh_optix.h
@@ -26,11 +26,16 @@
 
 CCL_NAMESPACE_BEGIN
 
+class Geometry;
+class Optix;
+
 class BVHOptiX : public BVH {
   friend class BVH;
 
  public:
-  BVHOptiX(const BVHParams &params, const vector<Mesh *> &meshes, const vector<Object *> &objects);
+  BVHOptiX(const BVHParams &params,
+           const vector<Geometry *> &geometry,
+           const vector<Object *> &objects);
   virtual ~BVHOptiX();
 
   virtual void build(Progress &progress, Stats *) override;
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 2731662a39d..5e2c4b63f1b 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -69,9 +69,6 @@ class BVHParams {
   /* BVH layout to be built. */
   BVHLayout bvh_layout;
 
-  /* Mask of primitives to be included into the BVH. */
-  int primitive_mask;
-
   /* Use unaligned bounding boxes.
    * Only used for curves BVH.
    */
@@ -120,8 +117,6 @@ class BVHParams {
     bvh_layout = BVH_LAYOUT_BVH2;
     use_unaligned_nodes = false;
 
-    primitive_mask = PRIMITIVE_ALL;
-
     num_motion_curve_steps = 0;
     num_motion_triangle_steps = 0;
 
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index bd261c10d55..acdca0f13ad 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -20,6 +20,7 @@
 #include "bvh/bvh_build.h"
 #include "bvh/bvh_sort.h"
 
+#include "render/hair.h"
 #include "render/mesh.h"
 #include "render/object.h"
 
@@ -378,7 +379,7 @@ void BVHSpatialSplit::split_triangle_primitive(const Mesh *mesh,
   }
 }
 
-void BVHSpatialSplit::split_curve_primitive(const Mesh *mesh,
+void BVHSpatialSplit::split_curve_primitive(const Hair *hair,
                                             const Transform *tfm,
                                             int prim_index,
                                             int segment_index,
@@ -388,11 +389,11 @@ void BVHSpatialSplit::split_curve_primitive(const Mesh *mesh,
                                             BoundBox &right_bounds)
 {
   /* curve split: NOTE - Currently ignores curve width and needs to be fixed.*/
-  Mesh::Curve curve = mesh->get_curve(prim_index);
+  Hair::Curve curve = hair->get_curve(prim_index);
   const int k0 = curve.first_key + segment_index;
   const int k1 = k0 + 1;
-  float3 v0 = mesh->curve_keys[k0];
-  float3 v1 = mesh->curve_keys[k1];
+  float3 v0 = hair->curve_keys[k0];
+  float3 v1 = hair->curve_keys[k1];
 
   if (tfm != NULL) {
     v0 = transform_point(tfm, v0);
@@ -436,13 +437,13 @@ void BVHSpatialSplit::split_triangle_reference(const BVHReference &ref,
 }
 
 void BVHSpatialSplit::split_curve_reference(const BVHReference &ref,
-                                            const Mesh *mesh,
+                                            const Hair *hair,
                                             int dim,
                                             float pos,
                                             BoundBox &left_bounds,
                                             BoundBox &right_bounds)
 {
-  split_curve_primitive(mesh,
+  split_curve_primitive(hair,
                         NULL,
                         ref.prim_index(),
                         PRIMITIVE_UNPACK_SEGMENT(ref.prim_type()),
@@ -455,15 +456,22 @@ void BVHSpatialSplit::split_curve_reference(const BVHReference &ref,
 void BVHSpatialSplit::split_object_reference(
     const Object *object, int dim, float pos, BoundBox &left_bounds, BoundBox &right_bounds)
 {
-  Mesh *mesh = object->mesh;
-  for (int tri_idx = 0; tri_idx < mesh->num_triangles(); ++tri_idx) {
-    split_triangle_primitive(mesh, &object->tfm, tri_idx, dim, pos, left_bounds, right_bounds);
+  Geometry *geom = object->geometry;
+
+  if (geom->type == Geometry::MESH) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    for (int tri_idx = 0; tri_idx < mesh->num_triangles(); ++tri_idx) {
+      split_triangle_primitive(mesh, &object->tfm, tri_idx, dim, pos, left_bounds, right_bounds);
+    }
   }
-  for (int curve_idx = 0; curve_idx < mesh->num_curves(); ++curve_idx) {
-    Mesh::Curve curve = mesh->get_curve(curve_idx);
-    for (int segment_idx = 0; segment_idx < curve.num_keys - 1; ++segment_idx) {
-      split_curve_primitive(
-          mesh, &object->tfm, curve_idx, segment_idx, dim, pos, left_bounds, right_bounds);
+  else if (geom->type == Geometry::HAIR) {
+    Hair *hair = static_cast<Hair *>(geom);
+    for (int curve_idx = 0; curve_idx < hair->num_curves(); ++curve_idx) {
+      Hair::Curve curve = hair->get_curve(curve_idx);
+      for (int segment_idx = 0; segment_idx < curve.num_keys - 1; ++segment_idx) {
+        split_curve_primitive(
+            hair, &object->tfm, curve_idx, segment_idx, dim, pos, left_bounds, right_bounds);
+      }
     }
   }
 }
@@ -481,13 +489,14 @@ void BVHSpatialSplit::split_reference(const BVHBuild &builder,
 
   /* loop over vertices/edges. */
   const Object *ob = builder.objects[ref.prim_object()];
-  const Mesh *mesh = ob->mesh;
 
   if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
+    Mesh *mesh = static_cast<Mesh *>(ob->geometry);
     split_triangle_reference(ref, mesh, dim, pos, left_bounds, right_bounds);
   }
   else if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
-    split_curve_reference(ref, mesh, dim, pos, left_bounds, right_bounds);
+    Hair *hair = static_cast<Hair *>(ob->geometry);
+    split_curve_reference(ref, hair, dim, pos, left_bounds, right_bounds);
   }
   else {
     split_object_reference(ob, dim, pos, left_bounds, right_bounds);
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index eddd1c27f49..5f2e41cf343 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -24,6 +24,8 @@
 CCL_NAMESPACE_BEGIN
 
 class BVHBuild;
+class Hair;
+class Mesh;
 struct Transform;
 
 /* Object Split */
@@ -113,7 +115,7 @@ class BVHSpatialSplit {
                                 float pos,
                                 BoundBox &left_bounds,
                                 BoundBox &right_bounds);
-  void split_curve_primitive(const Mesh *mesh,
+  void split_curve_primitive(const Hair *hair,
                              const Transform *tfm,
                              int prim_index,
                              int segment_index,
@@ -134,7 +136,7 @@ class BVHSpatialSplit {
                                 BoundBox &left_bounds,
                                 BoundBox &right_bounds);
   void split_curve_reference(const BVHReference &ref,
-                             const Mesh *mesh,
+                             const Hair *hair,
                              int dim,
                              float pos,
                              BoundBox &left_bounds,
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
index 1843ca403a5..f0995f343fe 100644
--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -16,7 +16,7 @@
 
 #include "bvh/bvh_unaligned.h"
 
-#include "render/mesh.h"
+#include "render/hair.h"
 #include "render/object.h"
 
 #include "bvh/bvh_binning.h"
@@ -71,10 +71,10 @@ bool BVHUnaligned::compute_aligned_space(const BVHReference &ref, Transform *ali
   if (type & PRIMITIVE_CURVE) {
     const int curve_index = ref.prim_index();
     const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
-    const Mesh *mesh = object->mesh;
-    const Mesh::Curve &curve = mesh->get_curve(curve_index);
+    const Hair *hair = static_cast<const Hair *>(object->geometry);
+    const Hair::Curve &curve = hair->get_curve(curve_index);
     const int key = curve.first_key + segment;
-    const float3 v1 = mesh->curve_keys[key], v2 = mesh->curve_keys[key + 1];
+    const float3 v1 = hair->curve_keys[key], v2 = hair->curve_keys[key + 1];
     float length;
     const float3 axis = normalize_len(v2 - v1, &length);
     if (length > 1e-6f) {
@@ -96,10 +96,10 @@ BoundBox BVHUnaligned::compute_aligned_prim_boundbox(const BVHReference &prim,
   if (type & PRIMITIVE_CURVE) {
     const int curve_index = prim.prim_index();
     const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
-    const Mesh *mesh = object->mesh;
-    const Mesh::Curve &curve = mesh->get_curve(curve_index);
+    const Hair *hair = static_cast<const Hair *>(object->geometry);
+    const Hair::Curve &curve = hair->get_curve(curve_index);
     curve.bounds_grow(
-        segment, &mesh->curve_keys[0], &mesh->curve_radius[0], aligned_space, bounds);
+        segment, &hair->curve_keys[0], &hair->curve_radius[0], aligned_space, bounds);
   }
   else {
     bounds = prim.bounds().transformed(&aligned_space);
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 5bf681792ca..0b082b11cf7 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -135,7 +135,7 @@ if(CYCLES_STANDALONE_REPOSITORY)
   ####
   # embree
   if(WITH_CYCLES_EMBREE)
-    find_package(embree 3.2.4 REQUIRED)
+    find_package(embree 3.8.0 REQUIRED)
   endif()
 
   ####
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 35a79356957..aa5b65a2b73 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -34,13 +34,17 @@ set(SRC
   device_task.cpp
 )
 
+set(SRC_CUDA
+  cuda/device_cuda.h
+  cuda/device_cuda_impl.cpp
+)
+
 set(SRC_OPENCL
-  opencl/opencl.h
+  opencl/device_opencl.h
+  opencl/device_opencl_impl.cpp
   opencl/memory_manager.h
-
-  opencl/opencl_split.cpp
-  opencl/opencl_util.cpp
   opencl/memory_manager.cpp
+  opencl/opencl_util.cpp
 )
 
 if(WITH_CYCLES_NETWORK)
@@ -98,4 +102,4 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
new file mode 100644
index 00000000000..3e397da895b
--- /dev/null
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/device.h"
+#  include "device/device_denoising.h"
+#  include "device/device_split_kernel.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include "util/util_opengl.h"
+#    include <cuda.h>
+#    include <cudaGL.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDASplitKernel;
+
+class CUDADevice : public Device {
+
+  friend class CUDASplitKernelFunction;
+  friend class CUDASplitKernel;
+  friend class CUDAContextScope;
+
+ public:
+  DedicatedTaskPool task_pool;
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule, cuFilterModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+  CUDASplitKernel *split_kernel;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+
+  struct PixelMem {
+    GLuint cuPBO;
+    CUgraphicsResource cuPBOresource;
+    GLuint cuTexId;
+    int w, h;
+  };
+  map<device_ptr, PixelMem> pixel_mem_map;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  /* Kernels */
+  struct {
+    bool loaded;
+
+    CUfunction adaptive_stopping;
+    CUfunction adaptive_filter_x;
+    CUfunction adaptive_filter_y;
+    CUfunction adaptive_scale_samples;
+    int adaptive_num_threads_per_block;
+  } functions;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const;
+
+  void cuda_error_documentation();
+
+  bool cuda_error_(CUresult result, const string &stmt);
+
+  void cuda_error_message(const string &message);
+
+  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
+
+  virtual ~CUDADevice();
+
+  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
+
+  bool use_adaptive_compilation();
+
+  bool use_split_kernel();
+
+  virtual string compile_kernel_get_common_cflags(
+      const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
+
+  string compile_kernel(const DeviceRequestedFeatures &requested_features,
+                        const char *name,
+                        const char *base = "cuda",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features);
+
+  void load_functions();
+
+  void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem);
+
+  void mem_copy_to(device_memory &mem);
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
+
+  void mem_zero(device_memory &mem);
+
+  void mem_free(device_memory &mem);
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/);
+
+  virtual void const_copy_to(const char *name, void *host, size_t size);
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  bool denoising_non_local_means(device_ptr image_ptr,
+                                 device_ptr guide_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr out_ptr,
+                                 DenoisingTask *task);
+
+  bool denoising_construct_transform(DenoisingTask *task);
+
+  bool denoising_accumulate(device_ptr color_ptr,
+                            device_ptr color_variance_ptr,
+                            device_ptr scale_ptr,
+                            int frame,
+                            DenoisingTask *task);
+
+  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
+
+  bool denoising_combine_halves(device_ptr a_ptr,
+                                device_ptr b_ptr,
+                                device_ptr mean_ptr,
+                                device_ptr variance_ptr,
+                                int r,
+                                int4 rect,
+                                DenoisingTask *task);
+
+  bool denoising_divide_shadow(device_ptr a_ptr,
+                               device_ptr b_ptr,
+                               device_ptr sample_variance_ptr,
+                               device_ptr sv_variance_ptr,
+                               device_ptr buffer_variance_ptr,
+                               DenoisingTask *task);
+
+  bool denoising_get_feature(int mean_offset,
+                             int variance_offset,
+                             device_ptr mean_ptr,
+                             device_ptr variance_ptr,
+                             float scale,
+                             DenoisingTask *task);
+
+  bool denoising_write_feature(int out_offset,
+                               device_ptr from_ptr,
+                               device_ptr buffer_ptr,
+                               DenoisingTask *task);
+
+  bool denoising_detect_outliers(device_ptr image_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr depth_ptr,
+                                 device_ptr output_ptr,
+                                 DenoisingTask *task);
+
+  void denoise(RenderTile &rtile, DenoisingTask &denoising);
+
+  void adaptive_sampling_filter(uint filter_sample,
+                                WorkTile *wtile,
+                                CUdeviceptr d_wtile,
+                                CUstream stream = 0);
+  void adaptive_sampling_post(RenderTile &rtile,
+                              WorkTile *wtile,
+                              CUdeviceptr d_wtile,
+                              CUstream stream = 0);
+
+  void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
+
+  void film_convert(DeviceTask &task,
+                    device_ptr buffer,
+                    device_ptr rgba_byte,
+                    device_ptr rgba_half);
+
+  void shader(DeviceTask &task);
+
+  CUdeviceptr map_pixels(device_ptr mem);
+
+  void unmap_pixels(device_ptr mem);
+
+  void pixels_alloc(device_memory &mem);
+
+  void pixels_copy_from(device_memory &mem, int y, int w, int h);
+
+  void pixels_free(device_memory &mem);
+
+  void draw_pixels(device_memory &mem,
+                   int y,
+                   int w,
+                   int h,
+                   int width,
+                   int height,
+                   int dx,
+                   int dy,
+                   int dw,
+                   int dh,
+                   bool transparent,
+                   const DeviceDrawParams &draw_params);
+
+  void thread_run(DeviceTask *task);
+
+  virtual void task_add(DeviceTask &task);
+
+  virtual void task_wait();
+
+  virtual void task_cancel();
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
new file mode 100644
index 00000000000..0f261ef2f70
--- /dev/null
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -0,0 +1,2620 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/cuda/device_cuda.h"
+#  include "device/device_intern.h"
+#  include "device/device_split_kernel.h"
+
+#  include "render/buffers.h"
+
+#  include "kernel/filter/filter_defines.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+#  include "kernel/split/kernel_split_data_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+#  ifndef WITH_CUDA_DYNLOAD
+
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all.
+ */
+
+namespace {
+
+const char *cuewErrorString(CUresult result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+
+} /* namespace */
+#  endif /* WITH_CUDA_DYNLOAD */
+
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+  CUDADevice *device;
+
+ public:
+  explicit CUDASplitKernel(CUDADevice *device);
+
+  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
+
+  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                              RenderTile &rtile,
+                                              int num_global_elements,
+                                              device_memory &kernel_globals,
+                                              device_memory &kernel_data_,
+                                              device_memory &split_data,
+                                              device_memory &ray_state,
+                                              device_memory &queue_index,
+                                              device_memory &use_queues_flag,
+                                              device_memory &work_pool_wgs);
+
+  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+                                                         const DeviceRequestedFeatures &);
+  virtual int2 split_kernel_local_size();
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+};
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
+
+ private:
+  CUDADevice *device;
+};
+
+bool CUDADevice::have_precompiled_kernels()
+{
+  string cubins_path = path_get("lib");
+  return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::cuda_error_documentation()
+{
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+#  define cuda_assert(stmt) \
+    { \
+      CUresult result = stmt; \
+\
+      if (result != CUDA_SUCCESS) { \
+        string message = string_printf( \
+            "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
+        if (error_msg == "") \
+          error_msg = message; \
+        fprintf(stderr, "%s\n", message.c_str()); \
+        /*cuda_abort();*/ \
+        cuda_error_documentation(); \
+      } \
+    } \
+    (void)0
+
+bool CUDADevice::cuda_error_(CUresult result, const string &stmt)
+{
+  if (result == CUDA_SUCCESS)
+    return false;
+
+  string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
+  if (error_msg == "")
+    error_msg = message;
+  fprintf(stderr, "%s\n", message.c_str());
+  cuda_error_documentation();
+  return true;
+}
+
+#  define cuda_error(stmt) cuda_error_(stmt, #  stmt)
+
+void CUDADevice::cuda_error_message(const string &message)
+{
+  if (error_msg == "")
+    error_msg = message;
+  fprintf(stderr, "%s\n", message.c_str());
+  cuda_error_documentation();
+}
+
+CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
+    : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+  background = background_;
+
+  cuDevId = info.num;
+  cuDevice = 0;
+  cuContext = 0;
+
+  cuModule = 0;
+  cuFilterModule = 0;
+
+  split_kernel = NULL;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+
+  functions.loaded = false;
+
+  /* Intialize CUDA. */
+  if (cuda_error(cuInit(0)))
+    return;
+
+  /* Setup device and context. */
+  if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
+    return;
+
+  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
+  cuda_assert(
+      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+  if (can_map_host) {
+    ctx_flags |= CU_CTX_MAP_HOST;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  CUresult result;
+
+  if (background) {
+    result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+  }
+  else {
+    result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+    if (result != CUDA_SUCCESS) {
+      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+      background = true;
+    }
+  }
+
+  if (cuda_error_(result, "cuCtxCreate"))
+    return;
+
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+  cuDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by cuCtxCreate. */
+  cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+  task_pool.stop();
+
+  delete split_kernel;
+
+  texture_info.free();
+
+  cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
+{
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* We only support sm_30 and above */
+  if (major < 3) {
+    cuda_error_message(
+        string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
+                      major,
+                      minor));
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+  return DebugFlags().cuda.adaptive_compile;
+}
+
+bool CUDADevice::use_split_kernel()
+{
+  return DebugFlags().cuda.split_kernel;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(
+    const DeviceRequestedFeatures &requested_features, bool filter, bool split)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DNVCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (!filter && use_adaptive_compilation()) {
+    cflags += " " + requested_features.get_build_options();
+  }
+  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+  if (extra_cflags) {
+    cflags += string(" ") + string(extra_cflags);
+  }
+#  ifdef WITH_CYCLES_DEBUG
+  cflags += " -D__KERNEL_DEBUG__";
+#  endif
+
+  if (split) {
+    cflags += " -D__SPLIT__";
+  }
+
+  return cflags;
+}
+
+string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
+                                  const char *name,
+                                  const char *base,
+                                  bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor));
+    VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+    if (path_exists(ptx)) {
+      VLOG(1) << "Using precompiled kernel.";
+      return ptx;
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing cuda toolkit or changing other
+   * compiler command line arguments makes sure cubin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(
+      requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string cubin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+  const string cubin = path_cache_get(path_join("kernels", cubin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+  if (path_exists(cubin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return cubin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      cuda_error_message(
+          string_printf("CUDA device requires compute capability 3.0 or up, "
+                        "found %d.%d. Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      cuda_error_message(
+          string_printf("CUDA binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const nvcc = cuewCompilerPath();
+  if (nvcc == NULL) {
+    cuda_error_message(
+        "CUDA nvcc compiler not found. "
+        "Install CUDA toolkit in default location.");
+    return string();
+  }
+
+  const int nvcc_cuda_version = cuewCompilerVersion();
+  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+  if (nvcc_cuda_version < 80) {
+    printf(
+        "Unsupported CUDA version %d.%d detected, "
+        "you need CUDA 8.0 or newer.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+    return string();
+  }
+  else if (nvcc_cuda_version != 101) {
+    printf(
+        "CUDA version %d.%d detected, build may succeed but only "
+        "CUDA 10.1 is officially supported.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(cubin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("kernels", path_join(base, string_printf("%s.cu", name))));
+
+  string command = string_printf(
+      "\"%s\" "
+      "-arch=%s_%d%d "
+      "--%s \"%s\" "
+      "-o \"%s\" "
+      "%s",
+      nvcc,
+      kernel_arch,
+      major,
+      minor,
+      kernel_ext,
+      source_path.c_str(),
+      cubin.c_str(),
+      common_cflags.c_str());
+
+  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    cuda_error_message(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(cubin)) {
+    cuda_error_message(
+        "CUDA kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return cubin;
+}
+
+bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
+{
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in cuCtxSynchronize.
+   */
+  if (cuFilterModule && cuModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if cuda init succeeded */
+  if (cuContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(requested_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
+  string cubin = compile_kernel(requested_features, kernel_name);
+  if (cubin.empty())
+    return false;
+
+  const char *filter_name = "filter";
+  string filter_cubin = compile_kernel(requested_features, filter_name);
+  if (filter_cubin.empty())
+    return false;
+
+  /* open module */
+  CUDAContextScope scope(this);
+
+  string cubin_data;
+  CUresult result;
+
+  if (path_read_text(cubin, cubin_data))
+    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (cuda_error_(result, "cuModuleLoad"))
+    cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+
+  if (path_read_text(filter_cubin, cubin_data))
+    result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (cuda_error_(result, "cuModuleLoad"))
+    cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
+  if (result == CUDA_SUCCESS) {
+    reserve_local_memory(requested_features);
+  }
+
+  load_functions();
+
+  return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::load_functions()
+{
+  /* TODO: load all functions here. */
+  if (functions.loaded) {
+    return;
+  }
+  functions.loaded = true;
+
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
+
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
+
+  int unused_min_blocks;
+  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
+                                               &functions.adaptive_num_threads_per_block,
+                                               functions.adaptive_scale_samples,
+                                               NULL,
+                                               0,
+                                               0));
+}
+
+void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
+{
+  if (use_split_kernel()) {
+    /* Split kernel mostly uses global memory and adaptive compilation,
+     * difficult to predict how much is needed currently. */
+    return;
+  }
+
+  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  CUDAContextScope scope(this);
+
+  size_t total = 0, free_before = 0, free_after = 0;
+  cuMemGetInfo(&free_before, &total);
+
+  /* Get kernel function. */
+  CUfunction cuPathTrace;
+
+  if (requested_features.use_integrator_branched) {
+    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+  int min_blocks, num_threads_per_block;
+  cuda_assert(cuOccupancyMaxPotentialBlockSize(
+      &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+
+  /* Launch kernel, using just 1 block appears sufficient to reserve
+   * memory for all multiprocessors. It would be good to do this in
+   * parallel for the multi GPU case still to make it faster. */
+  CUdeviceptr d_work_tiles = 0;
+  uint total_work_size = 0;
+
+  void *args[] = {&d_work_tiles, &total_work_size};
+
+  cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+  cuda_assert(cuCtxSynchronize());
+
+  cuMemGetInfo(&free_after, &total);
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    CUdeviceptr tmp;
+    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+    cuMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void CUDADevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+  if (need_texture_info) {
+    texture_info.copy_to_device();
+    need_texture_info = false;
+  }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+      device_memory &mem = *pair.first;
+      CUDAMem *cmem = &pair.second;
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* Already in host memory. */
+      if (cmem->use_mapped_host) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple CUDA devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      /* Preserve the original device pointer, in case of multi device
+       * we can't change it because the pointer mapping would break. */
+      device_ptr prev_pointer = max_mem->device_pointer;
+      size_t prev_size = max_mem->device_size;
+
+      mem_copy_to(*max_mem);
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      max_mem->device_pointer = prev_pointer;
+      max_mem->device_size = prev_size;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+
+  move_texture_to_host = false;
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  CUDAContextScope scope(this);
+
+  CUdeviceptr device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  cuMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    cuMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = cuMemAlloc(&device_pointer, size);
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = CUDA_SUCCESS;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = cuMemHostAlloc(
+          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+    else {
+      status = " failed, out of host memory";
+    }
+  }
+
+  if (mem_alloc_result != CUDA_SUCCESS) {
+    status = " failed, out of device and host memory";
+    cuda_assert(mem_alloc_result);
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  CUDAMem *cmem = &cuda_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * CUDA memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(
+        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          cuMemFreeHost(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      cuMemFree(mem.device_pointer);
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+  }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS) {
+    assert(!"mem_copy_to not supported for pixels.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+
+    generic_copy_to(mem);
+  }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_copy_from(mem, y, w, h);
+  }
+  else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const CUDAContextScope scope(this);
+      cuda_assert(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_free(mem);
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  CUDAContextScope scope(this);
+  CUdeviceptr mem;
+  size_t bytes;
+
+  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+  // assert(bytes == size);
+  cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+  CUDAContextScope scope(this);
+
+  generic_alloc(mem);
+  generic_copy_to(mem);
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    generic_free(mem);
+  }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+  CUDAContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = CU_TR_ADDRESS_MODE_WRAP;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+      break;
+    case EXTENSION_CLIP:
+      address_mode = CU_TR_ADDRESS_MODE_BORDER;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  CUfilter_mode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = CU_TR_FILTER_MODE_POINT;
+  }
+  else {
+    filter_mode = CU_TR_FILTER_MODE_LINEAR;
+  }
+
+  /* Image Texture Storage */
+  CUarray_format_enum format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = CU_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = CU_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = CU_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = CU_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = CU_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = CU_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  CUDAMem *cmem = NULL;
+  CUarray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    CUDA_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    param.dstArray = array_3d;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    cuda_assert(cuMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    int alignment = 0;
+    cuda_assert(
+        cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+    dst_pitch = align_up(src_pitch, alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    CUDA_MEMCPY2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    cuda_assert(cuMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Kepler+, bindless textures. */
+  CUDA_RESOURCE_DESC resDesc;
+  memset(&resDesc, 0, sizeof(resDesc));
+
+  if (array_3d) {
+    resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+    resDesc.res.array.hArray = array_3d;
+    resDesc.flags = 0;
+  }
+  else if (mem.data_height > 0) {
+    resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+    resDesc.res.pitch2D.devPtr = mem.device_pointer;
+    resDesc.res.pitch2D.format = format;
+    resDesc.res.pitch2D.numChannels = mem.data_elements;
+    resDesc.res.pitch2D.height = mem.data_height;
+    resDesc.res.pitch2D.width = mem.data_width;
+    resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+  }
+  else {
+    resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+    resDesc.res.linear.devPtr = mem.device_pointer;
+    resDesc.res.linear.format = format;
+    resDesc.res.linear.numChannels = mem.data_elements;
+    resDesc.res.linear.sizeInBytes = mem.device_size;
+  }
+
+  CUDA_TEXTURE_DESC texDesc;
+  memset(&texDesc, 0, sizeof(texDesc));
+  texDesc.addressMode[0] = address_mode;
+  texDesc.addressMode[1] = address_mode;
+  texDesc.addressMode[2] = address_mode;
+  texDesc.filterMode = filter_mode;
+  texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+  cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  texture_info[slot].data = (uint64_t)cmem->texobject;
+  need_texture_info = true;
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      cuTexObjectDestroy(cmem.texobject);
+    }
+
+    if (cmem.array) {
+      /* Free array. */
+      cuArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else {
+      generic_free(mem);
+    }
+  }
+}
+
+#  define CUDA_GET_BLOCKSIZE(func, w, h) \
+    int threads_per_block; \
+    cuda_assert( \
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+    int threads = (int)sqrt((float)threads_per_block); \
+    int xblocks = ((w) + threads - 1) / threads; \
+    int yblocks = ((h) + threads - 1) / threads;
+
+#  define CUDA_LAUNCH_KERNEL(func, args) \
+    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
+
+/* Similar as above, but for 1-dimensional blocks. */
+#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+    int threads_per_block; \
+    cuda_assert( \
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+    int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
+    int yblocks = h;
+
+#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
+    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
+
+bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
+                                           device_ptr guide_ptr,
+                                           device_ptr variance_ptr,
+                                           device_ptr out_ptr,
+                                           DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  int stride = task->buffer.stride;
+  int w = task->buffer.width;
+  int h = task->buffer.h;
+  int r = task->nlm_state.r;
+  int f = task->nlm_state.f;
+  float a = task->nlm_state.a;
+  float k_2 = task->nlm_state.k_2;
+
+  int pass_stride = task->buffer.pass_stride;
+  int num_shifts = (2 * r + 1) * (2 * r + 1);
+  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+  int frame_offset = 0;
+
+  if (have_error())
+    return false;
+
+  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
+  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
+  CUdeviceptr scale_ptr = 0;
+
+  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
+  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
+
+  {
+    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+
+    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
+
+    void *calc_difference_args[] = {&guide_ptr,
+                                    &variance_ptr,
+                                    &scale_ptr,
+                                    &difference,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &channel_offset,
+                                    &frame_offset,
+                                    &a,
+                                    &k_2};
+    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *calc_weight_args[] = {
+        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *update_output_args[] = {&blurDifference,
+                                  &image_ptr,
+                                  &out_ptr,
+                                  &weightAccum,
+                                  &w,
+                                  &h,
+                                  &stride,
+                                  &pass_stride,
+                                  &channel_offset,
+                                  &r,
+                                  &f};
+
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+  }
+
+  {
+    CUfunction cuNLMNormalize;
+    cuda_assert(
+        cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+    void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+    CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+    CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+    cuda_assert(cuCtxSynchronize());
+  }
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterConstructTransform;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
+
+  void *args[] = {&task->buffer.mem.device_pointer,
+                  &task->tile_info_mem.device_pointer,
+                  &task->storage.transform.device_pointer,
+                  &task->storage.rank.device_pointer,
+                  &task->filter_area,
+                  &task->rect,
+                  &task->radius,
+                  &task->pca_threshold,
+                  &task->buffer.pass_stride,
+                  &task->buffer.frame_stride,
+                  &task->buffer.use_time};
+  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
+                                      device_ptr color_variance_ptr,
+                                      device_ptr scale_ptr,
+                                      int frame,
+                                      DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  int r = task->radius;
+  int f = 4;
+  float a = 1.0f;
+  float k_2 = task->nlm_k_2;
+
+  int w = task->reconstruction_state.source_w;
+  int h = task->reconstruction_state.source_h;
+  int stride = task->buffer.stride;
+  int frame_offset = frame * task->buffer.frame_stride;
+  int t = task->tile_info->frames[frame];
+
+  int pass_stride = task->buffer.pass_stride;
+  int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+  if (have_error())
+    return false;
+
+  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
+  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+
+  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+  cuda_assert(cuModuleGetFunction(
+      &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+  cuda_assert(
+      cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+  cuda_assert(cuModuleGetFunction(
+      &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                        task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+                        num_shifts);
+
+  void *calc_difference_args[] = {&color_ptr,
+                                  &color_variance_ptr,
+                                  &scale_ptr,
+                                  &difference,
+                                  &w,
+                                  &h,
+                                  &stride,
+                                  &pass_stride,
+                                  &r,
+                                  &pass_stride,
+                                  &frame_offset,
+                                  &a,
+                                  &k_2};
+  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+  void *construct_gramian_args[] = {&t,
+                                    &blurDifference,
+                                    &task->buffer.mem.device_pointer,
+                                    &task->storage.transform.device_pointer,
+                                    &task->storage.rank.device_pointer,
+                                    &task->storage.XtWX.device_pointer,
+                                    &task->storage.XtWY.device_pointer,
+                                    &task->reconstruction_state.filter_window,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &f,
+                                    &frame_offset,
+                                    &task->buffer.use_time};
+
+  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
+{
+  CUfunction cuFinalize;
+  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+  void *finalize_args[] = {&output_ptr,
+                           &task->storage.rank.device_pointer,
+                           &task->storage.XtWX.device_pointer,
+                           &task->storage.XtWY.device_pointer,
+                           &task->filter_area,
+                           &task->reconstruction_state.buffer_params.x,
+                           &task->render_buffer.samples};
+  CUDA_GET_BLOCKSIZE(
+      cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
+  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
+                                          device_ptr b_ptr,
+                                          device_ptr mean_ptr,
+                                          device_ptr variance_ptr,
+                                          int r,
+                                          int4 rect,
+                                          DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterCombineHalves;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
+  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
+                                         device_ptr b_ptr,
+                                         device_ptr sample_variance_ptr,
+                                         device_ptr sv_variance_ptr,
+                                         device_ptr buffer_variance_ptr,
+                                         DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterDivideShadow;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->tile_info_mem.device_pointer,
+                  &a_ptr,
+                  &b_ptr,
+                  &sample_variance_ptr,
+                  &sv_variance_ptr,
+                  &buffer_variance_ptr,
+                  &task->rect,
+                  &task->render_buffer.pass_stride,
+                  &task->render_buffer.offset};
+  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_get_feature(int mean_offset,
+                                       int variance_offset,
+                                       device_ptr mean_ptr,
+                                       device_ptr variance_ptr,
+                                       float scale,
+                                       DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterGetFeature;
+  cuda_assert(
+      cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->tile_info_mem.device_pointer,
+                  &mean_offset,
+                  &variance_offset,
+                  &mean_ptr,
+                  &variance_ptr,
+                  &scale,
+                  &task->rect,
+                  &task->render_buffer.pass_stride,
+                  &task->render_buffer.offset};
+  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_write_feature(int out_offset,
+                                         device_ptr from_ptr,
+                                         device_ptr buffer_ptr,
+                                         DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterWriteFeature;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->reconstruction_state.buffer_params,
+                  &task->filter_area,
+                  &from_ptr,
+                  &buffer_ptr,
+                  &out_offset,
+                  &task->rect};
+  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
+                                           device_ptr variance_ptr,
+                                           device_ptr depth_ptr,
+                                           device_ptr output_ptr,
+                                           DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterDetectOutliers;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {
+      &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
+
+  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
+{
+  denoising.functions.construct_transform = function_bind(
+      &CUDADevice::denoising_construct_transform, this, &denoising);
+  denoising.functions.accumulate = function_bind(
+      &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
+  denoising.functions.divide_shadow = function_bind(
+      &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+  denoising.functions.non_local_means = function_bind(
+      &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+  denoising.functions.combine_halves = function_bind(
+      &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+  denoising.functions.get_feature = function_bind(
+      &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+  denoising.functions.write_feature = function_bind(
+      &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+  denoising.functions.detect_outliers = function_bind(
+      &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+  denoising.render_buffer.samples = rtile.sample;
+  denoising.buffer.gpu_temporary_mem = true;
+
+  denoising.run_denoising(&rtile);
+}
+
+void CUDADevice::adaptive_sampling_filter(uint filter_sample,
+                                          WorkTile *wtile,
+                                          CUdeviceptr d_wtile,
+                                          CUstream stream)
+{
+  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+
+  /* These are a series of tiny kernels because there is no grid synchronization
+   * from within a kernel, so multiple kernel launches it is. */
+  uint total_work_size = wtile->h * wtile->w;
+  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
+  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+  total_work_size = wtile->h;
+  num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+  total_work_size = wtile->w;
+  num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+}
+
+void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
+                                        WorkTile *wtile,
+                                        CUdeviceptr d_wtile,
+                                        CUstream stream)
+{
+  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+  uint total_work_size = wtile->h * wtile->w;
+
+  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
+  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args,
+                             0));
+}
+
+void CUDADevice::path_trace(DeviceTask &task,
+                            RenderTile &rtile,
+                            device_vector<WorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+  CUfunction cuPathTrace;
+
+  /* Get kernel function. */
+  if (task.integrator_branched) {
+    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  WorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  cuda_assert(cuOccupancyMaxPotentialBlockSize(
+      &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+  if (task.adaptive_sampling.use) {
+    step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+  }
+
+  /* Render all samples. */
+  int start_sample = rtile.start_sample;
+  int end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample; sample += step_samples) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = min(step_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(
+        cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    cuda_assert(cuCtxSynchronize());
+
+    /* Update progress. */
+    rtile.sample = sample + wtile->num_samples;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void CUDADevice::film_convert(DeviceTask &task,
+                              device_ptr buffer,
+                              device_ptr rgba_byte,
+                              device_ptr rgba_half)
+{
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilmConvert;
+  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
+  CUdeviceptr d_buffer = (CUdeviceptr)buffer;
+
+  /* get kernel function */
+  if (rgba_half) {
+    cuda_assert(
+        cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
+  }
+
+  float sample_scale = 1.0f / (task.sample + 1);
+
+  /* pass in parameters */
+  void *args[] = {&d_rgba,
+                  &d_buffer,
+                  &sample_scale,
+                  &task.x,
+                  &task.y,
+                  &task.w,
+                  &task.h,
+                  &task.offset,
+                  &task.stride};
+
+  /* launch kernel */
+  int threads_per_block;
+  cuda_assert(cuFuncGetAttribute(
+      &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
+
+  int xthreads = (int)sqrt(threads_per_block);
+  int ythreads = (int)sqrt(threads_per_block);
+  int xblocks = (task.w + xthreads - 1) / xthreads;
+  int yblocks = (task.h + ythreads - 1) / ythreads;
+
+  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
+
+  cuda_assert(cuLaunchKernel(cuFilmConvert,
+                             xblocks,
+                             yblocks,
+                             1, /* blocks */
+                             xthreads,
+                             ythreads,
+                             1, /* threads */
+                             0,
+                             0,
+                             args,
+                             0));
+
+  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
+
+  cuda_assert(cuCtxSynchronize());
+}
+
+void CUDADevice::shader(DeviceTask &task)
+{
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuShader;
+  CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
+  CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
+
+  /* get kernel function */
+  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
+  }
+  else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
+  }
+
+  /* do tasks in smaller chunks, so we can cancel it */
+  const int shader_chunk_size = 65536;
+  const int start = task.shader_x;
+  const int end = task.shader_x + task.shader_w;
+  int offset = task.offset;
+
+  bool canceled = false;
+  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
+    for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+      int shader_w = min(shader_chunk_size, end - shader_x);
+
+      /* pass in parameters */
+      void *args[8];
+      int arg = 0;
+      args[arg++] = &d_input;
+      args[arg++] = &d_output;
+      args[arg++] = &task.shader_eval_type;
+      if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+        args[arg++] = &task.shader_filter;
+      }
+      args[arg++] = &shader_x;
+      args[arg++] = &shader_w;
+      args[arg++] = &offset;
+      args[arg++] = &sample;
+
+      /* launch kernel */
+      int threads_per_block;
+      cuda_assert(cuFuncGetAttribute(
+          &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+
+      int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
+
+      cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuLaunchKernel(cuShader,
+                                 xblocks,
+                                 1,
+                                 1, /* blocks */
+                                 threads_per_block,
+                                 1,
+                                 1, /* threads */
+                                 0,
+                                 0,
+                                 args,
+                                 0));
+
+      cuda_assert(cuCtxSynchronize());
+
+      if (task.get_cancel()) {
+        canceled = true;
+        break;
+      }
+    }
+
+    task.update_progress(NULL);
+  }
+}
+
+CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
+{
+  if (!background) {
+    PixelMem pmem = pixel_mem_map[mem];
+    CUdeviceptr buffer;
+
+    size_t bytes;
+    cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
+    cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
+
+    return buffer;
+  }
+
+  return (CUdeviceptr)mem;
+}
+
+void CUDADevice::unmap_pixels(device_ptr mem)
+{
+  if (!background) {
+    PixelMem pmem = pixel_mem_map[mem];
+
+    cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
+  }
+}
+
+void CUDADevice::pixels_alloc(device_memory &mem)
+{
+  PixelMem pmem;
+
+  pmem.w = mem.data_width;
+  pmem.h = mem.data_height;
+
+  CUDAContextScope scope(this);
+
+  glGenBuffers(1, &pmem.cuPBO);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+  if (mem.data_type == TYPE_HALF)
+    glBufferData(
+        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
+  else
+    glBufferData(
+        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+  glActiveTexture(GL_TEXTURE0);
+  glGenTextures(1, &pmem.cuTexId);
+  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+  if (mem.data_type == TYPE_HALF)
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+  else
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  CUresult result = cuGraphicsGLRegisterBuffer(
+      &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+
+  if (result == CUDA_SUCCESS) {
+    mem.device_pointer = pmem.cuTexId;
+    pixel_mem_map[mem.device_pointer] = pmem;
+
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+
+    return;
+  }
+  else {
+    /* failed to register buffer, fallback to no interop */
+    glDeleteBuffers(1, &pmem.cuPBO);
+    glDeleteTextures(1, &pmem.cuTexId);
+
+    background = true;
+  }
+}
+
+void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
+{
+  PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+  CUDAContextScope scope(this);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+  size_t offset = sizeof(uchar) * 4 * y * w;
+  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
+  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+}
+
+void CUDADevice::pixels_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+    CUDAContextScope scope(this);
+
+    cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
+    glDeleteBuffers(1, &pmem.cuPBO);
+    glDeleteTextures(1, &pmem.cuTexId);
+
+    pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+    mem.device_pointer = 0;
+
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+void CUDADevice::draw_pixels(device_memory &mem,
+                             int y,
+                             int w,
+                             int h,
+                             int width,
+                             int height,
+                             int dx,
+                             int dy,
+                             int dw,
+                             int dh,
+                             bool transparent,
+                             const DeviceDrawParams &draw_params)
+{
+  assert(mem.type == MEM_PIXELS);
+
+  if (!background) {
+    const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
+    PixelMem pmem = pixel_mem_map[mem.device_pointer];
+    float *vpointer;
+
+    CUDAContextScope scope(this);
+
+    /* for multi devices, this assumes the inefficient method that we allocate
+     * all pixels on the device even though we only render to a subset */
+    size_t offset = 4 * y * w;
+
+    if (mem.data_type == TYPE_HALF)
+      offset *= sizeof(GLhalf);
+    else
+      offset *= sizeof(uint8_t);
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+    if (mem.data_type == TYPE_HALF) {
+      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
+    }
+    else {
+      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
+    }
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    if (transparent) {
+      glEnable(GL_BLEND);
+      glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+    }
+
+    GLint shader_program;
+    if (use_fallback_shader) {
+      if (!bind_fallback_display_space_shader(dw, dh)) {
+        return;
+      }
+      shader_program = fallback_shader_program;
+    }
+    else {
+      draw_params.bind_display_space_shader_cb();
+      glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
+    }
+
+    if (!vertex_buffer) {
+      glGenBuffers(1, &vertex_buffer);
+    }
+
+    glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+    /* invalidate old contents -
+     * avoids stalling if buffer is still waiting in queue to be rendered */
+    glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+    vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+    if (vpointer) {
+      /* texture coordinate - vertex pair */
+      vpointer[0] = 0.0f;
+      vpointer[1] = 0.0f;
+      vpointer[2] = dx;
+      vpointer[3] = dy;
+
+      vpointer[4] = (float)w / (float)pmem.w;
+      vpointer[5] = 0.0f;
+      vpointer[6] = (float)width + dx;
+      vpointer[7] = dy;
+
+      vpointer[8] = (float)w / (float)pmem.w;
+      vpointer[9] = (float)h / (float)pmem.h;
+      vpointer[10] = (float)width + dx;
+      vpointer[11] = (float)height + dy;
+
+      vpointer[12] = 0.0f;
+      vpointer[13] = (float)h / (float)pmem.h;
+      vpointer[14] = dx;
+      vpointer[15] = (float)height + dy;
+
+      glUnmapBuffer(GL_ARRAY_BUFFER);
+    }
+
+    GLuint vertex_array_object;
+    GLuint position_attribute, texcoord_attribute;
+
+    glGenVertexArrays(1, &vertex_array_object);
+    glBindVertexArray(vertex_array_object);
+
+    texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
+    position_attribute = glGetAttribLocation(shader_program, "pos");
+
+    glEnableVertexAttribArray(texcoord_attribute);
+    glEnableVertexAttribArray(position_attribute);
+
+    glVertexAttribPointer(
+        texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+    glVertexAttribPointer(position_attribute,
+                          2,
+                          GL_FLOAT,
+                          GL_FALSE,
+                          4 * sizeof(float),
+                          (const GLvoid *)(sizeof(float) * 2));
+
+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+    if (use_fallback_shader) {
+      glUseProgram(0);
+    }
+    else {
+      draw_params.unbind_display_space_shader_cb();
+    }
+
+    if (transparent) {
+      glDisable(GL_BLEND);
+    }
+
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    return;
+  }
+
+  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
+}
+
+void CUDADevice::thread_run(DeviceTask *task)
+{
+  CUDAContextScope scope(this);
+
+  if (task->type == DeviceTask::RENDER) {
+    DeviceRequestedFeatures requested_features;
+    if (use_split_kernel()) {
+      if (split_kernel == NULL) {
+        split_kernel = new CUDASplitKernel(this);
+        split_kernel->load_kernels(requested_features);
+      }
+    }
+
+    device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, *task);
+
+    while (task->acquire_tile(this, tile, task->tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        if (use_split_kernel()) {
+          device_only_memory<uchar> void_buffer(this, "void_buffer");
+          split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+        }
+        else {
+          path_trace(*task, tile, work_tiles);
+        }
+      }
+      else if (tile.task == RenderTile::DENOISE) {
+        tile.sample = tile.start_sample + tile.num_samples;
+
+        denoise(tile, denoising);
+
+        task->update_progress(&tile, tile.w * tile.h);
+      }
+
+      task->release_tile(tile);
+
+      if (task->get_cancel()) {
+        if (task->need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+  else if (task->type == DeviceTask::SHADER) {
+    shader(*task);
+
+    cuda_assert(cuCtxSynchronize());
+  }
+  else if (task->type == DeviceTask::DENOISE_BUFFER) {
+    RenderTile tile;
+    tile.x = task->x;
+    tile.y = task->y;
+    tile.w = task->w;
+    tile.h = task->h;
+    tile.buffer = task->buffer;
+    tile.sample = task->sample + task->num_samples;
+    tile.num_samples = task->num_samples;
+    tile.start_sample = task->sample;
+    tile.offset = task->offset;
+    tile.stride = task->stride;
+    tile.buffers = task->buffers;
+
+    DenoisingTask denoising(this, *task);
+    denoise(tile, denoising);
+    task->update_progress(&tile, tile.w * tile.h);
+  }
+}
+
+class CUDADeviceTask : public DeviceTask {
+ public:
+  CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
+  {
+    run = function_bind(&CUDADevice::thread_run, device, this);
+  }
+};
+
+void CUDADevice::task_add(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  /* Load texture info. */
+  load_texture_info();
+
+  /* Synchronize all memory copies before executing task. */
+  cuda_assert(cuCtxSynchronize());
+
+  if (task.type == DeviceTask::FILM_CONVERT) {
+    /* must be done in main thread due to opengl access */
+    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+  }
+  else {
+    task_pool.push(new CUDADeviceTask(this, task));
+  }
+}
+
+void CUDADevice::task_wait()
+{
+  task_pool.wait();
+}
+
+void CUDADevice::task_cancel()
+{
+  task_pool.cancel();
+}
+
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#  undef cuda_assert
+#  define cuda_assert(stmt) \
+    { \
+      CUresult result = stmt; \
+\
+      if (result != CUDA_SUCCESS) { \
+        string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+        if (device->error_msg == "") \
+          device->error_msg = message; \
+        fprintf(stderr, "%s\n", message.c_str()); \
+        /*cuda_abort();*/ \
+        device->cuda_error_documentation(); \
+      } \
+    } \
+    (void)0
+
+/* CUDA context scope. */
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+  cuda_assert(cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+  cuda_assert(cuCtxPopCurrent(NULL));
+}
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction {
+  CUDADevice *device;
+  CUfunction func;
+
+ public:
+  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
+  {
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
+  {
+    return enqueue(dim, NULL);
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, void *args[])
+  {
+    if (device->have_error())
+      return false;
+
+    CUDAContextScope scope(device);
+
+    /* we ignore dim.local_size for now, as this is faster */
+    int threads_per_block;
+    cuda_assert(
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
+                  threads_per_block;
+
+    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+    cuda_assert(cuLaunchKernel(func,
+                               xblocks,
+                               1,
+                               1, /* blocks */
+                               threads_per_block,
+                               1,
+                               1, /* threads */
+                               0,
+                               0,
+                               args,
+                               0));
+
+    return !device->have_error();
+  }
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
+                                            device_memory & /*data*/,
+                                            size_t num_threads)
+{
+  CUDAContextScope scope(device);
+
+  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+  size_buffer.alloc(1);
+  size_buffer.zero_to_device();
+
+  uint threads = num_threads;
+  CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
+
+  struct args_t {
+    uint *num_threads;
+    CUdeviceptr *size;
+  };
+
+  args_t args = {&threads, &d_size};
+
+  CUfunction state_buffer_size;
+  cuda_assert(
+      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
+
+  size_buffer.copy_from_device(0, 1, 1);
+  size_t size = size_buffer[0];
+  size_buffer.free();
+
+  return size;
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                                     RenderTile &rtile,
+                                                     int num_global_elements,
+                                                     device_memory & /*kernel_globals*/,
+                                                     device_memory & /*kernel_data*/,
+                                                     device_memory &split_data,
+                                                     device_memory &ray_state,
+                                                     device_memory &queue_index,
+                                                     device_memory &use_queues_flag,
+                                                     device_memory &work_pool_wgs)
+{
+  CUDAContextScope scope(device);
+
+  CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
+  CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
+  CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
+  CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
+  CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
+
+  CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
+
+  int end_sample = rtile.start_sample + rtile.num_samples;
+  int queue_size = dim.global_size[0] * dim.global_size[1];
+
+  struct args_t {
+    CUdeviceptr *split_data_buffer;
+    int *num_elements;
+    CUdeviceptr *ray_state;
+    int *start_sample;
+    int *end_sample;
+    int *sx;
+    int *sy;
+    int *sw;
+    int *sh;
+    int *offset;
+    int *stride;
+    CUdeviceptr *queue_index;
+    int *queuesize;
+    CUdeviceptr *use_queues_flag;
+    CUdeviceptr *work_pool_wgs;
+    int *num_samples;
+    CUdeviceptr *buffer;
+  };
+
+  args_t args = {&d_split_data,
+                 &num_global_elements,
+                 &d_ray_state,
+                 &rtile.start_sample,
+                 &end_sample,
+                 &rtile.x,
+                 &rtile.y,
+                 &rtile.w,
+                 &rtile.h,
+                 &rtile.offset,
+                 &rtile.stride,
+                 &d_queue_index,
+                 &queue_size,
+                 &d_use_queues_flag,
+                 &d_work_pool_wgs,
+                 &rtile.num_samples,
+                 &d_buffer};
+
+  CUfunction data_init;
+  cuda_assert(
+      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+  if (device->have_error()) {
+    return false;
+  }
+
+  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
+
+  return !device->have_error();
+}
+
+SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
+                                                                const DeviceRequestedFeatures &)
+{
+  CUDAContextScope scope(device);
+  CUfunction func;
+
+  cuda_assert(
+      cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+  if (device->have_error()) {
+    device->cuda_error_message(
+        string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+    return NULL;
+  }
+
+  return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+  return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
+                                               device_memory &data,
+                                               DeviceTask * /*task*/)
+{
+  CUDAContextScope scope(device);
+  size_t free;
+  size_t total;
+
+  cuda_assert(cuMemGetInfo(&free, &total));
+
+  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
+          << " bytes. (" << string_human_readable_size(free) << ").";
+
+  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+  size_t side = round_down((int)sqrt(num_elements), 32);
+  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
+  VLOG(1) << "Global size: " << global_size << ".";
+  return global_size;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 76670351734..d94d409175b 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -25,11 +25,11 @@
 #include "util/util_logging.h"
 #include "util/util_math.h"
 #include "util/util_opengl.h"
-#include "util/util_time.h"
+#include "util/util_string.h"
 #include "util/util_system.h"
+#include "util/util_time.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
-#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -366,6 +366,15 @@ void Device::draw_pixels(device_memory &rgba,
 
 Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
+#ifdef WITH_MULTI
+  if (!info.multi_devices.empty()) {
+    /* Always create a multi device when info contains multiple devices.
+     * This is done so that the type can still be e.g. DEVICE_CPU to indicate
+     * that it is a homogeneous collection of devices, which simplifies checks. */
+    return device_multi_create(info, stats, profiler, background);
+  }
+#endif
+
   Device *device;
 
   switch (info.type) {
@@ -388,11 +397,6 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
         device = NULL;
       break;
 #endif
-#ifdef WITH_MULTI
-    case DEVICE_MULTI:
-      device = device_multi_create(info, stats, profiler, background);
-      break;
-#endif
 #ifdef WITH_NETWORK
     case DEVICE_NETWORK:
       device = device_network_create(info, stats, profiler, "127.0.0.1");
@@ -586,7 +590,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   }
 
   DeviceInfo info;
-  info.type = DEVICE_MULTI;
+  info.type = subdevices.front().type;
   info.id = "MULTI";
   info.description = "Multi Device";
   info.num = 0;
@@ -624,6 +628,14 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
       info.multi_devices.push_back(device);
     }
 
+    /* Create unique ID for this combination of devices. */
+    info.id += device.id;
+
+    /* Set device type to MULTI if subdevices are not of a common type. */
+    if (device.type != info.type) {
+      info.type = DEVICE_MULTI;
+    }
+
     /* Accumulate device info. */
     info.has_half_images &= device.has_half_images;
     info.has_volume_decoupled &= device.has_volume_decoupled;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 66fcac921d3..a98ac171709 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -27,8 +27,8 @@
 #include "util/util_list.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
-#include "util/util_thread.h"
 #include "util/util_texture.h"
+#include "util/util_thread.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
@@ -83,6 +83,7 @@ class DeviceInfo {
   bool has_profiling;        /* Supports runtime collection of profiling info. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
+  vector<DeviceInfo> denoising_devices;
 
   DeviceInfo()
   {
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c2843a61e6d..57e8523e02a 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -29,16 +29,19 @@
 #include "device/device_intern.h"
 #include "device/device_split_kernel.h"
 
+// clang-format off
 #include "kernel/kernel.h"
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 #include "kernel/filter/filter.h"
 
 #include "kernel/osl/osl_shader.h"
 #include "kernel/osl/osl_globals.h"
+// clang-format on
 
 #include "render/buffers.h"
 #include "render/coverage.h"
@@ -261,7 +264,7 @@ class CPUDevice : public Device {
 
   CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
       : Device(info_, stats_, profiler_, background_),
-        texture_info(this, "__texture_info", MEM_TEXTURE),
+        texture_info(this, "__texture_info", MEM_GLOBAL),
 #define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
         REGISTER_KERNEL(path_trace),
         REGISTER_KERNEL(convert_to_half_float),
@@ -317,6 +320,10 @@ class CPUDevice : public Device {
     REGISTER_SPLIT_KERNEL(next_iteration_setup);
     REGISTER_SPLIT_KERNEL(indirect_subsurface);
     REGISTER_SPLIT_KERNEL(buffer_update);
+    REGISTER_SPLIT_KERNEL(adaptive_stopping);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
+    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
 #undef REGISTER_SPLIT_KERNEL
 #undef KERNEL_FUNCTIONS
   }
@@ -338,7 +345,10 @@ class CPUDevice : public Device {
     if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
       bvh_layout_mask |= BVH_LAYOUT_BVH4;
     }
-#if defined(__x86_64__) || defined(_M_X64)
+    /* MSVC does not support the -march=native switch and you always end up  */
+    /* with an sse2 kernel when you use WITH_KERNEL_NATIVE. We *cannot* feed */
+    /* that kernel BVH8 even if the CPU flags would allow for it. */
+#if (defined(__x86_64__) || defined(_M_X64)) && !(defined(_MSC_VER) && defined(WITH_KERNEL_NATIVE))
     if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
       bvh_layout_mask |= BVH_LAYOUT_BVH8;
     }
@@ -362,6 +372,9 @@ class CPUDevice : public Device {
     if (mem.type == MEM_TEXTURE) {
       assert(!"mem_alloc not supported for textures.");
     }
+    else if (mem.type == MEM_GLOBAL) {
+      assert(!"mem_alloc not supported for global memory.");
+    }
     else {
       if (mem.name) {
         VLOG(1) << "Buffer allocate: " << mem.name << ", "
@@ -386,9 +399,13 @@ class CPUDevice : public Device {
 
   void mem_copy_to(device_memory &mem)
   {
-    if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-      tex_alloc(mem);
+    if (mem.type == MEM_GLOBAL) {
+      global_free(mem);
+      global_alloc(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free((device_texture &)mem);
+      tex_alloc((device_texture &)mem);
     }
     else if (mem.type == MEM_PIXELS) {
       assert(!"mem_copy_to not supported for pixels.");
@@ -420,8 +437,11 @@ class CPUDevice : public Device {
 
   void mem_free(device_memory &mem)
   {
-    if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
+    if (mem.type == MEM_GLOBAL) {
+      global_free(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free((device_texture &)mem);
     }
     else if (mem.device_pointer) {
       if (mem.type == MEM_DEVICE_ONLY) {
@@ -443,51 +463,50 @@ class CPUDevice : public Device {
     kernel_const_copy(&kernel_globals, name, host, size);
   }
 
-  void tex_alloc(device_memory &mem)
+  void global_alloc(device_memory &mem)
   {
-    VLOG(1) << "Texture allocate: " << mem.name << ", "
+    VLOG(1) << "Global memory allocate: " << mem.name << ", "
             << string_human_readable_number(mem.memory_size()) << " bytes. ("
             << string_human_readable_size(mem.memory_size()) << ")";
 
-    if (mem.interpolation == INTERPOLATION_NONE) {
-      /* Data texture. */
-      kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-    }
-    else {
-      /* Image Texture. */
-      int flat_slot = 0;
-      if (string_startswith(mem.name, "__tex_image")) {
-        int pos = string(mem.name).rfind("_");
-        flat_slot = atoi(mem.name + pos + 1);
-      }
-      else {
-        assert(0);
-      }
-
-      if (flat_slot >= texture_info.size()) {
-        /* Allocate some slots in advance, to reduce amount
-         * of re-allocations. */
-        texture_info.resize(flat_slot + 128);
-      }
+    kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
 
-      TextureInfo &info = texture_info[flat_slot];
-      info.data = (uint64_t)mem.host_pointer;
-      info.cl_buffer = 0;
-      info.interpolation = mem.interpolation;
-      info.extension = mem.extension;
-      info.width = mem.data_width;
-      info.height = mem.data_height;
-      info.depth = mem.data_depth;
+    mem.device_pointer = (device_ptr)mem.host_pointer;
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+  }
 
-      need_texture_info = true;
+  void global_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      mem.device_pointer = 0;
+      stats.mem_free(mem.device_size);
+      mem.device_size = 0;
     }
+  }
+
+  void tex_alloc(device_texture &mem)
+  {
+    VLOG(1) << "Texture allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
 
     mem.device_pointer = (device_ptr)mem.host_pointer;
     mem.device_size = mem.memory_size();
     stats.mem_alloc(mem.device_size);
+
+    const uint slot = mem.slot;
+    if (slot >= texture_info.size()) {
+      /* Allocate some slots in advance, to reduce amount of re-allocations. */
+      texture_info.resize(slot + 128);
+    }
+
+    texture_info[slot] = mem.info;
+    texture_info[slot].data = (uint64_t)mem.host_pointer;
+    need_texture_info = true;
   }
 
-  void tex_free(device_memory &mem)
+  void tex_free(device_texture &mem)
   {
     if (mem.device_pointer) {
       mem.device_pointer = 0;
@@ -508,13 +527,14 @@ class CPUDevice : public Device {
 
   void thread_run(DeviceTask *task)
   {
-    if (task->type == DeviceTask::RENDER) {
+    if (task->type == DeviceTask::RENDER)
       thread_render(*task);
-    }
-    else if (task->type == DeviceTask::FILM_CONVERT)
-      thread_film_convert(*task);
     else if (task->type == DeviceTask::SHADER)
       thread_shader(*task);
+    else if (task->type == DeviceTask::FILM_CONVERT)
+      thread_film_convert(*task);
+    else if (task->type == DeviceTask::DENOISE_BUFFER)
+      thread_denoise(*task);
   }
 
   class CPUDeviceTask : public DeviceTask {
@@ -819,6 +839,49 @@ class CPUDevice : public Device {
     return true;
   }
 
+  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile)
+  {
+    WorkTile wtile;
+    wtile.x = tile.x;
+    wtile.y = tile.y;
+    wtile.w = tile.w;
+    wtile.h = tile.h;
+    wtile.offset = tile.offset;
+    wtile.stride = tile.stride;
+    wtile.buffer = (float *)tile.buffer;
+
+    bool any = false;
+    for (int y = tile.y; y < tile.y + tile.h; ++y) {
+      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
+    }
+    for (int x = tile.x; x < tile.x + tile.w; ++x) {
+      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
+    }
+    return (!any);
+  }
+
+  void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
+  {
+    float *render_buffer = (float *)tile.buffer;
+    for (int y = tile.y; y < tile.y + tile.h; y++) {
+      for (int x = tile.x; x < tile.x + tile.w; x++) {
+        int index = tile.offset + x + y * tile.stride;
+        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+          float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f,
+                                                      buffer[kernel_data.film.pass_sample_count]);
+          if (sample_multiplier != 1.0f) {
+            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+          }
+        }
+        else {
+          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
+        }
+      }
+    }
+  }
+
   void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
   {
     const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
@@ -851,14 +914,27 @@ class CPUDevice : public Device {
           path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
         }
       }
-
       tile.sample = sample + 1;
 
+      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+        const bool stop = adaptive_sampling_filter(kg, tile);
+        if (stop) {
+          const int num_progress_samples = end_sample - sample;
+          tile.sample = end_sample;
+          task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+          break;
+        }
+      }
+
       task.update_progress(&tile, tile.w * tile.h);
     }
     if (use_coverage) {
       coverage.finalize();
     }
+
+    if (task.adaptive_sampling.use) {
+      adaptive_sampling_post(tile, kg);
+    }
   }
 
   void denoise(DenoisingTask &denoising, RenderTile &tile)
@@ -923,7 +999,7 @@ class CPUDevice : public Device {
     DenoisingTask denoising(this, task);
     denoising.profiler = &kg->profiler;
 
-    while (task.acquire_tile(this, tile)) {
+    while (task.acquire_tile(this, tile, task.tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         if (use_split_kernel) {
           device_only_memory<uchar> void_buffer(this, "void_buffer");
@@ -954,6 +1030,33 @@ class CPUDevice : public Device {
     delete split_kernel;
   }
 
+  void thread_denoise(DeviceTask &task)
+  {
+    RenderTile tile;
+    tile.x = task.x;
+    tile.y = task.y;
+    tile.w = task.w;
+    tile.h = task.h;
+    tile.buffer = task.buffer;
+    tile.sample = task.sample + task.num_samples;
+    tile.num_samples = task.num_samples;
+    tile.start_sample = task.sample;
+    tile.offset = task.offset;
+    tile.stride = task.stride;
+    tile.buffers = task.buffers;
+
+    DenoisingTask denoising(this, task);
+
+    ProfilingState denoising_profiler_state;
+    profiler.add_state(&denoising_profiler_state);
+    denoising.profiler = &denoising_profiler_state;
+
+    denoise(denoising, tile);
+    task.update_progress(&tile, tile.w * tile.h);
+
+    profiler.remove_state(&denoising_profiler_state);
+  }
+
   void thread_film_convert(DeviceTask &task)
   {
     float sample_scale = 1.0f / (task.sample + 1);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dfd80d678fd..9a703b45c0a 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -14,2562 +14,21 @@
  * limitations under the License.
  */
 
-#include <climits>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#ifdef WITH_CUDA
 
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
+#  include "device/cuda/device_cuda.h"
+#  include "device/device.h"
+#  include "device/device_intern.h"
 
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#ifdef WITH_CUDA_DYNLOAD
-#  include "cuew.h"
-#else
-#  include "util/util_opengl.h"
-#  include <cuda.h>
-#  include <cudaGL.h>
-#endif
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_md5.h"
-#include "util/util_opengl.h"
-#include "util/util_path.h"
-#include "util/util_string.h"
-#include "util/util_system.h"
-#include "util/util_types.h"
-#include "util/util_time.h"
-#include "util/util_windows.h"
-
-#include "kernel/split/kernel_split_data_types.h"
+#  include "util/util_logging.h"
+#  include "util/util_string.h"
+#  include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
-  /* We can only give error code here without major code duplication, that
-   * should be enough since dynamic loading is only being disabled by folks
-   * who knows what they're doing anyway.
-   *
-   * NOTE: Avoid call from several threads.
-   */
-  static string error;
-  error = string_printf("%d", result);
-  return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
-  return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
-  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-#endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-  CUDADevice *device;
-
- public:
-  explicit CUDASplitKernel(CUDADevice *device);
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
-  CUDAContextScope(CUDADevice *device);
-  ~CUDAContextScope();
-
- private:
-  CUDADevice *device;
-};
-
-class CUDADevice : public Device {
- public:
-  DedicatedTaskPool task_pool;
-  CUdevice cuDevice;
-  CUcontext cuContext;
-  CUmodule cuModule, cuFilterModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
-  int cuDevId;
-  int cuDevArchitecture;
-  bool first_error;
-  CUDASplitKernel *split_kernel;
-
-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-
-  struct PixelMem {
-    GLuint cuPBO;
-    CUgraphicsResource cuPBOresource;
-    GLuint cuTexId;
-    int w, h;
-  };
-  map<device_ptr, PixelMem> pixel_mem_map;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-  CUdeviceptr cuda_device_ptr(device_ptr mem)
-  {
-    return (CUdeviceptr)mem;
-  }
-
-  static bool have_precompiled_kernels()
-  {
-    string cubins_path = path_get("lib");
-    return path_exists(cubins_path);
-  }
-
-  virtual bool show_samples() const
-  {
-    /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-    return true;
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  /*#ifdef NDEBUG
-#define cuda_abort()
-#else
-#define cuda_abort() abort()
-#endif*/
-  void cuda_error_documentation()
-  {
-    if (first_error) {
-      fprintf(stderr,
-              "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-      fprintf(stderr,
-              "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
-      first_error = false;
-    }
-  }
-
-#define cuda_assert(stmt) \
-  { \
-    CUresult result = stmt; \
-\
-    if (result != CUDA_SUCCESS) { \
-      string message = string_printf( \
-          "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
-      if (error_msg == "") \
-        error_msg = message; \
-      fprintf(stderr, "%s\n", message.c_str()); \
-      /*cuda_abort();*/ \
-      cuda_error_documentation(); \
-    } \
-  } \
-  (void)0
-
-  bool cuda_error_(CUresult result, const string &stmt)
-  {
-    if (result == CUDA_SUCCESS)
-      return false;
-
-    string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    cuda_error_documentation();
-    return true;
-  }
-
-#define cuda_error(stmt) cuda_error_(stmt, #stmt)
-
-  void cuda_error_message(const string &message)
-  {
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    cuda_error_documentation();
-  }
-
-  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-      : Device(info, stats, profiler, background_),
-        texture_info(this, "__texture_info", MEM_TEXTURE)
-  {
-    first_error = true;
-    background = background_;
-
-    cuDevId = info.num;
-    cuDevice = 0;
-    cuContext = 0;
-
-    cuModule = 0;
-    cuFilterModule = 0;
-
-    split_kernel = NULL;
-
-    need_texture_info = false;
-
-    device_texture_headroom = 0;
-    device_working_headroom = 0;
-    move_texture_to_host = false;
-    map_host_limit = 0;
-    map_host_used = 0;
-    can_map_host = 0;
-
-    /* Intialize CUDA. */
-    if (cuda_error(cuInit(0)))
-      return;
-
-    /* Setup device and context. */
-    if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
-      return;
-
-    /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-     * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-     * so we can predict which memory to map to host. */
-    cuda_assert(
-        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-    unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-    if (can_map_host) {
-      ctx_flags |= CU_CTX_MAP_HOST;
-      init_host_memory();
-    }
-
-    /* Create context. */
-    CUresult result;
-
-    if (background) {
-      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-    }
-    else {
-      result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-      if (result != CUDA_SUCCESS) {
-        result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-        background = true;
-      }
-    }
-
-    if (cuda_error_(result, "cuCtxCreate"))
-      return;
-
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-    cuDevArchitecture = major * 100 + minor * 10;
-
-    /* Pop context set by cuCtxCreate. */
-    cuCtxPopCurrent(NULL);
-  }
-
-  ~CUDADevice()
-  {
-    task_pool.stop();
-
-    delete split_kernel;
-
-    texture_info.free();
-
-    cuda_assert(cuCtxDestroy(cuContext));
-  }
-
-  bool support_device(const DeviceRequestedFeatures & /*requested_features*/)
-  {
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-    /* We only support sm_30 and above */
-    if (major < 3) {
-      cuda_error_message(string_printf(
-          "CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
-          major,
-          minor));
-      return false;
-    }
-
-    return true;
-  }
-
-  bool use_adaptive_compilation()
-  {
-    return DebugFlags().cuda.adaptive_compile;
-  }
-
-  bool use_split_kernel()
-  {
-    return DebugFlags().cuda.split_kernel;
-  }
-
-  /* Common NVCC flags which stays the same regardless of shading model,
-   * kernel sources md5 and only depends on compiler or compilation settings.
-   */
-  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
-                                          bool filter = false,
-                                          bool split = false)
-  {
-    const int machine = system_cpu_bits();
-    const string source_path = path_get("source");
-    const string include_path = source_path;
-    string cflags = string_printf(
-        "-m%d "
-        "--ptxas-options=\"-v\" "
-        "--use_fast_math "
-        "-DNVCC "
-        "-I\"%s\"",
-        machine,
-        include_path.c_str());
-    if (!filter && use_adaptive_compilation()) {
-      cflags += " " + requested_features.get_build_options();
-    }
-    const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-    if (extra_cflags) {
-      cflags += string(" ") + string(extra_cflags);
-    }
-#ifdef WITH_CYCLES_DEBUG
-    cflags += " -D__KERNEL_DEBUG__";
-#endif
-
-    if (split) {
-      cflags += " -D__SPLIT__";
-    }
-
-    return cflags;
-  }
-
-  bool compile_check_compiler()
-  {
-    const char *nvcc = cuewCompilerPath();
-    if (nvcc == NULL) {
-      cuda_error_message(
-          "CUDA nvcc compiler not found. "
-          "Install CUDA toolkit in default location.");
-      return false;
-    }
-    const int cuda_version = cuewCompilerVersion();
-    VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << ".";
-    const int major = cuda_version / 10, minor = cuda_version % 10;
-    if (cuda_version == 0) {
-      cuda_error_message("CUDA nvcc compiler version could not be parsed.");
-      return false;
-    }
-    if (cuda_version < 80) {
-      printf(
-          "Unsupported CUDA version %d.%d detected, "
-          "you need CUDA 8.0 or newer.\n",
-          major,
-          minor);
-      return false;
-    }
-    else if (cuda_version != 101) {
-      printf(
-          "CUDA version %d.%d detected, build may succeed but only "
-          "CUDA 10.1 is officially supported.\n",
-          major,
-          minor);
-    }
-    return true;
-  }
-
-  string compile_kernel(const DeviceRequestedFeatures &requested_features,
-                        bool filter = false,
-                        bool split = false)
-  {
-    const char *name, *source;
-    if (filter) {
-      name = "filter";
-      source = "filter.cu";
-    }
-    else if (split) {
-      name = "kernel_split";
-      source = "kernel_split.cu";
-    }
-    else {
-      name = "kernel";
-      source = "kernel.cu";
-    }
-    /* Compute cubin name. */
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-    /* Attempt to use kernel provided with Blender. */
-    if (!use_adaptive_compilation()) {
-      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-      if (path_exists(cubin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return cubin;
-      }
-      const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
-      if (path_exists(ptx)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return ptx;
-      }
-    }
-
-    const string common_cflags = compile_kernel_get_common_cflags(
-        requested_features, filter, split);
-
-    /* Try to use locally compiled kernel. */
-    const string source_path = path_get("source");
-    const string kernel_md5 = path_files_md5_hash(source_path);
-
-    /* We include cflags into md5 so changing cuda toolkit or changing other
-     * compiler command line arguments makes sure cubin gets re-built.
-     */
-    const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
-
-    const string cubin_file = string_printf(
-        "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str());
-    const string cubin = path_cache_get(path_join("kernels", cubin_file));
-    VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-    if (path_exists(cubin)) {
-      VLOG(1) << "Using locally compiled kernel.";
-      return cubin;
-    }
-
-#ifdef _WIN32
-    if (have_precompiled_kernels()) {
-      if (major < 3) {
-        cuda_error_message(
-            string_printf("CUDA device requires compute capability 3.0 or up, "
-                          "found %d.%d. Your GPU is not supported.",
-                          major,
-                          minor));
-      }
-      else {
-        cuda_error_message(
-            string_printf("CUDA binary kernel for this graphics card compute "
-                          "capability (%d.%d) not found.",
-                          major,
-                          minor));
-      }
-      return "";
-    }
-#endif
-
-    /* Compile. */
-    if (!compile_check_compiler()) {
-      return "";
-    }
-    const char *nvcc = cuewCompilerPath();
-    const string kernel = path_join(path_join(source_path, "kernel"),
-                                    path_join("kernels", path_join("cuda", source)));
-    double starttime = time_dt();
-    printf("Compiling CUDA kernel ...\n");
-
-    path_create_directories(cubin);
-
-    string command = string_printf(
-        "\"%s\" "
-        "-arch=sm_%d%d "
-        "--cubin \"%s\" "
-        "-o \"%s\" "
-        "%s ",
-        nvcc,
-        major,
-        minor,
-        kernel.c_str(),
-        cubin.c_str(),
-        common_cflags.c_str());
-
-    printf("%s\n", command.c_str());
-
-    if (system(command.c_str()) == -1) {
-      cuda_error_message(
-          "Failed to execute compilation command, "
-          "see console for details.");
-      return "";
-    }
-
-    /* Verify if compilation succeeded */
-    if (!path_exists(cubin)) {
-      cuda_error_message(
-          "CUDA kernel compilation failed, "
-          "see console for details.");
-      return "";
-    }
-
-    printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-    return cubin;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features)
-  {
-    /* TODO(sergey): Support kernels re-load for CUDA devices.
-     *
-     * Currently re-loading kernel will invalidate memory pointers,
-     * causing problems in cuCtxSynchronize.
-     */
-    if (cuFilterModule && cuModule) {
-      VLOG(1) << "Skipping kernel reload, not currently supported.";
-      return true;
-    }
-
-    /* check if cuda init succeeded */
-    if (cuContext == 0)
-      return false;
-
-    /* check if GPU is supported */
-    if (!support_device(requested_features))
-      return false;
-
-    /* get kernel */
-    string cubin = compile_kernel(requested_features, false, use_split_kernel());
-    if (cubin == "")
-      return false;
-
-    string filter_cubin = compile_kernel(requested_features, true, false);
-    if (filter_cubin == "")
-      return false;
-
-    /* open module */
-    CUDAContextScope scope(this);
-
-    string cubin_data;
-    CUresult result;
-
-    if (path_read_text(cubin, cubin_data))
-      result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-    else
-      result = CUDA_ERROR_FILE_NOT_FOUND;
-
-    if (cuda_error_(result, "cuModuleLoad"))
-      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
-
-    if (path_read_text(filter_cubin, cubin_data))
-      result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-    else
-      result = CUDA_ERROR_FILE_NOT_FOUND;
-
-    if (cuda_error_(result, "cuModuleLoad"))
-      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
-
-    if (result == CUDA_SUCCESS) {
-      reserve_local_memory(requested_features);
-    }
-
-    return (result == CUDA_SUCCESS);
-  }
-
-  void reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-  {
-    if (use_split_kernel()) {
-      /* Split kernel mostly uses global memory and adaptive compilation,
-       * difficult to predict how much is needed currently. */
-      return;
-    }
-
-    /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-     * needed for kernel launches, so that we can reliably figure out when
-     * to allocate scene data in mapped host memory. */
-    CUDAContextScope scope(this);
-
-    size_t total = 0, free_before = 0, free_after = 0;
-    cuMemGetInfo(&free_before, &total);
-
-    /* Get kernel function. */
-    CUfunction cuPathTrace;
-
-    if (requested_features.use_integrator_branched) {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-    }
-
-    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-    int min_blocks, num_threads_per_block;
-    cuda_assert(cuOccupancyMaxPotentialBlockSize(
-        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-
-    /* Launch kernel, using just 1 block appears sufficient to reserve
-     * memory for all multiprocessors. It would be good to do this in
-     * parallel for the multi GPU case still to make it faster. */
-    CUdeviceptr d_work_tiles = 0;
-    uint total_work_size = 0;
-
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    cuda_assert(cuCtxSynchronize());
-
-    cuMemGetInfo(&free_after, &total);
-    VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
-            << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-#if 0
-    /* For testing mapped host memory, fill up device memory. */
-    const size_t keep_mb = 1024;
-
-    while (free_after > keep_mb * 1024 * 1024LL) {
-      CUdeviceptr tmp;
-      cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-      cuMemGetInfo(&free_after, &total);
-    }
-#endif
-  }
-
-  void init_host_memory()
-  {
-    /* Limit amount of host mapped memory, because allocating too much can
-     * cause system instability. Leave at least half or 4 GB of system
-     * memory free, whichever is smaller. */
-    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-    size_t system_ram = system_physical_ram();
-
-    if (system_ram > 0) {
-      if (system_ram / 2 > default_limit) {
-        map_host_limit = system_ram - default_limit;
-      }
-      else {
-        map_host_limit = system_ram / 2;
-      }
-    }
-    else {
-      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-      map_host_limit = 0;
-    }
-
-    /* Amount of device memory to keep is free after texture memory
-     * and working memory allocations respectively. We set the working
-     * memory limit headroom lower so that some space is left after all
-     * texture memory allocations. */
-    device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-    device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-    VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-  }
-
-  void load_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  void move_textures_to_host(size_t size, bool for_texture)
-  {
-    /* Signal to reallocate textures in host memory only. */
-    move_texture_to_host = true;
-
-    while (size > 0) {
-      /* Find suitable memory allocation to move. */
-      device_memory *max_mem = NULL;
-      size_t max_size = 0;
-      bool max_is_image = false;
-
-      foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-        device_memory &mem = *pair.first;
-        CUDAMem *cmem = &pair.second;
-
-        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-        bool is_image = is_texture && (mem.data_height > 1);
-
-        /* Can't move this type of memory. */
-        if (!is_texture || cmem->array) {
-          continue;
-        }
-
-        /* Already in host memory. */
-        if (cmem->use_mapped_host) {
-          continue;
-        }
-
-        /* For other textures, only move image textures. */
-        if (for_texture && !is_image) {
-          continue;
-        }
-
-        /* Try to move largest allocation, prefer moving images. */
-        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-          max_is_image = is_image;
-          max_size = mem.device_size;
-          max_mem = &mem;
-        }
-      }
-
-      /* Move to host memory. This part is mutex protected since
-       * multiple CUDA devices could be moving the memory. The
-       * first one will do it, and the rest will adopt the pointer. */
-      if (max_mem) {
-        VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-        static thread_mutex move_mutex;
-        thread_scoped_lock lock(move_mutex);
-
-        /* Preserve the original device pointer, in case of multi device
-         * we can't change it because the pointer mapping would break. */
-        device_ptr prev_pointer = max_mem->device_pointer;
-        size_t prev_size = max_mem->device_size;
-
-        tex_free(*max_mem);
-        tex_alloc(*max_mem);
-        size = (max_size >= size) ? 0 : size - max_size;
-
-        max_mem->device_pointer = prev_pointer;
-        max_mem->device_size = prev_size;
-      }
-      else {
-        break;
-      }
-    }
-
-    /* Update texture info array with new pointers. */
-    load_texture_info();
-
-    move_texture_to_host = false;
-  }
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
-  {
-    CUDAContextScope scope(this);
-
-    CUdeviceptr device_pointer = 0;
-    size_t size = mem.memory_size() + pitch_padding;
-
-    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-    const char *status = "";
-
-    /* First try allocating in device memory, respecting headroom. We make
-     * an exception for texture info. It is small and frequently accessed,
-     * so treat it as working memory.
-     *
-     * If there is not enough room for working memory, we will try to move
-     * textures to host memory, assuming the performance impact would have
-     * been worse for working memory. */
-    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-    bool is_image = is_texture && (mem.data_height > 1);
-
-    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-    size_t total = 0, free = 0;
-    cuMemGetInfo(&free, &total);
-
-    /* Move textures to host memory if needed. */
-    if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-      move_textures_to_host(size + headroom - free, is_texture);
-      cuMemGetInfo(&free, &total);
-    }
-
-    /* Allocate in device memory. */
-    if (!move_texture_to_host && (size + headroom) < free) {
-      mem_alloc_result = cuMemAlloc(&device_pointer, size);
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        status = " in device memory";
-      }
-    }
-
-    /* Fall back to mapped host memory if needed and possible. */
-
-    void *shared_pointer = 0;
-
-    if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
-      if (mem.shared_pointer) {
-        /* Another device already allocated host memory. */
-        mem_alloc_result = CUDA_SUCCESS;
-        shared_pointer = mem.shared_pointer;
-      }
-      else if (map_host_used + size < map_host_limit) {
-        /* Allocate host memory ourselves. */
-        mem_alloc_result = cuMemHostAlloc(
-            &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-        assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-               (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-      }
-
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-        map_host_used += size;
-        status = " in host memory";
-      }
-      else {
-        status = " failed, out of host memory";
-      }
-    }
-
-    if (mem_alloc_result != CUDA_SUCCESS) {
-      status = " failed, out of device and host memory";
-      cuda_assert(mem_alloc_result);
-    }
-
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-    }
-
-    mem.device_pointer = (device_ptr)device_pointer;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    if (!mem.device_pointer) {
-      return NULL;
-    }
-
-    /* Insert into map of allocations. */
-    CUDAMem *cmem = &cuda_mem_map[&mem];
-    if (shared_pointer != 0) {
-      /* Replace host pointer with our host allocation. Only works if
-       * CUDA memory layout is the same and has no pitch padding. Also
-       * does not work if we move textures to host during a render,
-       * since other devices might be using the memory. */
-
-      if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-          mem.host_pointer != shared_pointer) {
-        memcpy(shared_pointer, mem.host_pointer, size);
-
-        /* A Call to device_memory::host_free() should be preceded by
-         * a call to device_memory::device_free() for host memory
-         * allocated by a device to be handled properly. Two exceptions
-         * are here and a call in OptiXDevice::generic_alloc(), where
-         * the current host memory can be assumed to be allocated by
-         * device_memory::host_alloc(), not by a device */
-
-        mem.host_free();
-        mem.host_pointer = shared_pointer;
-      }
-      mem.shared_pointer = shared_pointer;
-      mem.shared_counter++;
-      cmem->use_mapped_host = true;
-    }
-    else {
-      cmem->use_mapped_host = false;
-    }
-
-    return cmem;
-  }
-
-  void generic_copy_to(device_memory &mem)
-  {
-    if (mem.host_pointer && mem.device_pointer) {
-      CUDAContextScope scope(this);
-
-      /* If use_mapped_host of mem is false, the current device only
-       * uses device memory allocated by cuMemAlloc regardless of
-       * mem.host_pointer and mem.shared_pointer, and should copy
-       * data from mem.host_pointer. */
-
-      if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) {
-        cuda_assert(cuMemcpyHtoD(
-            cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
-      }
-    }
-  }
-
-  void generic_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(this);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      /* If cmem.use_mapped_host is true, reference counting is used
-       * to safely free a mapped host memory. */
-
-      if (cmem.use_mapped_host) {
-        assert(mem.shared_pointer);
-        if (mem.shared_pointer) {
-          assert(mem.shared_counter > 0);
-          if (--mem.shared_counter == 0) {
-            if (mem.host_pointer == mem.shared_pointer) {
-              mem.host_pointer = 0;
-            }
-            cuMemFreeHost(mem.shared_pointer);
-            mem.shared_pointer = 0;
-          }
-        }
-        map_host_used -= mem.device_size;
-      }
-      else {
-        /* Free device memory. */
-        cuMemFree(mem.device_pointer);
-      }
-
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-  }
-
-  void mem_alloc(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_alloc(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-    }
-    else {
-      generic_alloc(mem);
-    }
-  }
-
-  void mem_copy_to(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-      tex_alloc(mem);
-    }
-    else {
-      if (!mem.device_pointer) {
-        generic_alloc(mem);
-      }
-
-      generic_copy_to(mem);
-    }
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_copy_from(mem, y, w, h);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_copy_from not supported for textures.");
-    }
-    else {
-      CUDAContextScope scope(this);
-      size_t offset = elem * y * w;
-      size_t size = elem * w * h;
-
-      if (mem.host_pointer && mem.device_pointer) {
-        cuda_assert(cuMemcpyDtoH(
-            (uchar *)mem.host_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size));
-      }
-      else if (mem.host_pointer) {
-        memset((char *)mem.host_pointer + offset, 0, size);
-      }
-    }
-  }
-
-  void mem_zero(device_memory &mem)
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-
-    /* If use_mapped_host of mem is false, mem.device_pointer currently
-     * refers to device memory regardless of mem.host_pointer and
-     * mem.shared_pointer. */
-
-    if (mem.device_pointer &&
-        (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer)) {
-      CUDAContextScope scope(this);
-      cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
-    }
-  }
-
-  void mem_free(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_free(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-    }
-    else {
-      generic_free(mem);
-    }
-  }
-
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size)
-  {
-    CUDAContextScope scope(this);
-    CUdeviceptr mem;
-    size_t bytes;
-
-    cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-    // assert(bytes == size);
-    cuda_assert(cuMemcpyHtoD(mem, host, size));
-  }
-
-  void tex_alloc(device_memory &mem)
-  {
-    CUDAContextScope scope(this);
-
-    /* General variables for both architectures */
-    string bind_name = mem.name;
-    size_t dsize = datatype_size(mem.data_type);
-    size_t size = mem.memory_size();
-
-    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-    switch (mem.extension) {
-      case EXTENSION_REPEAT:
-        address_mode = CU_TR_ADDRESS_MODE_WRAP;
-        break;
-      case EXTENSION_EXTEND:
-        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-        break;
-      case EXTENSION_CLIP:
-        address_mode = CU_TR_ADDRESS_MODE_BORDER;
-        break;
-      default:
-        assert(0);
-        break;
-    }
-
-    CUfilter_mode filter_mode;
-    if (mem.interpolation == INTERPOLATION_CLOSEST) {
-      filter_mode = CU_TR_FILTER_MODE_POINT;
-    }
-    else {
-      filter_mode = CU_TR_FILTER_MODE_LINEAR;
-    }
-
-    /* Data Storage */
-    if (mem.interpolation == INTERPOLATION_NONE) {
-      generic_alloc(mem);
-      generic_copy_to(mem);
-
-      CUdeviceptr cumem;
-      size_t cubytes;
-
-      cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
-
-      if (cubytes == 8) {
-        /* 64 bit device pointer */
-        uint64_t ptr = mem.device_pointer;
-        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
-      }
-      else {
-        /* 32 bit device pointer */
-        uint32_t ptr = (uint32_t)mem.device_pointer;
-        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
-      }
-      return;
-    }
-
-    /* Image Texture Storage */
-    CUarray_format_enum format;
-    switch (mem.data_type) {
-      case TYPE_UCHAR:
-        format = CU_AD_FORMAT_UNSIGNED_INT8;
-        break;
-      case TYPE_UINT16:
-        format = CU_AD_FORMAT_UNSIGNED_INT16;
-        break;
-      case TYPE_UINT:
-        format = CU_AD_FORMAT_UNSIGNED_INT32;
-        break;
-      case TYPE_INT:
-        format = CU_AD_FORMAT_SIGNED_INT32;
-        break;
-      case TYPE_FLOAT:
-        format = CU_AD_FORMAT_FLOAT;
-        break;
-      case TYPE_HALF:
-        format = CU_AD_FORMAT_HALF;
-        break;
-      default:
-        assert(0);
-        return;
-    }
-
-    CUDAMem *cmem = NULL;
-    CUarray array_3d = NULL;
-    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-    size_t dst_pitch = src_pitch;
-
-    if (mem.data_depth > 1) {
-      /* 3D texture using array, there is no API for linear memory. */
-      CUDA_ARRAY3D_DESCRIPTOR desc;
-
-      desc.Width = mem.data_width;
-      desc.Height = mem.data_height;
-      desc.Depth = mem.data_depth;
-      desc.Format = format;
-      desc.NumChannels = mem.data_elements;
-      desc.Flags = 0;
-
-      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-
-      cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-      if (!array_3d) {
-        return;
-      }
-
-      CUDA_MEMCPY3D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-      param.dstArray = array_3d;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-      param.Depth = mem.data_depth;
-
-      cuda_assert(cuMemcpy3D(&param));
-
-      mem.device_pointer = (device_ptr)array_3d;
-      mem.device_size = size;
-      stats.mem_alloc(size);
-
-      cmem = &cuda_mem_map[&mem];
-      cmem->texobject = 0;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      /* 2D texture, using pitch aligned linear memory. */
-      int alignment = 0;
-      cuda_assert(
-          cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-      dst_pitch = align_up(src_pitch, alignment);
-      size_t dst_size = dst_pitch * mem.data_height;
-
-      cmem = generic_alloc(mem, dst_size - mem.memory_size());
-      if (!cmem) {
-        return;
-      }
-
-      CUDA_MEMCPY2D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-      param.dstDevice = mem.device_pointer;
-      param.dstPitch = dst_pitch;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-
-      cuda_assert(cuMemcpy2DUnaligned(&param));
-    }
-    else {
-      /* 1D texture, using linear memory. */
-      cmem = generic_alloc(mem);
-      if (!cmem) {
-        return;
-      }
-
-      cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-    }
-
-    /* Kepler+, bindless textures. */
-    int flat_slot = 0;
-    if (string_startswith(mem.name, "__tex_image")) {
-      int pos = string(mem.name).rfind("_");
-      flat_slot = atoi(mem.name + pos + 1);
-    }
-    else {
-      assert(0);
-    }
-
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    /* Resize once */
-    if (flat_slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount
-       * of re-allocations. */
-      texture_info.resize(flat_slot + 128);
-    }
-
-    /* Set Mapping and tag that we need to (re-)upload to device */
-    TextureInfo &info = texture_info[flat_slot];
-    info.data = (uint64_t)cmem->texobject;
-    info.cl_buffer = 0;
-    info.interpolation = mem.interpolation;
-    info.extension = mem.extension;
-    info.width = mem.data_width;
-    info.height = mem.data_height;
-    info.depth = mem.data_depth;
-    need_texture_info = true;
-  }
-
-  void tex_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(this);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      if (cmem.texobject) {
-        /* Free bindless texture. */
-        cuTexObjectDestroy(cmem.texobject);
-      }
-
-      if (cmem.array) {
-        /* Free array. */
-        cuArrayDestroy(cmem.array);
-        stats.mem_free(mem.device_size);
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        cuda_mem_map.erase(cuda_mem_map.find(&mem));
-      }
-      else {
-        generic_free(mem);
-      }
-    }
-  }
-
-#define CUDA_GET_BLOCKSIZE(func, w, h) \
-  int threads_per_block; \
-  cuda_assert( \
-      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-  int threads = (int)sqrt((float)threads_per_block); \
-  int xblocks = ((w) + threads - 1) / threads; \
-  int yblocks = ((h) + threads - 1) / threads;
-
-#define CUDA_LAUNCH_KERNEL(func, args) \
-  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-  int threads_per_block; \
-  cuda_assert( \
-      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-  int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
-  int yblocks = h;
-
-#define CUDA_LAUNCH_KERNEL_1D(func, args) \
-  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    int stride = task->buffer.stride;
-    int w = task->buffer.width;
-    int h = task->buffer.h;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-    int frame_offset = 0;
-
-    if (have_error())
-      return false;
-
-    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr scale_ptr = 0;
-
-    cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
-    cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
-    {
-      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-      cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-      void *calc_difference_args[] = {&guide_ptr,
-                                      &variance_ptr,
-                                      &scale_ptr,
-                                      &difference,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &channel_offset,
-                                      &frame_offset,
-                                      &a,
-                                      &k_2};
-      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *calc_weight_args[] = {
-          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *update_output_args[] = {&blurDifference,
-                                    &image_ptr,
-                                    &out_ptr,
-                                    &weightAccum,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &channel_offset,
-                                    &r,
-                                    &f};
-
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-    }
-
-    {
-      CUfunction cuNLMNormalize;
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-      cuda_assert(cuCtxSynchronize());
-    }
-
-    return !have_error();
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterConstructTransform;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-    void *args[] = {&task->buffer.mem.device_pointer,
-                    &task->tile_info_mem.device_pointer,
-                    &task->storage.transform.device_pointer,
-                    &task->storage.rank.device_pointer,
-                    &task->filter_area,
-                    &task->rect,
-                    &task->radius,
-                    &task->pca_threshold,
-                    &task->buffer.pass_stride,
-                    &task->buffer.frame_stride,
-                    &task->buffer.use_time};
-    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    int r = task->radius;
-    int f = 4;
-    float a = 1.0f;
-    float k_2 = task->nlm_k_2;
-
-    int w = task->reconstruction_state.source_w;
-    int h = task->reconstruction_state.source_h;
-    int stride = task->buffer.stride;
-    int frame_offset = frame * task->buffer.frame_stride;
-    int t = task->tile_info->frames[frame];
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-    if (have_error())
-      return false;
-
-    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                          task->reconstruction_state.source_w *
-                              task->reconstruction_state.source_h,
-                          num_shifts);
-
-    void *calc_difference_args[] = {&color_ptr,
-                                    &color_variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &pass_stride,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *construct_gramian_args[] = {&t,
-                                      &blurDifference,
-                                      &task->buffer.mem.device_pointer,
-                                      &task->storage.transform.device_pointer,
-                                      &task->storage.rank.device_pointer,
-                                      &task->storage.XtWX.device_pointer,
-                                      &task->storage.XtWY.device_pointer,
-                                      &task->reconstruction_state.filter_window,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &f,
-                                      &frame_offset,
-                                      &task->buffer.use_time};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-  {
-    CUfunction cuFinalize;
-    cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-    cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-    void *finalize_args[] = {&output_ptr,
-                             &task->storage.rank.device_pointer,
-                             &task->storage.XtWX.device_pointer,
-                             &task->storage.XtWY.device_pointer,
-                             &task->filter_area,
-                             &task->reconstruction_state.buffer_params.x,
-                             &task->render_buffer.samples};
-    CUDA_GET_BLOCKSIZE(
-        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterCombineHalves;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterDivideShadow;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &a_ptr,
-                    &b_ptr,
-                    &sample_variance_ptr,
-                    &sv_variance_ptr,
-                    &buffer_variance_ptr,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterGetFeature;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &mean_offset,
-                    &variance_offset,
-                    &mean_ptr,
-                    &variance_ptr,
-                    &scale,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterWriteFeature;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->reconstruction_state.buffer_params,
-                    &task->filter_area,
-                    &from_ptr,
-                    &buffer_ptr,
-                    &out_offset,
-                    &task->rect};
-    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterDetectOutliers;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&image_ptr,
-                    &variance_ptr,
-                    &depth_ptr,
-                    &output_ptr,
-                    &task->rect,
-                    &task->buffer.pass_stride};
-
-    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  void denoise(RenderTile &rtile, DenoisingTask &denoising)
-  {
-    denoising.functions.construct_transform = function_bind(
-        &CUDADevice::denoising_construct_transform, this, &denoising);
-    denoising.functions.accumulate = function_bind(
-        &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-    denoising.functions.divide_shadow = function_bind(
-        &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.non_local_means = function_bind(
-        &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.combine_halves = function_bind(
-        &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-    denoising.functions.get_feature = function_bind(
-        &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.write_feature = function_bind(
-        &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-    denoising.functions.detect_outliers = function_bind(
-        &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-    denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-    denoising.render_buffer.samples = rtile.sample;
-    denoising.buffer.gpu_temporary_mem = true;
-
-    denoising.run_denoising(&rtile);
-  }
-
-  void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-  {
-    scoped_timer timer(&rtile.buffers->render_time);
-
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-    CUfunction cuPathTrace;
-
-    /* Get kernel function. */
-    if (task.integrator_branched) {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-    }
-
-    if (have_error()) {
-      return;
-    }
-
-    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-    /* Allocate work tile. */
-    work_tiles.alloc(1);
-
-    WorkTile *wtile = work_tiles.data();
-    wtile->x = rtile.x;
-    wtile->y = rtile.y;
-    wtile->w = rtile.w;
-    wtile->h = rtile.h;
-    wtile->offset = rtile.offset;
-    wtile->stride = rtile.stride;
-    wtile->buffer = (float *)cuda_device_ptr(rtile.buffer);
-
-    /* Prepare work size. More step samples render faster, but for now we
-     * remain conservative for GPUs connected to a display to avoid driver
-     * timeouts and display freezing. */
-    int min_blocks, num_threads_per_block;
-    cuda_assert(cuOccupancyMaxPotentialBlockSize(
-        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-    if (!info.display_device) {
-      min_blocks *= 8;
-    }
-
-    uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-    /* Render all samples. */
-    int start_sample = rtile.start_sample;
-    int end_sample = rtile.start_sample + rtile.num_samples;
-
-    for (int sample = start_sample; sample < end_sample; sample += step_samples) {
-      /* Setup and copy work tile to device. */
-      wtile->start_sample = sample;
-      wtile->num_samples = min(step_samples, end_sample - sample);
-      work_tiles.copy_to_device();
-
-      CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
-      uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-      uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-      /* Launch kernel. */
-      void *args[] = {&d_work_tiles, &total_work_size};
-
-      cuda_assert(cuLaunchKernel(
-          cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-      cuda_assert(cuCtxSynchronize());
-
-      /* Update progress. */
-      rtile.sample = sample + wtile->num_samples;
-      task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-  }
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half)
-  {
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilmConvert;
-    CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
-    CUdeviceptr d_buffer = cuda_device_ptr(buffer);
-
-    /* get kernel function */
-    if (rgba_half) {
-      cuda_assert(
-          cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-    }
-
-    float sample_scale = 1.0f / (task.sample + 1);
-
-    /* pass in parameters */
-    void *args[] = {&d_rgba,
-                    &d_buffer,
-                    &sample_scale,
-                    &task.x,
-                    &task.y,
-                    &task.w,
-                    &task.h,
-                    &task.offset,
-                    &task.stride};
-
-    /* launch kernel */
-    int threads_per_block;
-    cuda_assert(cuFuncGetAttribute(
-        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-    int xthreads = (int)sqrt(threads_per_block);
-    int ythreads = (int)sqrt(threads_per_block);
-    int xblocks = (task.w + xthreads - 1) / xthreads;
-    int yblocks = (task.h + ythreads - 1) / ythreads;
-
-    cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(cuFilmConvert,
-                               xblocks,
-                               yblocks,
-                               1, /* blocks */
-                               xthreads,
-                               ythreads,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
-    cuda_assert(cuCtxSynchronize());
-  }
-
-  void shader(DeviceTask &task)
-  {
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuShader;
-    CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
-    CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
-
-    /* get kernel function */
-    if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
-    }
-    else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-    }
-
-    /* do tasks in smaller chunks, so we can cancel it */
-    const int shader_chunk_size = 65536;
-    const int start = task.shader_x;
-    const int end = task.shader_x + task.shader_w;
-    int offset = task.offset;
-
-    bool canceled = false;
-    for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
-      for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-        int shader_w = min(shader_chunk_size, end - shader_x);
-
-        /* pass in parameters */
-        void *args[8];
-        int arg = 0;
-        args[arg++] = &d_input;
-        args[arg++] = &d_output;
-        args[arg++] = &task.shader_eval_type;
-        if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-          args[arg++] = &task.shader_filter;
-        }
-        args[arg++] = &shader_x;
-        args[arg++] = &shader_w;
-        args[arg++] = &offset;
-        args[arg++] = &sample;
-
-        /* launch kernel */
-        int threads_per_block;
-        cuda_assert(cuFuncGetAttribute(
-            &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-        int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
-        cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-        cuda_assert(cuLaunchKernel(cuShader,
-                                   xblocks,
-                                   1,
-                                   1, /* blocks */
-                                   threads_per_block,
-                                   1,
-                                   1, /* threads */
-                                   0,
-                                   0,
-                                   args,
-                                   0));
-
-        cuda_assert(cuCtxSynchronize());
-
-        if (task.get_cancel()) {
-          canceled = true;
-          break;
-        }
-      }
-
-      task.update_progress(NULL);
-    }
-  }
-
-  CUdeviceptr map_pixels(device_ptr mem)
-  {
-    if (!background) {
-      PixelMem pmem = pixel_mem_map[mem];
-      CUdeviceptr buffer;
-
-      size_t bytes;
-      cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-      cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-      return buffer;
-    }
-
-    return cuda_device_ptr(mem);
-  }
-
-  void unmap_pixels(device_ptr mem)
-  {
-    if (!background) {
-      PixelMem pmem = pixel_mem_map[mem];
-
-      cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-    }
-  }
-
-  void pixels_alloc(device_memory &mem)
-  {
-    PixelMem pmem;
-
-    pmem.w = mem.data_width;
-    pmem.h = mem.data_height;
-
-    CUDAContextScope scope(this);
-
-    glGenBuffers(1, &pmem.cuPBO);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    if (mem.data_type == TYPE_HALF)
-      glBufferData(
-          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
-    else
-      glBufferData(
-          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-    glActiveTexture(GL_TEXTURE0);
-    glGenTextures(1, &pmem.cuTexId);
-    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-    if (mem.data_type == TYPE_HALF)
-      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-    else
-      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    CUresult result = cuGraphicsGLRegisterBuffer(
-        &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-    if (result == CUDA_SUCCESS) {
-      mem.device_pointer = pmem.cuTexId;
-      pixel_mem_map[mem.device_pointer] = pmem;
-
-      mem.device_size = mem.memory_size();
-      stats.mem_alloc(mem.device_size);
-
-      return;
-    }
-    else {
-      /* failed to register buffer, fallback to no interop */
-      glDeleteBuffers(1, &pmem.cuPBO);
-      glDeleteTextures(1, &pmem.cuTexId);
-
-      background = true;
-    }
-  }
-
-  void pixels_copy_from(device_memory &mem, int y, int w, int h)
-  {
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-    CUDAContextScope scope(this);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-    size_t offset = sizeof(uchar) * 4 * y * w;
-    memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
-    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-  }
-
-  void pixels_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-      CUDAContextScope scope(this);
-
-      cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-      glDeleteBuffers(1, &pmem.cuPBO);
-      glDeleteTextures(1, &pmem.cuTexId);
-
-      pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-      mem.device_pointer = 0;
-
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  void draw_pixels(device_memory &mem,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params)
-  {
-    assert(mem.type == MEM_PIXELS);
-
-    if (!background) {
-      const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-      PixelMem pmem = pixel_mem_map[mem.device_pointer];
-      float *vpointer;
-
-      CUDAContextScope scope(this);
-
-      /* for multi devices, this assumes the inefficient method that we allocate
-       * all pixels on the device even though we only render to a subset */
-      size_t offset = 4 * y * w;
-
-      if (mem.data_type == TYPE_HALF)
-        offset *= sizeof(GLhalf);
-      else
-        offset *= sizeof(uint8_t);
-
-      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-      glActiveTexture(GL_TEXTURE0);
-      glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-      if (mem.data_type == TYPE_HALF) {
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
-      }
-      else {
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
-      }
-      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-      if (transparent) {
-        glEnable(GL_BLEND);
-        glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-      }
-
-      GLint shader_program;
-      if (use_fallback_shader) {
-        if (!bind_fallback_display_space_shader(dw, dh)) {
-          return;
-        }
-        shader_program = fallback_shader_program;
-      }
-      else {
-        draw_params.bind_display_space_shader_cb();
-        glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-      }
-
-      if (!vertex_buffer) {
-        glGenBuffers(1, &vertex_buffer);
-      }
-
-      glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-      /* invalidate old contents -
-       * avoids stalling if buffer is still waiting in queue to be rendered */
-      glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-      vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-      if (vpointer) {
-        /* texture coordinate - vertex pair */
-        vpointer[0] = 0.0f;
-        vpointer[1] = 0.0f;
-        vpointer[2] = dx;
-        vpointer[3] = dy;
-
-        vpointer[4] = (float)w / (float)pmem.w;
-        vpointer[5] = 0.0f;
-        vpointer[6] = (float)width + dx;
-        vpointer[7] = dy;
-
-        vpointer[8] = (float)w / (float)pmem.w;
-        vpointer[9] = (float)h / (float)pmem.h;
-        vpointer[10] = (float)width + dx;
-        vpointer[11] = (float)height + dy;
-
-        vpointer[12] = 0.0f;
-        vpointer[13] = (float)h / (float)pmem.h;
-        vpointer[14] = dx;
-        vpointer[15] = (float)height + dy;
-
-        glUnmapBuffer(GL_ARRAY_BUFFER);
-      }
-
-      GLuint vertex_array_object;
-      GLuint position_attribute, texcoord_attribute;
-
-      glGenVertexArrays(1, &vertex_array_object);
-      glBindVertexArray(vertex_array_object);
-
-      texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-      position_attribute = glGetAttribLocation(shader_program, "pos");
-
-      glEnableVertexAttribArray(texcoord_attribute);
-      glEnableVertexAttribArray(position_attribute);
-
-      glVertexAttribPointer(
-          texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-      glVertexAttribPointer(position_attribute,
-                            2,
-                            GL_FLOAT,
-                            GL_FALSE,
-                            4 * sizeof(float),
-                            (const GLvoid *)(sizeof(float) * 2));
-
-      glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-      if (use_fallback_shader) {
-        glUseProgram(0);
-      }
-      else {
-        draw_params.unbind_display_space_shader_cb();
-      }
-
-      if (transparent) {
-        glDisable(GL_BLEND);
-      }
-
-      glBindTexture(GL_TEXTURE_2D, 0);
-
-      return;
-    }
-
-    Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-  }
-
-  void thread_run(DeviceTask *task)
-  {
-    CUDAContextScope scope(this);
-
-    if (task->type == DeviceTask::RENDER) {
-      DeviceRequestedFeatures requested_features;
-      if (use_split_kernel()) {
-        if (split_kernel == NULL) {
-          split_kernel = new CUDASplitKernel(this);
-          split_kernel->load_kernels(requested_features);
-        }
-      }
-
-      device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-      /* keep rendering tiles until done */
-      RenderTile tile;
-      DenoisingTask denoising(this, *task);
-
-      while (task->acquire_tile(this, tile)) {
-        if (tile.task == RenderTile::PATH_TRACE) {
-          if (use_split_kernel()) {
-            device_only_memory<uchar> void_buffer(this, "void_buffer");
-            split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-          }
-          else {
-            path_trace(*task, tile, work_tiles);
-          }
-        }
-        else if (tile.task == RenderTile::DENOISE) {
-          tile.sample = tile.start_sample + tile.num_samples;
-
-          denoise(tile, denoising);
-
-          task->update_progress(&tile, tile.w * tile.h);
-        }
-
-        task->release_tile(tile);
-
-        if (task->get_cancel()) {
-          if (task->need_finish_queue == false)
-            break;
-        }
-      }
-
-      work_tiles.free();
-    }
-    else if (task->type == DeviceTask::SHADER) {
-      shader(*task);
-
-      cuda_assert(cuCtxSynchronize());
-    }
-  }
-
-  class CUDADeviceTask : public DeviceTask {
-   public:
-    CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
-    {
-      run = function_bind(&CUDADevice::thread_run, device, this);
-    }
-  };
-
-  void task_add(DeviceTask &task)
-  {
-    CUDAContextScope scope(this);
-
-    /* Load texture info. */
-    load_texture_info();
-
-    /* Synchronize all memory copies before executing task. */
-    cuda_assert(cuCtxSynchronize());
-
-    if (task.type == DeviceTask::FILM_CONVERT) {
-      /* must be done in main thread due to opengl access */
-      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-    }
-    else {
-      task_pool.push(new CUDADeviceTask(this, task));
-    }
-  }
-
-  void task_wait()
-  {
-    task_pool.wait();
-  }
-
-  void task_cancel()
-  {
-    task_pool.cancel();
-  }
-
-  friend class CUDASplitKernelFunction;
-  friend class CUDASplitKernel;
-  friend class CUDAContextScope;
-};
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#undef cuda_assert
-#define cuda_assert(stmt) \
-  { \
-    CUresult result = stmt; \
-\
-    if (result != CUDA_SUCCESS) { \
-      string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
-      if (device->error_msg == "") \
-        device->error_msg = message; \
-      fprintf(stderr, "%s\n", message.c_str()); \
-      /*cuda_abort();*/ \
-      device->cuda_error_documentation(); \
-    } \
-  } \
-  (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
-  cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
-  cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
-  CUDADevice *device;
-  CUfunction func;
-
- public:
-  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
-  {
-    return enqueue(dim, NULL);
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, void *args[])
-  {
-    if (device->have_error())
-      return false;
-
-    CUDAContextScope scope(device);
-
-    /* we ignore dim.local_size for now, as this is faster */
-    int threads_per_block;
-    cuda_assert(
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
-                  threads_per_block;
-
-    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(func,
-                               xblocks,
-                               1,
-                               1, /* blocks */
-                               threads_per_block,
-                               1,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    return !device->have_error();
-  }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
-                                            device_memory & /*data*/,
-                                            size_t num_threads)
-{
-  CUDAContextScope scope(device);
-
-  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-  size_buffer.alloc(1);
-  size_buffer.zero_to_device();
-
-  uint threads = num_threads;
-  CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
-
-  struct args_t {
-    uint *num_threads;
-    CUdeviceptr *size;
-  };
-
-  args_t args = {&threads, &d_size};
-
-  CUfunction state_buffer_size;
-  cuda_assert(
-      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
-  size_buffer.copy_from_device(0, 1, 1);
-  size_t size = size_buffer[0];
-  size_buffer.free();
-
-  return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                     RenderTile &rtile,
-                                                     int num_global_elements,
-                                                     device_memory & /*kernel_globals*/,
-                                                     device_memory & /*kernel_data*/,
-                                                     device_memory &split_data,
-                                                     device_memory &ray_state,
-                                                     device_memory &queue_index,
-                                                     device_memory &use_queues_flag,
-                                                     device_memory &work_pool_wgs)
-{
-  CUDAContextScope scope(device);
-
-  CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
-  CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
-  CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
-  CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
-  CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
-
-  CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
-
-  int end_sample = rtile.start_sample + rtile.num_samples;
-  int queue_size = dim.global_size[0] * dim.global_size[1];
-
-  struct args_t {
-    CUdeviceptr *split_data_buffer;
-    int *num_elements;
-    CUdeviceptr *ray_state;
-    int *start_sample;
-    int *end_sample;
-    int *sx;
-    int *sy;
-    int *sw;
-    int *sh;
-    int *offset;
-    int *stride;
-    CUdeviceptr *queue_index;
-    int *queuesize;
-    CUdeviceptr *use_queues_flag;
-    CUdeviceptr *work_pool_wgs;
-    int *num_samples;
-    CUdeviceptr *buffer;
-  };
-
-  args_t args = {&d_split_data,
-                 &num_global_elements,
-                 &d_ray_state,
-                 &rtile.start_sample,
-                 &end_sample,
-                 &rtile.x,
-                 &rtile.y,
-                 &rtile.w,
-                 &rtile.h,
-                 &rtile.offset,
-                 &rtile.stride,
-                 &d_queue_index,
-                 &queue_size,
-                 &d_use_queues_flag,
-                 &d_work_pool_wgs,
-                 &rtile.num_samples,
-                 &d_buffer};
-
-  CUfunction data_init;
-  cuda_assert(
-      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-  if (device->have_error()) {
-    return false;
-  }
-
-  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
-  return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                                const DeviceRequestedFeatures &)
-{
-  CUDAContextScope scope(device);
-  CUfunction func;
-
-  cuda_assert(
-      cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
-  if (device->have_error()) {
-    device->cuda_error_message(
-        string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
-    return NULL;
-  }
-
-  return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-  return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
-                                               device_memory &data,
-                                               DeviceTask * /*task*/)
-{
-  CUDAContextScope scope(device);
-  size_t free;
-  size_t total;
-
-  cuda_assert(cuMemGetInfo(&free, &total));
-
-  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
-          << " bytes. (" << string_human_readable_size(free) << ").";
-
-  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-  size_t side = round_down((int)sqrt(num_elements), 32);
-  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-  VLOG(1) << "Global size: " << global_size << ".";
-  return global_size;
-}
-
 bool device_cuda_init()
 {
-#ifdef WITH_CUDA_DYNLOAD
+#  ifdef WITH_CUDA_DYNLOAD
   static bool initialized = false;
   static bool result = false;
 
@@ -2584,7 +43,6 @@ bool device_cuda_init()
       VLOG(1) << "Found precompiled kernels";
       result = true;
     }
-#  ifndef _WIN32
     else if (cuewCompilerPath() != NULL) {
       VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
       result = true;
@@ -2593,7 +51,6 @@ bool device_cuda_init()
       VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
               << " unable to use CUDA";
     }
-#  endif
   }
   else {
     VLOG(1) << "CUEW initialization failed: "
@@ -2602,9 +59,9 @@ bool device_cuda_init()
   }
 
   return result;
-#else  /* WITH_CUDA_DYNLOAD */
+#  else  /* WITH_CUDA_DYNLOAD */
   return true;
-#endif /* WITH_CUDA_DYNLOAD */
+#  endif /* WITH_CUDA_DYNLOAD */
 }
 
 Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
@@ -2614,7 +71,7 @@ Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, b
 
 static CUresult device_cuda_safe_init()
 {
-#ifdef _WIN32
+#  ifdef _WIN32
   __try {
     return cuInit(0);
   }
@@ -2625,9 +82,9 @@ static CUresult device_cuda_safe_init()
   }
 
   return CUDA_ERROR_NO_DEVICE;
-#else
+#  else
   return cuInit(0);
-#endif
+#  endif
 }
 
 void device_cuda_info(vector<DeviceInfo> &devices)
@@ -2739,13 +196,13 @@ string device_cuda_capabilities()
     }
     capabilities += string("\t") + name + "\n";
     int value;
-#define GET_ATTR(attr) \
-  { \
-    if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \
-      capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \
+#  define GET_ATTR(attr) \
+    { \
+      if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \
+        capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \
+      } \
     } \
-  } \
-  (void)0
+    (void)0
     /* TODO(sergey): Strip all attributes which are not useful for us
      * or does not depend on the driver.
      */
@@ -2836,7 +293,7 @@ string device_cuda_capabilities()
     GET_ATTR(MANAGED_MEMORY);
     GET_ATTR(MULTI_GPU_BOARD);
     GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
-#undef GET_ATTR
+#  undef GET_ATTR
     capabilities += "\n";
   }
 
@@ -2844,3 +301,5 @@ string device_cuda_capabilities()
 }
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 5b8b86886c4..0c229ac24cf 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -17,9 +17,15 @@
 #ifndef __DEVICE_INTERN_H__
 #define __DEVICE_INTERN_H__
 
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
 
 Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
 bool device_opencl_init();
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 3a99a49dffc..671cd7c29f3 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "device/device.h"
 #include "device/device_memory.h"
+#include "device/device.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -31,8 +31,6 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
       data_depth(0),
       type(type),
       name(name),
-      interpolation(INTERPOLATION_NONE),
-      extension(EXTENSION_REPEAT),
       device(device),
       device_pointer(0),
       host_pointer(0),
@@ -76,7 +74,7 @@ void device_memory::host_free()
 
 void device_memory::device_alloc()
 {
-  assert(!device_pointer && type != MEM_TEXTURE);
+  assert(!device_pointer && type != MEM_TEXTURE && type != MEM_GLOBAL);
   device->mem_alloc(*this);
 }
 
@@ -96,7 +94,7 @@ void device_memory::device_copy_to()
 
 void device_memory::device_copy_from(int y, int w, int h, int elem)
 {
-  assert(type != MEM_TEXTURE && type != MEM_READ_ONLY);
+  assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
   device->mem_copy_from(*this, y, w, h, elem);
 }
 
@@ -139,4 +137,93 @@ device_sub_ptr::~device_sub_ptr()
   device->mem_free_sub_ptr(ptr);
 }
 
+/* Device Texture */
+
+device_texture::device_texture(Device *device,
+                               const char *name,
+                               const uint slot,
+                               ImageDataType image_data_type,
+                               InterpolationType interpolation,
+                               ExtensionType extension)
+    : device_memory(device, name, MEM_TEXTURE), slot(slot)
+{
+  switch (image_data_type) {
+    case IMAGE_DATA_TYPE_FLOAT4:
+      data_type = TYPE_FLOAT;
+      data_elements = 4;
+      break;
+    case IMAGE_DATA_TYPE_FLOAT:
+      data_type = TYPE_FLOAT;
+      data_elements = 1;
+      break;
+    case IMAGE_DATA_TYPE_BYTE4:
+      data_type = TYPE_UCHAR;
+      data_elements = 4;
+      break;
+    case IMAGE_DATA_TYPE_BYTE:
+      data_type = TYPE_UCHAR;
+      data_elements = 1;
+      break;
+    case IMAGE_DATA_TYPE_HALF4:
+      data_type = TYPE_HALF;
+      data_elements = 4;
+      break;
+    case IMAGE_DATA_TYPE_HALF:
+      data_type = TYPE_HALF;
+      data_elements = 1;
+      break;
+    case IMAGE_DATA_TYPE_USHORT4:
+      data_type = TYPE_UINT16;
+      data_elements = 4;
+      break;
+    case IMAGE_DATA_TYPE_USHORT:
+      data_type = TYPE_UINT16;
+      data_elements = 1;
+      break;
+    case IMAGE_DATA_NUM_TYPES:
+      assert(0);
+      return;
+  }
+
+  memset(&info, 0, sizeof(info));
+  info.data_type = image_data_type;
+  info.interpolation = interpolation;
+  info.extension = extension;
+}
+
+device_texture::~device_texture()
+{
+  device_free();
+  host_free();
+}
+
+/* Host memory allocation. */
+void *device_texture::alloc(const size_t width, const size_t height, const size_t depth)
+{
+  const size_t new_size = size(width, height, depth);
+
+  if (new_size != data_size) {
+    device_free();
+    host_free();
+    host_pointer = host_alloc(data_elements * datatype_size(data_type) * new_size);
+    assert(device_pointer == 0);
+  }
+
+  data_size = new_size;
+  data_width = width;
+  data_height = height;
+  data_depth = depth;
+
+  info.width = width;
+  info.height = height;
+  info.depth = depth;
+
+  return host_pointer;
+}
+
+void device_texture::copy_to_device()
+{
+  device_copy_to();
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 60740807568..1c20db900bc 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -23,6 +23,7 @@
 
 #include "util/util_array.h"
 #include "util/util_half.h"
+#include "util/util_string.h"
 #include "util/util_texture.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
@@ -31,7 +32,14 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 
-enum MemoryType { MEM_READ_ONLY, MEM_READ_WRITE, MEM_DEVICE_ONLY, MEM_TEXTURE, MEM_PIXELS };
+enum MemoryType {
+  MEM_READ_ONLY,
+  MEM_READ_WRITE,
+  MEM_DEVICE_ONLY,
+  MEM_GLOBAL,
+  MEM_TEXTURE,
+  MEM_PIXELS
+};
 
 /* Supported Data Types */
 
@@ -208,8 +216,6 @@ class device_memory {
   size_t data_depth;
   MemoryType type;
   const char *name;
-  InterpolationType interpolation;
-  ExtensionType extension;
 
   /* Pointers. */
   Device *device;
@@ -310,7 +316,7 @@ template<typename T> class device_only_memory : public device_memory {
  * in and copied to the device with copy_to_device(). Or alternatively
  * allocated and set to zero on the device with zero_to_device().
  *
- * When using memory type MEM_TEXTURE, a pointer to this memory will be
+ * When using memory type MEM_GLOBAL, a pointer to this memory will be
  * automatically attached to kernel globals, using the provided name
  * matching an entry in kernel_textures.h. */
 
@@ -427,6 +433,11 @@ template<typename T> class device_vector : public device_memory {
     device_copy_to();
   }
 
+  void copy_from_device()
+  {
+    device_copy_from(0, data_width, data_height, sizeof(T));
+  }
+
   void copy_from_device(int y, int w, int h)
   {
     device_copy_from(y, w, h, sizeof(T));
@@ -498,6 +509,33 @@ class device_sub_ptr {
   device_ptr ptr;
 };
 
+/* Device Texture
+ *
+ * 2D or 3D image texture memory. */
+
+class device_texture : public device_memory {
+ public:
+  device_texture(Device *device,
+                 const char *name,
+                 const uint slot,
+                 ImageDataType image_data_type,
+                 InterpolationType interpolation,
+                 ExtensionType extension);
+  ~device_texture();
+
+  void *alloc(const size_t width, const size_t height, const size_t depth = 0);
+  void copy_to_device();
+
+  uint slot;
+  TextureInfo info;
+
+ protected:
+  size_t size(const size_t width, const size_t height, const size_t depth)
+  {
+    return width * ((height == 0) ? 1 : height) * ((depth == 0) ? 1 : depth);
+  }
+};
+
 CCL_NAMESPACE_END
 
 #endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index b8587eb0a62..3636ecaa7a1 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <stdlib.h>
 #include <sstream>
+#include <stdlib.h>
 
 #include "device/device.h"
 #include "device/device_intern.h"
@@ -42,7 +42,7 @@ class MultiDevice : public Device {
     map<device_ptr, device_ptr> ptr_map;
   };
 
-  list<SubDevice> devices;
+  list<SubDevice> devices, denoising_devices;
   device_ptr unique_key;
 
   MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
@@ -61,6 +61,12 @@ class MultiDevice : public Device {
       }
     }
 
+    foreach (DeviceInfo &subinfo, info.denoising_devices) {
+      Device *device = Device::create(subinfo, sub_stats_, profiler, background);
+
+      denoising_devices.push_back(SubDevice(device));
+    }
+
 #ifdef WITH_NETWORK
     /* try to add network devices */
     ServerDiscovery discovery(true);
@@ -80,17 +86,18 @@ class MultiDevice : public Device {
   {
     foreach (SubDevice &sub, devices)
       delete sub.device;
+    foreach (SubDevice &sub, denoising_devices)
+      delete sub.device;
   }
 
   const string &error_message()
   {
-    foreach (SubDevice &sub, devices) {
-      if (sub.device->error_message() != "") {
-        if (error_msg == "")
-          error_msg = sub.device->error_message();
-        break;
-      }
-    }
+    error_msg.clear();
+
+    foreach (SubDevice &sub, devices)
+      error_msg += sub.device->error_message();
+    foreach (SubDevice &sub, denoising_devices)
+      error_msg += sub.device->error_message();
 
     return error_msg;
   }
@@ -118,6 +125,12 @@ class MultiDevice : public Device {
       if (!sub.device->load_kernels(requested_features))
         return false;
 
+    if (requested_features.use_denoising) {
+      foreach (SubDevice &sub, denoising_devices)
+        if (!sub.device->load_kernels(requested_features))
+          return false;
+    }
+
     return true;
   }
 
@@ -127,6 +140,12 @@ class MultiDevice : public Device {
       if (!sub.device->wait_for_availability(requested_features))
         return false;
 
+    if (requested_features.use_denoising) {
+      foreach (SubDevice &sub, denoising_devices)
+        if (!sub.device->wait_for_availability(requested_features))
+          return false;
+    }
+
     return true;
   }
 
@@ -150,19 +169,28 @@ class MultiDevice : public Device {
           break;
       }
     }
+
     return result;
   }
 
   bool build_optix_bvh(BVH *bvh)
   {
-    // Broadcast acceleration structure build to all devices
-    foreach (SubDevice &sub, devices) {
+    // Broadcast acceleration structure build to all render devices
+    foreach (SubDevice &sub, devices)
       if (!sub.device->build_optix_bvh(bvh))
         return false;
-    }
+
     return true;
   }
 
+  virtual void *osl_memory()
+  {
+    if (devices.size() > 1) {
+      return NULL;
+    }
+    return devices.front().device->osl_memory();
+  }
+
   void mem_alloc(device_memory &mem)
   {
     device_ptr key = unique_key++;
@@ -236,6 +264,17 @@ class MultiDevice : public Device {
       sub.ptr_map[key] = mem.device_pointer;
     }
 
+    if (strcmp(mem.name, "RenderBuffers") == 0) {
+      foreach (SubDevice &sub, denoising_devices) {
+        mem.device = sub.device;
+        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        sub.device->mem_zero(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
+    }
+
     mem.device = this;
     mem.device_pointer = key;
     stats.mem_alloc(mem.device_size - existing_size);
@@ -255,6 +294,17 @@ class MultiDevice : public Device {
       sub.ptr_map.erase(sub.ptr_map.find(key));
     }
 
+    if (strcmp(mem.name, "RenderBuffers") == 0) {
+      foreach (SubDevice &sub, denoising_devices) {
+        mem.device = sub.device;
+        mem.device_pointer = sub.ptr_map[key];
+        mem.device_size = existing_size;
+
+        sub.device->mem_free(mem);
+        sub.ptr_map.erase(sub.ptr_map.find(key));
+      }
+    }
+
     mem.device = this;
     mem.device_pointer = 0;
     mem.device_size = 0;
@@ -302,10 +352,21 @@ class MultiDevice : public Device {
 
   void map_tile(Device *sub_device, RenderTile &tile)
   {
+    if (!tile.buffer) {
+      return;
+    }
+
     foreach (SubDevice &sub, devices) {
       if (sub.device == sub_device) {
-        if (tile.buffer)
-          tile.buffer = sub.ptr_map[tile.buffer];
+        tile.buffer = sub.ptr_map[tile.buffer];
+        return;
+      }
+    }
+
+    foreach (SubDevice &sub, denoising_devices) {
+      if (sub.device == sub_device) {
+        tile.buffer = sub.ptr_map[tile.buffer];
+        return;
       }
     }
   }
@@ -320,6 +381,12 @@ class MultiDevice : public Device {
       i++;
     }
 
+    foreach (SubDevice &sub, denoising_devices) {
+      if (sub.device == sub_device)
+        return i;
+      i++;
+    }
+
     return -1;
   }
 
@@ -330,24 +397,41 @@ class MultiDevice : public Device {
         continue;
       }
 
+      device_vector<float> &mem = tiles[i].buffers->buffer;
+      tiles[i].buffer = mem.device_pointer;
+
+      if (mem.device == this && denoising_devices.empty()) {
+        /* Skip unnecessary copies in viewport mode (buffer covers the
+         * whole image), but still need to fix up the tile device pointer. */
+        map_tile(sub_device, tiles[i]);
+        continue;
+      }
+
       /* If the tile was rendered on another device, copy its memory to
        * to the current device now, for the duration of the denoising task.
        * Note that this temporarily modifies the RenderBuffers and calls
        * the device, so this function is not thread safe. */
-      device_vector<float> &mem = tiles[i].buffers->buffer;
       if (mem.device != sub_device) {
         /* Only copy from device to host once. This is faster, but
          * also required for the case where a CPU thread is denoising
          * a tile rendered on the GPU. In that case we have to avoid
-         * overwriting the buffer being denoised by the CPU thread. */
+         * overwriting the buffer being de-noised by the CPU thread. */
         if (!tiles[i].buffers->map_neighbor_copied) {
           tiles[i].buffers->map_neighbor_copied = true;
-          mem.copy_from_device(0, mem.data_size, 1);
+          mem.copy_from_device();
         }
 
-        mem.swap_device(sub_device, 0, 0);
+        if (mem.device == this) {
+          /* Can re-use memory if tile is already allocated on the sub device. */
+          map_tile(sub_device, tiles[i]);
+          mem.swap_device(sub_device, mem.device_size, tiles[i].buffer);
+        }
+        else {
+          mem.swap_device(sub_device, 0, 0);
+        }
 
         mem.copy_to_device();
+
         tiles[i].buffer = mem.device_pointer;
         tiles[i].device_size = mem.device_size;
 
@@ -358,11 +442,17 @@ class MultiDevice : public Device {
 
   void unmap_neighbor_tiles(Device *sub_device, RenderTile *tiles)
   {
-    /* Copy denoised result back to the host. */
     device_vector<float> &mem = tiles[9].buffers->buffer;
+
+    if (mem.device == this && denoising_devices.empty()) {
+      return;
+    }
+
+    /* Copy denoised result back to the host. */
     mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer);
-    mem.copy_from_device(0, mem.data_size, 1);
+    mem.copy_from_device();
     mem.restore_device();
+
     /* Copy denoised result to the original device. */
     mem.copy_to_device();
 
@@ -372,7 +462,9 @@ class MultiDevice : public Device {
       }
 
       device_vector<float> &mem = tiles[i].buffers->buffer;
-      if (mem.device != sub_device) {
+
+      if (mem.device != sub_device && mem.device != this) {
+        /* Free up memory again if it was allocated for the copy above. */
         mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer);
         sub_device->mem_free(mem);
         mem.restore_device();
@@ -398,10 +490,29 @@ class MultiDevice : public Device {
 
   void task_add(DeviceTask &task)
   {
+    list<SubDevice> task_devices = devices;
+    if (!denoising_devices.empty()) {
+      if (task.type == DeviceTask::DENOISE_BUFFER) {
+        /* Denoising tasks should be redirected to the denoising devices entirely. */
+        task_devices = denoising_devices;
+      }
+      else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
+        const uint tile_types = task.tile_types;
+        /* For normal rendering tasks only redirect the denoising part to the denoising devices.
+         * Do not need to split the task here, since they all run through 'acquire_tile'. */
+        task.tile_types = RenderTile::DENOISE;
+        foreach (SubDevice &sub, denoising_devices) {
+          sub.device->task_add(task);
+        }
+        /* Rendering itself should still be executed on the rendering devices. */
+        task.tile_types = tile_types ^ RenderTile::DENOISE;
+      }
+    }
+
     list<DeviceTask> tasks;
-    task.split(tasks, devices.size());
+    task.split(tasks, task_devices.size());
 
-    foreach (SubDevice &sub, devices) {
+    foreach (SubDevice &sub, task_devices) {
       if (!tasks.empty()) {
         DeviceTask subtask = tasks.front();
         tasks.pop_front();
@@ -426,12 +537,16 @@ class MultiDevice : public Device {
   {
     foreach (SubDevice &sub, devices)
       sub.device->task_wait();
+    foreach (SubDevice &sub, denoising_devices)
+      sub.device->task_wait();
   }
 
   void task_cancel()
   {
     foreach (SubDevice &sub, devices)
       sub.device->task_cancel();
+    foreach (SubDevice &sub, denoising_devices)
+      sub.device->task_cancel();
   }
 
  protected:
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 80334ad8f22..2742cbf53aa 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include "device/device_network.h"
 #include "device/device.h"
 #include "device/device_intern.h"
-#include "device/device_network.h"
 
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 5b69b815cc6..e74c4508ab6 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -19,19 +19,19 @@
 
 #ifdef WITH_NETWORK
 
-#  include <boost/archive/text_iarchive.hpp>
-#  include <boost/archive/text_oarchive.hpp>
 #  include <boost/archive/binary_iarchive.hpp>
 #  include <boost/archive/binary_oarchive.hpp>
+#  include <boost/archive/text_iarchive.hpp>
+#  include <boost/archive/text_oarchive.hpp>
 #  include <boost/array.hpp>
 #  include <boost/asio.hpp>
 #  include <boost/bind.hpp>
 #  include <boost/serialization/vector.hpp>
 #  include <boost/thread.hpp>
 
+#  include <deque>
 #  include <iostream>
 #  include <sstream>
-#  include <deque>
 
 #  include "render/buffers.h"
 
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index b07596c60ff..891b73351a0 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,8 +16,8 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
-
+#  include "device/opencl/device_opencl.h"
+#  include "device/device.h"
 #  include "device/device_intern.h"
 
 #  include "util/util_foreach.h"
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index c1106b367ca..42d7b00314c 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -17,30 +17,28 @@
 
 #ifdef WITH_OPTIX
 
-#  include "device/device.h"
-#  include "device/device_intern.h"
-#  include "device/device_denoising.h"
 #  include "bvh/bvh.h"
-#  include "render/scene.h"
+#  include "device/cuda/device_cuda.h"
+#  include "device/device_denoising.h"
+#  include "device/device_intern.h"
+#  include "render/buffers.h"
+#  include "render/hair.h"
 #  include "render/mesh.h"
 #  include "render/object.h"
-#  include "render/buffers.h"
+#  include "render/scene.h"
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
 #  include "util/util_md5.h"
 #  include "util/util_path.h"
 #  include "util/util_time.h"
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-
-#  undef _WIN32_WINNT  // Need minimum API support for Windows 7
-#  define _WIN32_WINNT _WIN32_WINNT_WIN7
 
 #  ifdef WITH_CUDA_DYNLOAD
 #    include <cuew.h>
 // Do not use CUDA SDK headers when using CUEW
 #    define OPTIX_DONT_INCLUDE_CUDA
 #  endif
-#  include <optix_stubs.h>
 #  include <optix_function_table_definition.h>
+#  include <optix_stubs.h>
 
 // TODO(pmours): Disable this once drivers have native support
 #  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
@@ -110,31 +108,23 @@ struct KernelParams {
     } \
     (void)0
 
-#  define CUDA_GET_BLOCKSIZE(func, w, h) \
-    int threads; \
-    check_result_cuda_ret( \
-        cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    threads = (int)sqrt((float)threads); \
-    int xblocks = ((w) + threads - 1) / threads; \
-    int yblocks = ((h) + threads - 1) / threads;
-
-#  define CUDA_LAUNCH_KERNEL(func, args) \
-    check_result_cuda_ret(cuLaunchKernel( \
-        func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-    int threads; \
-    check_result_cuda_ret( \
-        cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int xblocks = ((w) + threads - 1) / threads; \
-    int yblocks = h;
-
-#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
-    check_result_cuda_ret(cuLaunchKernel( \
-        func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0));
+#  define launch_filter_kernel(func_name, w, h, args) \
+    { \
+      CUfunction func; \
+      check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
+      check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
+      int threads; \
+      check_result_cuda_ret( \
+          cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+      threads = (int)sqrt((float)threads); \
+      int xblocks = ((w) + threads - 1) / threads; \
+      int yblocks = ((h) + threads - 1) / threads; \
+      check_result_cuda_ret( \
+          cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
+    } \
+    (void)0
 
-class OptiXDevice : public Device {
+class OptiXDevice : public CUDADevice {
 
   // List of OptiX program groups
   enum {
@@ -183,77 +173,37 @@ class OptiXDevice : public Device {
   // Use a pool with multiple threads to support launches with multiple CUDA streams
   TaskPool task_pool;
 
-  // CUDA/OptiX context handles
-  CUdevice cuda_device = 0;
-  CUcontext cuda_context = NULL;
   vector<CUstream> cuda_stream;
   OptixDeviceContext context = NULL;
 
-  // Need CUDA kernel module for some utility functions
-  CUmodule cuda_module = NULL;
-  CUmodule cuda_filter_module = NULL;
-  // All necessary OptiX kernels are in one module
-  OptixModule optix_module = NULL;
+  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
   OptixPipeline pipelines[NUM_PIPELINES] = {};
 
   bool motion_blur = false;
-  bool need_texture_info = false;
   device_vector<SbtRecord> sbt_data;
-  device_vector<TextureInfo> texture_info;
   device_only_memory<KernelParams> launch_params;
   vector<CUdeviceptr> as_mem;
   OptixTraversableHandle tlas_handle = 0;
 
-  // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
-  int can_map_host = 0;
-  size_t map_host_used = 0;
-  size_t map_host_limit = 0;
-  size_t device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  size_t device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-  map<device_memory *, CUDAMem> cuda_mem_map;
-  bool move_texture_to_host = false;
-
   OptixDenoiser denoiser = NULL;
-  vector<pair<int2, CUdeviceptr>> denoiser_state;
+  device_only_memory<unsigned char> denoiser_state;
+  int denoiser_input_passes = 0;
 
  public:
   OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_),
+      : CUDADevice(info_, stats_, profiler_, background_),
         sbt_data(this, "__sbt", MEM_READ_ONLY),
-        texture_info(this, "__texture_info", MEM_TEXTURE),
-        launch_params(this, "__params")
+        launch_params(this, "__params"),
+        denoiser_state(this, "__denoiser_state")
   {
     // Store number of CUDA streams in device info
     info.cpu_threads = DebugFlags().optix.cuda_streams;
 
-    // Initialize CUDA driver API
-    check_result_cuda(cuInit(0));
-
-    // Retrieve the primary CUDA context for this device
-    check_result_cuda(cuDeviceGet(&cuda_device, info.num));
-    check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device));
-
-    // Make that CUDA context current
-    const CUDAContextScope scope(cuda_context);
-
-    // Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp)
-    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-    size_t system_ram = system_physical_ram();
-    if (system_ram > 0) {
-      if (system_ram / 2 > default_limit) {
-        map_host_limit = system_ram - default_limit;
-      }
-      else {
-        map_host_limit = system_ram / 2;
-      }
-    }
-    else {
-      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    // Make the CUDA context current
+    if (!cuContext) {
+      return;  // Do not initialize if CUDA context creation failed already
     }
-
-    // Check device support for pinned host memory
-    check_result_cuda(
-        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device));
+    const CUDAContextScope scope(cuContext);
 
     // Create OptiX context for this device
     OptixDeviceContextOptions options = {};
@@ -277,7 +227,7 @@ class OptiXDevice : public Device {
           }
         };
 #  endif
-    check_result_optix(optixDeviceContextCreate(cuda_context, &options, &context));
+    check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
 #  ifdef WITH_CYCLES_LOGGING
     check_result_optix(optixDeviceContextSetLogCallback(
         context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
@@ -292,37 +242,26 @@ class OptiXDevice : public Device {
     launch_params.data_elements = sizeof(KernelParams);
     // Allocate launch parameter buffer memory on device
     launch_params.alloc_to_device(info.cpu_threads);
-
-    // Create denoiser state entries for all threads (but do not allocate yet)
-    denoiser_state.resize(info.cpu_threads);
   }
   ~OptiXDevice()
   {
     // Stop processing any more tasks
     task_pool.stop();
 
+    // Make CUDA context current
+    const CUDAContextScope scope(cuContext);
+
     // Free all acceleration structures
     for (CUdeviceptr mem : as_mem) {
       cuMemFree(mem);
     }
 
-    // Free denoiser state for all threads
-    for (const pair<int2, CUdeviceptr> &state : denoiser_state) {
-      cuMemFree(state.second);
-    }
-
     sbt_data.free();
     texture_info.free();
     launch_params.free();
-
-    // Make CUDA context current
-    const CUDAContextScope scope(cuda_context);
+    denoiser_state.free();
 
     // Unload modules
-    if (cuda_module != NULL)
-      cuModuleUnload(cuda_module);
-    if (cuda_filter_module != NULL)
-      cuModuleUnload(cuda_filter_module);
     if (optix_module != NULL)
       optixModuleDestroy(optix_module);
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
@@ -336,9 +275,7 @@ class OptiXDevice : public Device {
     if (denoiser != NULL)
       optixDenoiserDestroy(denoiser);
 
-    // Destroy OptiX and CUDA context
     optixDeviceContextDestroy(context);
-    cuDevicePrimaryCtxRelease(cuda_device);
   }
 
  private:
@@ -354,10 +291,34 @@ class OptiXDevice : public Device {
     return BVH_LAYOUT_OPTIX;
   }
 
+  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
+                                          bool filter,
+                                          bool /*split*/) override
+  {
+    // Split kernel is not supported in OptiX
+    string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
+        requested_features, filter, false);
+
+    // Add OptiX SDK include directory to include paths
+    const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+    if (optix_sdk_path) {
+      common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+    }
+
+    return common_cflags;
+  }
+
   bool load_kernels(const DeviceRequestedFeatures &requested_features) override
   {
-    if (have_error())
-      return false;  // Abort early if context creation failed already
+    if (have_error()) {
+      // Abort early if context creation failed already
+      return false;
+    }
+
+    // Load CUDA modules because we need some of the utility kernels
+    if (!CUDADevice::load_kernels(requested_features)) {
+      return false;
+    }
 
     // Disable baking for now, since its kernel is not well-suited for inlining and is very slow
     if (requested_features.use_baking) {
@@ -370,7 +331,7 @@ class OptiXDevice : public Device {
       return false;
     }
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     // Unload existing OptiX module and pipelines first
     if (optix_module != NULL) {
@@ -421,9 +382,11 @@ class OptiXDevice : public Device {
     }
 
     {  // Load and compile PTX module with OptiX kernels
-      string ptx_data;
-      const string ptx_filename = "lib/kernel_optix.ptx";
-      if (!path_read_text(path_get(ptx_filename), ptx_data)) {
+      string ptx_data, ptx_filename = path_get("lib/kernel_optix.ptx");
+      if (use_adaptive_compilation()) {
+        ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
+      }
+      if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
         set_error("Failed loading OptiX kernel " + ptx_filename + ".");
         return false;
       }
@@ -438,34 +401,6 @@ class OptiXDevice : public Device {
                                                       &optix_module));
     }
 
-    {  // Load CUDA modules because we need some of the utility kernels
-      int major, minor;
-      cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
-      cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info.num);
-
-      if (cuda_module == NULL) {  // Avoid reloading module if it was already loaded
-        string cubin_data;
-        const string cubin_filename = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
-        if (!path_read_text(path_get(cubin_filename), cubin_data)) {
-          set_error("Failed loading pre-compiled CUDA kernel " + cubin_filename + ".");
-          return false;
-        }
-
-        check_result_cuda_ret(cuModuleLoadData(&cuda_module, cubin_data.data()));
-      }
-
-      if (requested_features.use_denoising && cuda_filter_module == NULL) {
-        string filter_data;
-        const string filter_filename = string_printf("lib/filter_sm_%d%d.cubin", major, minor);
-        if (!path_read_text(path_get(filter_filename), filter_data)) {
-          set_error("Failed loading pre-compiled CUDA filter kernel " + filter_filename + ".");
-          return false;
-        }
-
-        check_result_cuda_ret(cuModuleLoadData(&cuda_filter_module, filter_data.data()));
-      }
-    }
-
     // Create program groups
     OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
     OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
@@ -542,9 +477,9 @@ class OptiXDevice : public Device {
     // Calculate maximum trace continuation stack size
     unsigned int trace_css = stack_size[PG_HITD].cssCH;
     // This is based on the maximum of closest-hit and any-hit/intersection programs
-    trace_css = max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
-    trace_css = max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
 
     OptixPipelineLinkOptions link_options;
     link_options.maxTraceDepth = 1;
@@ -613,8 +548,9 @@ class OptiXDevice : public Device {
                               &pipelines[PIP_SHADER_EVAL]));
 
       // Calculate continuation stack size based on the maximum of all ray generation stack sizes
-      const unsigned int css = max(stack_size[PG_BAKE].cssRG,
-                                   max(stack_size[PG_DISP].cssRG, stack_size[PG_BACK].cssRG)) +
+      const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
+                                        std::max(stack_size[PG_DISP].cssRG,
+                                                 stack_size[PG_BACK].cssRG)) +
                                link_options.maxTraceDepth * trace_css;
 
       check_result_optix_ret(optixPipelineSetStackSize(
@@ -635,12 +571,17 @@ class OptiXDevice : public Device {
       return;  // Abort early if there was an error previously
 
     if (task.type == DeviceTask::RENDER) {
+      if (thread_index != 0) {
+        // Only execute denoising in a single thread (see also 'task_add')
+        task.tile_types &= ~RenderTile::DENOISE;
+      }
+
       RenderTile tile;
-      while (task.acquire_tile(this, tile)) {
+      while (task.acquire_tile(this, tile, task.tile_types)) {
         if (tile.task == RenderTile::PATH_TRACE)
           launch_render(task, tile, thread_index);
         else if (tile.task == RenderTile::DENOISE)
-          launch_denoise(task, tile, thread_index);
+          launch_denoise(task, tile);
         task.release_tile(tile);
         if (task.get_cancel() && !task.need_finish_queue)
           break;  // User requested cancellation
@@ -651,8 +592,21 @@ class OptiXDevice : public Device {
     else if (task.type == DeviceTask::SHADER) {
       launch_shader_eval(task, thread_index);
     }
-    else if (task.type == DeviceTask::FILM_CONVERT) {
-      launch_film_convert(task, thread_index);
+    else if (task.type == DeviceTask::DENOISE_BUFFER) {
+      // Set up a single tile that covers the whole task and denoise it
+      RenderTile tile;
+      tile.x = task.x;
+      tile.y = task.y;
+      tile.w = task.w;
+      tile.h = task.h;
+      tile.buffer = task.buffer;
+      tile.num_samples = task.num_samples;
+      tile.start_sample = task.sample;
+      tile.offset = task.offset;
+      tile.stride = task.stride;
+      tile.buffers = task.buffers;
+
+      launch_denoise(task, tile);
     }
   }
 
@@ -674,21 +628,24 @@ class OptiXDevice : public Device {
 
     const int end_sample = rtile.start_sample + rtile.num_samples;
     // Keep this number reasonable to avoid running into TDRs
-    const int step_samples = (info.display_device ? 8 : 32);
+    int step_samples = (info.display_device ? 8 : 32);
+    if (task.adaptive_sampling.use) {
+      step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+    }
+
     // Offset into launch params buffer so that streams use separate data
     device_ptr launch_params_ptr = launch_params.device_pointer +
                                    thread_index * launch_params.data_elements;
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
       // Copy work tile information to device
       wtile.num_samples = min(step_samples, end_sample - sample);
       wtile.start_sample = sample;
-      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile),
-                                          &wtile,
-                                          sizeof(wtile),
-                                          cuda_stream[thread_index]));
+      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+      check_result_cuda(
+          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
 
       OptixShaderBindingTable sbt_params = {};
       sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
@@ -713,6 +670,12 @@ class OptiXDevice : public Device {
                                      wtile.h,
                                      1));
 
+      // Run the adaptive sampling kernels at selected samples aligned to step samples.
+      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
+      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+      }
+
       // Wait for launch to finish
       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
 
@@ -724,13 +687,23 @@ class OptiXDevice : public Device {
       if (task.get_cancel() && !task.need_finish_queue)
         return;  // Cancel rendering
     }
+
+    // Finalize adaptive sampling
+    if (task.adaptive_sampling.use) {
+      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
+    }
   }
 
-  bool launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index)
+  bool launch_denoise(DeviceTask &task, RenderTile &rtile)
   {
-    int total_samples = rtile.start_sample + rtile.num_samples;
+    // Update current sample (for display and NLM denoising task)
+    rtile.sample = rtile.start_sample + rtile.num_samples;
 
-    const CUDAContextScope scope(cuda_context);
+    // Make CUDA context current now, since it is used for both denoising tasks
+    const CUDAContextScope scope(cuContext);
 
     // Choose between OptiX and NLM denoising
     if (task.denoising_use_optix) {
@@ -742,6 +715,7 @@ class OptiXDevice : public Device {
       RenderTile rtiles[10];
       rtiles[4] = rtile;
       task.map_neighbor_tiles(rtiles, this);
+      rtile = rtiles[4];  // Tile may have been modified by mapping code
 
       // Calculate size of the tile to denoise (including overlap)
       int4 rect = make_int4(
@@ -808,47 +782,40 @@ class OptiXDevice : public Device {
         tile_info->y[3] = rtiles[7].y + rtiles[7].h;
         tile_info_mem.copy_to_device();
 
-        CUfunction filter_copy_func;
-        check_result_cuda_ret(cuModuleGetFunction(
-            &filter_copy_func, cuda_filter_module, "kernel_cuda_filter_copy_input"));
-        check_result_cuda_ret(cuFuncSetCacheConfig(filter_copy_func, CU_FUNC_CACHE_PREFER_L1));
-
         void *args[] = {
             &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
-        CUDA_GET_BLOCKSIZE(filter_copy_func, rect_size.x, rect_size.y);
-        CUDA_LAUNCH_KERNEL(filter_copy_func, args);
+        launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
       }
 
 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
       device_only_memory<float> input_rgb(this, "denoiser input rgb");
-      {
-        input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 *
-                                  task.denoising.optix_input_passes);
-
-        CUfunction convert_to_rgb_func;
-        check_result_cuda_ret(cuModuleGetFunction(
-            &convert_to_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_to_rgb"));
-        check_result_cuda_ret(cuFuncSetCacheConfig(convert_to_rgb_func, CU_FUNC_CACHE_PREFER_L1));
-
-        void *args[] = {&input_rgb.device_pointer,
-                        &input_ptr,
-                        &rect_size.x,
-                        &rect_size.y,
-                        &input_stride,
-                        &task.pass_stride,
-                        const_cast<int *>(pass_offset),
-                        &task.denoising.optix_input_passes,
-                        &total_samples};
-        CUDA_GET_BLOCKSIZE(convert_to_rgb_func, rect_size.x, rect_size.y);
-        CUDA_LAUNCH_KERNEL(convert_to_rgb_func, args);
-
-        input_ptr = input_rgb.device_pointer;
-        pixel_stride = 3 * sizeof(float);
-        input_stride = rect_size.x * pixel_stride;
-      }
+      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.optix_input_passes);
+
+      void *input_args[] = {&input_rgb.device_pointer,
+                            &input_ptr,
+                            &rect_size.x,
+                            &rect_size.y,
+                            &input_stride,
+                            &task.pass_stride,
+                            const_cast<int *>(pass_offset),
+                            &task.denoising.optix_input_passes,
+                            &rtile.sample};
+      launch_filter_kernel(
+          "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
+
+      input_ptr = input_rgb.device_pointer;
+      pixel_stride = 3 * sizeof(float);
+      input_stride = rect_size.x * pixel_stride;
 #  endif
 
-      if (denoiser == NULL) {
+      const bool recreate_denoiser = (denoiser == NULL) ||
+                                     (task.denoising.optix_input_passes != denoiser_input_passes);
+      if (recreate_denoiser) {
+        // Destroy existing handle before creating new one
+        if (denoiser != NULL) {
+          optixDenoiserDestroy(denoiser);
+        }
+
         // Create OptiX denoiser handle on demand when it is first used
         OptixDenoiserOptions denoiser_options;
         assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3);
@@ -858,35 +825,35 @@ class OptiXDevice : public Device {
         check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
         check_result_optix_ret(
             optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
+
+        // OptiX denoiser handle was created with the requested number of input passes
+        denoiser_input_passes = task.denoising.optix_input_passes;
       }
 
       OptixDenoiserSizes sizes = {};
       check_result_optix_ret(
           optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
 
-      auto &state = denoiser_state[thread_index].second;
-      auto &state_size = denoiser_state[thread_index].first;
       const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
       const size_t scratch_offset = sizes.stateSizeInBytes;
 
       // Allocate denoiser state if tile size has changed since last setup
-      if (state_size.x != rect_size.x || state_size.y != rect_size.y) {
-        if (state) {
-          cuMemFree(state);
-          state = 0;
-        }
-        check_result_cuda_ret(cuMemAlloc(&state, scratch_offset + scratch_size));
+      if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
+                                denoiser_state.data_height != rect_size.y)) {
+        denoiser_state.alloc_to_device(scratch_offset + scratch_size);
 
+        // Initialize denoiser state for the current tile size
         check_result_optix_ret(optixDenoiserSetup(denoiser,
-                                                  cuda_stream[thread_index],
+                                                  0,
                                                   rect_size.x,
                                                   rect_size.y,
-                                                  state,
+                                                  denoiser_state.device_pointer,
                                                   scratch_offset,
-                                                  state + scratch_offset,
+                                                  denoiser_state.device_pointer + scratch_offset,
                                                   scratch_size));
 
-        state_size = rect_size;
+        denoiser_state.data_width = rect_size.x;
+        denoiser_state.data_height = rect_size.y;
       }
 
       // Set up input and output layer information
@@ -926,94 +893,46 @@ class OptiXDevice : public Device {
       // Finally run denonising
       OptixDenoiserParams params = {};  // All parameters are disabled/zero
       check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 cuda_stream[thread_index],
+                                                 0,
                                                  &params,
-                                                 state,
+                                                 denoiser_state.device_pointer,
                                                  scratch_offset,
                                                  input_layers,
                                                  task.denoising.optix_input_passes,
                                                  overlap_offset.x,
                                                  overlap_offset.y,
                                                  output_layers,
-                                                 state + scratch_offset,
+                                                 denoiser_state.device_pointer + scratch_offset,
                                                  scratch_size));
 
 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      {
-        CUfunction convert_from_rgb_func;
-        check_result_cuda_ret(cuModuleGetFunction(
-            &convert_from_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_from_rgb"));
-        check_result_cuda_ret(
-            cuFuncSetCacheConfig(convert_from_rgb_func, CU_FUNC_CACHE_PREFER_L1));
-
-        void *args[] = {&input_ptr,
-                        &rtiles[9].buffer,
-                        &output_offset.x,
-                        &output_offset.y,
-                        &rect_size.x,
-                        &rect_size.y,
-                        &rtiles[9].x,
-                        &rtiles[9].y,
-                        &rtiles[9].w,
-                        &rtiles[9].h,
-                        &rtiles[9].offset,
-                        &rtiles[9].stride,
-                        &task.pass_stride};
-        CUDA_GET_BLOCKSIZE(convert_from_rgb_func, rtiles[9].w, rtiles[9].h);
-        CUDA_LAUNCH_KERNEL(convert_from_rgb_func, args);
-      }
+      void *output_args[] = {&input_ptr,
+                             &rtiles[9].buffer,
+                             &output_offset.x,
+                             &output_offset.y,
+                             &rect_size.x,
+                             &rect_size.y,
+                             &rtiles[9].x,
+                             &rtiles[9].y,
+                             &rtiles[9].w,
+                             &rtiles[9].h,
+                             &rtiles[9].offset,
+                             &rtiles[9].stride,
+                             &task.pass_stride};
+      launch_filter_kernel(
+          "kernel_cuda_filter_convert_from_rgb", rtiles[9].w, rtiles[9].h, output_args);
 #  endif
 
-      check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+      check_result_cuda_ret(cuStreamSynchronize(0));
 
       task.unmap_neighbor_tiles(rtiles, this);
     }
     else {
       // Run CUDA denoising kernels
       DenoisingTask denoising(this, task);
-      denoising.functions.construct_transform = function_bind(
-          &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index);
-      denoising.functions.accumulate = function_bind(
-          &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index);
-      denoising.functions.solve = function_bind(
-          &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index);
-      denoising.functions.divide_shadow = function_bind(&OptiXDevice::denoising_divide_shadow,
-                                                        this,
-                                                        _1,
-                                                        _2,
-                                                        _3,
-                                                        _4,
-                                                        _5,
-                                                        &denoising,
-                                                        thread_index);
-      denoising.functions.non_local_means = function_bind(
-          &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index);
-      denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves,
-                                                         this,
-                                                         _1,
-                                                         _2,
-                                                         _3,
-                                                         _4,
-                                                         _5,
-                                                         _6,
-                                                         &denoising,
-                                                         thread_index);
-      denoising.functions.get_feature = function_bind(
-          &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index);
-      denoising.functions.write_feature = function_bind(
-          &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index);
-      denoising.functions.detect_outliers = function_bind(
-          &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index);
-
-      denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-      denoising.render_buffer.samples = total_samples;
-      denoising.buffer.gpu_temporary_mem = true;
-
-      denoising.run_denoising(&rtile);
+      CUDADevice::denoise(rtile, denoising);
     }
 
-    // Update current sample, so it is displayed correctly
-    rtile.sample = total_samples;
     // Update task progress after the denoiser completed processing
     task.update_progress(&rtile, rtile.w * rtile.h);
 
@@ -1028,7 +947,7 @@ class OptiXDevice : public Device {
     if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
       rgen_index = PG_DISP;
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     device_ptr launch_params_ptr = launch_params.device_pointer +
                                    thread_index * launch_params.data_elements;
@@ -1075,62 +994,13 @@ class OptiXDevice : public Device {
     }
   }
 
-  void launch_film_convert(DeviceTask &task, int thread_index)
-  {
-    const CUDAContextScope scope(cuda_context);
-
-    CUfunction film_convert_func;
-    check_result_cuda(cuModuleGetFunction(&film_convert_func,
-                                          cuda_module,
-                                          task.rgba_byte ? "kernel_cuda_convert_to_byte" :
-                                                           "kernel_cuda_convert_to_half_float"));
-
-    float sample_scale = 1.0f / (task.sample + 1);
-    CUdeviceptr rgba = (task.rgba_byte ? task.rgba_byte : task.rgba_half);
-
-    void *args[] = {&rgba,
-                    &task.buffer,
-                    &sample_scale,
-                    &task.x,
-                    &task.y,
-                    &task.w,
-                    &task.h,
-                    &task.offset,
-                    &task.stride};
-
-    int threads_per_block;
-    check_result_cuda(cuFuncGetAttribute(
-        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, film_convert_func));
-
-    const int num_threads_x = (int)sqrt(threads_per_block);
-    const int num_blocks_x = (task.w + num_threads_x - 1) / num_threads_x;
-    const int num_threads_y = (int)sqrt(threads_per_block);
-    const int num_blocks_y = (task.h + num_threads_y - 1) / num_threads_y;
-
-    check_result_cuda(cuLaunchKernel(film_convert_func,
-                                     num_blocks_x,
-                                     num_blocks_y,
-                                     1, /* blocks */
-                                     num_threads_x,
-                                     num_threads_y,
-                                     1, /* threads */
-                                     0,
-                                     cuda_stream[thread_index],
-                                     args,
-                                     0));
-
-    check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    task.update_progress(NULL);
-  }
-
   bool build_optix_bvh(const OptixBuildInput &build_input,
                        uint16_t num_motion_steps,
                        OptixTraversableHandle &out_handle)
   {
     out_handle = 0;
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     // Compute memory usage
     OptixAccelBufferSizes sizes = {};
@@ -1232,8 +1102,8 @@ class OptiXDevice : public Device {
     assert(bvh->params.top_level);
 
     unsigned int num_instances = 0;
-    unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
-    meshes.reserve(bvh->meshes.size());
+    unordered_map<Geometry *, OptixTraversableHandle> geometry;
+    geometry.reserve(bvh->geometry.size());
 
     // Free all previous acceleration structures
     for (CUdeviceptr mem : as_mem) {
@@ -1244,23 +1114,25 @@ class OptiXDevice : public Device {
     // Build bottom level acceleration structures (BLAS)
     // Note: Always keep this logic in sync with bvh_optix.cpp!
     for (Object *ob : bvh->objects) {
-      // Skip meshes for which acceleration structure already exists
-      if (meshes.find(ob->mesh) != meshes.end())
+      // Skip geometry for which acceleration structure already exists
+      Geometry *geom = ob->geometry;
+      if (geometry.find(geom) != geometry.end())
         continue;
 
-      Mesh *const mesh = ob->mesh;
-      vector<OptixTraversableHandle> handles;
-      handles.reserve(2);
+      if (geom->type == Geometry::HAIR) {
+        // Build BLAS for curve primitives
+        Hair *const hair = static_cast<Hair *const>(ob->geometry);
+        if (hair->num_curves() == 0) {
+          continue;
+        }
 
-      // Build BLAS for curve primitives
-      if (bvh->params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
-        const size_t num_curves = mesh->num_curves();
-        const size_t num_segments = mesh->num_segments();
+        const size_t num_curves = hair->num_curves();
+        const size_t num_segments = hair->num_segments();
 
         size_t num_motion_steps = 1;
-        Attribute *motion_keys = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && mesh->use_motion_blur && motion_keys) {
-          num_motion_steps = mesh->motion_steps;
+        Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+        if (motion_blur && hair->use_motion_blur && motion_keys) {
+          num_motion_steps = hair->motion_steps;
         }
 
         device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
@@ -1269,21 +1141,21 @@ class OptiXDevice : public Device {
         // Get AABBs for each motion step
         for (size_t step = 0; step < num_motion_steps; ++step) {
           // The center step for motion vertices is not stored in the attribute
-          const float3 *keys = mesh->curve_keys.data();
+          const float3 *keys = hair->curve_keys.data();
           size_t center_step = (num_motion_steps - 1) / 2;
           if (step != center_step) {
             size_t attr_offset = (step > center_step) ? step - 1 : step;
             // Technically this is a float4 array, but sizeof(float3) is the same as sizeof(float4)
-            keys = motion_keys->data_float3() + attr_offset * mesh->curve_keys.size();
+            keys = motion_keys->data_float3() + attr_offset * hair->curve_keys.size();
           }
 
           size_t i = step * num_segments;
           for (size_t j = 0; j < num_curves; ++j) {
-            const Mesh::Curve c = mesh->get_curve(j);
+            const Hair::Curve c = hair->get_curve(j);
 
             for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
               BoundBox bounds = BoundBox::empty;
-              c.bounds_grow(k, keys, mesh->curve_radius.data(), bounds);
+              c.bounds_grow(k, keys, hair->curve_radius.data(), bounds);
 
               aabb_data[i].minX = bounds.min.x;
               aabb_data[i].minY = bounds.min.y;
@@ -1314,16 +1186,24 @@ class OptiXDevice : public Device {
         build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
         build_input.aabbArray.flags = &build_flags;
         build_input.aabbArray.numSbtRecords = 1;
-        build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
+        build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
 
         // Allocate memory for new BLAS and build it
-        handles.emplace_back();
-        if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
+        OptixTraversableHandle handle;
+        if (build_optix_bvh(build_input, num_motion_steps, handle)) {
+          geometry.insert({ob->geometry, handle});
+        }
+        else {
           return false;
+        }
       }
+      else if (geom->type == Geometry::MESH) {
+        // Build BLAS for triangle primitives
+        Mesh *const mesh = static_cast<Mesh *const>(ob->geometry);
+        if (mesh->num_triangles() == 0) {
+          continue;
+        }
 
-      // Build BLAS for triangle primitives
-      if (bvh->params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
         const size_t num_verts = mesh->verts.size();
 
         size_t num_motion_steps = 1;
@@ -1378,23 +1258,24 @@ class OptiXDevice : public Device {
         // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
         // one and rely on that having the same meaning in this case.
         build_input.triangleArray.numSbtRecords = 1;
-        // Triangle primitives are packed right after the curve primitives of this mesh
-        build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
+        build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
 
         // Allocate memory for new BLAS and build it
-        handles.emplace_back();
-        if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
+        OptixTraversableHandle handle;
+        if (build_optix_bvh(build_input, num_motion_steps, handle)) {
+          geometry.insert({ob->geometry, handle});
+        }
+        else {
           return false;
+        }
       }
-
-      meshes.insert({mesh, handles});
     }
 
     // Fill instance descriptions
     device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
-    aabbs.alloc(bvh->objects.size() * 2);
+    aabbs.alloc(bvh->objects.size());
     device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
-    instances.alloc(bvh->objects.size() * 2);
+    instances.alloc(bvh->objects.size());
 
     for (Object *ob : bvh->objects) {
       // Skip non-traceable objects
@@ -1402,113 +1283,117 @@ class OptiXDevice : public Device {
         continue;
 
       // Create separate instance for triangle/curve meshes of an object
-      for (OptixTraversableHandle handle : meshes[ob->mesh]) {
-        OptixAabb &aabb = aabbs[num_instances];
-        aabb.minX = ob->bounds.min.x;
-        aabb.minY = ob->bounds.min.y;
-        aabb.minZ = ob->bounds.min.z;
-        aabb.maxX = ob->bounds.max.x;
-        aabb.maxY = ob->bounds.max.y;
-        aabb.maxZ = ob->bounds.max.z;
-
-        OptixInstance &instance = instances[num_instances++];
-        memset(&instance, 0, sizeof(instance));
-
-        // Clear transform to identity matrix
-        instance.transform[0] = 1.0f;
-        instance.transform[5] = 1.0f;
-        instance.transform[10] = 1.0f;
-
-        // Set user instance ID to object index
-        instance.instanceId = ob->get_device_index();
-
-        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-        // See 'scene_intersect_volume' in bvh.h
-        instance.visibilityMask = (ob->mesh->has_volume ? 3 : 1);
-
-        // Insert motion traversable if object has motion
-        if (motion_blur && ob->use_motion()) {
-          size_t motion_keys = max(ob->motion.size(), 2) - 2;
-          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
-                                         motion_keys * sizeof(OptixSRTData);
-
-          const CUDAContextScope scope(cuda_context);
-
-          CUdeviceptr motion_transform_gpu = 0;
-          check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
-          as_mem.push_back(motion_transform_gpu);
-
-          // Allocate host side memory for motion transform and fill it with transform data
-          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-              new uint8_t[motion_transform_size]);
-          motion_transform.child = handle;
-          motion_transform.motionOptions.numKeys = ob->motion.size();
-          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
-          motion_transform.motionOptions.timeBegin = 0.0f;
-          motion_transform.motionOptions.timeEnd = 1.0f;
-
-          OptixSRTData *const srt_data = motion_transform.srtData;
-          array<DecomposedTransform> decomp(ob->motion.size());
-          transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
-
-          for (size_t i = 0; i < ob->motion.size(); ++i) {
-            // Scale
-            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
-            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
-            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-
-            // Shear
-            srt_data[i].a = decomp[i].z.x;  // scale.x.y
-            srt_data[i].b = decomp[i].z.y;  // scale.x.z
-            srt_data[i].c = decomp[i].w.x;  // scale.y.z
-            assert(decomp[i].z.z == 0.0f);  // scale.y.x
-            assert(decomp[i].w.y == 0.0f);  // scale.z.x
-            assert(decomp[i].w.z == 0.0f);  // scale.z.y
-
-            // Pivot point
-            srt_data[i].pvx = 0.0f;
-            srt_data[i].pvy = 0.0f;
-            srt_data[i].pvz = 0.0f;
-
-            // Rotation
-            srt_data[i].qx = decomp[i].x.x;
-            srt_data[i].qy = decomp[i].x.y;
-            srt_data[i].qz = decomp[i].x.z;
-            srt_data[i].qw = decomp[i].x.w;
-
-            // Translation
-            srt_data[i].tx = decomp[i].y.x;
-            srt_data[i].ty = decomp[i].y.y;
-            srt_data[i].tz = decomp[i].y.z;
-          }
+      auto handle_it = geometry.find(ob->geometry);
+      if (handle_it == geometry.end()) {
+        continue;
+      }
+      OptixTraversableHandle handle = handle_it->second;
+
+      OptixAabb &aabb = aabbs[num_instances];
+      aabb.minX = ob->bounds.min.x;
+      aabb.minY = ob->bounds.min.y;
+      aabb.minZ = ob->bounds.min.z;
+      aabb.maxX = ob->bounds.max.x;
+      aabb.maxY = ob->bounds.max.y;
+      aabb.maxZ = ob->bounds.max.z;
+
+      OptixInstance &instance = instances[num_instances++];
+      memset(&instance, 0, sizeof(instance));
+
+      // Clear transform to identity matrix
+      instance.transform[0] = 1.0f;
+      instance.transform[5] = 1.0f;
+      instance.transform[10] = 1.0f;
+
+      // Set user instance ID to object index
+      instance.instanceId = ob->get_device_index();
+
+      // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+      // See 'scene_intersect_volume' in bvh.h
+      instance.visibilityMask = (ob->geometry->has_volume ? 3 : 1);
+
+      // Insert motion traversable if object has motion
+      if (motion_blur && ob->use_motion()) {
+        size_t motion_keys = max(ob->motion.size(), 2) - 2;
+        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                       motion_keys * sizeof(OptixSRTData);
+
+        const CUDAContextScope scope(cuContext);
+
+        CUdeviceptr motion_transform_gpu = 0;
+        check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
+        as_mem.push_back(motion_transform_gpu);
+
+        // Allocate host side memory for motion transform and fill it with transform data
+        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+            new uint8_t[motion_transform_size]);
+        motion_transform.child = handle;
+        motion_transform.motionOptions.numKeys = ob->motion.size();
+        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+        motion_transform.motionOptions.timeBegin = 0.0f;
+        motion_transform.motionOptions.timeEnd = 1.0f;
+
+        OptixSRTData *const srt_data = motion_transform.srtData;
+        array<DecomposedTransform> decomp(ob->motion.size());
+        transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
+
+        for (size_t i = 0; i < ob->motion.size(); ++i) {
+          // Scale
+          srt_data[i].sx = decomp[i].y.w;  // scale.x.x
+          srt_data[i].sy = decomp[i].z.w;  // scale.y.y
+          srt_data[i].sz = decomp[i].w.w;  // scale.z.z
+
+          // Shear
+          srt_data[i].a = decomp[i].z.x;  // scale.x.y
+          srt_data[i].b = decomp[i].z.y;  // scale.x.z
+          srt_data[i].c = decomp[i].w.x;  // scale.y.z
+          assert(decomp[i].z.z == 0.0f);  // scale.y.x
+          assert(decomp[i].w.y == 0.0f);  // scale.z.x
+          assert(decomp[i].w.z == 0.0f);  // scale.z.y
+
+          // Pivot point
+          srt_data[i].pvx = 0.0f;
+          srt_data[i].pvy = 0.0f;
+          srt_data[i].pvz = 0.0f;
+
+          // Rotation
+          srt_data[i].qx = decomp[i].x.x;
+          srt_data[i].qy = decomp[i].x.y;
+          srt_data[i].qz = decomp[i].x.z;
+          srt_data[i].qw = decomp[i].x.w;
+
+          // Translation
+          srt_data[i].tx = decomp[i].y.x;
+          srt_data[i].ty = decomp[i].y.y;
+          srt_data[i].tz = decomp[i].y.z;
+        }
 
-          // Upload motion transform to GPU
-          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
-          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+        // Upload motion transform to GPU
+        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
 
-          // Disable instance transform if object uses motion transform already
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+        // Disable instance transform if object uses motion transform already
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+        // Get traversable handle to motion transform
+        optixConvertPointerToTraversableHandle(context,
+                                               motion_transform_gpu,
+                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                               &instance.traversableHandle);
+      }
+      else {
+        instance.traversableHandle = handle;
 
-          // Get traversable handle to motion transform
-          optixConvertPointerToTraversableHandle(context,
-                                                 motion_transform_gpu,
-                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
-                                                 &instance.traversableHandle);
+        if (ob->geometry->is_instanced()) {
+          // Set transform matrix
+          memcpy(instance.transform, &ob->tfm, sizeof(instance.transform));
         }
         else {
-          instance.traversableHandle = handle;
-
-          if (ob->mesh->is_instanced()) {
-            // Set transform matrix
-            memcpy(instance.transform, &ob->tfm, sizeof(instance.transform));
-          }
-          else {
-            // Disable instance transform if mesh already has it applied to vertex data
-            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-            // Non-instanced objects read ID from prim_object, so
-            // distinguish them from instanced objects with high bit set
-            instance.instanceId |= 0x800000;
-          }
+          // Disable instance transform if geometry already has it applied to vertex data
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          // Non-instanced objects read ID from prim_object, so
+          // distinguish them from instanced objects with high bit set
+          instance.instanceId |= 0x800000;
         }
       }
     }
@@ -1530,655 +1415,76 @@ class OptiXDevice : public Device {
     return build_optix_bvh(build_input, 0, tlas_handle);
   }
 
-  void update_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
+  void const_copy_to(const char *name, void *host, size_t size) override
   {
-    const CUDAContextScope scope(cuda_context);
-
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(
-          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
-                       data,
-                       data_size));
-
     // Set constant memory for CUDA module
-    // TODO(pmours): This is only used for tonemapping (see 'launch_film_convert').
+    // TODO(pmours): This is only used for tonemapping (see 'film_convert').
     //               Could be removed by moving those functions to filter CUDA module.
-    size_t bytes = 0;
-    CUdeviceptr mem = 0;
-    check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name));
-    assert(mem != 0 && bytes == data_size);
-    check_result_cuda(cuMemcpyHtoD(mem, data, data_size));
-  }
-
-  void mem_alloc(device_memory &mem) override
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      // Always fall back to no interop for now
-      // TODO(pmours): Support OpenGL interop when moving CUDA memory management to common code
-      background = true;
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-      return;
-    }
-
-    generic_alloc(mem);
-  }
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
-  {
-    CUDAContextScope scope(cuda_context);
-
-    CUdeviceptr device_pointer = 0;
-    size_t size = mem.memory_size() + pitch_padding;
-
-    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-    const char *status = "";
-
-    /* First try allocating in device memory, respecting headroom. We make
-     * an exception for texture info. It is small and frequently accessed,
-     * so treat it as working memory.
-     *
-     * If there is not enough room for working memory, we will try to move
-     * textures to host memory, assuming the performance impact would have
-     * been worse for working memory. */
-    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-    bool is_image = is_texture && (mem.data_height > 1);
-
-    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-    size_t total = 0, free = 0;
-    cuMemGetInfo(&free, &total);
+    CUDADevice::const_copy_to(name, host, size);
 
-    /* Move textures to host memory if needed. */
-    if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-      move_textures_to_host(size + headroom - free, is_texture);
-      cuMemGetInfo(&free, &total);
-    }
-
-    /* Allocate in device memory. */
-    if (!move_texture_to_host && (size + headroom) < free) {
-      mem_alloc_result = cuMemAlloc(&device_pointer, size);
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        status = " in device memory";
-      }
-    }
-
-    /* Fall back to mapped host memory if needed and possible. */
-    void *shared_pointer = 0;
-
-    if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
-      if (mem.shared_pointer) {
-        /* Another device already allocated host memory. */
-        mem_alloc_result = CUDA_SUCCESS;
-        shared_pointer = mem.shared_pointer;
-      }
-      else if (map_host_used + size < map_host_limit) {
-        /* Allocate host memory ourselves. */
-        mem_alloc_result = cuMemHostAlloc(
-            &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-        assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-               (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-      }
-
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0);
-        map_host_used += size;
-        status = " in host memory";
-      }
-      else {
-        status = " failed, out of host memory";
-      }
-    }
-    else if (mem_alloc_result != CUDA_SUCCESS) {
-      status = " failed, out of device and host memory";
-    }
-
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-    }
-
-    if (mem_alloc_result != CUDA_SUCCESS) {
-      set_error(string_printf("Buffer allocate %s", status));
-      return NULL;
-    }
-
-    mem.device_pointer = (device_ptr)device_pointer;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    if (!mem.device_pointer) {
-      return NULL;
-    }
-
-    /* Insert into map of allocations. */
-    CUDAMem *cmem = &cuda_mem_map[&mem];
-    if (shared_pointer != 0) {
-      /* Replace host pointer with our host allocation. Only works if
-       * CUDA memory layout is the same and has no pitch padding. Also
-       * does not work if we move textures to host during a render,
-       * since other devices might be using the memory. */
-
-      if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-          mem.host_pointer != shared_pointer) {
-        memcpy(shared_pointer, mem.host_pointer, size);
-
-        /* A call to device_memory::host_free() should be preceded by
-         * a call to device_memory::device_free() for host memory
-         * allocated by a device to be handled properly. Two exceptions
-         * are here and a call in CUDADevice::generic_alloc(), where
-         * the current host memory can be assumed to be allocated by
-         * device_memory::host_alloc(), not by a device */
-
-        mem.host_free();
-        mem.host_pointer = shared_pointer;
-      }
-      mem.shared_pointer = shared_pointer;
-      mem.shared_counter++;
-      cmem->use_mapped_host = true;
-    }
-    else {
-      cmem->use_mapped_host = false;
-    }
-
-    return cmem;
-  }
+    if (strcmp(name, "__data") == 0) {
+      assert(size <= sizeof(KernelData));
 
-  void tex_alloc(device_memory &mem)
-  {
-    CUDAContextScope scope(cuda_context);
-
-    /* General variables for both architectures */
-    string bind_name = mem.name;
-    size_t dsize = datatype_size(mem.data_type);
-    size_t size = mem.memory_size();
-
-    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-    switch (mem.extension) {
-      case EXTENSION_REPEAT:
-        address_mode = CU_TR_ADDRESS_MODE_WRAP;
-        break;
-      case EXTENSION_EXTEND:
-        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-        break;
-      case EXTENSION_CLIP:
-        address_mode = CU_TR_ADDRESS_MODE_BORDER;
-        break;
-      default:
-        assert(0);
-        break;
-    }
+      // Fix traversable handle on multi devices
+      KernelData *const data = (KernelData *)host;
+      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
 
-    CUfilter_mode filter_mode;
-    if (mem.interpolation == INTERPOLATION_CLOSEST) {
-      filter_mode = CU_TR_FILTER_MODE_POINT;
-    }
-    else {
-      filter_mode = CU_TR_FILTER_MODE_LINEAR;
+      update_launch_params(name, offsetof(KernelParams, data), host, size);
+      return;
     }
 
-    /* Data Storage */
-    if (mem.interpolation == INTERPOLATION_NONE) {
-      generic_alloc(mem);
-      generic_copy_to(mem);
-
-      // Update data storage pointers in launch parameters
+    // Update data storage pointers in launch parameters
 #  define KERNEL_TEX(data_type, tex_name) \
-    if (strcmp(mem.name, #tex_name) == 0) \
-      update_launch_params( \
-          mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr));
+    if (strcmp(name, #tex_name) == 0) { \
+      update_launch_params(name, offsetof(KernelParams, tex_name), host, size); \
+      return; \
+    }
 #  include "kernel/kernel_textures.h"
 #  undef KERNEL_TEX
-      return;
-    }
-
-    /* Image Texture Storage */
-    CUarray_format_enum format;
-    switch (mem.data_type) {
-      case TYPE_UCHAR:
-        format = CU_AD_FORMAT_UNSIGNED_INT8;
-        break;
-      case TYPE_UINT16:
-        format = CU_AD_FORMAT_UNSIGNED_INT16;
-        break;
-      case TYPE_UINT:
-        format = CU_AD_FORMAT_UNSIGNED_INT32;
-        break;
-      case TYPE_INT:
-        format = CU_AD_FORMAT_SIGNED_INT32;
-        break;
-      case TYPE_FLOAT:
-        format = CU_AD_FORMAT_FLOAT;
-        break;
-      case TYPE_HALF:
-        format = CU_AD_FORMAT_HALF;
-        break;
-      default:
-        assert(0);
-        return;
-    }
-
-    CUDAMem *cmem = NULL;
-    CUarray array_3d = NULL;
-    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-    size_t dst_pitch = src_pitch;
-
-    if (mem.data_depth > 1) {
-      /* 3D texture using array, there is no API for linear memory. */
-      CUDA_ARRAY3D_DESCRIPTOR desc;
-
-      desc.Width = mem.data_width;
-      desc.Height = mem.data_height;
-      desc.Depth = mem.data_depth;
-      desc.Format = format;
-      desc.NumChannels = mem.data_elements;
-      desc.Flags = 0;
-
-      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-
-      check_result_cuda(cuArray3DCreate(&array_3d, &desc));
-
-      if (!array_3d) {
-        return;
-      }
-
-      CUDA_MEMCPY3D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-      param.dstArray = array_3d;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-      param.Depth = mem.data_depth;
-
-      check_result_cuda(cuMemcpy3D(&param));
-
-      mem.device_pointer = (device_ptr)array_3d;
-      mem.device_size = size;
-      stats.mem_alloc(size);
-
-      cmem = &cuda_mem_map[&mem];
-      cmem->texobject = 0;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      /* 2D texture, using pitch aligned linear memory. */
-      int alignment = 0;
-      check_result_cuda(cuDeviceGetAttribute(
-          &alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuda_device));
-      dst_pitch = align_up(src_pitch, alignment);
-      size_t dst_size = dst_pitch * mem.data_height;
-
-      cmem = generic_alloc(mem, dst_size - mem.memory_size());
-      if (!cmem) {
-        return;
-      }
-
-      CUDA_MEMCPY2D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-      param.dstDevice = mem.device_pointer;
-      param.dstPitch = dst_pitch;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-
-      check_result_cuda(cuMemcpy2DUnaligned(&param));
-    }
-    else {
-      /* 1D texture, using linear memory. */
-      cmem = generic_alloc(mem);
-      if (!cmem) {
-        return;
-      }
-
-      check_result_cuda(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-    }
-
-    /* Kepler+, bindless textures. */
-    int flat_slot = 0;
-    if (string_startswith(mem.name, "__tex_image")) {
-      int pos = string(mem.name).rfind("_");
-      flat_slot = atoi(mem.name + pos + 1);
-    }
-    else {
-      assert(0);
-    }
-
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    check_result_cuda(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    /* Resize once */
-    if (flat_slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount
-       * of re-allocations. */
-      texture_info.resize(flat_slot + 128);
-    }
-
-    /* Set Mapping and tag that we need to (re-)upload to device */
-    TextureInfo &info = texture_info[flat_slot];
-    info.data = (uint64_t)cmem->texobject;
-    info.cl_buffer = 0;
-    info.interpolation = mem.interpolation;
-    info.extension = mem.extension;
-    info.width = mem.data_width;
-    info.height = mem.data_height;
-    info.depth = mem.data_depth;
-    need_texture_info = true;
-  }
-
-  void mem_copy_to(device_memory &mem) override
-  {
-    if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-      tex_alloc(mem);
-    }
-    else {
-      if (!mem.device_pointer) {
-        generic_alloc(mem);
-      }
-
-      generic_copy_to(mem);
-    }
-  }
-
-  void generic_copy_to(device_memory &mem)
-  {
-    if (mem.host_pointer && mem.device_pointer) {
-      CUDAContextScope scope(cuda_context);
-
-      /* If use_mapped_host of mem is false, the current device only
-       * uses device memory allocated by cuMemAlloc regardless of
-       * mem.host_pointer and mem.shared_pointer, and should copy
-       * data from mem.host_pointer. */
-
-      if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) {
-        check_result_cuda(
-            cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-      }
-    }
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      assert(!"mem_copy_from not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_copy_from not supported for textures.");
-    }
-    else {
-      // Calculate linear memory offset and size
-      const size_t size = elem * w * h;
-      const size_t offset = elem * y * w;
-
-      if (mem.host_pointer && mem.device_pointer) {
-        const CUDAContextScope scope(cuda_context);
-        check_result_cuda(cuMemcpyDtoH(
-            (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
-      }
-      else if (mem.host_pointer) {
-        memset((char *)mem.host_pointer + offset, 0, size);
-      }
-    }
-  }
-
-  void mem_zero(device_memory &mem) override
-  {
-    if (mem.host_pointer)
-      memset(mem.host_pointer, 0, mem.memory_size());
-
-    if (!mem.device_pointer)
-      mem_alloc(mem);  // Need to allocate memory first if it does not exist yet
-
-    /* If use_mapped_host of mem is false, mem.device_pointer currently
-     * refers to device memory regardless of mem.host_pointer and
-     * mem.shared_pointer. */
-
-    if (mem.device_pointer &&
-        (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer)) {
-      const CUDAContextScope scope(cuda_context);
-      check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
-    }
-  }
-
-  void mem_free(device_memory &mem) override
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      assert(!"mem_free not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-    }
-    else {
-      generic_free(mem);
-    }
-  }
-
-  void generic_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(cuda_context);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      /* If cmem.use_mapped_host is true, reference counting is used
-       * to safely free a mapped host memory. */
-
-      if (cmem.use_mapped_host) {
-        assert(mem.shared_pointer);
-        if (mem.shared_pointer) {
-          assert(mem.shared_counter > 0);
-          if (--mem.shared_counter == 0) {
-            if (mem.host_pointer == mem.shared_pointer) {
-              mem.host_pointer = 0;
-            }
-            cuMemFreeHost(mem.shared_pointer);
-            mem.shared_pointer = 0;
-          }
-        }
-        map_host_used -= mem.device_size;
-      }
-      else {
-        /* Free device memory. */
-        cuMemFree(mem.device_pointer);
-      }
-
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
   }
 
-  void tex_free(device_memory &mem)
+  void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
   {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(cuda_context);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      if (cmem.texobject) {
-        /* Free bindless texture. */
-        cuTexObjectDestroy(cmem.texobject);
-      }
+    const CUDAContextScope scope(cuContext);
 
-      if (cmem.array) {
-        /* Free array. */
-        cuArrayDestroy(cmem.array);
-        stats.mem_free(mem.device_size);
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        cuda_mem_map.erase(cuda_mem_map.find(&mem));
-      }
-      else {
-        generic_free(mem);
-      }
-    }
+    for (int i = 0; i < info.cpu_threads; ++i)
+      check_result_cuda(
+          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
+                       data,
+                       data_size));
   }
 
-  void move_textures_to_host(size_t size, bool for_texture)
+  void task_add(DeviceTask &task) override
   {
-    /* Signal to reallocate textures in host memory only. */
-    move_texture_to_host = true;
-
-    while (size > 0) {
-      /* Find suitable memory allocation to move. */
-      device_memory *max_mem = NULL;
-      size_t max_size = 0;
-      bool max_is_image = false;
-
-      foreach (auto &pair, cuda_mem_map) {
-        device_memory &mem = *pair.first;
-        CUDAMem *cmem = &pair.second;
-
-        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-        bool is_image = is_texture && (mem.data_height > 1);
-
-        /* Can't move this type of memory. */
-        if (!is_texture || cmem->array) {
-          continue;
-        }
-
-        /* Already in host memory. */
-        if (cmem->use_mapped_host) {
-          continue;
-        }
-
-        /* For other textures, only move image textures. */
-        if (for_texture && !is_image) {
-          continue;
-        }
-
-        /* Try to move largest allocation, prefer moving images. */
-        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-          max_is_image = is_image;
-          max_size = mem.device_size;
-          max_mem = &mem;
-        }
+    struct OptiXDeviceTask : public DeviceTask {
+      OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task)
+      {
+        // Using task index parameter instead of thread index, since number of CUDA streams may
+        // differ from number of threads
+        run = function_bind(&OptiXDevice::thread_run, device, *this, task_index);
       }
+    };
 
-      /* Move to host memory. This part is mutex protected since
-       * multiple CUDA devices could be moving the memory. The
-       * first one will do it, and the rest will adopt the pointer. */
-      if (max_mem) {
-        VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-        static thread_mutex move_mutex;
-        thread_scoped_lock lock(move_mutex);
-
-        /* Preserve the original device pointer, in case of multi device
-         * we can't change it because the pointer mapping would break. */
-        device_ptr prev_pointer = max_mem->device_pointer;
-        size_t prev_size = max_mem->device_size;
-
-        tex_free(*max_mem);
-        tex_alloc(*max_mem);
-        size = (max_size >= size) ? 0 : size - max_size;
+    // Upload texture information to device if it has changed since last launch
+    load_texture_info();
 
-        max_mem->device_pointer = prev_pointer;
-        max_mem->device_size = prev_size;
-      }
-      else {
-        break;
-      }
+    if (task.type == DeviceTask::FILM_CONVERT) {
+      // Execute in main thread because of OpenGL access
+      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+      return;
     }
 
-    /* Update texture info array with new pointers. */
-    update_texture_info();
-
-    move_texture_to_host = false;
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Fix traversable handle on multi devices
-      KernelData *const data = (KernelData *)host;
-      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
-
-      update_launch_params(name, offsetof(KernelParams, data), host, size);
+    if (task.type == DeviceTask::DENOISE_BUFFER) {
+      // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
+      task_pool.push(new OptiXDeviceTask(this, task, 0));
+      return;
     }
-  }
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    // Upload texture information to device if it has changed since last launch
-    update_texture_info();
 
     // Split task into smaller ones
     list<DeviceTask> tasks;
     task.split(tasks, info.cpu_threads);
 
     // Queue tasks in internal task pool
-    struct OptiXDeviceTask : public DeviceTask {
-      OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task)
-      {
-        // Using task index parameter instead of thread index, since number of CUDA streams may
-        // differ from number of threads
-        run = function_bind(&OptiXDevice::thread_run, device, *this, task_index);
-      }
-    };
-
     int task_index = 0;
     for (DeviceTask &task : tasks)
       task_pool.push(new OptiXDeviceTask(this, task, task_index++));
@@ -2195,403 +1501,6 @@ class OptiXDevice : public Device {
     // Cancel any remaining tasks in the internal pool
     task_pool.cancel();
   }
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task,
-                                 int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    int stride = task->buffer.stride;
-    int w = task->buffer.width;
-    int h = task->buffer.h;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-    int frame_offset = 0;
-
-    CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr scale_ptr = 0;
-
-    check_result_cuda_ret(
-        cuMemsetD8Async(weightAccum, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
-    check_result_cuda_ret(
-        cuMemsetD8Async(out_ptr, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
-
-    {
-      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-      check_result_cuda_ret(cuModuleGetFunction(
-          &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
-      check_result_cuda_ret(
-          cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
-      check_result_cuda_ret(cuModuleGetFunction(
-          &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
-      check_result_cuda_ret(cuModuleGetFunction(
-          &cuNLMUpdateOutput, cuda_filter_module, "kernel_cuda_filter_nlm_update_output"));
-
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-      void *calc_difference_args[] = {&guide_ptr,
-                                      &variance_ptr,
-                                      &scale_ptr,
-                                      &difference,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &channel_offset,
-                                      &frame_offset,
-                                      &a,
-                                      &k_2};
-      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *calc_weight_args[] = {
-          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *update_output_args[] = {&blurDifference,
-                                    &image_ptr,
-                                    &out_ptr,
-                                    &weightAccum,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &channel_offset,
-                                    &r,
-                                    &f};
-
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-    }
-
-    {
-      CUfunction cuNLMNormalize;
-      check_result_cuda_ret(cuModuleGetFunction(
-          &cuNLMNormalize, cuda_filter_module, "kernel_cuda_filter_nlm_normalize"));
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-      check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-    }
-
-    return !have_error();
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task, int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterConstructTransform;
-    check_result_cuda_ret(cuModuleGetFunction(&cuFilterConstructTransform,
-                                              cuda_filter_module,
-                                              "kernel_cuda_filter_construct_transform"));
-    check_result_cuda_ret(
-        cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-    void *args[] = {&task->buffer.mem.device_pointer,
-                    &task->tile_info_mem.device_pointer,
-                    &task->storage.transform.device_pointer,
-                    &task->storage.rank.device_pointer,
-                    &task->filter_area,
-                    &task->rect,
-                    &task->radius,
-                    &task->pca_threshold,
-                    &task->buffer.pass_stride,
-                    &task->buffer.frame_stride,
-                    &task->buffer.use_time};
-    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-    check_result_cuda_ret(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task,
-                            int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    int r = task->radius;
-    int f = 4;
-    float a = 1.0f;
-    float k_2 = task->nlm_k_2;
-
-    int w = task->reconstruction_state.source_w;
-    int h = task->reconstruction_state.source_h;
-    int stride = task->buffer.stride;
-    int frame_offset = frame * task->buffer.frame_stride;
-    int t = task->tile_info->frames[frame];
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-    CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
-    check_result_cuda_ret(
-        cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuNLMConstructGramian, cuda_filter_module, "kernel_cuda_filter_nlm_construct_gramian"));
-
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    check_result_cuda_ret(
-        cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                          task->reconstruction_state.source_w *
-                              task->reconstruction_state.source_h,
-                          num_shifts);
-
-    void *calc_difference_args[] = {&color_ptr,
-                                    &color_variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &pass_stride,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *construct_gramian_args[] = {&t,
-                                      &blurDifference,
-                                      &task->buffer.mem.device_pointer,
-                                      &task->storage.transform.device_pointer,
-                                      &task->storage.rank.device_pointer,
-                                      &task->storage.XtWX.device_pointer,
-                                      &task->storage.XtWY.device_pointer,
-                                      &task->reconstruction_state.filter_window,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &f,
-                                      &frame_offset,
-                                      &task->buffer.use_time};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-    check_result_cuda_ret(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task, int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFinalize;
-    check_result_cuda_ret(
-        cuModuleGetFunction(&cuFinalize, cuda_filter_module, "kernel_cuda_filter_finalize"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-    void *finalize_args[] = {&output_ptr,
-                             &task->storage.rank.device_pointer,
-                             &task->storage.XtWX.device_pointer,
-                             &task->storage.XtWY.device_pointer,
-                             &task->filter_area,
-                             &task->reconstruction_state.buffer_params.x,
-                             &task->render_buffer.samples};
-    CUDA_GET_BLOCKSIZE(
-        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task,
-                                int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterCombineHalves;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterCombineHalves, cuda_filter_module, "kernel_cuda_filter_combine_halves"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task,
-                               int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterDivideShadow;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterDivideShadow, cuda_filter_module, "kernel_cuda_filter_divide_shadow"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &a_ptr,
-                    &b_ptr,
-                    &sample_variance_ptr,
-                    &sv_variance_ptr,
-                    &buffer_variance_ptr,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task,
-                             int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterGetFeature;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterGetFeature, cuda_filter_module, "kernel_cuda_filter_get_feature"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &mean_offset,
-                    &variance_offset,
-                    &mean_ptr,
-                    &variance_ptr,
-                    &scale,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task,
-                               int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterWriteFeature;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterWriteFeature, cuda_filter_module, "kernel_cuda_filter_write_feature"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->reconstruction_state.buffer_params,
-                    &task->filter_area,
-                    &from_ptr,
-                    &buffer_ptr,
-                    &out_offset,
-                    &task->rect};
-    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task,
-                                 int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterDetectOutliers;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterDetectOutliers, cuda_filter_module, "kernel_cuda_filter_detect_outliers"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&image_ptr,
-                    &variance_ptr,
-                    &depth_ptr,
-                    &output_ptr,
-                    &task->rect,
-                    &task->buffer.pass_stride};
-
-    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
 };
 
 bool device_optix_init()
@@ -2603,14 +1512,6 @@ bool device_optix_init()
   if (!device_cuda_init())
     return false;
 
-#  ifdef WITH_CUDA_DYNLOAD
-  // Load NVRTC function pointers for adaptive kernel compilation
-  if (DebugFlags().cuda.adaptive_compile && cuewInit(CUEW_INIT_NVRTC) != CUEW_SUCCESS) {
-    VLOG(1)
-        << "CUEW initialization failed for NVRTC. Adaptive kernel compilation won't be available.";
-  }
-#  endif
-
   const OptixResult result = optixInit();
 
   if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
@@ -2657,7 +1558,7 @@ void device_optix_info(vector<DeviceInfo> &devices)
     }
 
     // Only add devices with RTX support
-    if (rtcore_version == 0)
+    if (rtcore_version == 0 && !getenv("CYCLES_OPTIX_TEST"))
       it = cuda_devices.erase(it);
     else
       ++it;
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 42e597a34d7..f22d8761058 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -55,6 +55,10 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device)
   kernel_next_iteration_setup = NULL;
   kernel_indirect_subsurface = NULL;
   kernel_buffer_update = NULL;
+  kernel_adaptive_stopping = NULL;
+  kernel_adaptive_filter_x = NULL;
+  kernel_adaptive_filter_y = NULL;
+  kernel_adaptive_adjust_samples = NULL;
 }
 
 DeviceSplitKernel::~DeviceSplitKernel()
@@ -83,6 +87,10 @@ DeviceSplitKernel::~DeviceSplitKernel()
   delete kernel_next_iteration_setup;
   delete kernel_indirect_subsurface;
   delete kernel_buffer_update;
+  delete kernel_adaptive_stopping;
+  delete kernel_adaptive_filter_x;
+  delete kernel_adaptive_filter_y;
+  delete kernel_adaptive_adjust_samples;
 }
 
 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
@@ -114,6 +122,10 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_fe
   LOAD_KERNEL(next_iteration_setup);
   LOAD_KERNEL(indirect_subsurface);
   LOAD_KERNEL(buffer_update);
+  LOAD_KERNEL(adaptive_stopping);
+  LOAD_KERNEL(adaptive_filter_x);
+  LOAD_KERNEL(adaptive_filter_y);
+  LOAD_KERNEL(adaptive_adjust_samples);
 
 #undef LOAD_KERNEL
 
@@ -202,13 +214,21 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     /* initial guess to start rolling average */
     const int initial_num_samples = 1;
     /* approx number of samples per second */
-    int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                 int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                 initial_num_samples;
+    const int samples_per_second = (avg_time_per_sample > 0.0) ?
+                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
+                                       initial_num_samples;
 
     RenderTile subtile = tile;
     subtile.start_sample = tile.sample;
-    subtile.num_samples = min(samples_per_second,
+    subtile.num_samples = samples_per_second;
+
+    if (task->adaptive_sampling.use) {
+      subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample,
+                                                                          subtile.num_samples);
+    }
+
+    /* Don't go beyond requested number of samples. */
+    subtile.num_samples = min(subtile.num_samples,
                               tile.start_sample + tile.num_samples - tile.sample);
 
     if (device->have_error()) {
@@ -302,6 +322,23 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
       }
     }
 
+    int filter_sample = tile.sample + subtile.num_samples - 1;
+    if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) {
+      size_t buffer_size[2];
+      buffer_size[0] = round_up(tile.w, local_size[0]);
+      buffer_size[1] = round_up(tile.h, local_size[1]);
+      kernel_adaptive_stopping->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+      buffer_size[0] = round_up(tile.h, local_size[0]);
+      buffer_size[1] = round_up(1, local_size[1]);
+      kernel_adaptive_filter_x->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+      buffer_size[0] = round_up(tile.w, local_size[0]);
+      buffer_size[1] = round_up(1, local_size[1]);
+      kernel_adaptive_filter_y->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+    }
+
     double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
 
     if (avg_time_per_sample == 0.0) {
@@ -324,6 +361,28 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     }
   }
 
+  if (task->adaptive_sampling.use) {
+    /* Reset the start samples. */
+    RenderTile subtile = tile;
+    subtile.start_sample = tile.start_sample;
+    subtile.num_samples = tile.sample - tile.start_sample;
+    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+                                   subtile,
+                                   num_global_elements,
+                                   kgbuffer,
+                                   kernel_data,
+                                   split_data,
+                                   ray_state,
+                                   queue_index,
+                                   use_queues_flag,
+                                   work_pool_wgs);
+    size_t buffer_size[2];
+    buffer_size[0] = round_up(tile.w, local_size[0]);
+    buffer_size[1] = round_up(tile.h, local_size[1]);
+    kernel_adaptive_adjust_samples->enqueue(
+        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+  }
+
   return true;
 }
 
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 6ff326bf214..9d6b9efdd62 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -75,6 +75,10 @@ class DeviceSplitKernel {
   SplitKernelFunction *kernel_next_iteration_setup;
   SplitKernelFunction *kernel_indirect_subsurface;
   SplitKernelFunction *kernel_buffer_update;
+  SplitKernelFunction *kernel_adaptive_stopping;
+  SplitKernelFunction *kernel_adaptive_filter_x;
+  SplitKernelFunction *kernel_adaptive_filter_y;
+  SplitKernelFunction *kernel_adaptive_adjust_samples;
 
   /* Global memory variables [porting]; These memory is used for
    * co-operation between different kernels; Data written by one
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 376ad06a734..c36b1344c3b 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -115,7 +115,7 @@ void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size)
 
 void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 {
-  if ((type != RENDER) && (type != SHADER))
+  if (type == FILM_CONVERT)
     return;
 
   if (update_progress_sample) {
@@ -136,4 +136,59 @@ void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
   }
 }
 
+/* Adaptive Sampling */
+
+AdaptiveSampling::AdaptiveSampling()
+    : use(true), adaptive_step(ADAPTIVE_SAMPLE_STEP), min_samples(0)
+{
+}
+
+/* Render samples in steps that align with the adaptive filtering. */
+int AdaptiveSampling::align_static_samples(int samples) const
+{
+  if (samples > adaptive_step) {
+    /* Make multiple of adaptive_step. */
+    while (samples % adaptive_step != 0) {
+      samples--;
+    }
+  }
+  else if (samples < adaptive_step) {
+    /* Make divisor of adaptive_step. */
+    while (adaptive_step % samples != 0) {
+      samples--;
+    }
+  }
+
+  return max(samples, 1);
+}
+
+/* Render samples in steps that align with the adaptive filtering, with the
+ * suggested number of samples dynamically changing. */
+int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const
+{
+  /* Round so that we end up on multiples of adaptive_samples. */
+  samples += offset;
+
+  if (samples > adaptive_step) {
+    /* Make multiple of adaptive_step. */
+    while (samples % adaptive_step != 0) {
+      samples--;
+    }
+  }
+
+  samples -= offset;
+
+  return max(samples, 1);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (sample > min_samples) {
+    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+  }
+  else {
+    return false;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 1b1e97cdb10..8c4e682adb1 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -47,7 +47,7 @@ class DenoiseParams {
   int neighbor_frames;
   /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
   bool clamp_input;
-  /* Controls which passes the OptiX AI denoiser should use as input. */
+  /* Passes handed over to the OptiX denoiser (default to color + albedo). */
   int optix_input_passes;
 
   DenoiseParams()
@@ -58,13 +58,26 @@ class DenoiseParams {
     relative_pca = false;
     neighbor_frames = 2;
     clamp_input = true;
-    optix_input_passes = 1;
+    optix_input_passes = 2;
   }
 };
 
+class AdaptiveSampling {
+ public:
+  AdaptiveSampling();
+
+  int align_static_samples(int samples) const;
+  int align_dynamic_samples(int offset, int samples) const;
+  bool need_filter(int sample) const;
+
+  bool use;
+  int adaptive_step;
+  int min_samples;
+};
+
 class DeviceTask : public Task {
  public:
-  typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
+  typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
   Type type;
 
   int x, y, w, h;
@@ -81,7 +94,7 @@ class DeviceTask : public Task {
   int shader_filter;
   int shader_x, shader_w;
 
-  int passes_size;
+  RenderBuffers *buffers;
 
   explicit DeviceTask(Type type = RENDER);
 
@@ -90,7 +103,7 @@ class DeviceTask : public Task {
 
   void update_progress(RenderTile *rtile, int pixel_samples = -1);
 
-  function<bool(Device *device, RenderTile &)> acquire_tile;
+  function<bool(Device *device, RenderTile &, uint)> acquire_tile;
   function<void(long, int)> update_progress_sample;
   function<void(RenderTile &)> update_tile_sample;
   function<void(RenderTile &)> release_tile;
@@ -98,6 +111,7 @@ class DeviceTask : public Task {
   function<void(RenderTile *, Device *)> map_neighbor_tiles;
   function<void(RenderTile *, Device *)> unmap_neighbor_tiles;
 
+  uint tile_types;
   DenoiseParams denoising;
   bool denoising_from_render;
   vector<int> denoising_frames;
@@ -114,7 +128,7 @@ class DeviceTask : public Task {
 
   bool need_finish_queue;
   bool integrator_branched;
-  int2 requested_tile_size;
+  AdaptiveSampling adaptive_sampling;
 
  protected:
   double last_update_time;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/device_opencl.h
index 61b1e3e3b6b..d6f4fb43061 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/device_opencl.h
@@ -88,9 +88,12 @@ class OpenCLInfo {
   static bool device_supported(const string &platform_name, const cl_device_id device_id);
   static bool platform_version_check(cl_platform_id platform, string *error = NULL);
   static bool device_version_check(cl_device_id device, string *error = NULL);
+  static bool get_device_version(cl_device_id device,
+                                 int *r_major,
+                                 int *r_minor,
+                                 string *error = NULL);
   static string get_hardware_id(const string &platform_name, cl_device_id device_id);
-  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
-                                 bool force_all = false);
+  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
 
   /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
 
@@ -428,8 +431,10 @@ class OpenCLDevice : public Device {
   int mem_sub_ptr_alignment();
 
   void const_copy_to(const char *name, void *host, size_t size);
-  void tex_alloc(device_memory &mem);
-  void tex_free(device_memory &mem);
+  void global_alloc(device_memory &mem);
+  void global_free(device_memory &mem);
+  void tex_alloc(device_texture &mem);
+  void tex_free(device_texture &mem);
 
   size_t global_size_round_up(int group_size, int global_size);
   void enqueue_kernel(cl_kernel kernel,
@@ -445,6 +450,7 @@ class OpenCLDevice : public Device {
                     device_ptr rgba_byte,
                     device_ptr rgba_half);
   void shader(DeviceTask &task);
+  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
 
   void denoise(RenderTile &tile, DenoisingTask &denoising);
 
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index 76f9ce7a18f..2766f85d17c 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -16,7 +16,7 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
+#  include "device/opencl/device_opencl.h"
 
 #  include "kernel/kernel_types.h"
 #  include "kernel/split/kernel_split_data_types.h"
@@ -56,7 +56,11 @@ static const string SPLIT_BUNDLE_KERNELS =
     "enqueue_inactive "
     "next_iteration_setup "
     "indirect_subsurface "
-    "buffer_update";
+    "buffer_update "
+    "adaptive_stopping "
+    "adaptive_filter_x "
+    "adaptive_filter_y "
+    "adaptive_adjust_samples";
 
 const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
 {
@@ -253,16 +257,16 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
 
     /* Ordered with most complex kernels first, to reduce overall compile time. */
     ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
+    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
+    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
     if (requested_features.use_volume || is_preview) {
       ADD_SPLIT_KERNEL_PROGRAM(do_volume);
     }
+    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
+    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
+    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
     ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
     ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
-    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
-    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
-    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
 
     /* Quick kernels bundled in a single program to reduce overhead of starting
      * Blender processes. */
@@ -283,6 +287,10 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
     programs.push_back(&program_split);
 
 #  undef ADD_SPLIT_KERNEL_PROGRAM
@@ -605,7 +613,7 @@ OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, b
       kernel_programs(this),
       preview_programs(this),
       memory_manager(this),
-      texture_info(this, "__texture_info", MEM_TEXTURE)
+      texture_info(this, "__texture_info", MEM_GLOBAL)
 {
   cpPlatform = NULL;
   cdDevice = NULL;
@@ -937,7 +945,7 @@ void OpenCLDevice::mem_alloc(device_memory &mem)
   cl_mem_flags mem_flag;
   void *mem_ptr = NULL;
 
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
     mem_flag = CL_MEM_READ_ONLY;
   else
     mem_flag = CL_MEM_READ_WRITE;
@@ -961,9 +969,13 @@ void OpenCLDevice::mem_alloc(device_memory &mem)
 
 void OpenCLDevice::mem_copy_to(device_memory &mem)
 {
-  if (mem.type == MEM_TEXTURE) {
-    tex_free(mem);
-    tex_alloc(mem);
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
   }
   else {
     if (!mem.device_pointer) {
@@ -1069,8 +1081,11 @@ void OpenCLDevice::mem_zero(device_memory &mem)
 
 void OpenCLDevice::mem_free(device_memory &mem)
 {
-  if (mem.type == MEM_TEXTURE) {
-    tex_free(mem);
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
   }
   else {
     if (mem.device_pointer) {
@@ -1093,7 +1108,7 @@ int OpenCLDevice::mem_sub_ptr_alignment()
 device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
 {
   cl_mem_flags mem_flag;
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
     mem_flag = CL_MEM_READ_ONLY;
   else
     mem_flag = CL_MEM_READ_WRITE;
@@ -1133,9 +1148,9 @@ void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
   data->copy_to_device();
 }
 
-void OpenCLDevice::tex_alloc(device_memory &mem)
+void OpenCLDevice::global_alloc(device_memory &mem)
 {
-  VLOG(1) << "Texture allocate: " << mem.name << ", "
+  VLOG(1) << "Global memory allocate: " << mem.name << ", "
           << string_human_readable_number(mem.memory_size()) << " bytes. ("
           << string_human_readable_size(mem.memory_size()) << ")";
 
@@ -1147,7 +1162,7 @@ void OpenCLDevice::tex_alloc(device_memory &mem)
   textures_need_update = true;
 }
 
-void OpenCLDevice::tex_free(device_memory &mem)
+void OpenCLDevice::global_free(device_memory &mem)
 {
   if (mem.device_pointer) {
     mem.device_pointer = 0;
@@ -1165,6 +1180,25 @@ void OpenCLDevice::tex_free(device_memory &mem)
   }
 }
 
+void OpenCLDevice::tex_alloc(device_texture &mem)
+{
+  VLOG(1) << "Texture allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  memory_manager.alloc(mem.name, mem);
+  /* Set the pointer to non-null to keep code that inspects its value from thinking its
+   * unallocated. */
+  mem.device_pointer = 1;
+  textures[mem.name] = &mem;
+  textures_need_update = true;
+}
+
+void OpenCLDevice::tex_free(device_texture &mem)
+{
+  global_free(mem);
+}
+
 size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
 {
   int r = global_size % group_size;
@@ -1265,10 +1299,10 @@ void OpenCLDevice::flush_texture_buffers()
 
   foreach (TexturesMap::value_type &tex, textures) {
     string name = tex.first;
+    device_memory *mem = tex.second;
 
-    if (string_startswith(name, "__tex_image")) {
-      int pos = name.rfind("_");
-      int id = atoi(name.data() + pos + 1);
+    if (mem->type == MEM_TEXTURE) {
+      const uint id = ((device_texture *)mem)->slot;
       texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
       num_slots = max(num_slots, num_data_slots + id + 1);
     }
@@ -1281,22 +1315,20 @@ void OpenCLDevice::flush_texture_buffers()
 
   /* Fill in descriptors */
   foreach (texture_slot_t &slot, texture_slots) {
+    device_memory *mem = textures[slot.name];
     TextureInfo &info = texture_info[slot.slot];
 
     MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-    info.data = desc.offset;
-    info.cl_buffer = desc.device_buffer;
-
-    if (string_startswith(slot.name, "__tex_image")) {
-      device_memory *mem = textures[slot.name];
 
-      info.width = mem->data_width;
-      info.height = mem->data_height;
-      info.depth = mem->data_depth;
-
-      info.interpolation = mem->interpolation;
-      info.extension = mem->extension;
+    if (mem->type == MEM_TEXTURE) {
+      info = ((device_texture *)mem)->info;
+    }
+    else {
+      memset(&info, 0, sizeof(TextureInfo));
     }
+
+    info.data = desc.offset;
+    info.cl_buffer = desc.device_buffer;
   }
 
   /* Force write of descriptors. */
@@ -1308,13 +1340,7 @@ void OpenCLDevice::thread_run(DeviceTask *task)
 {
   flush_texture_buffers();
 
-  if (task->type == DeviceTask::FILM_CONVERT) {
-    film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-  }
-  else if (task->type == DeviceTask::SHADER) {
-    shader(*task);
-  }
-  else if (task->type == DeviceTask::RENDER) {
+  if (task->type == DeviceTask::RENDER) {
     RenderTile tile;
     DenoisingTask denoising(this, *task);
 
@@ -1323,7 +1349,7 @@ void OpenCLDevice::thread_run(DeviceTask *task)
     kgbuffer.alloc_to_device(1);
 
     /* Keep rendering tiles until done. */
-    while (task->acquire_tile(this, tile)) {
+    while (task->acquire_tile(this, tile, task->tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         assert(tile.task == RenderTile::PATH_TRACE);
         scoped_timer timer(&tile.buffers->render_time);
@@ -1352,6 +1378,30 @@ void OpenCLDevice::thread_run(DeviceTask *task)
 
     kgbuffer.free();
   }
+  else if (task->type == DeviceTask::SHADER) {
+    shader(*task);
+  }
+  else if (task->type == DeviceTask::FILM_CONVERT) {
+    film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+  }
+  else if (task->type == DeviceTask::DENOISE_BUFFER) {
+    RenderTile tile;
+    tile.x = task->x;
+    tile.y = task->y;
+    tile.w = task->w;
+    tile.h = task->h;
+    tile.buffer = task->buffer;
+    tile.sample = task->sample + task->num_samples;
+    tile.num_samples = task->num_samples;
+    tile.start_sample = task->sample;
+    tile.offset = task->offset;
+    tile.stride = task->stride;
+    tile.buffers = task->buffers;
+
+    DenoisingTask denoising(this, *task);
+    denoise(tile, denoising);
+    task->update_progress(&tile, tile.w * tile.h);
+  }
 }
 
 void OpenCLDevice::film_convert(DeviceTask &task,
@@ -1846,6 +1896,17 @@ string OpenCLDevice::kernel_build_options(const string *debug_src)
 {
   string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
 
+  /* Build with OpenCL 2.0 if available, this improves performance
+   * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
+   * Note that OpenCL selects the highest 1.x version by default,
+   * only for 2.0 do we need the explicit compiler flag. */
+  int version_major, version_minor;
+  if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
+    if (version_major >= 2) {
+      build_options += "-cl-std=CL2.0 ";
+    }
+  }
+
   if (platform_name == "NVIDIA CUDA") {
     build_options +=
         "-D__KERNEL_OPENCL_NVIDIA__ "
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
index 06d4746a86e..fedb3ea8c6a 100644
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -18,7 +18,7 @@
 
 #  include "util/util_foreach.h"
 
-#  include "device/opencl/opencl.h"
+#  include "device/opencl/device_opencl.h"
 #  include "device/opencl/memory_manager.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
index 2fbc97a0756..23624f837a6 100644
--- a/intern/cycles/device/opencl/memory_manager.h
+++ b/intern/cycles/device/opencl/memory_manager.h
@@ -19,8 +19,8 @@
 #include "device/device.h"
 
 #include "util/util_map.h"
-#include "util/util_vector.h"
 #include "util/util_string.h"
+#include "util/util_vector.h"
 
 #include "clew.h"
 
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 3eeff31f8c2..b8b07cf2947 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,15 +16,16 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
 #  include "device/device_intern.h"
+#  include "device/opencl/device_opencl.h"
 
 #  include "util/util_debug.h"
 #  include "util/util_logging.h"
 #  include "util/util_md5.h"
 #  include "util/util_path.h"
-#  include "util/util_time.h"
+#  include "util/util_semaphore.h"
 #  include "util/util_system.h"
+#  include "util/util_time.h"
 
 using std::cerr;
 using std::endl;
@@ -390,8 +391,27 @@ static void escape_python_string(string &str)
   string_replace(str, "'", "\'");
 }
 
+static int opencl_compile_process_limit()
+{
+  /* Limit number of concurrent processes compiling, with a heuristic based
+   * on total physical RAM and estimate of memory usage needed when compiling
+   * with all Cycles features enabled.
+   *
+   * This is somewhat arbitrary as we don't know the actual available RAM or
+   * how much the kernel compilation will needed depending on the features, but
+   * better than not limiting at all. */
+  static const int64_t GB = 1024LL * 1024LL * 1024LL;
+  static const int64_t process_memory = 2 * GB;
+  static const int64_t base_memory = 2 * GB;
+  static const int64_t system_memory = system_physical_ram();
+  static const int64_t process_limit = (system_memory - base_memory) / process_memory;
+
+  return max((int)process_limit, 1);
+}
+
 bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
 {
+  /* Construct arguments. */
   vector<string> args;
   args.push_back("--background");
   args.push_back("--factory-startup");
@@ -419,14 +439,23 @@ bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
       kernel_file_escaped.c_str(),
       clbin_escaped.c_str()));
 
-  double starttime = time_dt();
+  /* Limit number of concurrent processes compiling. */
+  static thread_counting_semaphore semaphore(opencl_compile_process_limit());
+  semaphore.acquire();
+
+  /* Compile. */
+  const double starttime = time_dt();
   add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
   add_log(string("Build flags: ") + kernel_build_options, true);
-  if (!system_call_self(args) || !path_exists(clbin)) {
+  const bool success = system_call_self(args);
+  const double elapsed = time_dt() - starttime;
+
+  semaphore.release();
+
+  if (!success || !path_exists(clbin)) {
     return false;
   }
 
-  double elapsed = time_dt() - starttime;
   add_log(
       string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
       false);
@@ -747,6 +776,10 @@ bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_i
   }
   VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
 
+  if (getenv("CYCLES_OPENCL_TEST")) {
+    return true;
+  }
+
   /* It is possible to have Iris GPU on AMD/Apple OpenCL framework
    * (aka, it will not be on Intel framework). This isn't supported
    * and needs an explicit blacklist.
@@ -806,18 +839,30 @@ bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
   return true;
 }
 
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
+bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
 {
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
   char version[256];
   clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
+  if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
     if (error != NULL) {
       *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
     }
     return false;
   }
+  if (error != NULL) {
+    *error = "";
+  }
+  return true;
+}
+
+bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
+{
+  const int req_major = 1, req_minor = 1;
+  int major, minor;
+  if (!get_device_version(device, &major, &minor, error)) {
+    return false;
+  }
+
   if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
     if (error != NULL) {
       *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
@@ -858,7 +903,7 @@ string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id dev
   return "";
 }
 
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all)
+void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
 {
   const cl_device_type device_type = OpenCLInfo::device_type();
   static bool first_time = true;
@@ -924,7 +969,7 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
         FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
         continue;
       }
-      if (force_all || device_supported(platform_name, device_id)) {
+      if (device_supported(platform_name, device_id)) {
         cl_device_type device_type;
         if (!get_device_type(device_id, &device_type, &error)) {
           FIRST_VLOG(2) << "Ignoring device " << device_name
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index 4f79a7518dc..1439fb5a407 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -669,4 +669,14 @@ size_t Node::get_total_size_in_bytes() const
   return total_size;
 }
 
+bool Node::is_a(const NodeType *type_)
+{
+  for (const NodeType *base = type; base; base = base->base) {
+    if (base == type_) {
+      return true;
+    }
+  }
+  return false;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index d35a1bb489c..4473b8aca28 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -94,6 +94,9 @@ struct Node {
   /* Get total size of this node. */
   size_t get_total_size_in_bytes() const;
 
+  /* Type testing, taking into account base classes. */
+  bool is_a(const NodeType *type);
+
   ustring name;
   const NodeType *type;
 };
diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp
index f46d4e48026..0283ed7c817 100644
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -135,8 +135,13 @@ bool SocketType::is_float3(Type type)
 
 /* Node Type */
 
-NodeType::NodeType(Type type_) : type(type_)
+NodeType::NodeType(Type type, const NodeType *base) : type(type), base(base)
 {
+  if (base) {
+    /* Inherit sockets. */
+    inputs = base->inputs;
+    outputs = base->outputs;
+  }
 }
 
 NodeType::~NodeType()
@@ -209,7 +214,7 @@ unordered_map<ustring, NodeType, ustringHash> &NodeType::types()
   return _types;
 }
 
-NodeType *NodeType::add(const char *name_, CreateFunc create_, Type type_)
+NodeType *NodeType::add(const char *name_, CreateFunc create_, Type type_, const NodeType *base_)
 {
   ustring name(name_);
 
@@ -219,7 +224,7 @@ NodeType *NodeType::add(const char *name_, CreateFunc create_, Type type_)
     return NULL;
   }
 
-  types()[name] = NodeType(type_);
+  types()[name] = NodeType(type_, base_);
 
   NodeType *type = &types()[name];
   type->name = name;
diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h
index e9496a42658..a79d44b82f3 100644
--- a/intern/cycles/graph/node_type.h
+++ b/intern/cycles/graph/node_type.h
@@ -103,7 +103,7 @@ struct SocketType {
 struct NodeType {
   enum Type { NONE, SHADER };
 
-  explicit NodeType(Type type = NONE);
+  explicit NodeType(Type type = NONE, const NodeType *base = NULL);
   ~NodeType();
 
   void register_input(ustring name,
@@ -124,11 +124,15 @@ struct NodeType {
 
   ustring name;
   Type type;
+  const NodeType *base;
   vector<SocketType, std::allocator<SocketType>> inputs;
   vector<SocketType, std::allocator<SocketType>> outputs;
   CreateFunc create;
 
-  static NodeType *add(const char *name, CreateFunc create, Type type = NONE);
+  static NodeType *add(const char *name,
+                       CreateFunc create,
+                       Type type = NONE,
+                       const NodeType *base = NULL);
   static const NodeType *find(ustring name);
   static unordered_map<ustring, NodeType, ustringHash> &types();
 };
@@ -148,6 +152,14 @@ struct NodeType {
   } \
   template<typename T> const NodeType *structname::register_type()
 
+#define NODE_ABSTRACT_DECLARE \
+  template<typename T> static const NodeType *register_base_type(); \
+  static const NodeType *node_base_type;
+
+#define NODE_ABSTRACT_DEFINE(structname) \
+  const NodeType *structname::node_base_type = structname::register_base_type<structname>(); \
+  template<typename T> const NodeType *structname::register_base_type()
+
 /* Sock Definition Macros */
 
 #define SOCKET_OFFSETOF(T, name) (((char *)&(((T *)1)->name)) - (char *)1)
diff --git a/intern/cycles/graph/node_xml.cpp b/intern/cycles/graph/node_xml.cpp
index a96970cc904..d333400cc4a 100644
--- a/intern/cycles/graph/node_xml.cpp
+++ b/intern/cycles/graph/node_xml.cpp
@@ -200,7 +200,7 @@ void xml_read_node(XMLReader &reader, Node *node, xml_node xml_node)
         map<ustring, Node *>::iterator it = reader.node_map.find(value);
         if (it != reader.node_map.end()) {
           Node *value_node = it->second;
-          if (value_node->type == *(socket.node_type))
+          if (value_node->is_a(*(socket.node_type)))
             node->set(socket, it->second);
         }
         break;
@@ -215,7 +215,7 @@ void xml_read_node(XMLReader &reader, Node *node, xml_node xml_node)
           map<ustring, Node *>::iterator it = reader.node_map.find(ustring(tokens[i]));
           if (it != reader.node_map.end()) {
             Node *value_node = it->second;
-            value[i] = (value_node->type == *(socket.node_type)) ? value_node : NULL;
+            value[i] = (value_node->is_a(*(socket.node_type))) ? value_node : NULL;
           }
           else {
             value[i] = NULL;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 99172f30b8b..3264b5afea2 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -36,6 +36,10 @@ set(SRC_CUDA_KERNELS
 )
 
 set(SRC_OPENCL_KERNELS
+  kernels/opencl/kernel_adaptive_stopping.cl
+  kernels/opencl/kernel_adaptive_filter_x.cl
+  kernels/opencl/kernel_adaptive_filter_y.cl
+  kernels/opencl/kernel_adaptive_adjust_samples.cl
   kernels/opencl/kernel_bake.cl
   kernels/opencl/kernel_base.cl
   kernels/opencl/kernel_displace.cl
@@ -94,6 +98,7 @@ set(SRC_BVH_HEADERS
 
 set(SRC_HEADERS
   kernel_accumulate.h
+  kernel_adaptive_sampling.h
   kernel_bake.h
   kernel_camera.h
   kernel_color.h
@@ -228,6 +233,7 @@ set(SRC_SVM_HEADERS
   svm/svm_fractal_noise.h
   svm/svm_types.h
   svm/svm_value.h
+  svm/svm_vector_rotate.h
   svm/svm_vector_transform.h
   svm/svm_voronoi.h
   svm/svm_voxel.h
@@ -323,6 +329,10 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
+  split/kernel_adaptive_adjust_samples.h
+  split/kernel_adaptive_filter_x.h
+  split/kernel_adaptive_filter_y.h
+  split/kernel_adaptive_stopping.h
   split/kernel_branched.h
   split/kernel_buffer_update.h
   split/kernel_data_init.h
@@ -442,7 +452,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       endif()
 
       add_custom_command(
-        OUTPUT ${cuda_cubin}
+        OUTPUT ${cuda_file}
         COMMAND ${CUBIN_CC_ENV}
             "$<TARGET_FILE:cycles_cubin_cc>"
             -target ${CUDA_ARCH}
@@ -451,7 +461,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
             -v
             -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
         DEPENDS ${kernel_sources} cycles_cubin_cc)
-      set(cuda_file ${cuda_cubin})
     else()
       add_custom_command(
         OUTPUT ${cuda_file}
@@ -507,7 +516,6 @@ if(WITH_CYCLES_DEVICE_OPTIX)
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
       -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
-      -arch=sm_30
       --use_fast_math
       -o ${output})
 
@@ -515,25 +523,62 @@ if(WITH_CYCLES_DEVICE_OPTIX)
       set(cuda_flags ${cuda_flags}
         -D __KERNEL_DEBUG__)
     endif()
+    if(WITH_CYCLES_CUBIN_COMPILER)
 
-    add_custom_command(
-      OUTPUT
-        ${output}
-      DEPENDS
-        ${input}
-        ${SRC_HEADERS}
-        ${SRC_KERNELS_CUDA_HEADERS}
-        ${SRC_KERNELS_OPTIX_HEADERS}
-        ${SRC_BVH_HEADERS}
-        ${SRC_SVM_HEADERS}
-        ${SRC_GEOM_HEADERS}
-        ${SRC_CLOSURE_HEADERS}
-        ${SRC_UTIL_HEADERS}
-      COMMAND
-        ${CUDA_NVCC_EXECUTABLE} --ptx ${cuda_flags} ${input}
-      WORKING_DIRECTORY
-        "${CMAKE_CURRENT_SOURCE_DIR}")
+      # Needed to find libnvrtc-builtins.so. Can't do it from inside
+      # cycles_cubin_cc since the env variable is read before main()
+      if(APPLE)
+        set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+          -E env DYLD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib")
+      elseif(UNIX)
+        set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+          -E env LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      endif()
 
+      add_custom_command(
+        OUTPUT ${output}
+        DEPENDS
+          ${input}
+          ${SRC_HEADERS}
+          ${SRC_KERNELS_CUDA_HEADERS}
+          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_BVH_HEADERS}
+          ${SRC_SVM_HEADERS}
+          ${SRC_GEOM_HEADERS}
+          ${SRC_CLOSURE_HEADERS}
+          ${SRC_UTIL_HEADERS}
+        COMMAND ${CUBIN_CC_ENV}
+            "$<TARGET_FILE:cycles_cubin_cc>"
+            -target 30
+            -ptx
+            -i ${CMAKE_CURRENT_SOURCE_DIR}/${input}
+            ${cuda_flags}
+            -v
+            -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
+        DEPENDS ${kernel_sources} cycles_cubin_cc)
+    else()
+      add_custom_command(
+        OUTPUT
+          ${output}
+        DEPENDS
+          ${input}
+          ${SRC_HEADERS}
+          ${SRC_KERNELS_CUDA_HEADERS}
+          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_BVH_HEADERS}
+          ${SRC_SVM_HEADERS}
+          ${SRC_GEOM_HEADERS}
+          ${SRC_CLOSURE_HEADERS}
+          ${SRC_UTIL_HEADERS}
+        COMMAND
+          ${CUDA_NVCC_EXECUTABLE}
+          --ptx
+          -arch=sm_30
+          ${cuda_flags}
+          ${input}
+        WORKING_DIRECTORY
+          "${CMAKE_CURRENT_SOURCE_DIR}")
+    endif()
     list(APPEND optix_ptx ${output})
 
     delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 8e17ab9af7a..b3992c03a9a 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -336,7 +336,9 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
     ctx.lcg_state = lcg_state;
     ctx.max_hits = max_hits;
     ctx.local_isect = local_isect;
-    local_isect->num_hits = 0;
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
     ctx.local_object_id = local_object;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
@@ -373,7 +375,9 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
       rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
     }
 
-    return local_isect->num_hits > 0;
+    /* rtcOccluded1 sets tfar to -inf if a hit was found. */
+    return (local_isect && local_isect->num_hits > 0) || (rtc_ray.tfar < 0);
+    ;
   }
 #    endif /* __EMBREE__ */
 
@@ -439,7 +443,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
     ctx.num_hits = 0;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
-    kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_SHADOW);
+    kernel_embree_setup_ray(*ray, rtc_ray, visibility);
     rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
 
     if (ctx.num_hits > max_hits) {
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
index ffea7d37440..ca637288bee 100644
--- a/intern/cycles/kernel/bvh/bvh_embree.h
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -17,9 +17,12 @@
 #include <embree3/rtcore_ray.h>
 #include <embree3/rtcore_scene.h>
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/split/kernel_split_data_types.h"
 #include "kernel/kernel_globals.h"
+// clang-format on
+
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index b282bf5a350..0a9631ad931 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+// clang-format off
 #include "kernel/closure/bsdf_ashikhmin_velvet.h"
 #include "kernel/closure/bsdf_diffuse.h"
 #include "kernel/closure/bsdf_oren_nayar.h"
@@ -32,6 +33,7 @@
 #include "kernel/closure/bsdf_principled_sheen.h"
 #include "kernel/closure/bssrdf.h"
 #include "kernel/closure/volume.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index 4db5a6cc830..f78bbeb5d9d 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -493,6 +493,36 @@ ccl_device void bsdf_principled_hair_blur(ShaderClosure *sc, float roughness)
   bsdf->m0_roughness = fmaxf(roughness, bsdf->m0_roughness);
 }
 
+/* Hair Albedo */
+
+ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
+    const float azimuthal_roughness)
+{
+  const float x = azimuthal_roughness;
+  return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
+}
+
+ccl_device float3 bsdf_principled_hair_albedo(ShaderClosure *sc)
+{
+  PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)sc;
+  return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
+}
+
+ccl_device_inline float3
+bsdf_principled_hair_sigma_from_reflectance(const float3 color, const float azimuthal_roughness)
+{
+  const float3 sigma = log3(color) /
+                       bsdf_principled_hair_albedo_roughness_scale(azimuthal_roughness);
+  return sigma * sigma;
+}
+
+ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const float eumelanin,
+                                                                       const float pheomelanin)
+{
+  return eumelanin * make_float3(0.506f, 0.841f, 1.653f) +
+         pheomelanin * make_float3(0.343f, 0.733f, 1.924f);
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 7bbd17066fd..59d4ace2bef 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -109,7 +109,6 @@ ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
   scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
   if (use_time) {
     scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
-    ;
   }
   scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
   scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index e81c1b781c8..5ff4d5f7053 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+// clang-format off
 #include "kernel/geom/geom_attribute.h"
 #include "kernel/geom/geom_object.h"
 #ifdef __PATCH_EVAL__
@@ -30,3 +31,4 @@
 #include "kernel/geom/geom_curve_intersect.h"
 #include "kernel/geom/geom_volume.h"
 #include "kernel/geom/geom_primitive.h"
+// clang-format on
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 456608bfa22..e1b0e6fb81c 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -29,17 +29,11 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
 
 ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
 {
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE) {
-    return ATTR_PRIM_CURVE;
-  }
-  else
-#endif
-      if (subd_triangle_patch(kg, sd) != ~0) {
+  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
     return ATTR_PRIM_SUBD;
   }
   else {
-    return ATTR_PRIM_TRIANGLE;
+    return ATTR_PRIM_GEOMETRY;
   }
 }
 
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index e0aacb434eb..928cad58452 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -83,6 +83,16 @@ ccl_device float curve_attribute_float(
 
     return (1.0f - sd->u) * f0 + sd->u * f1;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = 0.0f;
+    if (dy)
+      *dy = 0.0f;
+#  endif
+
+    return kernel_tex_fetch(__attributes_float, desc.offset);
+  }
   else {
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -133,6 +143,16 @@ ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
 
     return (1.0f - sd->u) * f0 + sd->u * f1;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float2(0.0f, 0.0f);
+    if (dy)
+      *dy = make_float2(0.0f, 0.0f);
+#  endif
+
+    return kernel_tex_fetch(__attributes_float2, desc.offset);
+  }
   else {
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -183,6 +203,16 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
 
     return (1.0f - sd->u) * f0 + sd->u * f1;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+#  ifdef __RAY_DIFFERENTIALS__
+    if (dx)
+      *dx = make_float3(0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float3(0.0f, 0.0f, 0.0f);
+#  endif
+
+    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+  }
   else {
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 7380c506bf4..0e2a00e9d2e 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -36,7 +36,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
    * zero iterations and rendering is really slow with motion curves. For until other
    * areas are speed up it's probably not so crucial to optimize this out.
    */
-  uint attr_offset = object_attribute_map_offset(kg, object) + ATTR_PRIM_CURVE;
+  uint attr_offset = object_attribute_map_offset(kg, object) + ATTR_PRIM_GEOMETRY;
   uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 
   while (attr_map.x != id) {
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index 49d4829af38..859d919f0bb 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -103,17 +103,21 @@ ccl_device_inline
                                  const Ray *ray,
                                  float3 verts[3])
 {
+#  ifdef __KERNEL_OPTIX__
+  /* isect->t is always in world space with OptiX. */
+  return motion_triangle_refine(kg, sd, isect, ray, verts);
+#  else
   float3 P = ray->P;
   float3 D = ray->D;
   float t = isect->t;
 
-#  ifdef __INTERSECTION_REFINE__
+#    ifdef __INTERSECTION_REFINE__
   if (isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
+#      ifdef __OBJECT_MOTION__
     Transform tfm = sd->ob_itfm;
-#    else
+#      else
     Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#    endif
+#      endif
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -135,19 +139,20 @@ ccl_device_inline
   P = P + D * rt;
 
   if (isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
+#      ifdef __OBJECT_MOTION__
     Transform tfm = sd->ob_tfm;
-#    else
+#      else
     Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#    endif
+#      endif
 
     P = transform_point(&tfm, P);
   }
 
   return P;
-#  else  /* __INTERSECTION_REFINE__ */
+#    else  /* __INTERSECTION_REFINE__ */
   return P + D * t;
-#  endif /* __INTERSECTION_REFINE__ */
+#    endif /* __INTERSECTION_REFINE__ */
+#  endif
 }
 #endif /* __BVH_LOCAL__ */
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index af4e6fbd89b..3aa68e1f84e 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -81,13 +81,7 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
   const uint num_steps = kernel_tex_fetch(__objects, object).numsteps * 2 + 1;
 
   Transform tfm;
-#  ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
-    transform_motion_array_interpolate_straight(&tfm, motion, num_steps, time);
-  }
-  else
-#  endif
-    transform_motion_array_interpolate(&tfm, motion, num_steps, time);
+  transform_motion_array_interpolate(&tfm, motion, num_steps, time);
 
   return tfm;
 }
@@ -326,6 +320,26 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).patch_map_offset;
 }
 
+/* Volume step size */
+
+ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
+{
+  if (object == OBJECT_NONE) {
+    return 1.0f;
+  }
+
+  return kernel_tex_fetch(__objects, object).surface_area;
+}
+
+ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
+{
+  if (object == OBJECT_NONE) {
+    return kernel_data.background.volume_step_size;
+  }
+
+  return kernel_tex_fetch(__object_volume_step, object);
+}
+
 /* Pass ID for shader */
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 81bac6e6ee1..3eef9857ae3 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -217,6 +217,14 @@ ccl_device_noinline float subd_triangle_attribute_float(
 
     return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = 0.0f;
+    if (dy)
+      *dy = 0.0f;
+
+    return kernel_tex_fetch(__attributes_float, desc.offset);
+  }
   else {
     if (dx)
       *dx = 0.0f;
@@ -352,6 +360,14 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
 
     return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float2(0.0f, 0.0f);
+    if (dy)
+      *dy = make_float2(0.0f, 0.0f);
+
+    return kernel_tex_fetch(__attributes_float2, desc.offset);
+  }
   else {
     if (dx)
       *dx = make_float2(0.0f, 0.0f);
@@ -486,6 +502,14 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
 
     return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float3(0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+  }
   else {
     if (dx)
       *dx = make_float3(0.0f, 0.0f, 0.0f);
@@ -584,6 +608,14 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals *kg,
 
     return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    return color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, desc.offset));
+  }
   else {
     if (dx)
       *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index fdb7f655f64..a2731bf2bd0 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -153,6 +153,14 @@ ccl_device float triangle_attribute_float(
 
     return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = 0.0f;
+    if (dy)
+      *dy = 0.0f;
+
+    return kernel_tex_fetch(__attributes_float, desc.offset);
+  }
   else {
     if (dx)
       *dx = 0.0f;
@@ -212,6 +220,14 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
 
     return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float2(0.0f, 0.0f);
+    if (dy)
+      *dy = make_float2(0.0f, 0.0f);
+
+    return kernel_tex_fetch(__attributes_float2, desc.offset);
+  }
   else {
     if (dx)
       *dx = make_float2(0.0f, 0.0f);
@@ -272,6 +288,14 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
 
     return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float3(0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+  }
   else {
     if (dx)
       *dx = make_float3(0.0f, 0.0f, 0.0f);
@@ -304,6 +328,14 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals *kg,
 
     return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
   }
+  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
+    if (dx)
+      *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    if (dy)
+      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    return color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, desc.offset));
+  }
   else {
     if (dx)
       *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index 68075199402..6604806f73b 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -690,16 +690,20 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
                                                const Intersection *isect,
                                                const Ray *ray)
 {
+#ifdef __KERNEL_OPTIX__
+  /* isect->t is always in world space with OptiX. */
+  return triangle_refine(kg, sd, isect, ray);
+#else
   float3 P = ray->P;
   float3 D = ray->D;
   float t = isect->t;
 
   if (isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
+#  ifdef __OBJECT_MOTION__
     Transform tfm = sd->ob_itfm;
-#else
+#  else
     Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
+#  endif
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -708,7 +712,7 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
 
   P = P + D * t;
 
-#ifdef __INTERSECTION_REFINE__
+#  ifdef __INTERSECTION_REFINE__
   const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
@@ -728,19 +732,20 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
     float rt = dot(edge2, qvec) / det;
     P = P + D * rt;
   }
-#endif /* __INTERSECTION_REFINE__ */
+#  endif /* __INTERSECTION_REFINE__ */
 
   if (isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
+#  ifdef __OBJECT_MOTION__
     Transform tfm = sd->ob_tfm;
-#else
+#  else
     Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
+#  endif
 
     P = transform_point(&tfm, P);
   }
 
   return P;
+#endif
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 96cf35a40dc..f43a7841b46 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -51,10 +51,14 @@ ccl_device float volume_attribute_float(KernelGlobals *kg,
                                         const ShaderData *sd,
                                         const AttributeDescriptor desc)
 {
-  float3 P = volume_normalized_position(kg, sd, sd->P);
+  /* todo: optimize this so we don't have to transform both here and in
+   * kernel_tex_image_interp_3d when possible. Also could optimize for the
+   * common case where transform is translation/scale only. */
+  float3 P = sd->P;
+  object_inverse_position_transform(kg, sd, &P);
   InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC :
                                                             INTERPOLATION_NONE;
-  float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp);
+  float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P, interp);
   return average(float4_to_float3(r));
 }
 
@@ -62,10 +66,11 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg,
                                           const ShaderData *sd,
                                           const AttributeDescriptor desc)
 {
-  float3 P = volume_normalized_position(kg, sd, sd->P);
+  float3 P = sd->P;
+  object_inverse_position_transform(kg, sd, &P);
   InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC) ? INTERPOLATION_CUBIC :
                                                             INTERPOLATION_NONE;
-  float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp);
+  float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P, interp);
 
   if (r.w > 1e-6f && r.w != 1.0f) {
     /* For RGBA colors, unpremultiply after interpolation. */
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index dfdd8843f29..b907c6a2bac 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -19,8 +19,8 @@
 
 /* CPU Kernel Interface */
 
-#include "util/util_types.h"
 #include "kernel/kernel_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,7 +38,7 @@ void *kernel_osl_memory(KernelGlobals *kg);
 bool kernel_osl_use(KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
-void kernel_tex_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
 
 #define KERNEL_ARCH cpu
 #include "kernel/kernels/cpu/kernel_cpu.h"
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 606c288649a..79ea03f4f6f 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -36,21 +36,18 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval,
     eval->glossy = make_float3(0.0f, 0.0f, 0.0f);
     eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
     eval->transparent = make_float3(0.0f, 0.0f, 0.0f);
-    eval->subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
+    eval->volume = make_float3(0.0f, 0.0f, 0.0f);
 
     if (type == CLOSURE_BSDF_TRANSPARENT_ID)
       eval->transparent = value;
-    else if (CLOSURE_IS_BSDF_DIFFUSE(type))
+    else if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
       eval->diffuse = value;
     else if (CLOSURE_IS_BSDF_GLOSSY(type))
       eval->glossy = value;
     else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
       eval->transmission = value;
-    else if (CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->subsurface = value;
     else if (CLOSURE_IS_PHASE(type))
-      eval->scatter = value;
+      eval->volume = value;
   }
   else
 #endif
@@ -73,16 +70,14 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval,
   value *= mis_weight;
 #ifdef __PASSES__
   if (eval->use_light_pass) {
-    if (CLOSURE_IS_BSDF_DIFFUSE(type))
+    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
       eval->diffuse += value;
     else if (CLOSURE_IS_BSDF_GLOSSY(type))
       eval->glossy += value;
     else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
       eval->transmission += value;
-    else if (CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->subsurface += value;
     else if (CLOSURE_IS_PHASE(type))
-      eval->scatter += value;
+      eval->volume += value;
 
     /* skipping transparent, this function is used by for eval(), will be zero then */
   }
@@ -98,7 +93,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 #ifdef __PASSES__
   if (eval->use_light_pass) {
     return is_zero(eval->diffuse) && is_zero(eval->glossy) && is_zero(eval->transmission) &&
-           is_zero(eval->transparent) && is_zero(eval->subsurface) && is_zero(eval->scatter);
+           is_zero(eval->transparent) && is_zero(eval->volume);
   }
   else
 #endif
@@ -114,8 +109,7 @@ ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
     eval->diffuse *= value;
     eval->glossy *= value;
     eval->transmission *= value;
-    eval->subsurface *= value;
-    eval->scatter *= value;
+    eval->volume *= value;
 
     /* skipping transparent, this function is used by for eval(), will be zero then */
   }
@@ -144,8 +138,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
     eval->diffuse *= value;
     eval->glossy *= value;
     eval->transmission *= value;
-    eval->subsurface *= value;
-    eval->scatter *= value;
+    eval->volume *= value;
 
     /* skipping transparent, this function is used by for eval(), will be zero then */
   }
@@ -160,7 +153,7 @@ ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
 #ifdef __PASSES__
   if (eval->use_light_pass) {
-    return eval->diffuse + eval->glossy + eval->transmission + eval->subsurface + eval->scatter;
+    return eval->diffuse + eval->glossy + eval->transmission + eval->volume;
   }
   else
 #endif
@@ -187,19 +180,16 @@ ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L)
     L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->color_glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->color_transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f);
 
     L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
+    L->direct_volume = make_float3(0.0f, 0.0f, 0.0f);
 
     L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
+    L->indirect_volume = make_float3(0.0f, 0.0f, 0.0f);
 
     L->transparent = 0.0f;
     L->emission = make_float3(0.0f, 0.0f, 0.0f);
@@ -211,8 +201,7 @@ ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L)
     L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
+    L->state.volume = make_float3(0.0f, 0.0f, 0.0f);
     L->state.direct = make_float3(0.0f, 0.0f, 0.0f);
   }
   else
@@ -264,11 +253,9 @@ ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg,
       L_state->diffuse = bsdf_eval->diffuse * value;
       L_state->glossy = bsdf_eval->glossy * value;
       L_state->transmission = bsdf_eval->transmission * value;
-      L_state->subsurface = bsdf_eval->subsurface * value;
-      L_state->scatter = bsdf_eval->scatter * value;
+      L_state->volume = bsdf_eval->volume * value;
 
-      *throughput = L_state->diffuse + L_state->glossy + L_state->transmission +
-                    L_state->subsurface + L_state->scatter;
+      *throughput = L_state->diffuse + L_state->glossy + L_state->transmission + L_state->volume;
 
       L_state->direct = *throughput;
     }
@@ -449,8 +436,7 @@ ccl_device_inline void path_radiance_accum_light(KernelGlobals *kg,
       L->direct_diffuse += shaded_throughput * bsdf_eval->diffuse;
       L->direct_glossy += shaded_throughput * bsdf_eval->glossy;
       L->direct_transmission += shaded_throughput * bsdf_eval->transmission;
-      L->direct_subsurface += shaded_throughput * bsdf_eval->subsurface;
-      L->direct_scatter += shaded_throughput * bsdf_eval->scatter;
+      L->direct_volume += shaded_throughput * bsdf_eval->volume;
 
       if (is_lamp) {
         L->shadow.x += shadow.x * shadow_fac;
@@ -528,7 +514,8 @@ ccl_device_inline void path_radiance_accum_background(KernelGlobals *kg,
   }
 
 #ifdef __DENOISING_FEATURES__
-  L->denoising_albedo += state->denoising_feature_weight * value;
+  L->denoising_albedo += state->denoising_feature_weight * state->denoising_feature_throughput *
+                         value;
 #endif /* __DENOISING_FEATURES__ */
 }
 
@@ -561,15 +548,13 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
     L->direct_diffuse += L->state.diffuse * L->direct_emission;
     L->direct_glossy += L->state.glossy * L->direct_emission;
     L->direct_transmission += L->state.transmission * L->direct_emission;
-    L->direct_subsurface += L->state.subsurface * L->direct_emission;
-    L->direct_scatter += L->state.scatter * L->direct_emission;
+    L->direct_volume += L->state.volume * L->direct_emission;
 
     L->indirect = safe_divide_color(L->indirect, L->state.direct);
     L->indirect_diffuse += L->state.diffuse * L->indirect;
     L->indirect_glossy += L->state.glossy * L->indirect;
     L->indirect_transmission += L->state.transmission * L->indirect;
-    L->indirect_subsurface += L->state.subsurface * L->indirect;
-    L->indirect_scatter += L->state.scatter * L->indirect;
+    L->indirect_volume += L->state.volume * L->indirect;
   }
 #endif
 }
@@ -581,8 +566,7 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
     L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
     L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
     L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
-    L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
+    L->state.volume = make_float3(0.0f, 0.0f, 0.0f);
 
     L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
     L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -646,10 +630,10 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
   if (L->use_light_pass) {
     path_radiance_sum_indirect(L);
 
-    L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission +
-               L->direct_subsurface + L->direct_scatter + L->emission;
+    L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_volume +
+               L->emission;
     L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission +
-                 L->indirect_subsurface + L->indirect_scatter;
+                 L->indirect_volume;
 
     if (!kernel_data.background.transparent)
       L_direct += L->background;
@@ -665,14 +649,12 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
       L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f);
       L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
       L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
-      L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-      L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
+      L->direct_volume = make_float3(0.0f, 0.0f, 0.0f);
 
       L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
       L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
       L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
-      L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-      L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
+      L->indirect_volume = make_float3(0.0f, 0.0f, 0.0f);
 
       L->emission = make_float3(0.0f, 0.0f, 0.0f);
     }
@@ -714,7 +696,7 @@ ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
   kernel_assert(L->use_light_pass);
 
   *clean = L->emission + L->background;
-  *noisy = L->direct_scatter + L->indirect_scatter;
+  *noisy = L->direct_volume + L->indirect_volume;
 
 #  define ADD_COMPONENT(flag, component) \
     if (kernel_data.film.denoising_flags & flag) \
@@ -728,8 +710,6 @@ ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
   ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
   ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
   ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
-  ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
-  ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
 #  undef ADD_COMPONENT
 #else
   *noisy = L->emission;
@@ -766,14 +746,12 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
   safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
   safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
   safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
-  safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface);
-  safe_float3_add(L->direct_scatter, L_sample->direct_scatter);
+  safe_float3_add(L->direct_volume, L_sample->direct_volume);
 
   safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
   safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
   safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
-  safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface);
-  safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter);
+  safe_float3_add(L->indirect_volume, L_sample->indirect_volume);
 
   safe_float3_add(L->background, L_sample->background);
   safe_float3_add(L->ao, L_sample->ao);
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
new file mode 100644
index 00000000000..047fe8c92ec
--- /dev/null
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -0,0 +1,230 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
+#define __KERNEL_ADAPTIVE_SAMPLING_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+
+ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
+                                            ccl_global float *buffer,
+                                            int sample)
+{
+  /* TODO Stefan: Is this better in linear, sRGB or something else? */
+  float4 I = *((ccl_global float4 *)buffer);
+  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
+   * A small epsilon is added to the divisor to prevent division by zero. */
+  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
+                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
+  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
+    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
+    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  }
+}
+
+/* Adjust the values of an adaptively sampled pixel. */
+
+ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
+                                            ccl_global float *buffer,
+                                            float sample_multiplier)
+{
+  *(ccl_global float4 *)(buffer) *= sample_multiplier;
+
+  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
+  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+
+#ifdef __PASSES__
+  int flag = kernel_data.film.pass_flag;
+
+  if (flag & PASSMASK(NORMAL))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+
+  if (flag & PASSMASK(UV))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+
+  if (flag & PASSMASK(MOTION)) {
+    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
+    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+  }
+
+  if (kernel_data.film.use_light_pass) {
+    int light_flag = kernel_data.film.light_pass_flag;
+
+    if (light_flag & PASSMASK(MIST))
+      *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
+
+    /* Shadow pass omitted on purpose. It has its own scale parameter. */
+
+    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(VOLUME_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(VOLUME_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
+
+    if (light_flag & PASSMASK(EMISSION))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
+    if (light_flag & PASSMASK(BACKGROUND))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
+    if (light_flag & PASSMASK(AO))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
+
+    if (light_flag & PASSMASK(DIFFUSE_COLOR))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_COLOR))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
+  }
+#endif
+
+#ifdef __DENOISING_FEATURES__
+
+#  define scale_float3_variance(buffer, offset, scale) \
+    *(buffer + offset) *= scale; \
+    *(buffer + offset + 1) *= scale; \
+    *(buffer + offset + 2) *= scale; \
+    *(buffer + offset + 3) *= scale * scale; \
+    *(buffer + offset + 4) *= scale * scale; \
+    *(buffer + offset + 5) *= scale * scale;
+
+#  define scale_shadow_variance(buffer, offset, scale) \
+    *(buffer + offset) *= scale; \
+    *(buffer + offset + 1) *= scale; \
+    *(buffer + offset + 2) *= scale * scale;
+
+  if (kernel_data.film.pass_denoising_data) {
+    scale_shadow_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
+    scale_shadow_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
+    if (kernel_data.film.pass_denoising_clean) {
+      scale_float3_variance(
+          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
+      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
+      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
+    }
+    else {
+      scale_float3_variance(
+          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+    }
+    scale_float3_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
+    scale_float3_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
+    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
+    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
+      1) *= sample_multiplier * sample_multiplier;
+  }
+#endif /* __DENOISING_FEATURES__ */
+
+  if (kernel_data.film.cryptomatte_passes) {
+    int num_slots = 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
+    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
+    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
+                                                         kernel_data.film.pass_cryptomatte);
+    for (int slot = 0; slot < num_slots; slot++) {
+      id_buffer[slot].y *= sample_multiplier;
+    }
+  }
+}
+
+/* This is a simple box filter in two passes.
+ * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
+
+ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+{
+  bool any = false;
+  bool prev = false;
+  for (int x = tile->x; x < tile->x + tile->w; ++x) {
+    int index = tile->offset + x + y * tile->stride;
+    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w == 0.0f) {
+      any = true;
+      if (x > tile->x && !prev) {
+        index = index - 1;
+        buffer = tile->buffer + index * kernel_data.film.pass_stride;
+        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+        aux->w = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        aux->w = 0.0f;
+      }
+      prev = false;
+    }
+  }
+  return any;
+}
+
+ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+{
+  bool prev = false;
+  bool any = false;
+  for (int y = tile->y; y < tile->y + tile->h; ++y) {
+    int index = tile->offset + x + y * tile->stride;
+    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w == 0.0f) {
+      any = true;
+      if (y > tile->y && !prev) {
+        index = index - tile->stride;
+        buffer = tile->buffer + index * kernel_data.film.pass_stride;
+        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+        aux->w = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        aux->w = 0.0f;
+      }
+      prev = false;
+    }
+  }
+  return any;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index a349b225abb..f1fc697553a 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -71,7 +71,7 @@ ccl_device_inline void compute_light_pass(
 
 #  ifdef __SUBSURFACE__
     /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
+    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
       /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
        * if scattering was successful. */
       SubsurfaceIndirectRays ss_indirect;
@@ -123,7 +123,7 @@ ccl_device_inline void compute_light_pass(
 
 #    ifdef __SUBSURFACE__
     /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
+    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
       /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
        * if scattering was successful. */
       kernel_branched_path_subsurface_scatter(
@@ -178,10 +178,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
       return shader_bsdf_glossy(kg, sd);
     case SHADER_EVAL_TRANSMISSION:
       return shader_bsdf_transmission(kg, sd);
-#  ifdef __SUBSURFACE__
-    case SHADER_EVAL_SUBSURFACE:
-      return shader_bsdf_subsurface(kg, sd);
-#  endif
     default:
       kernel_assert(!"Unknown bake type passed to BSDF evaluate");
       return make_float3(0.0f, 0.0f, 0.0f);
@@ -385,11 +381,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
       if ((pass_filter & BAKE_FILTER_TRANSMISSION_INDIRECT) == BAKE_FILTER_TRANSMISSION_INDIRECT)
         out += L.indirect_transmission;
 
-      if ((pass_filter & BAKE_FILTER_SUBSURFACE_DIRECT) == BAKE_FILTER_SUBSURFACE_DIRECT)
-        out += L.direct_subsurface;
-      if ((pass_filter & BAKE_FILTER_SUBSURFACE_INDIRECT) == BAKE_FILTER_SUBSURFACE_INDIRECT)
-        out += L.indirect_subsurface;
-
       if ((pass_filter & BAKE_FILTER_EMISSION) != 0)
         out += L.emission;
 
@@ -414,13 +405,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
           kg, &sd, &state, L.direct_transmission, L.indirect_transmission, type, pass_filter);
       break;
     }
-    case SHADER_EVAL_SUBSURFACE: {
-#    ifdef __SUBSURFACE__
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_subsurface, L.indirect_subsurface, type, pass_filter);
-#    endif
-      break;
-    }
 #  endif
 
     /* extra */
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 006dd00dd73..88f6a264a5a 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -35,11 +35,11 @@
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
+#include "util/util_half.h"
 #include "util/util_math.h"
 #include "util/util_simd.h"
-#include "util/util_half.h"
-#include "util/util_types.h"
 #include "util/util_texture.h"
+#include "util/util_types.h"
 
 #define ccl_addr_space
 
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 4f508d7cdaa..3c5a10540d5 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -37,8 +37,11 @@ typedef unsigned long long uint64_t;
 typedef unsigned short half;
 typedef unsigned long long CUtexObject;
 
-#define FLT_MIN 1.175494350822287507969e-38f
-#define FLT_MAX 340282346638528859811704183484516925440.0f
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
 
 __device__ half __float2half(const float f)
 {
diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/kernel_compat_optix.h
index 61b9d87a020..7068acc3a32 100644
--- a/intern/cycles/kernel/kernel_compat_optix.h
+++ b/intern/cycles/kernel/kernel_compat_optix.h
@@ -35,9 +35,11 @@ typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 typedef unsigned short half;
 typedef unsigned long long CUtexObject;
-
-#define FLT_MIN 1.175494350822287507969e-38f
-#define FLT_MAX 340282346638528859811704183484516925440.0f
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
 
 __device__ half __float2half(const float f)
 {
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index c63d1149d03..71b176a0a8f 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -145,16 +145,14 @@ ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
 #ifdef __PASSES__
   /* use visibility flag to skip lights */
   if (ls->shader & SHADER_EXCLUDE_ANY) {
-    if (ls->shader & SHADER_EXCLUDE_DIFFUSE) {
+    if (ls->shader & SHADER_EXCLUDE_DIFFUSE)
       eval->diffuse = make_float3(0.0f, 0.0f, 0.0f);
-      eval->subsurface = make_float3(0.0f, 0.0f, 0.0f);
-    }
     if (ls->shader & SHADER_EXCLUDE_GLOSSY)
       eval->glossy = make_float3(0.0f, 0.0f, 0.0f);
     if (ls->shader & SHADER_EXCLUDE_TRANSMIT)
       eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
     if (ls->shader & SHADER_EXCLUDE_SCATTER)
-      eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
+      eval->volume = make_float3(0.0f, 0.0f, 0.0f);
   }
 #endif
 
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index fc3a6152b79..3829426f261 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -40,15 +40,9 @@ ccl_device float4 film_get_pass_result(KernelGlobals *kg,
     if (display_divide_pass_stride != -1) {
       ccl_global float4 *divide_in = (ccl_global float4 *)(buffer + display_divide_pass_stride +
                                                            index * kernel_data.film.pass_stride);
-      if (divide_in->x != 0.0f) {
-        pass_result.x /= divide_in->x;
-      }
-      if (divide_in->y != 0.0f) {
-        pass_result.y /= divide_in->y;
-      }
-      if (divide_in->z != 0.0f) {
-        pass_result.z /= divide_in->z;
-      }
+      float3 divided = safe_divide_even_color(float4_to_float3(pass_result),
+                                              float4_to_float3(*divide_in));
+      pass_result = make_float4(divided.x, divided.y, divided.z, pass_result.w);
     }
 
     if (kernel_data.film.use_display_exposure) {
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index a440021b6b9..c186e8560eb 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -22,8 +22,8 @@
 #include "kernel/kernel_profiling.h"
 
 #ifdef __KERNEL_CPU__
-#  include "util/util_vector.h"
 #  include "util/util_map.h"
+#  include "util/util_vector.h"
 #endif
 
 #ifdef __KERNEL_OPENCL__
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index e59d8946950..5b6e3bbf501 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -195,4 +195,36 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 }
 #endif
 
+ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
+{
+  /* Fallback to random */
+  if (sample >= NUM_PMJ_SAMPLES) {
+    int p = rng_hash + dimension;
+    return cmj_randfloat(sample, p);
+  }
+  uint tmp_rng = cmj_hash_simple(dimension, rng_hash);
+  int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
+  return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ (tmp_rng & 0x007fffff)) -
+         1.0f;
+}
+
+ccl_device void pmj_sample_2D(
+    KernelGlobals *kg, int sample, int rng_hash, int dimension, float *fx, float *fy)
+{
+  if (sample >= NUM_PMJ_SAMPLES) {
+    int p = rng_hash + dimension;
+    *fx = cmj_randfloat(sample, p);
+    *fy = cmj_randfloat(sample, p + 1);
+    return;
+  }
+  uint tmp_rng = cmj_hash_simple(dimension, rng_hash);
+  int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
+  *fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ (tmp_rng & 0x007fffff)) -
+        1.0f;
+  tmp_rng = cmj_hash_simple(dimension + 1, rng_hash);
+  *fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^
+                        (tmp_rng & 0x007fffff)) -
+        1.0f;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 7345e9ee5bb..98136bc7047 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -29,7 +29,9 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
   if (kernel_data.film.pass_denoising_data == 0)
     return;
 
-  buffer += (sample & 1) ? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+  buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
+                DENOISING_PASS_SHADOW_B :
+                DENOISING_PASS_SHADOW_A;
 
   path_total = ensure_finite(path_total);
   path_total_shaded = ensure_finite(path_total_shaded);
@@ -58,7 +60,8 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
   }
 
   float3 normal = make_float3(0.0f, 0.0f, 0.0f);
-  float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+  float3 diffuse_albedo = make_float3(0.0f, 0.0f, 0.0f);
+  float3 specular_albedo = make_float3(0.0f, 0.0f, 0.0f);
   float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
@@ -70,24 +73,31 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
     /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
     normal += sc->N * sc->sample_weight;
     sum_weight += sc->sample_weight;
-    if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
-      float3 closure_albedo = sc->weight;
-      /* Closures that include a Fresnel term typically have weights close to 1 even though their
-       * actual contribution is significantly lower.
-       * To account for this, we scale their weight by the average fresnel factor (the same is also
-       * done for the sample weight in the BSDF setup, so we don't need to scale that here). */
-      if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
-        MicrofacetBsdf *bsdf = (MicrofacetBsdf *)sc;
-        closure_albedo *= bsdf->extra->fresnel_color;
-      }
-      else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
-        PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf *)sc;
-        closure_albedo *= bsdf->avg_value;
-      }
 
-      albedo += closure_albedo;
+    float3 closure_albedo = sc->weight;
+    /* Closures that include a Fresnel term typically have weights close to 1 even though their
+     * actual contribution is significantly lower.
+     * To account for this, we scale their weight by the average fresnel factor (the same is also
+     * done for the sample weight in the BSDF setup, so we don't need to scale that here). */
+    if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
+      MicrofacetBsdf *bsdf = (MicrofacetBsdf *)sc;
+      closure_albedo *= bsdf->extra->fresnel_color;
+    }
+    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
+      PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf *)sc;
+      closure_albedo *= bsdf->avg_value;
+    }
+    else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
+      closure_albedo *= bsdf_principled_hair_albedo(sc);
+    }
+
+    if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
+      diffuse_albedo += closure_albedo;
       sum_nonspecular_weight += sc->sample_weight;
     }
+    else {
+      specular_albedo += closure_albedo;
+    }
   }
 
   /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
@@ -101,10 +111,14 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
     normal = transform_direction(&worldtocamera, normal);
 
     L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
-    L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+    L->denoising_albedo += ensure_finite3(state->denoising_feature_weight *
+                                          state->denoising_feature_throughput * diffuse_albedo);
 
     state->denoising_feature_weight = 0.0f;
   }
+  else {
+    state->denoising_feature_throughput *= specular_albedo;
+  }
 }
 #endif /* __DENOISING_FEATURES__ */
 
@@ -240,8 +254,6 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
     L->color_glossy += shader_bsdf_glossy(kg, sd) * throughput;
   if (light_flag & PASSMASK_COMPONENT(TRANSMISSION))
     L->color_transmission += shader_bsdf_transmission(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(SUBSURFACE))
-    L->color_subsurface += shader_bsdf_subsurface(kg, sd) * throughput;
 
   if (light_flag & PASSMASK(MIST)) {
     /* bring depth into 0..1 range */
@@ -287,11 +299,8 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
   if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect,
                              L->indirect_transmission);
-  if (light_flag & PASSMASK(SUBSURFACE_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect,
-                             L->indirect_subsurface);
   if (light_flag & PASSMASK(VOLUME_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_scatter);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_volume);
   if (light_flag & PASSMASK(DIFFUSE_DIRECT))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
   if (light_flag & PASSMASK(GLOSSY_DIRECT))
@@ -299,11 +308,8 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
   if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct,
                              L->direct_transmission);
-  if (light_flag & PASSMASK(SUBSURFACE_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct,
-                             L->direct_subsurface);
   if (light_flag & PASSMASK(VOLUME_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_scatter);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_volume);
 
   if (light_flag & PASSMASK(EMISSION))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
@@ -319,8 +325,6 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
   if (light_flag & PASSMASK(TRANSMISSION_COLOR))
     kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
                              L->color_transmission);
-  if (light_flag & PASSMASK(SUBSURFACE_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, L->color_subsurface);
   if (light_flag & PASSMASK(SHADOW)) {
     float4 shadow = L->shadow;
     shadow.w = kernel_data.film.pass_shadow_scale;
@@ -387,6 +391,41 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
 #ifdef __KERNEL_DEBUG__
   kernel_write_debug_passes(kg, buffer, L);
 #endif
+
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+     criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+     Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+     tiles. */
+  if (kernel_data.film.pass_adaptive_aux_buffer &&
+      kernel_data.integrator.adaptive_threshold > 0.0f) {
+    if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+      kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
+                               make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
+    }
+#ifdef __KERNEL_CPU__
+    if (sample > kernel_data.integrator.adaptive_min_samples &&
+        (sample & (ADAPTIVE_SAMPLE_STEP - 1)) == (ADAPTIVE_SAMPLE_STEP - 1)) {
+      kernel_do_adaptive_stopping(kg, buffer, sample);
+    }
+#endif
+  }
+
+  /* Write the sample count as negative numbers initially to mark the samples as in progress.
+   * Once the tile has finished rendering, the sign gets flipped and all the pixel values
+   * are scaled as if they were taken at a uniform sample count. */
+  if (kernel_data.film.pass_sample_count) {
+    /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
+     * passes. */
+#ifdef __ATOMIC_PASS_WRITE__
+    atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
+                               0x80000000);
+#else
+    if (buffer[kernel_data.film.pass_sample_count] > 0) {
+      buffer[kernel_data.film.pass_sample_count] *= -1.0f;
+    }
+#endif
+    kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 1a0b67275a7..db35303e3f1 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -18,6 +18,7 @@
 #  include "kernel/osl/osl_shader.h"
 #endif
 
+// clang-format off
 #include "kernel/kernel_random.h"
 #include "kernel/kernel_projection.h"
 #include "kernel/kernel_montecarlo.h"
@@ -31,6 +32,7 @@
 #include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
 #include "kernel/kernel_light.h"
+#include "kernel/kernel_adaptive_sampling.h"
 #include "kernel/kernel_passes.h"
 
 #if defined(__VOLUME__) || defined(__SUBSURFACE__)
@@ -48,6 +50,7 @@
 #include "kernel/kernel_path_surface.h"
 #include "kernel/kernel_path_volume.h"
 #include "kernel/kernel_path_subsurface.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -168,19 +171,19 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *k
   Ray volume_ray = *ray;
   volume_ray.t = (hit) ? isect->t : FLT_MAX;
 
-  bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+  float step_size = volume_stack_step_size(kg, state->volume_stack);
 
 #    ifdef __VOLUME_DECOUPLED__
   int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
   bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
-  bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method);
+  bool decoupled = kernel_volume_use_decoupled(kg, step_size, direct, sampling_method);
 
   if (decoupled) {
     /* cache steps along volume for repeated sampling */
     VolumeSegment volume_segment;
 
     shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, heterogeneous);
+    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
 
     volume_segment.sampling_method = sampling_method;
 
@@ -226,7 +229,7 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *k
   {
     /* integrate along volume segment with distance sampling */
     VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+        kg, state, sd, &volume_ray, L, throughput, step_size);
 
 #    ifdef __VOLUME_SCATTER__
     if (result == VOLUME_PATH_SCATTERED) {
@@ -656,6 +659,14 @@ ccl_device void kernel_path_trace(
 
   buffer += index * pass_stride;
 
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w > 0.0f) {
+      return;
+    }
+  }
+
   /* Initialize random numbers and sample ray. */
   uint rng_hash;
   Ray ray;
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index f75e4ab4c97..337c4fb1d10 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -91,7 +91,7 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
   Ray volume_ray = *ray;
   volume_ray.t = (hit) ? isect->t : FLT_MAX;
 
-  bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+  float step_size = volume_stack_step_size(kg, state->volume_stack);
 
 #      ifdef __VOLUME_DECOUPLED__
   /* decoupled ray marching only supported on CPU */
@@ -100,7 +100,7 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
     VolumeSegment volume_segment;
 
     shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, heterogeneous);
+    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
 
     /* direct light sampling */
     if (volume_segment.closure_flag & SD_SCATTER) {
@@ -171,7 +171,7 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
       path_state_branch(&ps, j, num_samples);
 
       VolumeIntegrateResult result = kernel_volume_integrate(
-          kg, &ps, sd, &volume_ray, L, &tp, heterogeneous);
+          kg, &ps, sd, &volume_ray, L, &tp, step_size);
 
 #      ifdef __VOLUME_SCATTER__
       if (result == VOLUME_PATH_SCATTERED) {
@@ -523,6 +523,14 @@ ccl_device void kernel_branched_path_trace(
 
   buffer += index * pass_stride;
 
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w > 0.0f) {
+      return;
+    }
+  }
+
   /* initialize random numbers and ray */
   uint rng_hash;
   Ray ray;
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 8735e3208db..c389c815ae2 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -41,9 +41,11 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
   if (kernel_data.film.pass_denoising_data) {
     state->flag |= PATH_RAY_STORE_SHADOW_INFO;
     state->denoising_feature_weight = 1.0f;
+    state->denoising_feature_throughput = make_float3(1.0f, 1.0f, 1.0f);
   }
   else {
     state->denoising_feature_weight = 0.0f;
+    state->denoising_feature_throughput = make_float3(0.0f, 0.0f, 0.0f);
   }
 #endif /* __DENOISING_FEATURES__ */
 
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 80738213d2a..f4c3b36e778 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -43,7 +43,7 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
   uint i = index + SOBOL_SKIP;
   for (int j = 0, x; (x = find_first_set(i)); i >>= x) {
     j += x;
-    result ^= kernel_tex_fetch(__sobol_directions, 32 * dimension + j - 1);
+    result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1);
   }
   return result;
 }
@@ -56,7 +56,9 @@ ccl_device_forceinline float path_rng_1D(
 #ifdef __DEBUG_CORRELATION__
   return (float)drand48();
 #endif
-
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
+    return pmj_sample_1D(kg, sample, rng_hash, dimension);
+  }
 #ifdef __CMJ__
 #  ifdef __SOBOL__
   if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
@@ -99,7 +101,10 @@ ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
   *fy = (float)drand48();
   return;
 #endif
-
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
+    pmj_sample_2D(kg, sample, rng_hash, dimension, fx, fy);
+    return;
+  }
 #ifdef __CMJ__
 #  ifdef __SOBOL__
   if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
@@ -284,4 +289,31 @@ ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
   return (float)*rng * (1.0f / (float)0xFFFFFFFF);
 }
 
+ccl_device_inline bool sample_is_even(int pattern, int sample)
+{
+  if (pattern == SAMPLING_PATTERN_PMJ) {
+    /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
+     * We can use this to get divide sample sequence into two classes for easier variance
+     * estimation. */
+#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
+    return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
+#elif defined(__NVCC__)
+    return __popc(sample & 0xaaaaaaaa) & 1;
+#elif defined(__KERNEL_OPENCL__)
+    return popcount(sample & 0xaaaaaaaa) & 1;
+#else
+    /* TODO(Stefan): popcnt intrinsic for Windows with fallback for older CPUs. */
+    int i = sample & 0xaaaaaaaa;
+    i = i - ((i >> 1) & 0x55555555);
+    i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+    i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+    return i & 1;
+#endif
+  }
+  else {
+    /* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
+    return sample & 0x1;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index d03faff4242..9700aaba80f 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -23,10 +23,12 @@
  * Release.
  */
 
+// clang-format off
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/bsdf_util.h"
 #include "kernel/closure/bsdf.h"
 #include "kernel/closure/emissive.h"
+// clang-format on
 
 #include "kernel/svm/svm.h"
 
@@ -901,7 +903,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
   for (int i = 0; i < sd->num_closure; i++) {
     ShaderClosure *sc = &sd->closure[i];
 
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type) ||
+        CLOSURE_IS_BSDF_BSSRDF(sc->type))
       eval += sc->weight;
   }
 
@@ -936,20 +939,6 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
-{
-  float3 eval = make_float3(0.0f, 0.0f, 0.0f);
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      eval += sc->weight;
-  }
-
-  return eval;
-}
-
 ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
 {
   float3 N = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 23e30db1b08..ed8572467ea 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -428,12 +428,17 @@ ccl_device_noinline
     hit = (ss_isect->num_hits > 0);
 
     if (hit) {
+#ifdef __KERNEL_OPTIX__
+      /* t is always in world space with OptiX. */
+      t = ss_isect->hits[0].t;
+#else
       /* Compute world space distance to surface hit. */
       float3 D = ray->D;
       object_inverse_dir_transform(kg, sd, &D);
       D = normalize(D) * ss_isect->hits[0].t;
       object_dir_transform(kg, sd, &D);
       t = len(D);
+#endif
     }
 
     /* Advance to new scatter location. */
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 9eaa6b5516e..c8e01677d09 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -35,6 +35,7 @@ KERNEL_TEX(KernelObject, __objects)
 KERNEL_TEX(Transform, __object_motion_pass)
 KERNEL_TEX(DecomposedTransform, __object_motion)
 KERNEL_TEX(uint, __object_flag)
+KERNEL_TEX(float, __object_volume_step)
 
 /* cameras */
 KERNEL_TEX(DecomposedTransform, __camera_motion)
@@ -77,7 +78,7 @@ KERNEL_TEX(KernelShader, __shaders)
 KERNEL_TEX(float, __lookup_table)
 
 /* sobol */
-KERNEL_TEX(uint, __sobol_directions)
+KERNEL_TEX(uint, __sample_pattern_lut)
 
 /* image textures */
 KERNEL_TEX(TextureInfo, __texture_info)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index c35e345763a..b6d319311a1 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -63,6 +63,11 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE 32
 
+/* Adaptive sampling constants */
+#define ADAPTIVE_SAMPLE_STEP 4
+static_assert((ADAPTIVE_SAMPLE_STEP & (ADAPTIVE_SAMPLE_STEP - 1)) == 0,
+              "ADAPTIVE_SAMPLE_STEP must be power of two for bitwise operations to work");
+
 /* Split kernel constants */
 #define WORK_POOL_SIZE_GPU 64
 #define WORK_POOL_SIZE_CPU 1
@@ -106,8 +111,6 @@ CCL_NAMESPACE_BEGIN
 #ifndef __KERNEL_AO_PREVIEW__
 #  define __SVM__
 #  define __EMISSION__
-#  define __TEXTURES__
-#  define __EXTRA_NODES__
 #  define __HOLDOUT__
 #  define __MULTI_CLOSURE__
 #  define __TRANSPARENT_SHADOWS__
@@ -220,7 +223,6 @@ typedef enum ShaderEvalType {
   SHADER_EVAL_DIFFUSE_COLOR,
   SHADER_EVAL_GLOSSY_COLOR,
   SHADER_EVAL_TRANSMISSION_COLOR,
-  SHADER_EVAL_SUBSURFACE_COLOR,
   SHADER_EVAL_EMISSION,
   SHADER_EVAL_AOV_COLOR,
   SHADER_EVAL_AOV_VALUE,
@@ -232,7 +234,6 @@ typedef enum ShaderEvalType {
   SHADER_EVAL_DIFFUSE,
   SHADER_EVAL_GLOSSY,
   SHADER_EVAL_TRANSMISSION,
-  SHADER_EVAL_SUBSURFACE,
 
   /* extra */
   SHADER_EVAL_ENVIRONMENT,
@@ -269,6 +270,7 @@ enum PathTraceDimension {
 enum SamplingPattern {
   SAMPLING_PATTERN_SOBOL = 0,
   SAMPLING_PATTERN_CMJ = 1,
+  SAMPLING_PATTERN_PMJ = 2,
 
   SAMPLING_NUM_PATTERNS,
 };
@@ -375,6 +377,8 @@ typedef enum PassType {
   PASS_CRYPTOMATTE,
   PASS_AOV_COLOR,
   PASS_AOV_VALUE,
+  PASS_ADAPTIVE_AUX_BUFFER,
+  PASS_SAMPLE_COUNT,
   PASS_CATEGORY_MAIN_END = 31,
 
   PASS_MIST = 32,
@@ -392,10 +396,7 @@ typedef enum PassType {
   PASS_TRANSMISSION_DIRECT,
   PASS_TRANSMISSION_INDIRECT,
   PASS_TRANSMISSION_COLOR,
-  PASS_SUBSURFACE_DIRECT,
-  PASS_SUBSURFACE_INDIRECT,
-  PASS_SUBSURFACE_COLOR,
-  PASS_VOLUME_DIRECT,
+  PASS_VOLUME_DIRECT = 50,
   PASS_VOLUME_INDIRECT,
   /* No Scatter color since it's tricky to define what it would even mean. */
   PASS_CATEGORY_LIGHT_END = 63,
@@ -445,23 +446,20 @@ typedef enum eBakePassFilter {
   BAKE_FILTER_DIFFUSE = (1 << 3),
   BAKE_FILTER_GLOSSY = (1 << 4),
   BAKE_FILTER_TRANSMISSION = (1 << 5),
-  BAKE_FILTER_SUBSURFACE = (1 << 6),
-  BAKE_FILTER_EMISSION = (1 << 7),
-  BAKE_FILTER_AO = (1 << 8),
+  BAKE_FILTER_EMISSION = (1 << 6),
+  BAKE_FILTER_AO = (1 << 7),
 } eBakePassFilter;
 
 typedef enum BakePassFilterCombos {
   BAKE_FILTER_COMBINED = (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE |
-                          BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_SUBSURFACE |
-                          BAKE_FILTER_EMISSION | BAKE_FILTER_AO),
+                          BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_EMISSION |
+                          BAKE_FILTER_AO),
   BAKE_FILTER_DIFFUSE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_DIFFUSE),
   BAKE_FILTER_GLOSSY_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_GLOSSY),
   BAKE_FILTER_TRANSMISSION_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_TRANSMISSION),
-  BAKE_FILTER_SUBSURFACE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_SUBSURFACE),
   BAKE_FILTER_DIFFUSE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE),
   BAKE_FILTER_GLOSSY_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_GLOSSY),
   BAKE_FILTER_TRANSMISSION_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_TRANSMISSION),
-  BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
 } BakePassFilterCombos;
 
 typedef enum DenoiseFlag {
@@ -471,9 +469,7 @@ typedef enum DenoiseFlag {
   DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
   DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
   DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
-  DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
-  DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
-  DENOISING_CLEAN_ALL_PASSES = (1 << 8) - 1,
+  DENOISING_CLEAN_ALL_PASSES = (1 << 6) - 1,
 } DenoiseFlag;
 
 #ifdef __KERNEL_DEBUG__
@@ -493,8 +489,7 @@ typedef ccl_addr_space struct PathRadianceState {
   float3 diffuse;
   float3 glossy;
   float3 transmission;
-  float3 subsurface;
-  float3 scatter;
+  float3 volume;
 
   float3 direct;
 #endif
@@ -517,19 +512,16 @@ typedef ccl_addr_space struct PathRadiance {
   float3 color_diffuse;
   float3 color_glossy;
   float3 color_transmission;
-  float3 color_subsurface;
 
   float3 direct_diffuse;
   float3 direct_glossy;
   float3 direct_transmission;
-  float3 direct_subsurface;
-  float3 direct_scatter;
+  float3 direct_volume;
 
   float3 indirect_diffuse;
   float3 indirect_glossy;
   float3 indirect_transmission;
-  float3 indirect_subsurface;
-  float3 indirect_scatter;
+  float3 indirect_volume;
 
   float4 shadow;
   float mist;
@@ -583,8 +575,7 @@ typedef struct BsdfEval {
   float3 glossy;
   float3 transmission;
   float3 transparent;
-  float3 subsurface;
-  float3 scatter;
+  float3 volume;
 #endif
 #ifdef __SHADOW_TRICKS__
   float3 sum_no_mis;
@@ -725,8 +716,7 @@ typedef enum PrimitiveType {
 /* Attributes */
 
 typedef enum AttributePrimitive {
-  ATTR_PRIM_TRIANGLE = 0,
-  ATTR_PRIM_CURVE,
+  ATTR_PRIM_GEOMETRY = 0,
   ATTR_PRIM_SUBD,
 
   ATTR_PRIM_TYPES
@@ -754,6 +744,7 @@ typedef enum AttributeStandard {
   ATTR_STD_UV,
   ATTR_STD_UV_TANGENT,
   ATTR_STD_UV_TANGENT_SIGN,
+  ATTR_STD_VERTEX_COLOR,
   ATTR_STD_GENERATED,
   ATTR_STD_GENERATED_TRANSFORM,
   ATTR_STD_POSITION_UNDEFORMED,
@@ -894,13 +885,13 @@ enum ShaderDataFlag {
   SD_HAS_DISPLACEMENT = (1 << 26),
   /* Has constant emission (value stored in __shaders) */
   SD_HAS_CONSTANT_EMISSION = (1 << 27),
-  /* Needs to access attributes */
-  SD_NEED_ATTRIBUTES = (1 << 28),
+  /* Needs to access attributes for volume rendering */
+  SD_NEED_VOLUME_ATTRIBUTES = (1 << 28),
 
   SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME |
                      SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR |
                      SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT |
-                     SD_HAS_CONSTANT_EMISSION | SD_NEED_ATTRIBUTES)
+                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES)
 };
 
 /* Object flags. */
@@ -1057,6 +1048,7 @@ typedef struct PathState {
 
 #ifdef __DENOISING_FEATURES__
   float denoising_feature_weight;
+  float3 denoising_feature_throughput;
 #endif /* __DENOISING_FEATURES__ */
 
   /* multiple importance sampling */
@@ -1213,18 +1205,15 @@ typedef struct KernelFilm {
   int pass_diffuse_color;
   int pass_glossy_color;
   int pass_transmission_color;
-  int pass_subsurface_color;
 
   int pass_diffuse_indirect;
   int pass_glossy_indirect;
   int pass_transmission_indirect;
-  int pass_subsurface_indirect;
   int pass_volume_indirect;
 
   int pass_diffuse_direct;
   int pass_glossy_direct;
   int pass_transmission_direct;
-  int pass_subsurface_direct;
   int pass_volume_direct;
 
   int pass_emission;
@@ -1239,6 +1228,9 @@ typedef struct KernelFilm {
   int cryptomatte_depth;
   int pass_cryptomatte;
 
+  int pass_adaptive_aux_buffer;
+  int pass_sample_count;
+
   int pass_mist;
   float mist_start;
   float mist_inv_depth;
@@ -1251,7 +1243,6 @@ typedef struct KernelFilm {
   int pass_aov_color;
   int pass_aov_value;
   int pad1;
-  int pad2;
 
   /* XYZ to rendering color space transform. float4 instead of float3 to
    * ensure consistent padding/alignment across devices. */
@@ -1273,6 +1264,8 @@ typedef struct KernelFilm {
   int display_divide_pass_stride;
   int use_display_exposure;
   int use_display_pass_alpha;
+
+  int pad3, pad4, pad5;
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
 
@@ -1280,6 +1273,7 @@ typedef struct KernelBackground {
   /* only shader index */
   int surface_shader;
   int volume_shader;
+  float volume_step_size;
   int transparent;
   float transparent_roughness_squared_threshold;
 
@@ -1287,7 +1281,6 @@ typedef struct KernelBackground {
   float ao_factor;
   float ao_distance;
   float ao_bounces_factor;
-  float ao_pad;
 } KernelBackground;
 static_assert_align(KernelBackground, 16);
 
@@ -1354,18 +1347,20 @@ typedef struct KernelIntegrator {
   /* sampler */
   int sampling_pattern;
   int aa_samples;
+  int adaptive_min_samples;
+  float adaptive_threshold;
 
   /* volume render */
   int use_volumes;
   int volume_max_steps;
-  float volume_step_size;
+  float volume_step_rate;
   int volume_samples;
 
   int start_sample;
 
   int max_closures;
 
-  int pad1;
+  int pad1, pad2, pad3;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1679,12 +1674,16 @@ typedef struct WorkTile {
   uint start_sample;
   uint num_samples;
 
-  uint offset;
+  int offset;
   uint stride;
 
   ccl_global float *buffer;
 } WorkTile;
 
+/* Precoumputed sample table sizes for PMJ02 sampler. */
+#define NUM_PMJ_SAMPLES 64 * 64
+#define NUM_PMJ_PATTERNS 48
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index f443bb88463..b4f9d2186f4 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -48,7 +48,8 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
   shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
 
   if (sd->flag & SD_EXTINCTION) {
-    *extinction = sd->closure_transparent_extinction;
+    const float density = object_volume_density(kg, sd->object);
+    *extinction = sd->closure_transparent_extinction * density;
     return true;
   }
   else {
@@ -84,6 +85,11 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
     }
   }
 
+  const float density = object_volume_density(kg, sd->object);
+  coeff->sigma_s *= density;
+  coeff->sigma_t *= density;
+  coeff->emission *= density;
+
   return true;
 }
 
@@ -101,15 +107,19 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel)
 
 #ifdef __VOLUME__
 
-ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
+ccl_device float volume_stack_step_size(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
 {
+  float step_size = FLT_MAX;
+
   for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
     int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
 
+    bool heterogeneous = false;
+
     if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
-      return true;
+      heterogeneous = true;
     }
-    else if (shader_flag & SD_NEED_ATTRIBUTES) {
+    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
       /* We want to render world or objects without any volume grids
        * as homogeneous, but can only verify this at run-time since other
        * heterogeneous volume objects may be using the same shader. */
@@ -117,13 +127,19 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space
       if (object != OBJECT_NONE) {
         int object_flag = kernel_tex_fetch(__object_flag, object);
         if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
-          return true;
+          heterogeneous = true;
         }
       }
     }
+
+    if (heterogeneous) {
+      float object_step_size = object_volume_step_size(kg, stack[i].object);
+      object_step_size *= kernel_data.integrator.volume_step_rate;
+      step_size = fminf(object_step_size, step_size);
+    }
   }
 
-  return false;
+  return step_size;
 }
 
 ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
@@ -158,12 +174,13 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 
 ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg,
                                                ccl_addr_space PathState *state,
+                                               const float object_step_size,
                                                float t,
                                                float *step_size,
                                                float *step_offset)
 {
   const int max_steps = kernel_data.integrator.volume_max_steps;
-  float step = min(kernel_data.integrator.volume_step_size, t);
+  float step = min(object_step_size, t);
 
   /* compute exact steps in advance for malloc */
   if (t > max_steps * step) {
@@ -199,7 +216,8 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
                                                    ccl_addr_space PathState *state,
                                                    Ray *ray,
                                                    ShaderData *sd,
-                                                   float3 *throughput)
+                                                   float3 *throughput,
+                                                   const float object_step_size)
 {
   float3 tp = *throughput;
   const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -207,7 +225,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
   /* prepare for stepping */
   int max_steps = kernel_data.integrator.volume_max_steps;
   float step_offset, step_size;
-  kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
+  kernel_volume_step_init(kg, state, object_step_size, ray->t, &step_size, &step_offset);
 
   /* compute extinction at the start */
   float t = 0.0f;
@@ -264,8 +282,9 @@ ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
 {
   shader_setup_from_volume(kg, shadow_sd, ray);
 
-  if (volume_stack_is_heterogeneous(kg, state->volume_stack))
-    kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput);
+  float step_size = volume_stack_step_size(kg, state->volume_stack);
+  if (step_size != FLT_MAX)
+    kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput, step_size);
   else
     kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
 }
@@ -533,7 +552,8 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
                                                Ray *ray,
                                                ShaderData *sd,
                                                PathRadiance *L,
-                                               ccl_addr_space float3 *throughput)
+                                               ccl_addr_space float3 *throughput,
+                                               const float object_step_size)
 {
   float3 tp = *throughput;
   const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -541,7 +561,7 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
   /* prepare for stepping */
   int max_steps = kernel_data.integrator.volume_max_steps;
   float step_offset, step_size;
-  kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
+  kernel_volume_step_init(kg, state, object_step_size, ray->t, &step_size, &step_offset);
 
   /* compute coefficients at the start */
   float t = 0.0f;
@@ -679,12 +699,13 @@ kernel_volume_integrate(KernelGlobals *kg,
                         Ray *ray,
                         PathRadiance *L,
                         ccl_addr_space float3 *throughput,
-                        bool heterogeneous)
+                        float step_size)
 {
   shader_setup_from_volume(kg, sd, ray);
 
-  if (heterogeneous)
-    return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput);
+  if (step_size != FLT_MAX)
+    return kernel_volume_integrate_heterogeneous_distance(
+        kg, state, ray, sd, L, throughput, step_size);
   else
     return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
 }
@@ -735,7 +756,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
                                                Ray *ray,
                                                ShaderData *sd,
                                                VolumeSegment *segment,
-                                               bool heterogeneous)
+                                               const float object_step_size)
 {
   const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
@@ -743,9 +764,9 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
   int max_steps;
   float step_size, step_offset;
 
-  if (heterogeneous) {
+  if (object_step_size != FLT_MAX) {
     max_steps = kernel_data.integrator.volume_max_steps;
-    kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset);
+    kernel_volume_step_init(kg, state, object_step_size, ray->t, &step_size, &step_offset);
 
 #      ifdef __KERNEL_CPU__
     /* NOTE: For the branched path tracing it's possible to have direct
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 799561a7466..c642d227e4b 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -23,17 +23,41 @@ CCL_NAMESPACE_BEGIN
  * Utility functions for work stealing
  */
 
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
+{
+#ifdef __KERNEL_CUDA__
+  /* Keeping threads for the same pixel together improves performance on CUDA. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#else  /* __KERNEL_CUDA__ */
+  uint tile_pixels = tile->w * tile->h;
+  uint sample_offset = global_work_index / tile_pixels;
+  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+#endif /* __KERNEL_CUDA__ */
+  uint y_offset = pixel_offset / tile->w;
+  uint x_offset = pixel_offset - y_offset * tile->w;
+
+  *x = tile->x + x_offset;
+  *y = tile->y + y_offset;
+  *sample = tile->start_sample + sample_offset;
+}
+
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
 #ifdef __SPLIT_KERNEL__
 /* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
+ccl_device bool get_next_work_item(KernelGlobals *kg,
+                                   ccl_global uint *work_pools,
+                                   uint total_work_size,
+                                   uint ray_index,
+                                   ccl_private uint *global_work_index)
 {
   /* With a small amount of work there may be more threads than work due to
    * rounding up of global size, stop such threads immediately. */
@@ -56,31 +80,37 @@ ccl_device bool get_next_work(KernelGlobals *kg,
   /* Test if all work for this pool is done. */
   return (*global_work_index < total_work_size);
 }
-#endif
 
-/* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
-                                      uint global_work_index,
-                                      ccl_private uint *x,
-                                      ccl_private uint *y,
-                                      ccl_private uint *sample)
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              ccl_global uint *work_pools,
+                              uint total_work_size,
+                              uint ray_index,
+                              ccl_private uint *global_work_index)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
-  uint y_offset = pixel_offset / tile->w;
-  uint x_offset = pixel_offset - y_offset * tile->w;
-
-  *x = tile->x + x_offset;
-  *y = tile->y + y_offset;
-  *sample = tile->start_sample + sample_offset;
+  bool got_work = false;
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    do {
+      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+      if (got_work) {
+        ccl_global WorkTile *tile = &kernel_split_params.tile;
+        uint x, y, sample;
+        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
+        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
+        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                       kernel_data.film.pass_adaptive_aux_buffer);
+        if (aux->w == 0.0f) {
+          break;
+        }
+      }
+    } while (got_work);
+  }
+  else {
+    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+  }
+  return got_work;
 }
+#endif
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index f2146302a27..8829a14ead5 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -72,7 +72,7 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t s
     assert(0);
 }
 
-void kernel_tex_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
 {
   if (0) {
   }
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index f5d981fb71a..683f4b88d79 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,5 +89,9 @@ DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
 DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index 8f311baf010..f87501db258 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -474,7 +474,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
-  switch (kernel_tex_type(id)) {
+  switch (info.data_type) {
     case IMAGE_DATA_TYPE_HALF:
       return TextureInterpolator<half>::interp(info, x, y);
     case IMAGE_DATA_TYPE_BYTE:
@@ -498,28 +498,34 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(
-    KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+                                             int id,
+                                             float3 P,
+                                             InterpolationType interp)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
-  switch (kernel_tex_type(id)) {
+  if (info.use_transform_3d) {
+    P = transform_point(&info.transform_3d, P);
+  }
+
+  switch (info.data_type) {
     case IMAGE_DATA_TYPE_HALF:
-      return TextureInterpolator<half>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_BYTE:
-      return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_USHORT:
-      return TextureInterpolator<uint16_t>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_FLOAT:
-      return TextureInterpolator<float>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_HALF4:
-      return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_BYTE4:
-      return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_USHORT4:
-      return TextureInterpolator<ushort4>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp);
     case IMAGE_DATA_TYPE_FLOAT4:
-      return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp);
+      return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp);
     default:
       assert(0);
       return make_float4(
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 9ca3f46b5b6..091e53cfd83 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -20,6 +20,7 @@
  * simply includes this file without worry of copying actual implementation over.
  */
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 
 #ifndef KERNEL_STUB
@@ -58,6 +59,10 @@
 #    include "kernel/split/kernel_next_iteration_setup.h"
 #    include "kernel/split/kernel_indirect_subsurface.h"
 #    include "kernel/split/kernel_buffer_update.h"
+#    include "kernel/split/kernel_adaptive_stopping.h"
+#    include "kernel/split/kernel_adaptive_filter_x.h"
+#    include "kernel/split/kernel_adaptive_filter_y.h"
+#    include "kernel/split/kernel_adaptive_adjust_samples.h"
 #  endif /* __SPLIT_KERNEL__ */
 #else
 #  define STUB_ASSERT(arch, name) \
@@ -67,6 +72,7 @@
 #    include "kernel/split/kernel_data_init.h"
 #  endif /* __SPLIT_KERNEL__ */
 #endif   /* KERNEL_STUB */
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -204,6 +210,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 #endif   /* __SPLIT_KERNEL__ */
 
 #undef KERNEL_STUB
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index fbb773533ce..22fd5ea5634 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -57,9 +57,9 @@ kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int st
 		if (num_inputs > 0) {
 			float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
 			float *out = rgb + (x + y * sw) * 3;
-			out[0] = in[0];
-			out[1] = in[1];
-			out[2] = in[2];
+			out[0] = clamp(in[0], 0.0f, 10000.0f);
+			out[1] = clamp(in[1], 0.0f, 10000.0f);
+			out[2] = clamp(in[2], 0.0f, 10000.0f);
 		}
 		if (num_inputs > 1) {
 			float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index af311027f78..c4c810c6a82 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -33,6 +33,7 @@
 #include "kernel/kernel_path_branched.h"
 #include "kernel/kernel_bake.h"
 #include "kernel/kernel_work_stealing.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 /* kernels */
 extern "C" __global__ void
@@ -83,6 +84,75 @@ kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
+{
+	int work_index = ccl_global_id(0);
+	bool thread_is_active = work_index < total_work_size;
+	KernelGlobals kg;
+	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
+		uint x = tile->x + work_index % tile->w;
+		uint y = tile->y + work_index / tile->w;
+		int index = tile->offset + x + y * tile->stride;
+		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+		kernel_do_adaptive_stopping(&kg, buffer, sample);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
+{
+	KernelGlobals kg;
+	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+		if(ccl_global_id(0) < tile->h) {
+			int y = tile->y + ccl_global_id(0);
+			kernel_do_adaptive_filter_x(&kg, y, tile);
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
+{
+	KernelGlobals kg;
+	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+		if(ccl_global_id(0) < tile->w) {
+			int x = tile->x + ccl_global_id(0);
+			kernel_do_adaptive_filter_y(&kg, x, tile);
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
+{
+	if(kernel_data.film.pass_adaptive_aux_buffer) {
+		int work_index = ccl_global_id(0);
+		bool thread_is_active = work_index < total_work_size;
+		KernelGlobals kg;
+		if(thread_is_active) {
+			uint x = tile->x + work_index % tile->w;
+			uint y = tile->y + work_index / tile->w;
+			int index = tile->offset + x + y * tile->stride;
+			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+				float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]);
+				if(sample_multiplier != 1.0f) {
+					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
+				}
+			}
+			else {
+				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
+			}
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
index 7c68f08ea10..1d425d132a1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -124,7 +124,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   CUtexObject tex = (CUtexObject)info.data;
 
   /* float4, byte4, ushort4 and half4 */
-  const int texture_type = kernel_tex_type(id);
+  const int texture_type = info.data_type;
   if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
       texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
     if (info.interpolation == INTERPOLATION_CUBIC) {
@@ -149,14 +149,25 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(
-    KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+                                             int id,
+                                             float3 P,
+                                             InterpolationType interp)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  if (info.use_transform_3d) {
+    P = transform_point(&info.transform_3d, P);
+  }
+
+  const float x = P.x;
+  const float y = P.y;
+  const float z = P.z;
+
   CUtexObject tex = (CUtexObject)info.data;
   uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
 
-  const int texture_type = kernel_tex_type(id);
+  const int texture_type = info.data_type;
   if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
       texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
     if (interpolation == INTERPOLATION_CUBIC) {
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 43b3d0aa0e6..95ad7599cf1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -43,6 +43,10 @@
 #include "kernel/split/kernel_next_iteration_setup.h"
 #include "kernel/split/kernel_indirect_subsurface.h"
 #include "kernel/split/kernel_buffer_update.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
 
 #include "kernel/kernel_film.h"
 
@@ -121,6 +125,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
new file mode 100644
index 00000000000..ebdb99d4730
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
+
+#define KERNEL_NAME adaptive_adjust_samples
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
new file mode 100644
index 00000000000..76d82d4184e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+
+#define KERNEL_NAME adaptive_filter_x
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
new file mode 100644
index 00000000000..1e6d15ba0f2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+
+#define KERNEL_NAME adaptive_filter_y
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
new file mode 100644
index 00000000000..51de0059667
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+
+#define KERNEL_NAME adaptive_stopping
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index b6390679331..89fcb0ae60f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -47,7 +47,7 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg,
                                                 int id,
                                                 int offset)
 {
-  const int texture_type = kernel_tex_type(id);
+  const int texture_type = info->data_type;
 
   /* Float4 */
   if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
@@ -202,11 +202,19 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4
-kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, int interp)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float3 P, int interp)
 {
   const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
 
+  if (info->use_transform_3d) {
+    Transform tfm = info->transform_3d;
+    P = transform_point(&tfm, P);
+  }
+
+  const float x = P.x;
+  const float y = P.y;
+  const float z = P.z;
+
   if (info->extension == EXTENSION_CLIP) {
     if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
index 6041f13b52b..c3b7b09460a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
@@ -28,3 +28,7 @@
 #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
 #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
 #include "kernel/kernels/opencl/kernel_buffer_update.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index 5be5bd181ec..fc0c845fd4f 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -33,6 +33,9 @@ set(LIB
   ${LLVM_LIBRARY}
 )
 
+# OSL and LLVM are built without RTTI
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index b395227845d..3f9de5ab33d 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -36,9 +36,11 @@
 
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index c5edc7c9be3..76a2e41abfa 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -37,10 +37,12 @@
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_types.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/bsdf_diffuse_ramp.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index 4b7e59ff932..b78dc8a3a67 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -37,9 +37,11 @@
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/bsdf_phong_ramp.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index c29ddb13e2e..d656723bac2 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -36,10 +36,12 @@
 
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/kernel_types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index dd52c33071c..c5ca8616fbd 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -35,6 +35,7 @@
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/osl/osl_closures.h"
 
+// clang-format off
 #include "kernel/kernel_types.h"
 #include "kernel/kernel_montecarlo.h"
 
@@ -43,6 +44,7 @@
 #include "kernel/closure/bsdf_diffuse.h"
 #include "kernel/closure/bsdf_principled_diffuse.h"
 #include "kernel/closure/bssrdf.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 463a65f21a0..ea5e00ec23c 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -39,6 +39,7 @@
 #include "util/util_math.h"
 #include "util/util_param.h"
 
+// clang-format off
 #include "kernel/kernel_types.h"
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/split/kernel_split_data_types.h"
@@ -63,6 +64,7 @@
 #include "kernel/closure/bsdf_principled_diffuse.h"
 #include "kernel/closure/bsdf_principled_sheen.h"
 #include "kernel/closure/volume.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index d3db6b71f5c..d12afdb80dd 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -33,12 +33,12 @@
 #ifndef __OSL_CLOSURES_H__
 #define __OSL_CLOSURES_H__
 
-#include "util/util_types.h"
 #include "kernel/kernel_types.h"
+#include "util/util_types.h"
 
+#include <OSL/genclosure.h>
 #include <OSL/oslclosure.h>
 #include <OSL/oslexec.h>
-#include <OSL/genclosure.h>
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 0e6c8d21534..c06c9abd4c1 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -27,8 +27,8 @@
 #  include "util/util_map.h"
 #  include "util/util_param.h"
 #  include "util/util_thread.h"
-#  include "util/util_vector.h"
 #  include "util/util_unique_ptr.h"
+#  include "util/util_vector.h"
 
 #  ifndef WIN32
 using std::isfinite;
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 767bd7702ae..2857de533f3 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -39,6 +39,7 @@
 #include "util/util_logging.h"
 #include "util/util_string.h"
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/split/kernel_split_data_types.h"
 #include "kernel/kernel_globals.h"
@@ -56,6 +57,7 @@
 #include "kernel/kernel_projection.h"
 #include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -1220,8 +1222,8 @@ bool OSLRenderServices::texture3d(ustring filename,
       ShaderData *sd = (ShaderData *)(sg->renderstate);
       KernelGlobals *kernel_globals = sd->osl_globals;
       int slot = handle->svm_slot;
-      float4 rgba = kernel_tex_image_interp_3d(
-          kernel_globals, slot, P.x, P.y, P.z, INTERPOLATION_NONE);
+      float3 P_float3 = make_float3(P.x, P.y, P.z);
+      float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE);
 
       result[0] = rgba[0];
       if (nchannels > 1)
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 469c5188730..d32dace23bf 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -25,8 +25,8 @@
  * attributes.
  */
 
-#include <OSL/oslexec.h>
 #include <OSL/oslclosure.h>
+#include <OSL/oslexec.h>
 
 #ifdef WITH_PTEX
 class PtexCache;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index db5ad06d3fc..2318813949e 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -16,6 +16,7 @@
 
 #include <OSL/oslexec.h>
 
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_types.h"
@@ -28,6 +29,7 @@
 #include "kernel/osl/osl_globals.h"
 #include "kernel/osl/osl_services.h"
 #include "kernel/osl/osl_shader.h"
+// clang-format on
 
 #include "util/util_foreach.h"
 
@@ -382,10 +384,6 @@ int OSLShader::find_attribute(KernelGlobals *kg,
 {
   /* for OSL, a hash map is used to lookup the attribute by name. */
   int object = sd->object * ATTR_PRIM_TYPES;
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE)
-    object += ATTR_PRIM_CURVE;
-#endif
 
   OSLGlobals::AttributeMap &attr_map = kg->osl->attribute_map[object];
   ustring stdname(std::string("geom:") +
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index f4258da70d3..9dcedc9ba19 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -78,6 +78,7 @@ set(SRC_OSL
   node_value.osl
   node_vector_curves.osl
   node_vector_math.osl
+  node_vector_rotate.osl
   node_vector_transform.osl
   node_velvet_bsdf.osl
   node_vertex_color.osl
@@ -95,16 +96,19 @@ set(SRC_OSL
   node_rgb_to_bw.osl
 )
 
+# The headers that OSL ships differs per release so we can not
+# hardcode this.
+file(GLOB SRC_OSL_HEADER_DIST ${OSL_SHADER_DIR}/*.h)
+
 set(SRC_OSL_HEADERS
   node_color.h
   node_fresnel.h
   node_hash.h
+  node_math.h
   node_noise.h
   node_ramp_util.h
-  stdosl.h
-  oslutil.h
-  vector2.h
-  vector4.h
+  stdcycles.h
+  ${SRC_OSL_HEADER_DIST}
 )
 
 set(SRC_OSO
@@ -119,7 +123,7 @@ foreach(_file ${SRC_OSL})
   string(REPLACE ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} _OSO_FILE ${_OSO_FILE})
   add_custom_command(
     OUTPUT ${_OSO_FILE}
-    COMMAND ${OSL_COMPILER} -q -O2  -I"${CMAKE_CURRENT_SOURCE_DIR}" -o ${_OSO_FILE} ${_OSL_FILE}
+    COMMAND ${OSL_COMPILER} -q -O2  -I"${CMAKE_CURRENT_SOURCE_DIR}" -I"${OSL_SHADER_DIR}" -o ${_OSO_FILE} ${_OSL_FILE}
     DEPENDS ${_OSL_FILE} ${SRC_OSL_HEADERS} ${OSL_COMPILER})
   list(APPEND SRC_OSO
     ${_OSO_FILE}
diff --git a/intern/cycles/kernel/shaders/node_absorption_volume.osl b/intern/cycles/kernel/shaders/node_absorption_volume.osl
index e99bd254666..37ccc4c969f 100644
--- a/intern/cycles/kernel/shaders/node_absorption_volume.osl
+++ b/intern/cycles/kernel/shaders/node_absorption_volume.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_absorption_volume(color Color = color(0.8, 0.8, 0.8),
                               float Density = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_add_closure.osl b/intern/cycles/kernel/shaders/node_add_closure.osl
index 077e2735e61..27ecc9ef0c2 100644
--- a/intern/cycles/kernel/shaders/node_add_closure.osl
+++ b/intern/cycles/kernel/shaders/node_add_closure.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_add_closure(closure color Closure1 = 0,
                         closure color Closure2 = 0,
diff --git a/intern/cycles/kernel/shaders/node_ambient_occlusion.osl b/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
index 7bf28719e78..22d245d0698 100644
--- a/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
+++ b/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_ambient_occlusion(color ColorIn = color(1.0, 1.0, 1.0),
                               int samples = 16,
diff --git a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
index 165c09eb8e0..739cd375ab2 100644
--- a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_anisotropic_bsdf(color Color = 0.0,
                              string distribution = "GGX",
diff --git a/intern/cycles/kernel/shaders/node_attribute.osl b/intern/cycles/kernel/shaders/node_attribute.osl
index 336543cc130..abec8ebfbf0 100644
--- a/intern/cycles/kernel/shaders/node_attribute.osl
+++ b/intern/cycles/kernel/shaders/node_attribute.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_attribute(string bump_offset = "center",
                       string name = "",
diff --git a/intern/cycles/kernel/shaders/node_background.osl b/intern/cycles/kernel/shaders/node_background.osl
index 6ded0d2c65c..3f45db751b3 100644
--- a/intern/cycles/kernel/shaders/node_background.osl
+++ b/intern/cycles/kernel/shaders/node_background.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_background(color Color = 0.8,
                        float Strength = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_bevel.osl b/intern/cycles/kernel/shaders/node_bevel.osl
index 189c20c52e7..e87ddab716d 100644
--- a/intern/cycles/kernel/shaders/node_bevel.osl
+++ b/intern/cycles/kernel/shaders/node_bevel.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_bevel(int samples = 4,
                   float Radius = 0.05,
diff --git a/intern/cycles/kernel/shaders/node_blackbody.osl b/intern/cycles/kernel/shaders/node_blackbody.osl
index 8a24bf1e28b..741efae755d 100644
--- a/intern/cycles/kernel/shaders/node_blackbody.osl
+++ b/intern/cycles/kernel/shaders/node_blackbody.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_blackbody(float Temperature = 1200.0, output color Color = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl
index 30644ef2ff3..075a324c730 100644
--- a/intern/cycles/kernel/shaders/node_brick_texture.osl
+++ b/intern/cycles/kernel/shaders/node_brick_texture.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 /* Brick */
 
diff --git a/intern/cycles/kernel/shaders/node_brightness.osl b/intern/cycles/kernel/shaders/node_brightness.osl
index 2defbc4b1db..019edfb79a3 100644
--- a/intern/cycles/kernel/shaders/node_brightness.osl
+++ b/intern/cycles/kernel/shaders/node_brightness.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_brightness(color ColorIn = 0.8,
                        float Bright = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_bump.osl b/intern/cycles/kernel/shaders/node_bump.osl
index 3697bb37fd9..811182f40b5 100644
--- a/intern/cycles/kernel/shaders/node_bump.osl
+++ b/intern/cycles/kernel/shaders/node_bump.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 /* "Bump Mapping Unparameterized Surfaces on the GPU"
  * Morten S. Mikkelsen, 2010 */
diff --git a/intern/cycles/kernel/shaders/node_camera.osl b/intern/cycles/kernel/shaders/node_camera.osl
index 833e9e775fe..45ca50c6e1e 100644
--- a/intern/cycles/kernel/shaders/node_camera.osl
+++ b/intern/cycles/kernel/shaders/node_camera.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_camera(output vector ViewVector = vector(0.0, 0.0, 0.0),
                    output float ViewZDepth = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_checker_texture.osl b/intern/cycles/kernel/shaders/node_checker_texture.osl
index e05cf20099f..d6a30dbdb40 100644
--- a/intern/cycles/kernel/shaders/node_checker_texture.osl
+++ b/intern/cycles/kernel/shaders/node_checker_texture.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 /* Checker */
 
diff --git a/intern/cycles/kernel/shaders/node_clamp.osl b/intern/cycles/kernel/shaders/node_clamp.osl
index d689ba7f809..ce9392a0d98 100644
--- a/intern/cycles/kernel/shaders/node_clamp.osl
+++ b/intern/cycles/kernel/shaders/node_clamp.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_clamp(string type = "minmax",
                   float Value = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_combine_hsv.osl b/intern/cycles/kernel/shaders/node_combine_hsv.osl
index 1658cf3d774..05e502b5bc1 100644
--- a/intern/cycles/kernel/shaders/node_combine_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_combine_hsv.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_combine_hsv(float H = 0.0, float S = 0.0, float V = 0.0, output color Color = 0.8)
 {
diff --git a/intern/cycles/kernel/shaders/node_combine_rgb.osl b/intern/cycles/kernel/shaders/node_combine_rgb.osl
index aaa95e9c5af..036f371eb5c 100644
--- a/intern/cycles/kernel/shaders/node_combine_rgb.osl
+++ b/intern/cycles/kernel/shaders/node_combine_rgb.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_combine_rgb(float R = 0.0, float G = 0.0, float B = 0.0, output color Image = 0.8)
 {
diff --git a/intern/cycles/kernel/shaders/node_combine_xyz.osl b/intern/cycles/kernel/shaders/node_combine_xyz.osl
index 4ab49168704..4ebd86b605c 100644
--- a/intern/cycles/kernel/shaders/node_combine_xyz.osl
+++ b/intern/cycles/kernel/shaders/node_combine_xyz.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_combine_xyz(float X = 0.0, float Y = 0.0, float Z = 0.0, output vector Vector = 0.8)
 {
diff --git a/intern/cycles/kernel/shaders/node_convert_from_color.osl b/intern/cycles/kernel/shaders/node_convert_from_color.osl
index 7ea9a1e4fb3..c3f0e118844 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_color.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_color.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_color(color value_color = 0.0,
                                output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_float.osl b/intern/cycles/kernel/shaders/node_convert_from_float.osl
index 13b5dea0838..61a15a1c2b0 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_float.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_float.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_float(float value_float = 0.0,
                                output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_int.osl b/intern/cycles/kernel/shaders/node_convert_from_int.osl
index a59e025d822..2e6a99b2765 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_int.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_int.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_int(int value_int = 0,
                              output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_normal.osl b/intern/cycles/kernel/shaders/node_convert_from_normal.osl
index 7bdd94d1941..64201d63190 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_normal.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_normal.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_normal(normal value_normal = normal(0.0, 0.0, 0.0),
                                 output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_point.osl b/intern/cycles/kernel/shaders/node_convert_from_point.osl
index 79c1719e7a7..11d64f76d6f 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_point.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_point.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_point(point value_point = point(0.0, 0.0, 0.0),
                                output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_convert_from_string.osl b/intern/cycles/kernel/shaders/node_convert_from_string.osl
index 48d894a6b3e..b496c4e6d05 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_string.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_string.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_string(string value_string = "",
                                 output color value_color = color(0.0, 0.0, 0.0),
diff --git a/intern/cycles/kernel/shaders/node_convert_from_vector.osl b/intern/cycles/kernel/shaders/node_convert_from_vector.osl
index 92ab2313bcb..820faabd32b 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_vector.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_vector.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_convert_from_vector(vector value_vector = vector(0.0, 0.0, 0.0),
                                 output string value_string = "",
diff --git a/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl b/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
index bd5554b838a..f5886f534eb 100644
--- a/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_diffuse_bsdf(color Color = 0.8,
                          float Roughness = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_displacement.osl b/intern/cycles/kernel/shaders/node_displacement.osl
index a1f3b7b7737..44a4828d511 100644
--- a/intern/cycles/kernel/shaders/node_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_displacement.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_displacement(string space = "object",
                          float Height = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_emission.osl b/intern/cycles/kernel/shaders/node_emission.osl
index 57973f57ac6..f289a9711d9 100644
--- a/intern/cycles/kernel/shaders/node_emission.osl
+++ b/intern/cycles/kernel/shaders/node_emission.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_emission(color Color = 0.8, float Strength = 1.0, output closure color Emission = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_environment_texture.osl b/intern/cycles/kernel/shaders/node_environment_texture.osl
index 43f607f7cb0..d04743eb368 100644
--- a/intern/cycles/kernel/shaders/node_environment_texture.osl
+++ b/intern/cycles/kernel/shaders/node_environment_texture.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 vector environment_texture_direction_to_equirectangular(vector dir)
 {
diff --git a/intern/cycles/kernel/shaders/node_fresnel.osl b/intern/cycles/kernel/shaders/node_fresnel.osl
index 89250db40f3..cff084c344d 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.osl
+++ b/intern/cycles/kernel/shaders/node_fresnel.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_fresnel(float IOR = 1.45, normal Normal = N, output float Fac = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_gamma.osl b/intern/cycles/kernel/shaders/node_gamma.osl
index 9b9c17dc8af..0816df64fe8 100644
--- a/intern/cycles/kernel/shaders/node_gamma.osl
+++ b/intern/cycles/kernel/shaders/node_gamma.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_gamma(color ColorIn = 0.8, float Gamma = 1.0, output color ColorOut = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_geometry.osl b/intern/cycles/kernel/shaders/node_geometry.osl
index 3cf2e974022..55cda71db1b 100644
--- a/intern/cycles/kernel/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/shaders/node_geometry.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_geometry(normal NormalIn = N,
                      string bump_offset = "center",
diff --git a/intern/cycles/kernel/shaders/node_glass_bsdf.osl b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
index c0b8a002536..0042d573f8d 100644
--- a/intern/cycles/kernel/shaders/node_glass_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_glass_bsdf(color Color = 0.8,
                        string distribution = "sharp",
diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
index 2d40ee8d3f6..c73604d3650 100644
--- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_glossy_bsdf(color Color = 0.8,
                         string distribution = "GGX",
diff --git a/intern/cycles/kernel/shaders/node_gradient_texture.osl b/intern/cycles/kernel/shaders/node_gradient_texture.osl
index 6cb181aee72..e9acebc0572 100644
--- a/intern/cycles/kernel/shaders/node_gradient_texture.osl
+++ b/intern/cycles/kernel/shaders/node_gradient_texture.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 /* Gradient */
 
diff --git a/intern/cycles/kernel/shaders/node_hair_bsdf.osl b/intern/cycles/kernel/shaders/node_hair_bsdf.osl
index bc912087666..3e0ac7af2e0 100644
--- a/intern/cycles/kernel/shaders/node_hair_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_hair_bsdf.osl
@@ -16,7 +16,7 @@
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_hair_bsdf(color Color = 0.8,
                       string component = "reflection",
diff --git a/intern/cycles/kernel/shaders/node_hair_info.osl b/intern/cycles/kernel/shaders/node_hair_info.osl
index 991a27c4103..ee08ea57e68 100644
--- a/intern/cycles/kernel/shaders/node_hair_info.osl
+++ b/intern/cycles/kernel/shaders/node_hair_info.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_hair_info(output float IsStrand = 0.0,
                       output float Intercept = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_hash.h b/intern/cycles/kernel/shaders/node_hash.h
index 7affe432bf2..b42e42ff910 100644
--- a/intern/cycles/kernel/shaders/node_hash.h
+++ b/intern/cycles/kernel/shaders/node_hash.h
@@ -1,4 +1,4 @@
-#include "stdosl.h"
+#include "stdcycles.h"
 #include "vector2.h"
 #include "vector4.h"
 
diff --git a/intern/cycles/kernel/shaders/node_holdout.osl b/intern/cycles/kernel/shaders/node_holdout.osl
index b51bc0543a5..92e41c92f72 100644
--- a/intern/cycles/kernel/shaders/node_holdout.osl
+++ b/intern/cycles/kernel/shaders/node_holdout.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_holdout(output closure color Holdout = holdout())
 {
diff --git a/intern/cycles/kernel/shaders/node_hsv.osl b/intern/cycles/kernel/shaders/node_hsv.osl
index 30c56a20a92..4417057b10f 100644
--- a/intern/cycles/kernel/shaders/node_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_hsv.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 shader node_hsv(float Hue = 0.5,
                 float Saturation = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_ies_light.osl b/intern/cycles/kernel/shaders/node_ies_light.osl
index 4d881eb3b65..76348b4d758 100644
--- a/intern/cycles/kernel/shaders/node_ies_light.osl
+++ b/intern/cycles/kernel/shaders/node_ies_light.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 /* IES Light */
 
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index 9a0f2d054ea..22d34a1082c 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 point texco_remap_square(point co)
 {
diff --git a/intern/cycles/kernel/shaders/node_invert.osl b/intern/cycles/kernel/shaders/node_invert.osl
index c7d41e4e129..23c16935ca1 100644
--- a/intern/cycles/kernel/shaders/node_invert.osl
+++ b/intern/cycles/kernel/shaders/node_invert.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_invert(float Fac = 1.0, color ColorIn = 0.8, output color ColorOut = 0.8)
 {
diff --git a/intern/cycles/kernel/shaders/node_layer_weight.osl b/intern/cycles/kernel/shaders/node_layer_weight.osl
index 7c46f28b41b..1662be2cad1 100644
--- a/intern/cycles/kernel/shaders/node_layer_weight.osl
+++ b/intern/cycles/kernel/shaders/node_layer_weight.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_layer_weight(float Blend = 0.5,
                          normal Normal = N,
diff --git a/intern/cycles/kernel/shaders/node_light_falloff.osl b/intern/cycles/kernel/shaders/node_light_falloff.osl
index d0d7dd9c5aa..3f3c9444a5a 100644
--- a/intern/cycles/kernel/shaders/node_light_falloff.osl
+++ b/intern/cycles/kernel/shaders/node_light_falloff.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_light_falloff(float Strength = 0.0,
                           float Smooth = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_light_path.osl b/intern/cycles/kernel/shaders/node_light_path.osl
index c4a3624a67f..4ff06915771 100644
--- a/intern/cycles/kernel/shaders/node_light_path.osl
+++ b/intern/cycles/kernel/shaders/node_light_path.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_light_path(output float IsCameraRay = 0.0,
                        output float IsShadowRay = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_magic_texture.osl b/intern/cycles/kernel/shaders/node_magic_texture.osl
index 26e7d57278b..476c6895f05 100644
--- a/intern/cycles/kernel/shaders/node_magic_texture.osl
+++ b/intern/cycles/kernel/shaders/node_magic_texture.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 /* Magic */
 
diff --git a/intern/cycles/kernel/shaders/node_map_range.osl b/intern/cycles/kernel/shaders/node_map_range.osl
index 242ec4271ed..1c49027e6dd 100644
--- a/intern/cycles/kernel/shaders/node_map_range.osl
+++ b/intern/cycles/kernel/shaders/node_map_range.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 float safe_divide(float a, float b)
 {
diff --git a/intern/cycles/kernel/shaders/node_mapping.osl b/intern/cycles/kernel/shaders/node_mapping.osl
index e8a9d940eda..8d204999630 100644
--- a/intern/cycles/kernel/shaders/node_mapping.osl
+++ b/intern/cycles/kernel/shaders/node_mapping.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 point safe_divide(point a, point b)
 {
diff --git a/intern/cycles/kernel/shaders/node_math.h b/intern/cycles/kernel/shaders/node_math.h
new file mode 100644
index 00000000000..4b1a6c5bc16
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_math.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+float safe_divide(float a, float b)
+{
+  return (b != 0.0) ? a / b : 0.0;
+}
+
+vector safe_divide(vector a, vector b)
+{
+  return vector((b[0] != 0.0) ? a[0] / b[0] : 0.0,
+                (b[1] != 0.0) ? a[1] / b[1] : 0.0,
+                (b[2] != 0.0) ? a[2] / b[2] : 0.0);
+}
+
+float safe_modulo(float a, float b)
+{
+  return (b != 0.0) ? fmod(a, b) : 0.0;
+}
+
+float fract(float a)
+{
+  return a - floor(a);
+}
+
+/* See: https://www.iquilezles.org/www/articles/smin/smin.htm. */
+float smoothmin(float a, float b, float c)
+{
+  if (c != 0.0) {
+    float h = max(c - abs(a - b), 0.0) / c;
+    return min(a, b) - h * h * h * c * (1.0 / 6.0);
+  }
+  else {
+    return min(a, b);
+  }
+}
+
+float pingpong(float a, float b)
+{
+  return (b != 0.0) ? abs(fract((a - b) / (b * 2.0)) * b * 2.0 - b) : 0.0;
+}
+
+float safe_sqrt(float a)
+{
+  return (a > 0.0) ? sqrt(a) : 0.0;
+}
+
+float safe_log(float a, float b)
+{
+  return (a > 0.0 && b > 0.0) ? log(a) / log(b) : 0.0;
+}
+
+vector project(vector v, vector v_proj)
+{
+  float lenSquared = dot(v_proj, v_proj);
+  return (lenSquared != 0.0) ? (dot(v, v_proj) / lenSquared) * v_proj : vector(0.0);
+}
+
+vector snap(vector a, vector b)
+{
+  return floor(safe_divide(a, b)) * b;
+}
+
+/* Adapted from godotengine math_funcs.h. */
+float wrap(float value, float max, float min)
+{
+  float range = max - min;
+  return (range != 0.0) ? value - (range * floor((value - min) / range)) : min;
+}
+
+point wrap(point value, point max, point min)
+{
+  return point(wrap(value[0], max[0], min[0]),
+               wrap(value[1], max[1], min[1]),
+               wrap(value[2], max[2], min[2]));
+}
+
+matrix euler_to_mat(point euler)
+{
+  float cx = cos(euler[0]);
+  float cy = cos(euler[1]);
+  float cz = cos(euler[2]);
+  float sx = sin(euler[0]);
+  float sy = sin(euler[1]);
+  float sz = sin(euler[2]);
+  matrix mat = matrix(1.0);
+  mat[0][0] = cy * cz;
+  mat[0][1] = cy * sz;
+  mat[0][2] = -sy;
+  mat[1][0] = sy * sx * cz - cx * sz;
+  mat[1][1] = sy * sx * sz + cx * cz;
+  mat[1][2] = cy * sx;
+  +mat[2][0] = sy * cx * cz + sx * sz;
+  mat[2][1] = sy * cx * sz - sx * cz;
+  mat[2][2] = cy * cx;
+  return mat;
+}
diff --git a/intern/cycles/kernel/shaders/node_math.osl b/intern/cycles/kernel/shaders/node_math.osl
index 1eccb56405b..dbaa7ccb60e 100644
--- a/intern/cycles/kernel/shaders/node_math.osl
+++ b/intern/cycles/kernel/shaders/node_math.osl
@@ -14,56 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-
-float safe_divide(float a, float b)
-{
-  return (b != 0.0) ? a / b : 0.0;
-}
-
-float safe_modulo(float a, float b)
-{
-  return (b != 0.0) ? fmod(a, b) : 0.0;
-}
-
-float fract(float a)
-{
-  return a - floor(a);
-}
-
-/* Adapted from godotengine math_funcs.h. */
-float wrap(float value, float max, float min)
-{
-  float range = max - min;
-  return (range != 0.0) ? value - (range * floor((value - min) / range)) : min;
-}
-
-/* See: https://www.iquilezles.org/www/articles/smin/smin.htm. */
-float smoothmin(float a, float b, float c)
-{
-  if (c != 0.0) {
-    float h = max(c - abs(a - b), 0.0) / c;
-    return min(a, b) - h * h * h * c * (1.0 / 6.0);
-  }
-  else {
-    return min(a, b);
-  }
-}
-
-float pingpong(float a, float b)
-{
-  return (b != 0.0) ? abs(fract((a - b) / (b * 2.0)) * b * 2.0 - b) : 0.0;
-}
-
-float safe_sqrt(float a)
-{
-  return (a > 0.0) ? sqrt(a) : 0.0;
-}
-
-float safe_log(float a, float b)
-{
-  return (a > 0.0 && b > 0.0) ? log(a) / log(b) : 0.0;
-}
+#include "node_math.h"
+#include "stdcycles.h"
 
 /* OSL asin, acos, and pow functions are safe by default. */
 shader node_math(string type = "add",
diff --git a/intern/cycles/kernel/shaders/node_mix.osl b/intern/cycles/kernel/shaders/node_mix.osl
index 9fbd3391ade..a13b4bb7b96 100644
--- a/intern/cycles/kernel/shaders/node_mix.osl
+++ b/intern/cycles/kernel/shaders/node_mix.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 color node_mix_blend(float t, color col1, color col2)
 {
diff --git a/intern/cycles/kernel/shaders/node_mix_closure.osl b/intern/cycles/kernel/shaders/node_mix_closure.osl
index 517c59c8786..94fc2171c44 100644
--- a/intern/cycles/kernel/shaders/node_mix_closure.osl
+++ b/intern/cycles/kernel/shaders/node_mix_closure.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_mix_closure(float Fac = 0.5,
                         closure color Closure1 = 0,
diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
index 8861f9a671a..d03b84c1ab4 100644
--- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_noise.h"
+#include "stdcycles.h"
 #include "vector2.h"
 #include "vector4.h"
 
@@ -691,7 +691,7 @@ shader node_musgrave_texture(
     float Dimension = 2.0,
     float Scale = 5.0,
     float Detail = 2.0,
-    float Lacunarity = 1.0,
+    float Lacunarity = 2.0,
     float Offset = 0.0,
     float Gain = 1.0,
     output float Fac = 0.0)
diff --git a/intern/cycles/kernel/shaders/node_noise_texture.osl b/intern/cycles/kernel/shaders/node_noise_texture.osl
index 6cff1cdab2c..4121b415673 100644
--- a/intern/cycles/kernel/shaders/node_noise_texture.osl
+++ b/intern/cycles/kernel/shaders/node_noise_texture.osl
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "node_noise.h"
+#include "stdcycles.h"
 #include "vector2.h"
 #include "vector4.h"
-#include "node_noise.h"
 
 #define vector3 point
 
diff --git a/intern/cycles/kernel/shaders/node_normal.osl b/intern/cycles/kernel/shaders/node_normal.osl
index 1d20c3e7cac..a0a88445427 100644
--- a/intern/cycles/kernel/shaders/node_normal.osl
+++ b/intern/cycles/kernel/shaders/node_normal.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_normal(normal direction = normal(0.0, 0.0, 0.0),
                    normal NormalIn = normal(0.0, 0.0, 0.0),
diff --git a/intern/cycles/kernel/shaders/node_normal_map.osl b/intern/cycles/kernel/shaders/node_normal_map.osl
index 90b593d00bc..912960f13ab 100644
--- a/intern/cycles/kernel/shaders/node_normal_map.osl
+++ b/intern/cycles/kernel/shaders/node_normal_map.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_normal_map(normal NormalIn = N,
                        float Strength = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_object_info.osl b/intern/cycles/kernel/shaders/node_object_info.osl
index 350404bb747..44513d9a1ba 100644
--- a/intern/cycles/kernel/shaders/node_object_info.osl
+++ b/intern/cycles/kernel/shaders/node_object_info.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_object_info(output point Location = point(0.0, 0.0, 0.0),
                         output color Color = color(1.0, 1.0, 1.0),
diff --git a/intern/cycles/kernel/shaders/node_output_displacement.osl b/intern/cycles/kernel/shaders/node_output_displacement.osl
index fa7f603980b..bd60fc2b7e1 100644
--- a/intern/cycles/kernel/shaders/node_output_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_output_displacement.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 displacement node_output_displacement(vector Displacement = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_output_surface.osl b/intern/cycles/kernel/shaders/node_output_surface.osl
index 013666145da..cd746f79c4a 100644
--- a/intern/cycles/kernel/shaders/node_output_surface.osl
+++ b/intern/cycles/kernel/shaders/node_output_surface.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 surface node_output_surface(closure color Surface = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_output_volume.osl b/intern/cycles/kernel/shaders/node_output_volume.osl
index dd479e751b3..4cc14cd6699 100644
--- a/intern/cycles/kernel/shaders/node_output_volume.osl
+++ b/intern/cycles/kernel/shaders/node_output_volume.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 volume node_output_volume(closure color Volume = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_particle_info.osl b/intern/cycles/kernel/shaders/node_particle_info.osl
index e286c33a1ff..2dcdf3d0f3c 100644
--- a/intern/cycles/kernel/shaders/node_particle_info.osl
+++ b/intern/cycles/kernel/shaders/node_particle_info.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_particle_info(output float Index = 0.0,
                           output float Random = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 657ced9b6e6..1711811ac65 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_principled_bsdf(string distribution = "Multiscatter GGX",
                             string subsurface_method = "burley",
diff --git a/intern/cycles/kernel/shaders/node_principled_hair_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_hair_bsdf.osl
index bf986438fca..4cf17e0e703 100644
--- a/intern/cycles/kernel/shaders/node_principled_hair_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_hair_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 color log3(color a)
 {
diff --git a/intern/cycles/kernel/shaders/node_principled_volume.osl b/intern/cycles/kernel/shaders/node_principled_volume.osl
index 39cf6837eb2..0cb4cdebdaa 100644
--- a/intern/cycles/kernel/shaders/node_principled_volume.osl
+++ b/intern/cycles/kernel/shaders/node_principled_volume.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_principled_volume(color Color = color(0.5, 0.5, 0.5),
                               float Density = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
index 941d99dd44d..9e9b31d9a87 100644
--- a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_refraction_bsdf(color Color = 0.8,
                             string distribution = "sharp",
diff --git a/intern/cycles/kernel/shaders/node_rgb_curves.osl b/intern/cycles/kernel/shaders/node_rgb_curves.osl
index e34eb027cc3..8850040d580 100644
--- a/intern/cycles/kernel/shaders/node_rgb_curves.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_curves.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_ramp_util.h"
+#include "stdcycles.h"
 
 shader node_rgb_curves(color ramp[] = {0.0},
                        float min_x = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_rgb_ramp.osl b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
index c9f9746a4fb..2131edb2688 100644
--- a/intern/cycles/kernel/shaders/node_rgb_ramp.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_ramp_util.h"
+#include "stdcycles.h"
 
 shader node_rgb_ramp(color ramp_color[] = {0.0},
                      float ramp_alpha[] = {0.0},
diff --git a/intern/cycles/kernel/shaders/node_rgb_to_bw.osl b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl
index 837d6caf5fc..f0a094d5b57 100644
--- a/intern/cycles/kernel/shaders/node_rgb_to_bw.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_rgb_to_bw(color Color = 0.0, output float Val = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_scatter_volume.osl b/intern/cycles/kernel/shaders/node_scatter_volume.osl
index fce5716f372..36ad952dee6 100644
--- a/intern/cycles/kernel/shaders/node_scatter_volume.osl
+++ b/intern/cycles/kernel/shaders/node_scatter_volume.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_scatter_volume(color Color = color(0.8, 0.8, 0.8),
                            float Density = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_separate_hsv.osl b/intern/cycles/kernel/shaders/node_separate_hsv.osl
index c77ed1f3755..2f902b72dbc 100644
--- a/intern/cycles/kernel/shaders/node_separate_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_separate_hsv.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 shader node_separate_hsv(color Color = 0.8,
                          output float H = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_separate_rgb.osl b/intern/cycles/kernel/shaders/node_separate_rgb.osl
index ee64add27e2..62e4aedb879 100644
--- a/intern/cycles/kernel/shaders/node_separate_rgb.osl
+++ b/intern/cycles/kernel/shaders/node_separate_rgb.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_separate_rgb(color Image = 0.8,
                          output float R = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_separate_xyz.osl b/intern/cycles/kernel/shaders/node_separate_xyz.osl
index 8a563f5e920..acaf3942b6f 100644
--- a/intern/cycles/kernel/shaders/node_separate_xyz.osl
+++ b/intern/cycles/kernel/shaders/node_separate_xyz.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_separate_xyz(vector Vector = 0.8,
                          output float X = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_set_normal.osl b/intern/cycles/kernel/shaders/node_set_normal.osl
index 9541b829ef7..26a97e2b5d1 100644
--- a/intern/cycles/kernel/shaders/node_set_normal.osl
+++ b/intern/cycles/kernel/shaders/node_set_normal.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 surface node_set_normal(normal Direction = N, output normal Normal = N)
 {
diff --git a/intern/cycles/kernel/shaders/node_sky_texture.osl b/intern/cycles/kernel/shaders/node_sky_texture.osl
index 9b29e5489c2..4def237a2e0 100644
--- a/intern/cycles/kernel/shaders/node_sky_texture.osl
+++ b/intern/cycles/kernel/shaders/node_sky_texture.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_color.h"
+#include "stdcycles.h"
 
 float sky_angle_between(float thetav, float phiv, float theta, float phi)
 {
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index e12199d8c3d..b1e854150ab 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_subsurface_scattering(color Color = 0.8,
                                   float Scale = 1.0,
diff --git a/intern/cycles/kernel/shaders/node_tangent.osl b/intern/cycles/kernel/shaders/node_tangent.osl
index 44eb9973f3d..83f19a4610b 100644
--- a/intern/cycles/kernel/shaders/node_tangent.osl
+++ b/intern/cycles/kernel/shaders/node_tangent.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_tangent(normal NormalIn = N,
                     string attr_name = "geom:tangent",
diff --git a/intern/cycles/kernel/shaders/node_texture_coordinate.osl b/intern/cycles/kernel/shaders/node_texture_coordinate.osl
index 13861653d04..ac05e984af2 100644
--- a/intern/cycles/kernel/shaders/node_texture_coordinate.osl
+++ b/intern/cycles/kernel/shaders/node_texture_coordinate.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_texture_coordinate(
     normal NormalIn = N,
diff --git a/intern/cycles/kernel/shaders/node_toon_bsdf.osl b/intern/cycles/kernel/shaders/node_toon_bsdf.osl
index ed3a0b25c60..4a44730c70c 100644
--- a/intern/cycles/kernel/shaders/node_toon_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_toon_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_toon_bsdf(color Color = 0.8,
                       string component = "diffuse",
diff --git a/intern/cycles/kernel/shaders/node_translucent_bsdf.osl b/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
index 7ce1ab08c59..23a562bf34d 100644
--- a/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_translucent_bsdf(color Color = 0.8, normal Normal = N, output closure color BSDF = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_transparent_bsdf.osl b/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
index a735513ba89..eb737a05c41 100644
--- a/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_transparent_bsdf(color Color = 0.8, normal Normal = N, output closure color BSDF = 0)
 {
diff --git a/intern/cycles/kernel/shaders/node_uv_map.osl b/intern/cycles/kernel/shaders/node_uv_map.osl
index 6f2887be63c..88d8c5ba394 100644
--- a/intern/cycles/kernel/shaders/node_uv_map.osl
+++ b/intern/cycles/kernel/shaders/node_uv_map.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_uv_map(int from_dupli = 0,
                    string attribute = "",
diff --git a/intern/cycles/kernel/shaders/node_value.osl b/intern/cycles/kernel/shaders/node_value.osl
index 398e2c0e392..13197b9a27a 100644
--- a/intern/cycles/kernel/shaders/node_value.osl
+++ b/intern/cycles/kernel/shaders/node_value.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_value(float value_value = 0.0,
                   vector vector_value = vector(0.0, 0.0, 0.0),
diff --git a/intern/cycles/kernel/shaders/node_vector_curves.osl b/intern/cycles/kernel/shaders/node_vector_curves.osl
index e8c8036b550..9d3a2b82b0a 100644
--- a/intern/cycles/kernel/shaders/node_vector_curves.osl
+++ b/intern/cycles/kernel/shaders/node_vector_curves.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_ramp_util.h"
+#include "stdcycles.h"
 
 shader node_vector_curves(color ramp[] = {0.0},
                           float min_x = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_vector_displacement.osl b/intern/cycles/kernel/shaders/node_vector_displacement.osl
index e9bd336347f..7cd9c2a37f2 100644
--- a/intern/cycles/kernel/shaders/node_vector_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_vector_displacement.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_vector_displacement(color Vector = color(0.0, 0.0, 0.0),
                                 float Midlevel = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_vector_math.osl b/intern/cycles/kernel/shaders/node_vector_math.osl
index 4fa9b3bb57b..218851598b4 100644
--- a/intern/cycles/kernel/shaders/node_vector_math.osl
+++ b/intern/cycles/kernel/shaders/node_vector_math.osl
@@ -14,34 +14,13 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
-
-float safe_divide(float a, float b)
-{
-  return (b != 0.0) ? a / b : 0.0;
-}
-
-vector safe_divide(vector a, vector b)
-{
-  return vector((b[0] != 0.0) ? a[0] / b[0] : 0.0,
-                (b[1] != 0.0) ? a[1] / b[1] : 0.0,
-                (b[2] != 0.0) ? a[2] / b[2] : 0.0);
-}
-
-vector project(vector v, vector v_proj)
-{
-  float lenSquared = dot(v_proj, v_proj);
-  return (lenSquared != 0.0) ? (dot(v, v_proj) / lenSquared) * v_proj : vector(0.0);
-}
-
-vector snap(vector a, vector b)
-{
-  return floor(safe_divide(a, b)) * b;
-}
+#include "node_math.h"
+#include "stdcycles.h"
 
 shader node_vector_math(string type = "add",
                         vector Vector1 = vector(0.0, 0.0, 0.0),
                         vector Vector2 = vector(0.0, 0.0, 0.0),
+                        vector Vector3 = vector(0.0, 0.0, 0.0),
                         float Scale = 1.0,
                         output float Value = 0.0,
                         output vector Vector = vector(0.0, 0.0, 0.0))
@@ -94,6 +73,9 @@ shader node_vector_math(string type = "add",
   else if (type == "modulo") {
     Vector = fmod(Vector1, Vector2);
   }
+  else if (type == "wrap") {
+    Vector = wrap(Vector1, Vector2, Vector3);
+  }
   else if (type == "fraction") {
     Vector = Vector1 - floor(Vector1);
   }
@@ -106,6 +88,15 @@ shader node_vector_math(string type = "add",
   else if (type == "maximum") {
     Vector = max(Vector1, Vector2);
   }
+  else if (type == "sine") {
+    Vector = sin(Vector1);
+  }
+  else if (type == "cosine") {
+    Vector = cos(Vector1);
+  }
+  else if (type == "tangent") {
+    Vector = tan(Vector1);
+  }
   else {
     warning("%s", "Unknown vector math operator!");
   }
diff --git a/intern/cycles/kernel/shaders/node_vector_rotate.osl b/intern/cycles/kernel/shaders/node_vector_rotate.osl
new file mode 100644
index 00000000000..2efe3470ae2
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_vector_rotate.osl
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "node_math.h"
+#include "stdcycles.h"
+
+shader node_vector_rotate(int invert = 0,
+                          string type = "axis",
+                          vector VectorIn = vector(0.0, 0.0, 0.0),
+                          point Center = point(0.0, 0.0, 0.0),
+                          point Rotation = point(0.0, 0.0, 0.0),
+                          vector Axis = vector(0.0, 0.0, 1.0),
+                          float Angle = 0.0,
+                          output vector VectorOut = vector(0.0, 0.0, 0.0))
+{
+  if (type == "euler_xyz") {
+    matrix rmat = (invert) ? transpose(euler_to_mat(Rotation)) : euler_to_mat(Rotation);
+    VectorOut = transform(rmat, VectorIn - Center) + Center;
+  }
+  else {
+    float a = (invert) ? -Angle : Angle;
+    if (type == "x_axis") {
+      VectorOut = rotate(VectorIn - Center, a, point(0.0), vector(1.0, 0.0, 0.0)) + Center;
+    }
+    else if (type == "y_axis") {
+      VectorOut = rotate(VectorIn - Center, a, point(0.0), vector(0.0, 1.0, 0.0)) + Center;
+    }
+    else if (type == "z_axis") {
+      VectorOut = rotate(VectorIn - Center, a, point(0.0), vector(0.0, 0.0, 1.0)) + Center;
+    }
+    else {  // axis
+      VectorOut = (length(Axis) != 0.0) ? rotate(VectorIn - Center, a, point(0.0), Axis) + Center :
+                                          VectorIn;
+    }
+  }
+}
diff --git a/intern/cycles/kernel/shaders/node_vector_transform.osl b/intern/cycles/kernel/shaders/node_vector_transform.osl
index 22939577be0..1db799cfc9e 100644
--- a/intern/cycles/kernel/shaders/node_vector_transform.osl
+++ b/intern/cycles/kernel/shaders/node_vector_transform.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_vector_transform(string type = "vector",
                              string convert_from = "world",
diff --git a/intern/cycles/kernel/shaders/node_velvet_bsdf.osl b/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
index 9290b845325..299acef35ee 100644
--- a/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_fresnel.h"
+#include "stdcycles.h"
 
 shader node_velvet_bsdf(color Color = 0.8,
                         float Sigma = 0.0,
diff --git a/intern/cycles/kernel/shaders/node_vertex_color.osl b/intern/cycles/kernel/shaders/node_vertex_color.osl
index 16bf3dd146e..ffaf7a2f720 100644
--- a/intern/cycles/kernel/shaders/node_vertex_color.osl
+++ b/intern/cycles/kernel/shaders/node_vertex_color.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_vertex_color(string bump_offset = "center",
                          string layer_name = "",
@@ -22,7 +22,16 @@ shader node_vertex_color(string bump_offset = "center",
                          output float Alpha = 0.0)
 {
   float vertex_color[4];
-  if (getattribute(layer_name, vertex_color)) {
+  string vertex_color_layer;
+
+  if (layer_name == "") {
+    vertex_color_layer = "geom:vertex_color";
+  }
+  else {
+    vertex_color_layer = layer_name;
+  }
+
+  if (getattribute(vertex_color_layer, vertex_color)) {
     Color = color(vertex_color[0], vertex_color[1], vertex_color[2]);
     Alpha = vertex_color[3];
 
diff --git a/intern/cycles/kernel/shaders/node_voronoi_texture.osl b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
index 10a9f7a6329..04d61c32f8a 100644
--- a/intern/cycles/kernel/shaders/node_voronoi_texture.osl
+++ b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "node_hash.h"
+#include "stdcycles.h"
 #include "vector2.h"
 #include "vector4.h"
-#include "node_hash.h"
 
 #define vector3 point
 
diff --git a/intern/cycles/kernel/shaders/node_voxel_texture.osl b/intern/cycles/kernel/shaders/node_voxel_texture.osl
index 0e4484561d8..14489298367 100644
--- a/intern/cycles/kernel/shaders/node_voxel_texture.osl
+++ b/intern/cycles/kernel/shaders/node_voxel_texture.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_voxel_texture(string filename = "",
                           string interpolation = "linear",
diff --git a/intern/cycles/kernel/shaders/node_wave_texture.osl b/intern/cycles/kernel/shaders/node_wave_texture.osl
index a706c442368..f17397be243 100644
--- a/intern/cycles/kernel/shaders/node_wave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_wave_texture.osl
@@ -14,45 +14,86 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "node_noise.h"
+#include "stdcycles.h"
 
 /* Wave */
 
-float wave(point p, string type, string profile, float detail, float distortion, float dscale)
+float wave(point p_input,
+           string type,
+           string bands_direction,
+           string rings_direction,
+           string profile,
+           float detail,
+           float distortion,
+           float dscale,
+           float phase)
 {
+  /* Prevent precision issues on unit coordinates. */
+  point p = (p_input + 0.000001) * 0.999999;
+
   float n = 0.0;
 
   if (type == "bands") {
-    n = (p[0] + p[1] + p[2]) * 10.0;
+    if (bands_direction == "x") {
+      n = p[0] * 20.0;
+    }
+    else if (bands_direction == "y") {
+      n = p[1] * 20.0;
+    }
+    else if (bands_direction == "z") {
+      n = p[2] * 20.0;
+    }
+    else { /* diagonal */
+      n = (p[0] + p[1] + p[2]) * 10.0;
+    }
   }
   else if (type == "rings") {
-    n = length(p) * 20.0;
+    point rp = p;
+    if (rings_direction == "x") {
+      rp *= point(0.0, 1.0, 1.0);
+    }
+    else if (rings_direction == "y") {
+      rp *= point(1.0, 0.0, 1.0);
+    }
+    else if (rings_direction == "z") {
+      rp *= point(1.0, 1.0, 0.0);
+    }
+    /* else: "spherical" */
+
+    n = length(rp) * 20.0;
   }
 
+  n += phase;
+
   if (distortion != 0.0) {
     n = n + (distortion * (fractal_noise(p * dscale, detail) * 2.0 - 1.0));
   }
 
   if (profile == "sine") {
-    return 0.5 + 0.5 * sin(n);
+    return 0.5 + 0.5 * sin(n - M_PI_2);
+  }
+  else if (profile == "saw") {
+    n /= M_2PI;
+    return n - floor(n);
   }
-  else {
-    /* Saw profile */
+  else { /* profile tri */
     n /= M_2PI;
-    n -= (int)n;
-    return (n < 0.0) ? n + 1.0 : n;
+    return abs(n - floor(n + 0.5)) * 2.0;
   }
 }
 
 shader node_wave_texture(int use_mapping = 0,
                          matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                          string type = "bands",
+                         string bands_direction = "x",
+                         string rings_direction = "x",
                          string profile = "sine",
                          float Scale = 5.0,
                          float Distortion = 0.0,
                          float Detail = 2.0,
                          float DetailScale = 1.0,
+                         float PhaseOffset = 0.0,
                          point Vector = P,
                          output float Fac = 0.0,
                          output color Color = 0.0)
@@ -62,6 +103,14 @@ shader node_wave_texture(int use_mapping = 0,
   if (use_mapping)
     p = transform(mapping, p);
 
-  Fac = wave(p * Scale, type, profile, Detail, Distortion, DetailScale);
+  Fac = wave(p * Scale,
+             type,
+             bands_direction,
+             rings_direction,
+             profile,
+             Detail,
+             Distortion,
+             DetailScale,
+             PhaseOffset);
   Color = Fac;
 }
diff --git a/intern/cycles/kernel/shaders/node_wavelength.osl b/intern/cycles/kernel/shaders/node_wavelength.osl
index c8c6eecb171..f484c4b4788 100644
--- a/intern/cycles/kernel/shaders/node_wavelength.osl
+++ b/intern/cycles/kernel/shaders/node_wavelength.osl
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "stdcycles.h"
 
 shader node_wavelength(float Wavelength = 500.0, output color Color = 0.0)
 {
diff --git a/intern/cycles/kernel/shaders/node_white_noise_texture.osl b/intern/cycles/kernel/shaders/node_white_noise_texture.osl
index 95f91d25e5e..94735a019d5 100644
--- a/intern/cycles/kernel/shaders/node_white_noise_texture.osl
+++ b/intern/cycles/kernel/shaders/node_white_noise_texture.osl
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
+#include "node_hash.h"
+#include "stdcycles.h"
 #include "vector2.h"
 #include "vector4.h"
-#include "node_hash.h"
 
 #define vector3 point
 
diff --git a/intern/cycles/kernel/shaders/node_wireframe.osl b/intern/cycles/kernel/shaders/node_wireframe.osl
index ea4bd3a4c87..673a451c928 100644
--- a/intern/cycles/kernel/shaders/node_wireframe.osl
+++ b/intern/cycles/kernel/shaders/node_wireframe.osl
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "stdosl.h"
 #include "oslutil.h"
+#include "stdcycles.h"
 
 shader node_wireframe(string bump_offset = "center",
                       int use_pixel_size = 0,
diff --git a/intern/cycles/kernel/shaders/oslutil.h b/intern/cycles/kernel/shaders/oslutil.h
deleted file mode 100644
index d48bfa4a665..00000000000
--- a/intern/cycles/kernel/shaders/oslutil.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef CCL_OSLUTIL_H
-#define CCL_OSLUTIL_H
-
-// Return wireframe opacity factor [0, 1] given a geometry type in
-// ("triangles", "polygons" or "patches"), and a line_width in raster
-// or world space depending on the last (raster) boolean argument.
-//
-float wireframe(string edge_type, float line_width, int raster)
-{
-  // ray differentials are so big in diffuse context that this function would always return "wire"
-  if (raytype("path:diffuse"))
-    return 0.0;
-
-  int np = 0;
-  point p[64];
-  float pixelWidth = 1;
-
-  if (edge_type == "triangles") {
-    np = 3;
-    if (!getattribute("geom:trianglevertices", p))
-      return 0.0;
-  }
-  else if (edge_type == "polygons" || edge_type == "patches") {
-    getattribute("geom:numpolyvertices", np);
-    if (np < 3 || !getattribute("geom:polyvertices", p))
-      return 0.0;
-  }
-
-  if (raster) {
-    // Project the derivatives of P to the viewing plane defined
-    // by I so we have a measure of how big is a pixel at this point
-    float pixelWidthX = length(Dx(P) - dot(Dx(P), I) * I);
-    float pixelWidthY = length(Dy(P) - dot(Dy(P), I) * I);
-    // Take the average of both axis' length
-    pixelWidth = (pixelWidthX + pixelWidthY) / 2;
-  }
-
-  // Use half the width as the neighbor face will render the
-  // other half. And take the square for fast comparison
-  pixelWidth *= 0.5 * line_width;
-  pixelWidth *= pixelWidth;
-  for (int i = 0; i < np; i++) {
-    int i2 = i ? i - 1 : np - 1;
-    vector dir = P - p[i];
-    vector edge = p[i] - p[i2];
-    vector crs = cross(edge, dir);
-    // At this point dot(crs, crs) / dot(edge, edge) is
-    // the square of area / length(edge) == square of the
-    // distance to the edge.
-    if (dot(crs, crs) < (dot(edge, edge) * pixelWidth))
-      return 1;
-  }
-  return 0;
-}
-
-float wireframe(string edge_type, float line_width)
-{
-  return wireframe(edge_type, line_width, 1);
-}
-float wireframe(string edge_type)
-{
-  return wireframe(edge_type, 1.0, 1);
-}
-float wireframe()
-{
-  return wireframe("polygons", 1.0, 1);
-}
-
-#endif /* CCL_OSLUTIL_H */
diff --git a/intern/cycles/kernel/shaders/stdcycles.h b/intern/cycles/kernel/shaders/stdcycles.h
new file mode 100644
index 00000000000..dd604da68ce
--- /dev/null
+++ b/intern/cycles/kernel/shaders/stdcycles.h
@@ -0,0 +1,150 @@
+/////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.  All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+// * Neither the name of Sony Pictures Imageworks nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef CCL_STDCYCLESOSL_H
+#define CCL_STDCYCLESOSL_H
+
+#include "stdosl.h"
+
+// Declaration of built-in functions and closures, stdosl.h does not make
+// these available so we have to redefine them.
+#define BUILTIN [[int builtin = 1]]
+#define BUILTIN_DERIV [[ int builtin = 1, int deriv = 1 ]]
+
+closure color diffuse_ramp(normal N, color colors[8]) BUILTIN;
+closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN;
+closure color diffuse_toon(normal N, float size, float smooth) BUILTIN;
+closure color glossy_toon(normal N, float size, float smooth) BUILTIN;
+closure color microfacet_ggx(normal N, float ag) BUILTIN;
+closure color microfacet_ggx_aniso(normal N, vector T, float ax, float ay) BUILTIN;
+closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
+closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
+closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
+closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
+closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_ggx_aniso_fresnel(
+    normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color
+microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_aniso_fresnel(
+    normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color
+microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_beckmann(normal N, float ab) BUILTIN;
+closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
+closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
+closure color ashikhmin_shirley(normal N, vector T, float ax, float ay) BUILTIN;
+closure color ashikhmin_velvet(normal N, float sigma) BUILTIN;
+closure color ambient_occlusion() BUILTIN;
+closure color principled_diffuse(normal N, float roughness) BUILTIN;
+closure color principled_sheen(normal N) BUILTIN;
+closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
+
+// BSSRDF
+closure color bssrdf(string method, normal N, vector radius, color albedo) BUILTIN;
+
+// Hair
+closure color
+hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
+closure color
+hair_transmission(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
+closure color principled_hair(normal N,
+                              color sigma,
+                              float roughnessu,
+                              float roughnessv,
+                              float coat,
+                              float alpha,
+                              float eta) BUILTIN;
+
+// Volume
+closure color henyey_greenstein(float g) BUILTIN;
+closure color absorption() BUILTIN;
+
+normal ensure_valid_reflection(normal Ng, vector I, normal N)
+{
+  /* The implementation here mirrors the one in kernel_montecarlo.h,
+   * check there for an explanation of the algorithm. */
+
+  float sqr(float x)
+  {
+    return x * x;
+  }
+
+  vector R = 2 * dot(N, I) * N - I;
+
+  float threshold = min(0.9 * dot(Ng, I), 0.01);
+  if (dot(Ng, R) >= threshold) {
+    return N;
+  }
+
+  float NdotNg = dot(N, Ng);
+  vector X = normalize(N - NdotNg * Ng);
+
+  float Ix = dot(I, X), Iz = dot(I, Ng);
+  float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
+  float a = Ix2 + Iz2;
+
+  float b = sqrt(Ix2 * (a - sqr(threshold)));
+  float c = Iz * threshold + a;
+
+  float fac = 0.5 / a;
+  float N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c);
+  int valid1 = (N1_z2 > 1e-5) && (N1_z2 <= (1.0 + 1e-5));
+  int valid2 = (N2_z2 > 1e-5) && (N2_z2 <= (1.0 + 1e-5));
+
+  float N_new_x, N_new_z;
+  if (valid1 && valid2) {
+    float N1_x = sqrt(1.0 - N1_z2), N1_z = sqrt(N1_z2);
+    float N2_x = sqrt(1.0 - N2_z2), N2_z = sqrt(N2_z2);
+
+    float R1 = 2 * (N1_x * Ix + N1_z * Iz) * N1_z - Iz;
+    float R2 = 2 * (N2_x * Ix + N2_z * Iz) * N2_z - Iz;
+
+    valid1 = (R1 >= 1e-5);
+    valid2 = (R2 >= 1e-5);
+    if (valid1 && valid2) {
+      N_new_x = (R1 < R2) ? N1_x : N2_x;
+      N_new_z = (R1 < R2) ? N1_z : N2_z;
+    }
+    else {
+      N_new_x = (R1 > R2) ? N1_x : N2_x;
+      N_new_z = (R1 > R2) ? N1_z : N2_z;
+    }
+  }
+  else if (valid1 || valid2) {
+    float Nz2 = valid1 ? N1_z2 : N2_z2;
+    N_new_x = sqrt(1.0 - Nz2);
+    N_new_z = sqrt(Nz2);
+  }
+  else {
+    return Ng;
+  }
+
+  return N_new_x * X + N_new_z * Ng;
+}
+
+#endif /* CCL_STDOSL_H */
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
deleted file mode 100644
index 6515d914909..00000000000
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ /dev/null
@@ -1,880 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.  All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-// * Neither the name of Sony Pictures Imageworks nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////
-
-#ifndef CCL_STDOSL_H
-#define CCL_STDOSL_H
-
-#ifndef M_PI
-#  define M_PI 3.1415926535897932       /* pi */
-#  define M_PI_2 1.5707963267948966     /* pi/2 */
-#  define M_PI_4 0.7853981633974483     /* pi/4 */
-#  define M_2_PI 0.6366197723675813     /* 2/pi */
-#  define M_2PI 6.2831853071795865      /* 2*pi */
-#  define M_4PI 12.566370614359173      /* 4*pi */
-#  define M_2_SQRTPI 1.1283791670955126 /* 2/sqrt(pi) */
-#  define M_E 2.7182818284590452        /* e (Euler's number) */
-#  define M_LN2 0.6931471805599453      /* ln(2) */
-#  define M_LN10 2.3025850929940457     /* ln(10) */
-#  define M_LOG2E 1.4426950408889634    /* log_2(e) */
-#  define M_LOG10E 0.4342944819032518   /* log_10(e) */
-#  define M_SQRT2 1.4142135623730950    /* sqrt(2) */
-#  define M_SQRT1_2 0.7071067811865475  /* 1/sqrt(2) */
-#endif
-
-// Declaration of built-in functions and closures
-#define BUILTIN [[int builtin = 1]]
-#define BUILTIN_DERIV [[ int builtin = 1, int deriv = 1 ]]
-
-#define PERCOMP1(name) \
-  normal name(normal x) BUILTIN; \
-  vector name(vector x) BUILTIN; \
-  point name(point x) BUILTIN; \
-  color name(color x) BUILTIN; \
-  float name(float x) BUILTIN;
-
-#define PERCOMP2(name) \
-  normal name(normal x, normal y) BUILTIN; \
-  vector name(vector x, vector y) BUILTIN; \
-  point name(point x, point y) BUILTIN; \
-  color name(color x, color y) BUILTIN; \
-  float name(float x, float y) BUILTIN;
-
-#define PERCOMP2F(name) \
-  normal name(normal x, float y) BUILTIN; \
-  vector name(vector x, float y) BUILTIN; \
-  point name(point x, float y) BUILTIN; \
-  color name(color x, float y) BUILTIN; \
-  float name(float x, float y) BUILTIN;
-
-// Basic math
-normal degrees(normal x)
-{
-  return x * (180.0 / M_PI);
-}
-vector degrees(vector x)
-{
-  return x * (180.0 / M_PI);
-}
-point degrees(point x)
-{
-  return x * (180.0 / M_PI);
-}
-color degrees(color x)
-{
-  return x * (180.0 / M_PI);
-}
-float degrees(float x)
-{
-  return x * (180.0 / M_PI);
-}
-normal radians(normal x)
-{
-  return x * (M_PI / 180.0);
-}
-vector radians(vector x)
-{
-  return x * (M_PI / 180.0);
-}
-point radians(point x)
-{
-  return x * (M_PI / 180.0);
-}
-color radians(color x)
-{
-  return x * (M_PI / 180.0);
-}
-float radians(float x)
-{
-  return x * (M_PI / 180.0);
-}
-PERCOMP1(cos)
-PERCOMP1(sin)
-PERCOMP1(tan)
-PERCOMP1(acos)
-PERCOMP1(asin)
-PERCOMP1(atan)
-PERCOMP2(atan2)
-PERCOMP1(cosh)
-PERCOMP1(sinh)
-PERCOMP1(tanh)
-PERCOMP2F(pow)
-PERCOMP1(exp)
-PERCOMP1(exp2)
-PERCOMP1(expm1)
-PERCOMP1(log)
-point log(point a, float b)
-{
-  return log(a) / log(b);
-}
-vector log(vector a, float b)
-{
-  return log(a) / log(b);
-}
-color log(color a, float b)
-{
-  return log(a) / log(b);
-}
-float log(float a, float b)
-{
-  return log(a) / log(b);
-}
-PERCOMP1(log2)
-PERCOMP1(log10)
-PERCOMP1(logb)
-PERCOMP1(sqrt)
-PERCOMP1(inversesqrt)
-float hypot(float a, float b)
-{
-  return sqrt(a * a + b * b);
-}
-float hypot(float a, float b, float c)
-{
-  return sqrt(a * a + b * b + c * c);
-}
-PERCOMP1(abs)
-int abs(int x) BUILTIN;
-PERCOMP1(fabs)
-int fabs(int x) BUILTIN;
-PERCOMP1(sign)
-PERCOMP1(floor)
-PERCOMP1(ceil)
-PERCOMP1(round)
-PERCOMP1(trunc)
-PERCOMP2(fmod)
-PERCOMP2F(fmod)
-int mod(int a, int b)
-{
-  return a - b * (int)floor(a / b);
-}
-point mod(point a, point b)
-{
-  return a - b * floor(a / b);
-}
-vector mod(vector a, vector b)
-{
-  return a - b * floor(a / b);
-}
-normal mod(normal a, normal b)
-{
-  return a - b * floor(a / b);
-}
-color mod(color a, color b)
-{
-  return a - b * floor(a / b);
-}
-point mod(point a, float b)
-{
-  return a - b * floor(a / b);
-}
-vector mod(vector a, float b)
-{
-  return a - b * floor(a / b);
-}
-normal mod(normal a, float b)
-{
-  return a - b * floor(a / b);
-}
-color mod(color a, float b)
-{
-  return a - b * floor(a / b);
-}
-float mod(float a, float b)
-{
-  return a - b * floor(a / b);
-}
-PERCOMP2(min)
-int min(int a, int b) BUILTIN;
-PERCOMP2(max)
-int max(int a, int b) BUILTIN;
-normal clamp(normal x, normal minval, normal maxval)
-{
-  return max(min(x, maxval), minval);
-}
-vector clamp(vector x, vector minval, vector maxval)
-{
-  return max(min(x, maxval), minval);
-}
-point clamp(point x, point minval, point maxval)
-{
-  return max(min(x, maxval), minval);
-}
-color clamp(color x, color minval, color maxval)
-{
-  return max(min(x, maxval), minval);
-}
-float clamp(float x, float minval, float maxval)
-{
-  return max(min(x, maxval), minval);
-}
-int clamp(int x, int minval, int maxval)
-{
-  return max(min(x, maxval), minval);
-}
-#if 0
-normal mix(normal x, normal y, normal a)
-{
-  return x * (1 - a) + y * a;
-}
-normal mix(normal x, normal y, float a)
-{
-  return x * (1 - a) + y * a;
-}
-vector mix(vector x, vector y, vector a)
-{
-  return x * (1 - a) + y * a;
-}
-vector mix(vector x, vector y, float a)
-{
-  return x * (1 - a) + y * a;
-}
-point mix(point x, point y, point a)
-{
-  return x * (1 - a) + y * a;
-}
-point mix(point x, point y, float a)
-{
-  return x * (1 - a) + y * a;
-}
-color mix(color x, color y, color a)
-{
-  return x * (1 - a) + y * a;
-}
-color mix(color x, color y, float a)
-{
-  return x * (1 - a) + y * a;
-}
-float mix(float x, float y, float a)
-{
-  return x * (1 - a) + y * a;
-}
-#else
-normal mix(normal x, normal y, normal a) BUILTIN;
-normal mix(normal x, normal y, float a) BUILTIN;
-vector mix(vector x, vector y, vector a) BUILTIN;
-vector mix(vector x, vector y, float a) BUILTIN;
-point mix(point x, point y, point a) BUILTIN;
-point mix(point x, point y, float a) BUILTIN;
-color mix(color x, color y, color a) BUILTIN;
-color mix(color x, color y, float a) BUILTIN;
-float mix(float x, float y, float a) BUILTIN;
-#endif
-int isnan(float x) BUILTIN;
-int isinf(float x) BUILTIN;
-int isfinite(float x) BUILTIN;
-float erf(float x) BUILTIN;
-float erfc(float x) BUILTIN;
-
-// Vector functions
-
-vector cross(vector a, vector b) BUILTIN;
-float dot(vector a, vector b) BUILTIN;
-float length(vector v) BUILTIN;
-float distance(point a, point b) BUILTIN;
-float distance(point a, point b, point q)
-{
-  vector d = b - a;
-  float dd = dot(d, d);
-  if (dd == 0.0)
-    return distance(q, a);
-  float t = dot(q - a, d) / dd;
-  return distance(q, a + clamp(t, 0.0, 1.0) * d);
-}
-normal normalize(normal v) BUILTIN;
-vector normalize(vector v) BUILTIN;
-vector faceforward(vector N, vector I, vector Nref) BUILTIN;
-vector faceforward(vector N, vector I) BUILTIN;
-vector reflect(vector I, vector N)
-{
-  return I - 2 * dot(N, I) * N;
-}
-vector refract(vector I, vector N, float eta)
-{
-  float IdotN = dot(I, N);
-  float k = 1 - eta * eta * (1 - IdotN * IdotN);
-  return (k < 0) ? vector(0, 0, 0) : (eta * I - N * (eta * IdotN + sqrt(k)));
-}
-void fresnel(vector I,
-             normal N,
-             float eta,
-             output float Kr,
-             output float Kt,
-             output vector R,
-             output vector T)
-{
-  float sqr(float x)
-  {
-    return x * x;
-  }
-  float c = dot(I, N);
-  if (c < 0)
-    c = -c;
-  R = reflect(I, N);
-  float g = 1.0 / sqr(eta) - 1.0 + c * c;
-  if (g >= 0.0) {
-    g = sqrt(g);
-    float beta = g - c;
-    float F = (c * (g + c) - 1.0) / (c * beta + 1.0);
-    F = 0.5 * (1.0 + sqr(F));
-    F *= sqr(beta / (g + c));
-    Kr = F;
-    Kt = (1.0 - Kr) * eta * eta;
-    // OPT: the following recomputes some of the above values, but it
-    // gives us the same result as if the shader-writer called refract()
-    T = refract(I, N, eta);
-  }
-  else {
-    // total internal reflection
-    Kr = 1.0;
-    Kt = 0.0;
-    T = vector(0, 0, 0);
-  }
-}
-
-void fresnel(vector I, normal N, float eta, output float Kr, output float Kt)
-{
-  vector R, T;
-  fresnel(I, N, eta, Kr, Kt, R, T);
-}
-
-normal transform(matrix Mto, normal p) BUILTIN;
-vector transform(matrix Mto, vector p) BUILTIN;
-point transform(matrix Mto, point p) BUILTIN;
-normal transform(string from, string to, normal p) BUILTIN;
-vector transform(string from, string to, vector p) BUILTIN;
-point transform(string from, string to, point p) BUILTIN;
-normal transform(string to, normal p)
-{
-  return transform("common", to, p);
-}
-vector transform(string to, vector p)
-{
-  return transform("common", to, p);
-}
-point transform(string to, point p)
-{
-  return transform("common", to, p);
-}
-
-float transformu(string tounits, float x) BUILTIN;
-float transformu(string fromunits, string tounits, float x) BUILTIN;
-
-point rotate(point p, float angle, point a, point b)
-{
-  vector axis = normalize(b - a);
-  float cosang, sinang;
-  /* Older OSX has major issues with sincos() function,
-   * it's likely a big in OSL or LLVM. For until we've
-   * updated to new versions of this libraries we'll
-   * use a workaround to prevent possible crashes on all
-   * the platforms.
-   *
-   * Shouldn't be that bad because it's mainly used for
-   * anisotropic shader where angle is usually constant.
-   */
-#if 0
-  sincos(angle, sinang, cosang);
-#else
-  sinang = sin(angle);
-  cosang = cos(angle);
-#endif
-  float cosang1 = 1.0 - cosang;
-  float x = axis[0], y = axis[1], z = axis[2];
-  matrix M = matrix(x * x + (1.0 - x * x) * cosang,
-                    x * y * cosang1 + z * sinang,
-                    x * z * cosang1 - y * sinang,
-                    0.0,
-                    x * y * cosang1 - z * sinang,
-                    y * y + (1.0 - y * y) * cosang,
-                    y * z * cosang1 + x * sinang,
-                    0.0,
-                    x * z * cosang1 + y * sinang,
-                    y * z * cosang1 - x * sinang,
-                    z * z + (1.0 - z * z) * cosang,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    1.0);
-  return transform(M, p - a) + a;
-}
-
-normal ensure_valid_reflection(normal Ng, vector I, normal N)
-{
-  /* The implementation here mirrors the one in kernel_montecarlo.h,
-   * check there for an explanation of the algorithm. */
-
-  float sqr(float x)
-  {
-    return x * x;
-  }
-
-  vector R = 2 * dot(N, I) * N - I;
-
-  float threshold = min(0.9 * dot(Ng, I), 0.01);
-  if (dot(Ng, R) >= threshold) {
-    return N;
-  }
-
-  float NdotNg = dot(N, Ng);
-  vector X = normalize(N - NdotNg * Ng);
-
-  float Ix = dot(I, X), Iz = dot(I, Ng);
-  float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
-  float a = Ix2 + Iz2;
-
-  float b = sqrt(Ix2 * (a - sqr(threshold)));
-  float c = Iz * threshold + a;
-
-  float fac = 0.5 / a;
-  float N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c);
-  int valid1 = (N1_z2 > 1e-5) && (N1_z2 <= (1.0 + 1e-5));
-  int valid2 = (N2_z2 > 1e-5) && (N2_z2 <= (1.0 + 1e-5));
-
-  float N_new_x, N_new_z;
-  if (valid1 && valid2) {
-    float N1_x = sqrt(1.0 - N1_z2), N1_z = sqrt(N1_z2);
-    float N2_x = sqrt(1.0 - N2_z2), N2_z = sqrt(N2_z2);
-
-    float R1 = 2 * (N1_x * Ix + N1_z * Iz) * N1_z - Iz;
-    float R2 = 2 * (N2_x * Ix + N2_z * Iz) * N2_z - Iz;
-
-    valid1 = (R1 >= 1e-5);
-    valid2 = (R2 >= 1e-5);
-    if (valid1 && valid2) {
-      N_new_x = (R1 < R2) ? N1_x : N2_x;
-      N_new_z = (R1 < R2) ? N1_z : N2_z;
-    }
-    else {
-      N_new_x = (R1 > R2) ? N1_x : N2_x;
-      N_new_z = (R1 > R2) ? N1_z : N2_z;
-    }
-  }
-  else if (valid1 || valid2) {
-    float Nz2 = valid1 ? N1_z2 : N2_z2;
-    N_new_x = sqrt(1.0 - Nz2);
-    N_new_z = sqrt(Nz2);
-  }
-  else {
-    return Ng;
-  }
-
-  return N_new_x * X + N_new_z * Ng;
-}
-
-// Color functions
-
-float luminance(color c) BUILTIN;
-color blackbody(float temperatureK) BUILTIN;
-color wavelength_color(float wavelength_nm) BUILTIN;
-
-color transformc(string to, color x)
-{
-  color rgb_to_hsv(color rgb)
-  {  // See Foley & van Dam
-    float r = rgb[0], g = rgb[1], b = rgb[2];
-    float mincomp = min(r, min(g, b));
-    float maxcomp = max(r, max(g, b));
-    float delta = maxcomp - mincomp;  // chroma
-    float h, s, v;
-    v = maxcomp;
-    if (maxcomp > 0)
-      s = delta / maxcomp;
-    else
-      s = 0;
-    if (s <= 0)
-      h = 0;
-    else {
-      if (r >= maxcomp)
-        h = (g - b) / delta;
-      else if (g >= maxcomp)
-        h = 2 + (b - r) / delta;
-      else
-        h = 4 + (r - g) / delta;
-      h /= 6;
-      if (h < 0)
-        h += 1;
-    }
-    return color(h, s, v);
-  }
-
-  color rgb_to_hsl(color rgb)
-  {  // See Foley & van Dam
-    // First convert rgb to hsv, then to hsl
-    float minval = min(rgb[0], min(rgb[1], rgb[2]));
-    color hsv = rgb_to_hsv(rgb);
-    float maxval = hsv[2];  // v == maxval
-    float h = hsv[0], s, l = (minval + maxval) / 2;
-    if (minval == maxval)
-      s = 0;  // special 'achromatic' case, hue is 0
-    else if (l <= 0.5)
-      s = (maxval - minval) / (maxval + minval);
-    else
-      s = (maxval - minval) / (2 - maxval - minval);
-    return color(h, s, l);
-  }
-
-  color r;
-  if (to == "rgb" || to == "RGB")
-    r = x;
-  else if (to == "hsv")
-    r = rgb_to_hsv(x);
-  else if (to == "hsl")
-    r = rgb_to_hsl(x);
-  else if (to == "YIQ")
-    r = color(dot(vector(0.299, 0.587, 0.114), (vector)x),
-              dot(vector(0.596, -0.275, -0.321), (vector)x),
-              dot(vector(0.212, -0.523, 0.311), (vector)x));
-  else if (to == "XYZ")
-    r = color(dot(vector(0.412453, 0.357580, 0.180423), (vector)x),
-              dot(vector(0.212671, 0.715160, 0.072169), (vector)x),
-              dot(vector(0.019334, 0.119193, 0.950227), (vector)x));
-  else {
-    error("Unknown color space \"%s\"", to);
-    r = x;
-  }
-  return r;
-}
-
-color transformc(string from, string to, color x)
-{
-  color hsv_to_rgb(color c)
-  {  // Reference: Foley & van Dam
-    float h = c[0], s = c[1], v = c[2];
-    color r;
-    if (s < 0.0001) {
-      r = v;
-    }
-    else {
-      h = 6 * (h - floor(h));  // expand to [0..6)
-      int hi = (int)h;
-      float f = h - hi;
-      float p = v * (1 - s);
-      float q = v * (1 - s * f);
-      float t = v * (1 - s * (1 - f));
-      if (hi == 0)
-        r = color(v, t, p);
-      else if (hi == 1)
-        r = color(q, v, p);
-      else if (hi == 2)
-        r = color(p, v, t);
-      else if (hi == 3)
-        r = color(p, q, v);
-      else if (hi == 4)
-        r = color(t, p, v);
-      else
-        r = color(v, p, q);
-    }
-    return r;
-  }
-
-  color hsl_to_rgb(color c)
-  {
-    float h = c[0], s = c[1], l = c[2];
-    // Easiest to convert hsl -> hsv, then hsv -> RGB (per Foley & van Dam)
-    float v = (l <= 0.5) ? (l * (1 + s)) : (l * (1 - s) + s);
-    color r;
-    if (v <= 0) {
-      r = 0;
-    }
-    else {
-      float min = 2 * l - v;
-      s = (v - min) / v;
-      r = hsv_to_rgb(color(h, s, v));
-    }
-    return r;
-  }
-
-  color r;
-  if (from == "rgb" || from == "RGB")
-    r = x;
-  else if (from == "hsv")
-    r = hsv_to_rgb(x);
-  else if (from == "hsl")
-    r = hsl_to_rgb(x);
-  else if (from == "YIQ")
-    r = color(dot(vector(1, 0.9557, 0.6199), (vector)x),
-              dot(vector(1, -0.2716, -0.6469), (vector)x),
-              dot(vector(1, -1.1082, 1.7051), (vector)x));
-  else if (from == "XYZ")
-    r = color(dot(vector(3.240479, -1.537150, -0.498535), (vector)x),
-              dot(vector(-0.969256, 1.875991, 0.041556), (vector)x),
-              dot(vector(0.055648, -0.204043, 1.057311), (vector)x));
-  else {
-    error("Unknown color space \"%s\"", to);
-    r = x;
-  }
-  return transformc(to, r);
-}
-
-// Matrix functions
-
-float determinant(matrix m) BUILTIN;
-matrix transpose(matrix m) BUILTIN;
-
-// Pattern generation
-
-color step(color edge, color x) BUILTIN;
-point step(point edge, point x) BUILTIN;
-vector step(vector edge, vector x) BUILTIN;
-normal step(normal edge, normal x) BUILTIN;
-float step(float edge, float x) BUILTIN;
-float smoothstep(float edge0, float edge1, float x) BUILTIN;
-
-float linearstep(float edge0, float edge1, float x)
-{
-  float result;
-  if (edge0 != edge1) {
-    float xclamped = clamp(x, edge0, edge1);
-    result = (xclamped - edge0) / (edge1 - edge0);
-  }
-  else {  // special case: edges coincide
-    result = step(edge0, x);
-  }
-  return result;
-}
-
-float smooth_linearstep(float edge0, float edge1, float x_, float eps_)
-{
-  float result;
-  if (edge0 != edge1) {
-    float rampup(float x, float r)
-    {
-      return 0.5 / r * x * x;
-    }
-    float width_inv = 1.0 / (edge1 - edge0);
-    float eps = eps_ * width_inv;
-    float x = (x_ - edge0) * width_inv;
-    if (x <= -eps)
-      result = 0;
-    else if (x >= eps && x <= 1.0 - eps)
-      result = x;
-    else if (x >= 1.0 + eps)
-      result = 1;
-    else if (x < eps)
-      result = rampup(x + eps, 2.0 * eps);
-    else /* if (x < 1.0+eps) */
-      result = 1.0 - rampup(1.0 + eps - x, 2.0 * eps);
-  }
-  else {
-    result = step(edge0, x_);
-  }
-  return result;
-}
-
-float aastep(float edge, float s, float dedge, float ds)
-{
-  // Box filtered AA step
-  float width = fabs(dedge) + fabs(ds);
-  float halfwidth = 0.5 * width;
-  float e1 = edge - halfwidth;
-  return (s <= e1) ? 0.0 : ((s >= (edge + halfwidth)) ? 1.0 : (s - e1) / width);
-}
-float aastep(float edge, float s, float ds)
-{
-  return aastep(edge, s, filterwidth(edge), ds);
-}
-float aastep(float edge, float s)
-{
-  return aastep(edge, s, filterwidth(edge), filterwidth(s));
-}
-
-// Derivatives and area operators
-
-// Displacement functions
-
-// String functions
-int strlen(string s) BUILTIN;
-int hash(string s) BUILTIN;
-int getchar(string s, int index) BUILTIN;
-int startswith(string s, string prefix) BUILTIN;
-int endswith(string s, string suffix) BUILTIN;
-string substr(string s, int start, int len) BUILTIN;
-string substr(string s, int start)
-{
-  return substr(s, start, strlen(s));
-}
-float stof(string str) BUILTIN;
-int stoi(string str) BUILTIN;
-
-// Define concat in terms of shorter concat
-string concat(string a, string b, string c)
-{
-  return concat(concat(a, b), c);
-}
-string concat(string a, string b, string c, string d)
-{
-  return concat(concat(a, b, c), d);
-}
-string concat(string a, string b, string c, string d, string e)
-{
-  return concat(concat(a, b, c, d), e);
-}
-string concat(string a, string b, string c, string d, string e, string f)
-{
-  return concat(concat(a, b, c, d, e), f);
-}
-
-// Texture
-
-// Closures
-
-closure color diffuse(normal N) BUILTIN;
-closure color oren_nayar(normal N, float sigma) BUILTIN;
-closure color diffuse_ramp(normal N, color colors[8]) BUILTIN;
-closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN;
-closure color diffuse_toon(normal N, float size, float smooth) BUILTIN;
-closure color glossy_toon(normal N, float size, float smooth) BUILTIN;
-closure color translucent(normal N) BUILTIN;
-closure color reflection(normal N) BUILTIN;
-closure color refraction(normal N, float eta) BUILTIN;
-closure color transparent() BUILTIN;
-closure color microfacet_ggx(normal N, float ag) BUILTIN;
-closure color microfacet_ggx_aniso(normal N, vector T, float ax, float ay) BUILTIN;
-closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
-closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
-closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
-closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
-closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
-closure color microfacet_ggx_aniso_fresnel(
-    normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
-closure color
-microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
-closure color microfacet_multi_ggx_aniso_fresnel(
-    normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
-closure color
-microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
-closure color microfacet_beckmann(normal N, float ab) BUILTIN;
-closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
-closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
-closure color ashikhmin_shirley(normal N, vector T, float ax, float ay) BUILTIN;
-closure color ashikhmin_velvet(normal N, float sigma) BUILTIN;
-closure color emission() BUILTIN;
-closure color background() BUILTIN;
-closure color holdout() BUILTIN;
-closure color ambient_occlusion() BUILTIN;
-closure color principled_diffuse(normal N, float roughness) BUILTIN;
-closure color principled_sheen(normal N) BUILTIN;
-closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
-
-// BSSRDF
-closure color bssrdf(string method, normal N, vector radius, color albedo) BUILTIN;
-
-// Hair
-closure color
-hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
-closure color
-hair_transmission(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
-closure color principled_hair(normal N,
-                              color sigma,
-                              float roughnessu,
-                              float roughnessv,
-                              float coat,
-                              float alpha,
-                              float eta) BUILTIN;
-
-// Volume
-closure color henyey_greenstein(float g) BUILTIN;
-closure color absorption() BUILTIN;
-
-// OSL 1.5 Microfacet functions
-closure color microfacet(
-    string distribution, normal N, vector U, float xalpha, float yalpha, float eta, int refract)
-{
-  /* GGX */
-  if (distribution == "ggx" || distribution == "default") {
-    if (!refract) {
-      if (xalpha == yalpha) {
-        /* Isotropic */
-        return microfacet_ggx(N, xalpha);
-      }
-      else {
-        /* Anisotropic */
-        return microfacet_ggx_aniso(N, U, xalpha, yalpha);
-      }
-    }
-    else {
-      return microfacet_ggx_refraction(N, xalpha, eta);
-    }
-  }
-  /* Beckmann */
-  else {
-    if (!refract) {
-      if (xalpha == yalpha) {
-        /* Isotropic */
-        return microfacet_beckmann(N, xalpha);
-      }
-      else {
-        /* Anisotropic */
-        return microfacet_beckmann_aniso(N, U, xalpha, yalpha);
-      }
-    }
-    else {
-      return microfacet_beckmann_refraction(N, xalpha, eta);
-    }
-  }
-}
-
-closure color microfacet(string distribution, normal N, float alpha, float eta, int refract)
-{
-  return microfacet(distribution, N, vector(0), alpha, alpha, eta, refract);
-}
-
-// Renderer state
-int backfacing() BUILTIN;
-int raytype(string typename) BUILTIN;
-// the individual 'isFOOray' functions are deprecated
-int iscameraray()
-{
-  return raytype("camera");
-}
-int isdiffuseray()
-{
-  return raytype("diffuse");
-}
-int isglossyray()
-{
-  return raytype("glossy");
-}
-int isshadowray()
-{
-  return raytype("shadow");
-}
-int getmatrix(string fromspace, string tospace, output matrix M) BUILTIN;
-int getmatrix(string fromspace, output matrix M)
-{
-  return getmatrix(fromspace, "common", M);
-}
-
-// Miscellaneous
-
-#undef BUILTIN
-#undef BUILTIN_DERIV
-#undef PERCOMP1
-#undef PERCOMP2
-#undef PERCOMP2F
-
-#endif /* CCL_STDOSL_H */
diff --git a/intern/cycles/kernel/shaders/vector2.h b/intern/cycles/kernel/shaders/vector2.h
deleted file mode 100644
index c524735d892..00000000000
--- a/intern/cycles/kernel/shaders/vector2.h
+++ /dev/null
@@ -1,291 +0,0 @@
-// Open Shading Language : Copyright (c) 2009-2017 Sony Pictures Imageworks Inc., et al.
-// https://github.com/imageworks/OpenShadingLanguage/blob/master/LICENSE
-
-#pragma once
-#define VECTOR2_H
-
-// vector2 is a 2D vector
-struct vector2 {
-  float x;
-  float y;
-};
-
-//
-// For vector2, define math operators to match vector
-//
-
-vector2 __operator__neg__(vector2 a)
-{
-  return vector2(-a.x, -a.y);
-}
-
-vector2 __operator__add__(vector2 a, vector2 b)
-{
-  return vector2(a.x + b.x, a.y + b.y);
-}
-
-vector2 __operator__add__(vector2 a, int b)
-{
-  return a + vector2(b, b);
-}
-
-vector2 __operator__add__(vector2 a, float b)
-{
-  return a + vector2(b, b);
-}
-
-vector2 __operator__add__(int a, vector2 b)
-{
-  return vector2(a, a) + b;
-}
-
-vector2 __operator__add__(float a, vector2 b)
-{
-  return vector2(a, a) + b;
-}
-
-vector2 __operator__sub__(vector2 a, vector2 b)
-{
-  return vector2(a.x - b.x, a.y - b.y);
-}
-
-vector2 __operator__sub__(vector2 a, int b)
-{
-  return a - vector2(b, b);
-}
-
-vector2 __operator__sub__(vector2 a, float b)
-{
-  return a - vector2(b, b);
-}
-
-vector2 __operator__sub__(int a, vector2 b)
-{
-  return vector2(a, a) - b;
-}
-
-vector2 __operator__sub__(float a, vector2 b)
-{
-  return vector2(a, a) - b;
-}
-
-vector2 __operator__mul__(vector2 a, vector2 b)
-{
-  return vector2(a.x * b.x, a.y * b.y);
-}
-
-vector2 __operator__mul__(vector2 a, int b)
-{
-  return a * vector2(b, b);
-}
-
-vector2 __operator__mul__(vector2 a, float b)
-{
-  return a * vector2(b, b);
-}
-
-vector2 __operator__mul__(int a, vector2 b)
-{
-  return b * vector2(a, a);
-}
-
-vector2 __operator__mul__(float a, vector2 b)
-{
-  return b * vector2(a, a);
-}
-
-vector2 __operator__div__(vector2 a, vector2 b)
-{
-  return vector2(a.x / b.x, a.y / b.y);
-}
-
-vector2 __operator__div__(vector2 a, int b)
-{
-  float b_inv = 1 / b;
-  return a * vector2(b_inv, b_inv);
-}
-
-vector2 __operator__div__(vector2 a, float b)
-{
-  float b_inv = 1 / b;
-  return a * vector2(b_inv, b_inv);
-}
-
-vector2 __operator__div__(int a, vector2 b)
-{
-  return vector2(a, a) / b;
-}
-
-vector2 __operator__div__(float a, vector2 b)
-{
-  return vector2(a, a) / b;
-}
-
-int __operator__eq__(vector2 a, vector2 b)
-{
-  return (a.x == b.x) && (a.y == b.y);
-}
-
-int __operator__ne__(vector2 a, vector2 b)
-{
-  return (a.x != b.x) || (a.y != b.y);
-}
-
-//
-// For vector2, define most of the stdosl functions to match vector
-//
-
-vector2 abs(vector2 a)
-{
-  return vector2(abs(a.x), abs(a.y));
-}
-
-vector2 ceil(vector2 a)
-{
-  return vector2(ceil(a.x), ceil(a.y));
-}
-
-vector2 floor(vector2 a)
-{
-  return vector2(floor(a.x), floor(a.y));
-}
-
-vector2 sqrt(vector2 a)
-{
-  return vector2(sqrt(a.x), sqrt(a.y));
-}
-
-vector2 exp(vector2 a)
-{
-  return vector2(exp(a.x), exp(a.y));
-}
-
-vector2 log(vector2 a)
-{
-  return vector2(log(a.x), log(a.y));
-}
-
-vector2 log2(vector2 a)
-{
-  return vector2(log2(a.x), log2(a.y));
-}
-
-vector2 mix(vector2 a, vector2 b, float x)
-{
-  return vector2(mix(a.x, b.x, x), mix(a.y, b.y, x));
-}
-
-float dot(vector2 a, vector2 b)
-{
-  return (a.x * b.x + a.y * b.y);
-}
-
-float length(vector2 a)
-{
-  return hypot(a.x, a.y);
-}
-
-vector2 smoothstep(vector2 low, vector2 high, vector2 in)
-{
-  return vector2(smoothstep(low.x, high.x, in.x), smoothstep(low.y, high.y, in.y));
-}
-
-vector2 smoothstep(float low, float high, vector2 in)
-{
-  return vector2(smoothstep(low, high, in.x), smoothstep(low, high, in.y));
-}
-
-vector2 clamp(vector2 in, vector2 low, vector2 high)
-{
-  return vector2(clamp(in.x, low.x, high.x), clamp(in.y, low.y, high.y));
-}
-
-vector2 clamp(vector2 in, float low, float high)
-{
-  return clamp(in, vector2(low, low), vector2(high, high));
-}
-
-vector2 max(vector2 a, vector2 b)
-{
-  return vector2(max(a.x, b.x), max(a.y, b.y));
-}
-
-vector2 max(vector2 a, float b)
-{
-  return max(a, vector2(b, b));
-}
-
-vector2 normalize(vector2 a)
-{
-  return a / length(a);
-}
-
-vector2 min(vector2 a, vector2 b)
-{
-  return vector2(min(a.x, a.x), min(b.y, b.y));
-}
-
-vector2 min(vector2 a, float b)
-{
-  return min(a, vector2(b, b));
-}
-
-vector2 fmod(vector2 a, vector2 b)
-{
-  return vector2(fmod(a.x, b.x), fmod(a.y, b.y));
-}
-
-vector2 fmod(vector2 a, float b)
-{
-  return fmod(a, vector2(b, b));
-}
-
-vector2 pow(vector2 in, vector2 amount)
-{
-  return vector2(pow(in.x, amount.x), pow(in.y, amount.y));
-}
-
-vector2 pow(vector2 in, float amount)
-{
-  return pow(in, vector2(amount, amount));
-}
-
-vector2 sign(vector2 a)
-{
-  return vector2(sign(a.x), sign(a.y));
-}
-
-vector2 sin(vector2 a)
-{
-  return vector2(sin(a.x), sin(a.y));
-}
-
-vector2 cos(vector2 a)
-{
-  return vector2(cos(a.x), cos(a.y));
-}
-
-vector2 tan(vector2 a)
-{
-  return vector2(tan(a.x), tan(a.y));
-}
-
-vector2 asin(vector2 a)
-{
-  return vector2(asin(a.x), asin(a.y));
-}
-
-vector2 acos(vector2 a)
-{
-  return vector2(acos(a.x), acos(a.y));
-}
-
-vector2 atan2(vector2 a, float f)
-{
-  return vector2(atan2(a.x, f), atan2(a.y, f));
-}
-
-vector2 atan2(vector2 a, vector2 b)
-{
-  return vector2(atan2(a.x, b.x), atan2(a.y, b.y));
-}
diff --git a/intern/cycles/kernel/shaders/vector4.h b/intern/cycles/kernel/shaders/vector4.h
deleted file mode 100644
index 58e1b3c2e23..00000000000
--- a/intern/cycles/kernel/shaders/vector4.h
+++ /dev/null
@@ -1,327 +0,0 @@
-// Open Shading Language : Copyright (c) 2009-2017 Sony Pictures Imageworks Inc., et al.
-// https://github.com/imageworks/OpenShadingLanguage/blob/master/LICENSE
-
-#pragma once
-#define VECTOR4_H
-
-// vector4 is a 4D vector
-struct vector4 {
-  float x;
-  float y;
-  float z;
-  float w;
-};
-
-//
-// For vector4, define math operators to match vector
-//
-
-vector4 __operator__neg__(vector4 a)
-{
-  return vector4(-a.x, -a.y, -a.z, -a.w);
-}
-
-vector4 __operator__add__(vector4 a, vector4 b)
-{
-  return vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
-
-vector4 __operator__add__(vector4 a, int b)
-{
-  return a + vector4(b, b, b, b);
-}
-
-vector4 __operator__add__(vector4 a, float b)
-{
-  return a + vector4(b, b, b, b);
-}
-
-vector4 __operator__add__(int a, vector4 b)
-{
-  return vector4(a, a, a, a) + b;
-}
-
-vector4 __operator__add__(float a, vector4 b)
-{
-  return vector4(a, a, a, a) + b;
-}
-
-vector4 __operator__sub__(vector4 a, vector4 b)
-{
-  return vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-}
-
-vector4 __operator__sub__(vector4 a, int b)
-{
-  return a - vector4(b, b, b, b);
-}
-
-vector4 __operator__sub__(vector4 a, float b)
-{
-  return a - vector4(b, b, b, b);
-}
-
-vector4 __operator__sub__(int a, vector4 b)
-{
-  return vector4(a, a, a, a) - b;
-}
-
-vector4 __operator__sub__(float a, vector4 b)
-{
-  return vector4(a, a, a, a) - b;
-}
-
-vector4 __operator__mul__(vector4 a, vector4 b)
-{
-  return vector4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-}
-
-vector4 __operator__mul__(vector4 a, int b)
-{
-  return a * vector4(b, b, b, b);
-}
-
-vector4 __operator__mul__(vector4 a, float b)
-{
-  return a * vector4(b, b, b, b);
-}
-
-vector4 __operator__mul__(int a, vector4 b)
-{
-  return vector4(a, a, a, a) * b;
-}
-
-vector4 __operator__mul__(float a, vector4 b)
-{
-  return vector4(a, a, a, a) * b;
-}
-
-vector4 __operator__div__(vector4 a, vector4 b)
-{
-  return vector4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-}
-
-vector4 __operator__div__(vector4 a, int b)
-{
-  float b_inv = 1 / b;
-  return a * vector4(b_inv, b_inv, b_inv, b_inv);
-}
-
-vector4 __operator__div__(vector4 a, float b)
-{
-  float b_inv = 1 / b;
-  return a * vector4(b_inv, b_inv, b_inv, b_inv);
-}
-
-vector4 __operator__div__(int a, vector4 b)
-{
-  return vector4(a, a, a, a) / b;
-}
-
-vector4 __operator__div__(float a, vector4 b)
-{
-  return vector4(a, a, a, a) / b;
-}
-
-int __operator__eq__(vector4 a, vector4 b)
-{
-  return (a.x == b.x) && (a.y == b.y) && (a.z == b.z) && (a.w == b.w);
-}
-
-int __operator__ne__(vector4 a, vector4 b)
-{
-  return (a.x != b.x) || (a.y != b.y) || (a.z != b.z) || (a.w != b.w);
-}
-
-//
-// For vector4, define most of the stdosl functions to match vector
-//
-
-vector4 abs(vector4 in)
-{
-  return vector4(abs(in.x), abs(in.y), abs(in.z), abs(in.w));
-}
-
-vector4 ceil(vector4 in)
-{
-  return vector4(ceil(in.x), ceil(in.y), ceil(in.z), ceil(in.w));
-}
-
-vector4 floor(vector4 in)
-{
-  return vector4(floor(in.x), floor(in.y), floor(in.z), floor(in.w));
-}
-
-vector4 sqrt(vector4 in)
-{
-  return vector4(sqrt(in.x), sqrt(in.y), sqrt(in.z), sqrt(in.w));
-}
-
-vector4 exp(vector4 in)
-{
-  return vector4(exp(in.x), exp(in.y), exp(in.z), exp(in.w));
-}
-
-vector4 log(vector4 in)
-{
-  return vector4(log(in.x), log(in.y), log(in.z), log(in.w));
-}
-
-vector4 log2(vector4 in)
-{
-  return vector4(log2(in.x), log2(in.y), log2(in.z), log2(in.w));
-}
-
-vector4 mix(vector4 value1, vector4 value2, float x)
-{
-  return vector4(mix(value1.x, value2.x, x),
-                 mix(value1.y, value2.y, x),
-                 mix(value1.z, value2.z, x),
-                 mix(value1.w, value2.w, x));
-}
-
-vector vec4ToVec3(vector4 v)
-{
-  return vector(v.x, v.y, v.z) / v.w;
-}
-
-float dot(vector4 a, vector4 b)
-{
-  return ((a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w));
-}
-
-float length(vector4 a)
-{
-  return sqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w);
-}
-
-vector4 smoothstep(vector4 low, vector4 high, vector4 in)
-{
-  return vector4(smoothstep(low.x, high.x, in.x),
-                 smoothstep(low.y, high.y, in.y),
-                 smoothstep(low.z, high.z, in.z),
-                 smoothstep(low.w, high.w, in.w));
-}
-
-vector4 smoothstep(float low, float high, vector4 in)
-{
-  return vector4(smoothstep(low, high, in.x),
-                 smoothstep(low, high, in.y),
-                 smoothstep(low, high, in.z),
-                 smoothstep(low, high, in.w));
-}
-
-vector4 clamp(vector4 in, vector4 low, vector4 high)
-{
-  return vector4(clamp(in.x, low.x, high.x),
-                 clamp(in.y, low.y, high.y),
-                 clamp(in.z, low.z, high.z),
-                 clamp(in.w, low.w, high.w));
-}
-
-vector4 clamp(vector4 in, float low, float high)
-{
-  return vector4(clamp(in.x, low, high),
-                 clamp(in.y, low, high),
-                 clamp(in.z, low, high),
-                 clamp(in.w, low, high));
-}
-
-vector4 max(vector4 a, vector4 b)
-{
-  return vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-}
-
-vector4 max(vector4 a, float b)
-{
-  return max(a, vector4(b, b, b, b));
-}
-
-vector4 normalize(vector4 a)
-{
-  return a / length(a);
-}
-
-vector4 min(vector4 a, vector4 b)
-{
-  return vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-}
-
-vector4 min(vector4 a, float b)
-{
-  return min(a, vector4(b, b, b, b));
-}
-
-vector4 fmod(vector4 a, vector4 b)
-{
-  return vector4(fmod(a.x, b.x), fmod(a.y, b.y), fmod(a.z, b.z), fmod(a.w, b.w));
-}
-
-vector4 fmod(vector4 a, float b)
-{
-  return fmod(a, vector4(b, b, b, b));
-}
-
-vector4 pow(vector4 in, vector4 amount)
-{
-  return vector4(
-      pow(in.x, amount.x), pow(in.y, amount.y), pow(in.z, amount.z), pow(in.w, amount.w));
-}
-
-vector4 pow(vector4 in, float amount)
-{
-  return vector4(pow(in.x, amount), pow(in.y, amount), pow(in.z, amount), pow(in.w, amount));
-}
-
-vector4 sign(vector4 a)
-{
-  return vector4(sign(a.x), sign(a.y), sign(a.z), sign(a.w));
-}
-
-vector4 sin(vector4 a)
-{
-  return vector4(sin(a.x), sin(a.y), sin(a.z), sin(a.w));
-}
-
-vector4 cos(vector4 a)
-{
-  return vector4(cos(a.x), cos(a.y), cos(a.z), cos(a.w));
-}
-
-vector4 tan(vector4 a)
-{
-  return vector4(tan(a.x), tan(a.y), tan(a.z), tan(a.w));
-}
-
-vector4 asin(vector4 a)
-{
-  return vector4(asin(a.x), asin(a.y), asin(a.z), asin(a.w));
-}
-
-vector4 acos(vector4 a)
-{
-  return vector4(acos(a.x), acos(a.y), acos(a.z), acos(a.w));
-}
-
-vector4 atan2(vector4 a, float f)
-{
-  return vector4(atan2(a.x, f), atan2(a.y, f), atan2(a.z, f), atan2(a.w, f));
-}
-
-vector4 atan2(vector4 a, vector4 b)
-{
-  return vector4(atan2(a.x, b.x), atan2(a.y, b.y), atan2(a.z, b.z), atan2(a.w, b.w));
-}
-
-vector4 transform(matrix M, vector4 p)
-{
-  return vector4(M[0][0] * p.x + M[0][1] * p.y + M[0][2] * p.z + M[0][2] * p.w,
-                 M[1][0] * p.x + M[1][1] * p.y + M[1][2] * p.z + M[1][2] * p.w,
-                 M[2][0] * p.x + M[2][1] * p.y + M[2][2] * p.z + M[2][2] * p.w,
-                 M[3][0] * p.x + M[3][1] * p.y + M[3][2] * p.z + M[3][2] * p.w);
-}
-
-vector4 transform(string fromspace, string tospace, vector4 p)
-{
-  return transform(matrix(fromspace, tospace), p);
-}
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
new file mode 100644
index 00000000000..60ebf415970
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
+    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+    int buffer_offset = (kernel_split_params.tile.offset + x +
+                         y * kernel_split_params.tile.stride) *
+                        kernel_data.film.pass_stride;
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+    int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
+    if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+      buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+      float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f,
+                                             buffer[kernel_data.film.pass_sample_count]);
+      if (sample_multiplier != 1.0f) {
+        kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+      }
+    }
+    else {
+      kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
new file mode 100644
index 00000000000..93f41f7ced4
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.h &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int y = kernel_split_params.tile.y + pixel_index;
+    kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
new file mode 100644
index 00000000000..eca53d079ec
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int x = kernel_split_params.tile.x + pixel_index;
+    kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
new file mode 100644
index 00000000000..c8eb1ebd705
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+    int buffer_offset = (kernel_split_params.tile.offset + x +
+                         y * kernel_split_params.tile.stride) *
+                        kernel_data.film.pass_stride;
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+    kernel_do_adaptive_stopping(kg,
+                                buffer,
+                                kernel_split_params.tile.start_sample +
+                                    kernel_split_params.tile.num_samples - 1);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 45b839db05f..b24699ec39c 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -44,7 +44,7 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
                      branched_state->isect.t :
                      FLT_MAX;
 
-  bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+  float step_size = volume_stack_step_size(kg, branched_state->path_state.volume_stack);
 
   for (int j = branched_state->next_sample; j < num_samples; j++) {
     ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
@@ -61,7 +61,7 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
 
     /* integrate along volume segment with distance sampling */
     VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, ps, sd, &volume_ray, L, tp, heterogeneous);
+        kg, ps, sd, &volume_ray, L, tp, step_size);
 
 #  ifdef __VOLUME_SCATTER__
     if (result == VOLUME_PATH_SCATTERED) {
@@ -164,12 +164,12 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
       if (!kernel_data.integrator.branched ||
           IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
 #  endif /* __BRANCHED_PATH__ */
-        bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+        float step_size = volume_stack_step_size(kg, state->volume_stack);
 
         {
           /* integrate along volume segment with distance sampling */
           VolumeIntegrateResult result = kernel_volume_integrate(
-              kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+              kg, state, sd, &volume_ray, L, throughput, step_size);
 
 #  ifdef __VOLUME_SCATTER__
           if (result == VOLUME_PATH_SCATTERED) {
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 384bc952460..5114f2b03e5 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,6 +17,7 @@
 #ifndef __KERNEL_SPLIT_H__
 #define __KERNEL_SPLIT_H__
 
+// clang-format off
 #include "kernel/kernel_math.h"
 #include "kernel/kernel_types.h"
 
@@ -52,6 +53,7 @@
 #ifdef __BRANCHED_PATH__
 #  include "kernel/split/kernel_branched.h"
 #endif
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 433b1221a37..decc537b39b 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -18,6 +18,7 @@
 #define __KERNEL_SPLIT_DATA_H__
 
 #include "kernel/split/kernel_split_data_types.h"
+
 #include "kernel/kernel_globals.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index fd2833ee687..abeb8fa7457 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -161,52 +161,53 @@ CCL_NAMESPACE_END
 #include "svm_fractal_noise.h"
 
 #include "kernel/svm/svm_color_util.h"
-#include "kernel/svm/svm_math_util.h"
 #include "kernel/svm/svm_mapping_util.h"
+#include "kernel/svm/svm_math_util.h"
 
 #include "kernel/svm/svm_aov.h"
 #include "kernel/svm/svm_attribute.h"
-#include "kernel/svm/svm_gradient.h"
 #include "kernel/svm/svm_blackbody.h"
+#include "kernel/svm/svm_brick.h"
+#include "kernel/svm/svm_brightness.h"
+#include "kernel/svm/svm_bump.h"
+#include "kernel/svm/svm_camera.h"
+#include "kernel/svm/svm_checker.h"
+#include "kernel/svm/svm_clamp.h"
 #include "kernel/svm/svm_closure.h"
-#include "kernel/svm/svm_noisetex.h"
 #include "kernel/svm/svm_convert.h"
 #include "kernel/svm/svm_displace.h"
 #include "kernel/svm/svm_fresnel.h"
-#include "kernel/svm/svm_wireframe.h"
-#include "kernel/svm/svm_wavelength.h"
-#include "kernel/svm/svm_camera.h"
+#include "kernel/svm/svm_gamma.h"
 #include "kernel/svm/svm_geometry.h"
+#include "kernel/svm/svm_gradient.h"
 #include "kernel/svm/svm_hsv.h"
 #include "kernel/svm/svm_ies.h"
 #include "kernel/svm/svm_image.h"
-#include "kernel/svm/svm_gamma.h"
-#include "kernel/svm/svm_brightness.h"
 #include "kernel/svm/svm_invert.h"
 #include "kernel/svm/svm_light_path.h"
 #include "kernel/svm/svm_magic.h"
+#include "kernel/svm/svm_map_range.h"
 #include "kernel/svm/svm_mapping.h"
-#include "kernel/svm/svm_normal.h"
-#include "kernel/svm/svm_wave.h"
 #include "kernel/svm/svm_math.h"
 #include "kernel/svm/svm_mix.h"
+#include "kernel/svm/svm_musgrave.h"
+#include "kernel/svm/svm_noisetex.h"
+#include "kernel/svm/svm_normal.h"
 #include "kernel/svm/svm_ramp.h"
 #include "kernel/svm/svm_sepcomb_hsv.h"
 #include "kernel/svm/svm_sepcomb_vector.h"
-#include "kernel/svm/svm_musgrave.h"
 #include "kernel/svm/svm_sky.h"
 #include "kernel/svm/svm_tex_coord.h"
 #include "kernel/svm/svm_value.h"
-#include "kernel/svm/svm_voronoi.h"
-#include "kernel/svm/svm_checker.h"
-#include "kernel/svm/svm_brick.h"
+#include "kernel/svm/svm_vector_rotate.h"
 #include "kernel/svm/svm_vector_transform.h"
+#include "kernel/svm/svm_vertex_color.h"
+#include "kernel/svm/svm_voronoi.h"
 #include "kernel/svm/svm_voxel.h"
-#include "kernel/svm/svm_bump.h"
-#include "kernel/svm/svm_map_range.h"
-#include "kernel/svm/svm_clamp.h"
+#include "kernel/svm/svm_wave.h"
+#include "kernel/svm/svm_wavelength.h"
 #include "kernel/svm/svm_white_noise.h"
-#include "kernel/svm/svm_vertex_color.h"
+#include "kernel/svm/svm_wireframe.h"
 
 #ifdef __SHADER_RAYTRACE__
 #  include "kernel/svm/svm_ao.h"
@@ -230,6 +231,8 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
     uint4 node = read_node(kg, &offset);
 
     switch (node.x) {
+      case NODE_END:
+        return;
 #if NODES_GROUP(NODE_GROUP_LEVEL_0)
       case NODE_SHADER_JUMP: {
         if (type == SHADER_TYPE_SURFACE)
@@ -309,7 +312,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
         svm_node_vector_displacement(kg, sd, stack, node, &offset);
         break;
 #  endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */
-#  ifdef __TEXTURES__
       case NODE_TEX_IMAGE:
         svm_node_tex_image(kg, sd, stack, node, &offset);
         break;
@@ -319,9 +321,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_TEX_NOISE:
         svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
-#  endif /* __TEXTURES__ */
-#  ifdef __EXTRA_NODES__
-#    if NODES_FEATURE(NODE_FEATURE_BUMP)
+#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_SET_BUMP:
         svm_node_set_bump(kg, sd, stack, node);
         break;
@@ -346,20 +346,19 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_CLOSURE_SET_NORMAL:
         svm_node_set_normal(kg, sd, stack, node.y, node.z);
         break;
-#      if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
+#    if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
       case NODE_ENTER_BUMP_EVAL:
         svm_node_enter_bump_eval(kg, sd, stack, node.y);
         break;
       case NODE_LEAVE_BUMP_EVAL:
         svm_node_leave_bump_eval(kg, sd, stack, node.y);
         break;
-#      endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
-#    endif   /* NODES_FEATURE(NODE_FEATURE_BUMP) */
+#    endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
+#  endif   /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_HSV:
         svm_node_hsv(kg, sd, stack, node, &offset);
         break;
-#  endif /* __EXTRA_NODES__ */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
+#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
 
 #if NODES_GROUP(NODE_GROUP_LEVEL_1)
       case NODE_CLOSURE_HOLDOUT:
@@ -379,7 +378,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
         svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset);
         break;
 #  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
-#  ifdef __EXTRA_NODES__
       case NODE_MATH:
         svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
@@ -404,15 +402,12 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_PARTICLE_INFO:
         svm_node_particle_info(kg, sd, stack, node.y, node.z);
         break;
-#    ifdef __HAIR__
-#      if NODES_FEATURE(NODE_FEATURE_HAIR)
+#  if defined(__HAIR__) && NODES_FEATURE(NODE_FEATURE_HAIR)
       case NODE_HAIR_INFO:
         svm_node_hair_info(kg, sd, stack, node.y, node.z);
         break;
-#      endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
-#    endif   /* __HAIR__ */
-#  endif     /* __EXTRA_NODES__ */
-#endif       /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+#  endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
+#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
 
 #if NODES_GROUP(NODE_GROUP_LEVEL_2)
       case NODE_TEXTURE_MAPPING:
@@ -427,7 +422,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_CAMERA:
         svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
         break;
-#  ifdef __TEXTURES__
       case NODE_TEX_ENVIRONMENT:
         svm_node_tex_environment(kg, sd, stack, node);
         break;
@@ -458,8 +452,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_TEX_WHITE_NOISE:
         svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
-#  endif /* __TEXTURES__ */
-#  ifdef __EXTRA_NODES__
       case NODE_NORMAL:
         svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
@@ -469,19 +461,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_IES:
         svm_node_ies(kg, sd, stack, node, &offset);
         break;
-      case NODE_AOV_START:
-        if (!svm_node_aov_check(state, buffer)) {
-          return;
-        }
-        break;
-      case NODE_AOV_COLOR:
-        svm_node_aov_color(kg, sd, stack, node, buffer);
-        break;
-      case NODE_AOV_VALUE:
-        svm_node_aov_value(kg, sd, stack, node, buffer);
-        break;
-#  endif /* __EXTRA_NODES__ */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
+#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
 #if NODES_GROUP(NODE_GROUP_LEVEL_3)
       case NODE_RGB_CURVES:
@@ -494,7 +474,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_NORMAL_MAP:
         svm_node_normal_map(kg, sd, stack, node);
         break;
-#  ifdef __EXTRA_NODES__
       case NODE_INVERT:
         svm_node_invert(sd, stack, node.y, node.z, node.w);
         break;
@@ -513,6 +492,9 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_COMBINE_HSV:
         svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
+      case NODE_VECTOR_ROTATE:
+        svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
+        break;
       case NODE_VECTOR_TRANSFORM:
         svm_node_vector_transform(kg, sd, stack, node);
         break;
@@ -531,12 +513,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
       case NODE_CLAMP:
         svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, &offset);
         break;
-#  endif /* __EXTRA_NODES__ */
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
-      case NODE_TEX_VOXEL:
-        svm_node_tex_voxel(kg, sd, stack, node, &offset);
-        break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
 #  ifdef __SHADER_RAYTRACE__
       case NODE_BEVEL:
         svm_node_bevel(kg, sd, state, stack, node);
@@ -546,8 +522,25 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg,
         break;
 #  endif /* __SHADER_RAYTRACE__ */
 #endif   /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
-      case NODE_END:
-        return;
+
+#if NODES_GROUP(NODE_GROUP_LEVEL_4)
+#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
+      case NODE_TEX_VOXEL:
+        svm_node_tex_voxel(kg, sd, stack, node, &offset);
+        break;
+#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
+      case NODE_AOV_START:
+        if (!svm_node_aov_check(state, buffer)) {
+          return;
+        }
+        break;
+      case NODE_AOV_COLOR:
+        svm_node_aov_color(kg, sd, stack, node, buffer);
+        break;
+      case NODE_AOV_VALUE:
+        svm_node_aov_value(kg, sd, stack, node, buffer);
+        break;
+#endif /* NODES_GROUP(NODE_GROUP_LEVEL_4) */
       default:
         kernel_assert(!"Unknown node type was passed to the SVM machine");
         return;
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index bf2d3f4fbff..cb1b521c585 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -16,23 +16,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Hair Melanin */
-
-ccl_device_inline float3 sigma_from_concentration(float eumelanin, float pheomelanin)
-{
-  return eumelanin * make_float3(0.506f, 0.841f, 1.653f) +
-         pheomelanin * make_float3(0.343f, 0.733f, 1.924f);
-}
-
-ccl_device_inline float3 sigma_from_reflectance(float3 color, float azimuthal_roughness)
-{
-  float x = azimuthal_roughness;
-  float roughness_fac = (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x +
-                        5.969f;
-  float3 sigma = log3(color) / roughness_fac;
-  return sigma * sigma;
-}
-
 /* Closure Nodes */
 
 ccl_device void svm_node_glass_setup(
@@ -868,24 +851,26 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
             /* Benedikt Bitterli's melanin ratio remapping. */
             float eumelanin = melanin * (1.0f - melanin_redness);
             float pheomelanin = melanin * melanin_redness;
-            float3 melanin_sigma = sigma_from_concentration(eumelanin, pheomelanin);
+            float3 melanin_sigma = bsdf_principled_hair_sigma_from_concentration(eumelanin,
+                                                                                 pheomelanin);
 
             /* Optional tint. */
             float3 tint = stack_load_float3(stack, tint_ofs);
-            float3 tint_sigma = sigma_from_reflectance(tint, radial_roughness);
+            float3 tint_sigma = bsdf_principled_hair_sigma_from_reflectance(tint,
+                                                                            radial_roughness);
 
             bsdf->sigma = melanin_sigma + tint_sigma;
             break;
           }
           case NODE_PRINCIPLED_HAIR_REFLECTANCE: {
             float3 color = stack_load_float3(stack, color_ofs);
-            bsdf->sigma = sigma_from_reflectance(color, radial_roughness);
+            bsdf->sigma = bsdf_principled_hair_sigma_from_reflectance(color, radial_roughness);
             break;
           }
           default: {
             /* Fallback to brownish hair, same as defaults for melanin. */
             kernel_assert(!"Invalid Principled Hair parametrization!");
-            bsdf->sigma = sigma_from_concentration(0.0f, 0.8054375f);
+            bsdf->sigma = bsdf_principled_hair_sigma_from_concentration(0.0f, 0.8054375f);
             break;
           }
         }
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 90f1a7845c7..f57c85fc23e 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,8 +16,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __TEXTURES__
-
 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint flags)
 {
   if (id == -1) {
@@ -30,10 +28,6 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 
   if ((flags & NODE_IMAGE_ALPHA_UNASSOCIATE) && alpha != 1.0f && alpha != 0.0f) {
     r /= alpha;
-    const int texture_type = kernel_tex_type(id);
-    if (texture_type == IMAGE_DATA_TYPE_BYTE4 || texture_type == IMAGE_DATA_TYPE_BYTE) {
-      r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f));
-    }
     r.w = alpha;
   }
 
@@ -250,6 +244,4 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg,
     stack_store_float(stack, alpha_offset, f.w);
 }
 
-#endif /* __TEXTURES__ */
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 82cae7bbacf..01e01c399ea 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -51,11 +51,19 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
 
   float3 a = stack_load_float3(stack, a_stack_offset);
   float3 b = stack_load_float3(stack, b_stack_offset);
+  float3 c;
   float scale = stack_load_float(stack, scale_stack_offset);
 
   float value;
   float3 vector;
-  svm_vector_math(&value, &vector, (NodeVectorMathType)type, a, b, scale);
+
+  /* 3 Vector Operators */
+  if (type == NODE_VECTOR_MATH_WRAP) {
+    uint4 extra_node = read_node(kg, offset);
+    c = stack_load_float3(stack, extra_node.x);
+  }
+
+  svm_vector_math(&value, &vector, (NodeVectorMathType)type, a, b, c, scale);
 
   if (stack_valid(value_stack_offset))
     stack_store_float(stack, value_stack_offset, value);
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index 7b9eaaeb710..d1e1fa87e53 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -16,8 +16,13 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_vector_math(
-    float *value, float3 *vector, NodeVectorMathType type, float3 a, float3 b, float scale)
+ccl_device void svm_vector_math(float *value,
+                                float3 *vector,
+                                NodeVectorMathType type,
+                                float3 a,
+                                float3 b,
+                                float3 c,
+                                float scale)
 {
   switch (type) {
     case NODE_VECTOR_MATH_ADD:
@@ -68,6 +73,9 @@ ccl_device void svm_vector_math(
     case NODE_VECTOR_MATH_MODULO:
       *vector = make_float3(safe_modulo(a.x, b.x), safe_modulo(a.y, b.y), safe_modulo(a.z, b.z));
       break;
+    case NODE_VECTOR_MATH_WRAP:
+      *vector = make_float3(wrapf(a.x, b.x, c.x), wrapf(a.y, b.y, c.y), wrapf(a.z, b.z, c.z));
+      break;
     case NODE_VECTOR_MATH_FRACTION:
       *vector = a - floor(a);
       break;
@@ -80,6 +88,15 @@ ccl_device void svm_vector_math(
     case NODE_VECTOR_MATH_MAXIMUM:
       *vector = max(a, b);
       break;
+    case NODE_VECTOR_MATH_SINE:
+      *vector = make_float3(sinf(a.x), sinf(a.y), sinf(a.z));
+      break;
+    case NODE_VECTOR_MATH_COSINE:
+      *vector = make_float3(cosf(a.x), cosf(a.y), cosf(a.z));
+      break;
+    case NODE_VECTOR_MATH_TANGENT:
+      *vector = make_float3(tanf(a.x), tanf(a.y), tanf(a.z));
+      break;
     default:
       *vector = make_float3(0.0f, 0.0f, 0.0f);
       *value = 0.0f;
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 8dbb147e76a..85ede7770e9 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -42,7 +42,8 @@ CCL_NAMESPACE_BEGIN
 #define NODE_GROUP_LEVEL_1 1
 #define NODE_GROUP_LEVEL_2 2
 #define NODE_GROUP_LEVEL_3 3
-#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_3
+#define NODE_GROUP_LEVEL_4 4
+#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_4
 
 #define NODE_FEATURE_VOLUME (1 << 0)
 #define NODE_FEATURE_HAIR (1 << 1)
@@ -62,97 +63,98 @@ CCL_NAMESPACE_BEGIN
 
 typedef enum ShaderNodeType {
   NODE_END = 0,
+  NODE_SHADER_JUMP,
   NODE_CLOSURE_BSDF,
   NODE_CLOSURE_EMISSION,
   NODE_CLOSURE_BACKGROUND,
   NODE_CLOSURE_SET_WEIGHT,
   NODE_CLOSURE_WEIGHT,
+  NODE_EMISSION_WEIGHT,
   NODE_MIX_CLOSURE,
   NODE_JUMP_IF_ZERO,
   NODE_JUMP_IF_ONE,
-  NODE_TEX_IMAGE,
-  NODE_TEX_IMAGE_BOX,
-  NODE_TEX_SKY,
   NODE_GEOMETRY,
-  NODE_GEOMETRY_DUPLI,
-  NODE_LIGHT_PATH,
+  NODE_CONVERT,
+  NODE_TEX_COORD,
   NODE_VALUE_F,
   NODE_VALUE_V,
-  NODE_MIX,
   NODE_ATTR,
-  NODE_CONVERT,
-  NODE_FRESNEL,
-  NODE_WIREFRAME,
-  NODE_WAVELENGTH,
-  NODE_BLACKBODY,
-  NODE_EMISSION_WEIGHT,
-  NODE_TEX_GRADIENT,
-  NODE_TEX_VORONOI,
-  NODE_TEX_MUSGRAVE,
-  NODE_TEX_WAVE,
-  NODE_TEX_MAGIC,
-  NODE_TEX_NOISE,
-  NODE_SHADER_JUMP,
-  NODE_SET_DISPLACEMENT,
+  NODE_VERTEX_COLOR,
   NODE_GEOMETRY_BUMP_DX,
   NODE_GEOMETRY_BUMP_DY,
+  NODE_SET_DISPLACEMENT,
+  NODE_DISPLACEMENT,
+  NODE_VECTOR_DISPLACEMENT,
+  NODE_TEX_IMAGE,
+  NODE_TEX_IMAGE_BOX,
+  NODE_TEX_NOISE,
   NODE_SET_BUMP,
-  NODE_MATH,
-  NODE_VECTOR_MATH,
-  NODE_VECTOR_TRANSFORM,
-  NODE_MAPPING,
-  NODE_TEX_COORD,
-  NODE_TEX_COORD_BUMP_DX,
-  NODE_TEX_COORD_BUMP_DY,
   NODE_ATTR_BUMP_DX,
   NODE_ATTR_BUMP_DY,
-  NODE_TEX_ENVIRONMENT,
+  NODE_VERTEX_COLOR_BUMP_DX,
+  NODE_VERTEX_COLOR_BUMP_DY,
+  NODE_TEX_COORD_BUMP_DX,
+  NODE_TEX_COORD_BUMP_DY,
+  NODE_CLOSURE_SET_NORMAL,
+  NODE_ENTER_BUMP_EVAL,
+  NODE_LEAVE_BUMP_EVAL,
+  NODE_HSV,
   NODE_CLOSURE_HOLDOUT,
+  NODE_FRESNEL,
   NODE_LAYER_WEIGHT,
   NODE_CLOSURE_VOLUME,
-  NODE_SEPARATE_VECTOR,
-  NODE_COMBINE_VECTOR,
-  NODE_SEPARATE_HSV,
-  NODE_COMBINE_HSV,
-  NODE_HSV,
-  NODE_CAMERA,
-  NODE_INVERT,
-  NODE_NORMAL,
+  NODE_PRINCIPLED_VOLUME,
+  NODE_MATH,
+  NODE_VECTOR_MATH,
+  NODE_RGB_RAMP,
   NODE_GAMMA,
-  NODE_TEX_CHECKER,
   NODE_BRIGHTCONTRAST,
-  NODE_RGB_RAMP,
-  NODE_RGB_CURVES,
-  NODE_VECTOR_CURVES,
-  NODE_MIN_MAX,
-  NODE_LIGHT_FALLOFF,
+  NODE_LIGHT_PATH,
   NODE_OBJECT_INFO,
   NODE_PARTICLE_INFO,
+  NODE_HAIR_INFO,
+  NODE_TEXTURE_MAPPING,
+  NODE_MAPPING,
+  NODE_MIN_MAX,
+  NODE_CAMERA,
+  NODE_TEX_ENVIRONMENT,
+  NODE_TEX_SKY,
+  NODE_TEX_GRADIENT,
+  NODE_TEX_VORONOI,
+  NODE_TEX_MUSGRAVE,
+  NODE_TEX_WAVE,
+  NODE_TEX_MAGIC,
+  NODE_TEX_CHECKER,
   NODE_TEX_BRICK,
-  NODE_CLOSURE_SET_NORMAL,
-  NODE_AMBIENT_OCCLUSION,
+  NODE_TEX_WHITE_NOISE,
+  NODE_NORMAL,
+  NODE_LIGHT_FALLOFF,
+  NODE_IES,
+  NODE_RGB_CURVES,
+  NODE_VECTOR_CURVES,
   NODE_TANGENT,
   NODE_NORMAL_MAP,
-  NODE_HAIR_INFO,
-  NODE_UVMAP,
-  NODE_TEX_VOXEL,
-  NODE_ENTER_BUMP_EVAL,
-  NODE_LEAVE_BUMP_EVAL,
-  NODE_BEVEL,
-  NODE_DISPLACEMENT,
-  NODE_VECTOR_DISPLACEMENT,
-  NODE_PRINCIPLED_VOLUME,
-  NODE_IES,
+  NODE_INVERT,
+  NODE_MIX,
+  NODE_SEPARATE_VECTOR,
+  NODE_COMBINE_VECTOR,
+  NODE_SEPARATE_HSV,
+  NODE_COMBINE_HSV,
+  NODE_VECTOR_ROTATE,
+  NODE_VECTOR_TRANSFORM,
+  NODE_WIREFRAME,
+  NODE_WAVELENGTH,
+  NODE_BLACKBODY,
   NODE_MAP_RANGE,
   NODE_CLAMP,
-  NODE_TEXTURE_MAPPING,
-  NODE_TEX_WHITE_NOISE,
-  NODE_VERTEX_COLOR,
-  NODE_VERTEX_COLOR_BUMP_DX,
-  NODE_VERTEX_COLOR_BUMP_DY,
+  NODE_BEVEL,
+  NODE_AMBIENT_OCCLUSION,
+  NODE_TEX_VOXEL,
   NODE_AOV_START,
-  NODE_AOV_VALUE,
   NODE_AOV_COLOR,
+  NODE_AOV_VALUE,
+  /* NOTE: for best OpenCL performance, item definition in the enum must
+   * match the switch case order in svm.h. */
 } ShaderNodeType;
 
 typedef enum NodeAttributeType {
@@ -326,6 +328,10 @@ typedef enum NodeVectorMathType {
   NODE_VECTOR_MATH_ABSOLUTE,
   NODE_VECTOR_MATH_MINIMUM,
   NODE_VECTOR_MATH_MAXIMUM,
+  NODE_VECTOR_MATH_WRAP,
+  NODE_VECTOR_MATH_SINE,
+  NODE_VECTOR_MATH_COSINE,
+  NODE_VECTOR_MATH_TANGENT,
 } NodeVectorMathType;
 
 typedef enum NodeClampType {
@@ -347,6 +353,14 @@ typedef enum NodeMappingType {
   NODE_MAPPING_TYPE_NORMAL
 } NodeMappingType;
 
+typedef enum NodeVectorRotateType {
+  NODE_VECTOR_ROTATE_TYPE_AXIS,
+  NODE_VECTOR_ROTATE_TYPE_AXIS_X,
+  NODE_VECTOR_ROTATE_TYPE_AXIS_Y,
+  NODE_VECTOR_ROTATE_TYPE_AXIS_Z,
+  NODE_VECTOR_ROTATE_TYPE_EULER_XYZ,
+} NodeVectorRotateType;
+
 typedef enum NodeVectorTransformType {
   NODE_VECTOR_TRANSFORM_TYPE_VECTOR,
   NODE_VECTOR_TRANSFORM_TYPE_POINT,
@@ -380,9 +394,24 @@ typedef enum NodeMusgraveType {
 
 typedef enum NodeWaveType { NODE_WAVE_BANDS, NODE_WAVE_RINGS } NodeWaveType;
 
-typedef enum NodeWaveProfiles {
+typedef enum NodeWaveBandsDirection {
+  NODE_WAVE_BANDS_DIRECTION_X,
+  NODE_WAVE_BANDS_DIRECTION_Y,
+  NODE_WAVE_BANDS_DIRECTION_Z,
+  NODE_WAVE_BANDS_DIRECTION_DIAGONAL
+} NodeWaveBandsDirection;
+
+typedef enum NodeWaveRingsDirection {
+  NODE_WAVE_RINGS_DIRECTION_X,
+  NODE_WAVE_RINGS_DIRECTION_Y,
+  NODE_WAVE_RINGS_DIRECTION_Z,
+  NODE_WAVE_RINGS_DIRECTION_SPHERICAL
+} NodeWaveRingsDirection;
+
+typedef enum NodeWaveProfile {
   NODE_WAVE_PROFILE_SIN,
   NODE_WAVE_PROFILE_SAW,
+  NODE_WAVE_PROFILE_TRI,
 } NodeWaveProfile;
 
 typedef enum NodeSkyType { NODE_SKY_OLD, NODE_SKY_NEW } NodeSkyType;
@@ -499,6 +528,7 @@ typedef enum ClosureType {
   CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID,
   CLOSURE_BSDF_PRINCIPLED_SHEEN_ID,
   CLOSURE_BSDF_DIFFUSE_TOON_ID,
+  CLOSURE_BSDF_TRANSLUCENT_ID,
 
   /* Glossy */
   CLOSURE_BSDF_REFLECTION_ID,
@@ -521,7 +551,6 @@ typedef enum ClosureType {
   CLOSURE_BSDF_HAIR_REFLECTION_ID,
 
   /* Transmission */
-  CLOSURE_BSDF_TRANSLUCENT_ID,
   CLOSURE_BSDF_REFRACTION_ID,
   CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
   CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
@@ -562,12 +591,12 @@ typedef enum ClosureType {
 /* watch this, being lazy with memory usage */
 #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_DIFFUSE(type) \
-  (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
+  (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_TRANSLUCENT_ID)
 #define CLOSURE_IS_BSDF_GLOSSY(type) \
   ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) || \
    (type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID))
 #define CLOSURE_IS_BSDF_TRANSMISSION(type) \
-  (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+  (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
 #define CLOSURE_IS_BSDF_BSSRDF(type) \
   (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
 #define CLOSURE_IS_BSDF_SINGULAR(type) \
diff --git a/intern/cycles/kernel/svm/svm_vector_rotate.h b/intern/cycles/kernel/svm/svm_vector_rotate.h
new file mode 100644
index 00000000000..79a4ec2c40e
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_vector_rotate.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Vector Rotate */
+
+ccl_device void svm_node_vector_rotate(ShaderData *sd,
+                                       float *stack,
+                                       uint input_stack_offsets,
+                                       uint axis_stack_offsets,
+                                       uint result_stack_offset)
+{
+  uint type, vector_stack_offset, rotation_stack_offset, center_stack_offset, axis_stack_offset,
+      angle_stack_offset, invert;
+
+  svm_unpack_node_uchar4(
+      input_stack_offsets, &type, &vector_stack_offset, &rotation_stack_offset, &invert);
+  svm_unpack_node_uchar3(
+      axis_stack_offsets, &center_stack_offset, &axis_stack_offset, &angle_stack_offset);
+
+  if (stack_valid(result_stack_offset)) {
+
+    float3 vector = stack_load_float3(stack, vector_stack_offset);
+    float3 center = stack_load_float3(stack, center_stack_offset);
+    float3 result = make_float3(0.0f, 0.0f, 0.0f);
+
+    if (type == NODE_VECTOR_ROTATE_TYPE_EULER_XYZ) {
+      float3 rotation = stack_load_float3(stack, rotation_stack_offset);  // Default XYZ.
+      Transform rotationTransform = euler_to_transform(rotation);
+      if (invert) {
+        result = transform_direction_transposed(&rotationTransform, vector - center) + center;
+      }
+      else {
+        result = transform_direction(&rotationTransform, vector - center) + center;
+      }
+    }
+    else {
+      float3 axis;
+      switch (type) {
+        case NODE_VECTOR_ROTATE_TYPE_AXIS_X:
+          axis = make_float3(1.0f, 0.0f, 0.0f);
+          break;
+        case NODE_VECTOR_ROTATE_TYPE_AXIS_Y:
+          axis = make_float3(0.0f, 1.0f, 0.0f);
+          break;
+        case NODE_VECTOR_ROTATE_TYPE_AXIS_Z:
+          axis = make_float3(0.0f, 0.0f, 1.0f);
+          break;
+        default:
+          axis = normalize(stack_load_float3(stack, axis_stack_offset));
+          break;
+      }
+      float angle = stack_load_float(stack, angle_stack_offset);
+      angle = invert ? -angle : angle;
+      result = (len_squared(axis) != 0.0f) ?
+                   rotate_around_axis(vector - center, axis, angle) + center :
+                   vector;
+    }
+
+    stack_store_float3(stack, result_stack_offset, result);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index b79be8e5bde..4bc14f82382 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -39,7 +39,7 @@ ccl_device void svm_node_tex_voxel(
     co = transform_point(&tfm, co);
   }
 
-  float4 r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z, INTERPOLATION_NONE);
+  float4 r = kernel_tex_image_interp_3d(kg, id, co, INTERPOLATION_NONE);
 #else
   float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index 50c868c0f82..64102535f7d 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -19,52 +19,101 @@ CCL_NAMESPACE_BEGIN
 /* Wave */
 
 ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
+                                       NodeWaveBandsDirection bands_dir,
+                                       NodeWaveRingsDirection rings_dir,
                                        NodeWaveProfile profile,
                                        float3 p,
                                        float detail,
                                        float distortion,
-                                       float dscale)
+                                       float dscale,
+                                       float phase)
 {
+  /* Prevent precision issues on unit coordinates. */
+  p = (p + 0.000001f) * 0.999999f;
+
   float n;
 
-  if (type == NODE_WAVE_BANDS)
-    n = (p.x + p.y + p.z) * 10.0f;
-  else /* NODE_WAVE_RINGS */
-    n = len(p) * 20.0f;
+  if (type == NODE_WAVE_BANDS) {
+    if (bands_dir == NODE_WAVE_BANDS_DIRECTION_X) {
+      n = p.x * 20.0f;
+    }
+    else if (bands_dir == NODE_WAVE_BANDS_DIRECTION_Y) {
+      n = p.y * 20.0f;
+    }
+    else if (bands_dir == NODE_WAVE_BANDS_DIRECTION_Z) {
+      n = p.z * 20.0f;
+    }
+    else { /* NODE_WAVE_BANDS_DIRECTION_DIAGONAL */
+      n = (p.x + p.y + p.z) * 10.0f;
+    }
+  }
+  else { /* NODE_WAVE_RINGS */
+    float3 rp = p;
+    if (rings_dir == NODE_WAVE_RINGS_DIRECTION_X) {
+      rp *= make_float3(0.0f, 1.0f, 1.0f);
+    }
+    else if (rings_dir == NODE_WAVE_RINGS_DIRECTION_Y) {
+      rp *= make_float3(1.0f, 0.0f, 1.0f);
+    }
+    else if (rings_dir == NODE_WAVE_RINGS_DIRECTION_Z) {
+      rp *= make_float3(1.0f, 1.0f, 0.0f);
+    }
+    /* else: NODE_WAVE_RINGS_DIRECTION_SPHERICAL */
+
+    n = len(rp) * 20.0f;
+  }
+
+  n += phase;
 
   if (distortion != 0.0f)
     n += distortion * (fractal_noise_3d(p * dscale, detail) * 2.0f - 1.0f);
 
   if (profile == NODE_WAVE_PROFILE_SIN) {
-    return 0.5f + 0.5f * sinf(n);
+    return 0.5f + 0.5f * sinf(n - M_PI_2_F);
+  }
+  else if (profile == NODE_WAVE_PROFILE_SAW) {
+    n /= M_2PI_F;
+    return n - floorf(n);
   }
-  else { /* NODE_WAVE_PROFILE_SAW */
+  else { /* NODE_WAVE_PROFILE_TRI */
     n /= M_2PI_F;
-    n -= (int)n;
-    return (n < 0.0f) ? n + 1.0f : n;
+    return fabsf(n - floorf(n + 0.5f)) * 2.0f;
   }
 }
 
 ccl_device void svm_node_tex_wave(
     KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
-  uint4 node2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, offset);
+  uint4 defaults2 = read_node(kg, offset);
 
-  uint type;
-  uint co_offset, scale_offset, detail_offset, dscale_offset, distortion_offset, color_offset,
-      fac_offset;
+  /* RNA properties */
+  uint type_offset, bands_dir_offset, rings_dir_offset, profile_offset;
+  /* Inputs, Outputs */
+  uint co_offset, scale_offset, distortion_offset, detail_offset, dscale_offset, phase_offset;
+  uint color_offset, fac_offset;
 
-  svm_unpack_node_uchar4(node.y, &type, &color_offset, &fac_offset, &dscale_offset);
-  svm_unpack_node_uchar4(node.z, &co_offset, &scale_offset, &detail_offset, &distortion_offset);
+  svm_unpack_node_uchar4(
+      node.y, &type_offset, &bands_dir_offset, &rings_dir_offset, &profile_offset);
+  svm_unpack_node_uchar4(node.z, &co_offset, &scale_offset, &distortion_offset, &detail_offset);
+  svm_unpack_node_uchar4(node.w, &dscale_offset, &phase_offset, &color_offset, &fac_offset);
 
   float3 co = stack_load_float3(stack, co_offset);
-  float scale = stack_load_float_default(stack, scale_offset, node2.x);
-  float detail = stack_load_float_default(stack, detail_offset, node2.y);
-  float distortion = stack_load_float_default(stack, distortion_offset, node2.z);
-  float dscale = stack_load_float_default(stack, dscale_offset, node2.w);
+  float scale = stack_load_float_default(stack, scale_offset, defaults1.x);
+  float detail = stack_load_float_default(stack, detail_offset, defaults1.y);
+  float distortion = stack_load_float_default(stack, distortion_offset, defaults1.z);
+  float dscale = stack_load_float_default(stack, dscale_offset, defaults1.w);
+  float phase = stack_load_float_default(stack, phase_offset, defaults2.x);
 
-  float f = svm_wave(
-      (NodeWaveType)type, (NodeWaveProfile)node.w, co * scale, detail, distortion, dscale);
+  float f = svm_wave((NodeWaveType)type_offset,
+                     (NodeWaveBandsDirection)bands_dir_offset,
+                     (NodeWaveRingsDirection)rings_dir_offset,
+                     (NodeWaveProfile)profile_offset,
+                     co * scale,
+                     detail,
+                     distortion,
+                     dscale,
+                     phase);
 
   if (stack_valid(fac_offset))
     stack_store_float(stack, fac_offset, f);
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 92578b888a6..472b5a0c101 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -19,9 +19,14 @@ set(SRC
   coverage.cpp
   denoising.cpp
   film.cpp
+  geometry.cpp
   graph.cpp
+  hair.cpp
   image.cpp
+  image_oiio.cpp
+  image_vdb.cpp
   integrator.cpp
+  jitter.cpp
   light.cpp
   merge.cpp
   mesh.cpp
@@ -54,10 +59,15 @@ set(SRC_HEADERS
   coverage.h
   denoising.h
   film.h
+  geometry.h
   graph.h
+  hair.h
   image.h
+  image_oiio.h
+  image_vdb.h
   integrator.h
   light.h
+  jitter.h
   merge.h
   mesh.h
   nodes.h
@@ -86,6 +96,29 @@ if(WITH_CYCLES_OSL)
   list(APPEND LIB
     cycles_kernel_osl
   )
+
+  SET_PROPERTY(SOURCE osl.cpp PROPERTY COMPILE_FLAGS ${RTTI_DISABLE_FLAGS})
+endif()
+
+if(WITH_OPENCOLORIO)
+  add_definitions(-DWITH_OCIO)
+  include_directories(
+    SYSTEM
+    ${OPENCOLORIO_INCLUDE_DIRS}
+  )
+  if(WIN32)
+    add_definitions(-DOpenColorIO_STATIC)
+  endif()
+endif()
+
+if(WITH_OPENVDB)
+  add_definitions(-DWITH_OPENVDB ${OPENVDB_DEFINITIONS})
+  list(APPEND INC_SYS
+    ${OPENVDB_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${OPENVDB_LIBRARIES}
+  )
 endif()
 
 include_directories(${INC})
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index b65c2faa788..4c26d5e8365 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
+#include "render/attribute.h"
+#include "render/hair.h"
 #include "render/image.h"
 #include "render/mesh.h"
-#include "render/attribute.h"
 
 #include "util/util_foreach.h"
 #include "util/util_transform.h"
@@ -25,46 +26,51 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute */
 
-Attribute::~Attribute()
-{
-  /* for voxel data, we need to remove the image from the image manager */
-  if (element == ATTR_ELEMENT_VOXEL) {
-    VoxelAttribute *voxel_data = data_voxel();
-
-    if (voxel_data && voxel_data->slot != -1) {
-      voxel_data->manager->remove_image(voxel_data->slot);
-    }
-  }
-}
-
-void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
+Attribute::Attribute(
+    ustring name, TypeDesc type, AttributeElement element, Geometry *geom, AttributePrimitive prim)
+    : name(name), std(ATTR_STD_NONE), type(type), element(element), flags(0)
 {
-  name = name_;
-  type = type_;
-  element = element_;
-  std = ATTR_STD_NONE;
-  flags = 0;
-
   /* string and matrix not supported! */
   assert(type == TypeDesc::TypeFloat || type == TypeDesc::TypeColor ||
          type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
          type == TypeDesc::TypeNormal || type == TypeDesc::TypeMatrix || type == TypeFloat2 ||
          type == TypeRGBA);
+
+  if (element == ATTR_ELEMENT_VOXEL) {
+    buffer.resize(sizeof(ImageHandle));
+    new (buffer.data()) ImageHandle();
+  }
+  else {
+    resize(geom, prim, false);
+  }
 }
 
-void Attribute::resize(Mesh *mesh, AttributePrimitive prim, bool reserve_only)
+Attribute::~Attribute()
 {
-  if (reserve_only) {
-    buffer.reserve(buffer_size(mesh, prim));
+  /* For voxel data, we need to free the image handle. */
+  if (element == ATTR_ELEMENT_VOXEL && buffer.size()) {
+    ImageHandle &handle = data_voxel();
+    handle.~ImageHandle();
   }
-  else {
-    buffer.resize(buffer_size(mesh, prim), 0);
+}
+
+void Attribute::resize(Geometry *geom, AttributePrimitive prim, bool reserve_only)
+{
+  if (element != ATTR_ELEMENT_VOXEL) {
+    if (reserve_only) {
+      buffer.reserve(buffer_size(geom, prim));
+    }
+    else {
+      buffer.resize(buffer_size(geom, prim), 0);
+    }
   }
 }
 
 void Attribute::resize(size_t num_elements)
 {
-  buffer.resize(num_elements * data_sizeof(), 0);
+  if (element != ATTR_ELEMENT_VOXEL) {
+    buffer.resize(num_elements * data_sizeof(), 0);
+  }
 }
 
 void Attribute::add(const float &f)
@@ -122,17 +128,6 @@ void Attribute::add(const Transform &f)
     buffer.push_back(data[i]);
 }
 
-void Attribute::add(const VoxelAttribute &f)
-{
-  assert(data_sizeof() == sizeof(VoxelAttribute));
-
-  char *data = (char *)&f;
-  size_t size = sizeof(f);
-
-  for (size_t i = 0; i < size; i++)
-    buffer.push_back(data[i]);
-}
-
 void Attribute::add(const char *data)
 {
   size_t size = data_sizeof();
@@ -144,7 +139,7 @@ void Attribute::add(const char *data)
 size_t Attribute::data_sizeof() const
 {
   if (element == ATTR_ELEMENT_VOXEL)
-    return sizeof(VoxelAttribute);
+    return sizeof(ImageHandle);
   else if (element == ATTR_ELEMENT_CORNER_BYTE)
     return sizeof(uchar4);
   else if (type == TypeDesc::TypeFloat)
@@ -157,13 +152,13 @@ size_t Attribute::data_sizeof() const
     return sizeof(float3);
 }
 
-size_t Attribute::element_size(Mesh *mesh, AttributePrimitive prim) const
+size_t Attribute::element_size(Geometry *geom, AttributePrimitive prim) const
 {
   if (flags & ATTR_FINAL_SIZE) {
     return buffer.size() / data_sizeof();
   }
 
-  size_t size;
+  size_t size = 0;
 
   switch (element) {
     case ATTR_ELEMENT_OBJECT:
@@ -172,54 +167,74 @@ size_t Attribute::element_size(Mesh *mesh, AttributePrimitive prim) const
       size = 1;
       break;
     case ATTR_ELEMENT_VERTEX:
-      size = mesh->verts.size() + mesh->num_ngons;
-      if (prim == ATTR_PRIM_SUBD) {
-        size -= mesh->num_subd_verts;
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        size = mesh->verts.size() + mesh->num_ngons;
+        if (prim == ATTR_PRIM_SUBD) {
+          size -= mesh->num_subd_verts;
+        }
       }
       break;
     case ATTR_ELEMENT_VERTEX_MOTION:
-      size = (mesh->verts.size() + mesh->num_ngons) * (mesh->motion_steps - 1);
-      if (prim == ATTR_PRIM_SUBD) {
-        size -= mesh->num_subd_verts * (mesh->motion_steps - 1);
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        size = (mesh->verts.size() + mesh->num_ngons) * (mesh->motion_steps - 1);
+        if (prim == ATTR_PRIM_SUBD) {
+          size -= mesh->num_subd_verts * (mesh->motion_steps - 1);
+        }
       }
       break;
     case ATTR_ELEMENT_FACE:
-      if (prim == ATTR_PRIM_TRIANGLE) {
-        size = mesh->num_triangles();
-      }
-      else {
-        size = mesh->subd_faces.size() + mesh->num_ngons;
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (prim == ATTR_PRIM_GEOMETRY) {
+          size = mesh->num_triangles();
+        }
+        else {
+          size = mesh->subd_faces.size() + mesh->num_ngons;
+        }
       }
       break;
     case ATTR_ELEMENT_CORNER:
     case ATTR_ELEMENT_CORNER_BYTE:
-      if (prim == ATTR_PRIM_TRIANGLE) {
-        size = mesh->num_triangles() * 3;
-      }
-      else {
-        size = mesh->subd_face_corners.size() + mesh->num_ngons;
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (prim == ATTR_PRIM_GEOMETRY) {
+          size = mesh->num_triangles() * 3;
+        }
+        else {
+          size = mesh->subd_face_corners.size() + mesh->num_ngons;
+        }
       }
       break;
     case ATTR_ELEMENT_CURVE:
-      size = mesh->num_curves();
+      if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        size = hair->num_curves();
+      }
       break;
     case ATTR_ELEMENT_CURVE_KEY:
-      size = mesh->curve_keys.size();
+      if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        size = hair->curve_keys.size();
+      }
       break;
     case ATTR_ELEMENT_CURVE_KEY_MOTION:
-      size = mesh->curve_keys.size() * (mesh->motion_steps - 1);
+      if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        size = hair->curve_keys.size() * (hair->motion_steps - 1);
+      }
       break;
     default:
-      size = 0;
       break;
   }
 
   return size;
 }
 
-size_t Attribute::buffer_size(Mesh *mesh, AttributePrimitive prim) const
+size_t Attribute::buffer_size(Geometry *geom, AttributePrimitive prim) const
 {
-  return element_size(mesh, prim) * data_sizeof();
+  return element_size(geom, prim) * data_sizeof();
 }
 
 bool Attribute::same_storage(TypeDesc a, TypeDesc b)
@@ -280,6 +295,8 @@ const char *Attribute::standard_name(AttributeStandard std)
       return "tangent";
     case ATTR_STD_UV_TANGENT_SIGN:
       return "tangent_sign";
+    case ATTR_STD_VERTEX_COLOR:
+      return "vertex_color";
     case ATTR_STD_POSITION_UNDEFORMED:
       return "undeformed";
     case ATTR_STD_POSITION_UNDISPLACED:
@@ -336,13 +353,42 @@ AttributeStandard Attribute::name_standard(const char *name)
   return ATTR_STD_NONE;
 }
 
+void Attribute::get_uv_tiles(Geometry *geom,
+                             AttributePrimitive prim,
+                             unordered_set<int> &tiles) const
+{
+  if (type != TypeFloat2) {
+    return;
+  }
+
+  const int num = element_size(geom, prim);
+  const float2 *uv = data_float2();
+  for (int i = 0; i < num; i++, uv++) {
+    float u = uv->x, v = uv->y;
+    int x = (int)u, y = (int)v;
+
+    if (x < 0 || y < 0 || x >= 10) {
+      continue;
+    }
+
+    /* Be conservative in corners - precisely touching the right or upper edge of a tile
+     * should not load its right/upper neighbor as well. */
+    if (x > 0 && (u < x + 1e-6f)) {
+      x--;
+    }
+    if (y > 0 && (v < y + 1e-6f)) {
+      y--;
+    }
+
+    tiles.insert(1001 + 10 * y + x);
+  }
+}
+
 /* Attribute Set */
 
-AttributeSet::AttributeSet()
+AttributeSet::AttributeSet(Geometry *geometry, AttributePrimitive prim)
+    : geometry(geometry), prim(prim)
 {
-  triangle_mesh = NULL;
-  curve_mesh = NULL;
-  subd_mesh = NULL;
 }
 
 AttributeSet::~AttributeSet()
@@ -362,28 +408,9 @@ Attribute *AttributeSet::add(ustring name, TypeDesc type, AttributeElement eleme
     remove(name);
   }
 
-#if __cplusplus >= 201103L
-  attributes.emplace_back();
-  attr = &attributes.back();
-  attr->set(name, type, element);
-#else
-  {
-    Attribute attr_temp;
-    attr_temp.set(name, type, element);
-    attributes.push_back(attr_temp);
-    attr = &attributes.back();
-  }
-#endif
-
-  /* this is weak .. */
-  if (triangle_mesh)
-    attr->resize(triangle_mesh, ATTR_PRIM_TRIANGLE, false);
-  if (curve_mesh)
-    attr->resize(curve_mesh, ATTR_PRIM_CURVE, false);
-  if (subd_mesh)
-    attr->resize(subd_mesh, ATTR_PRIM_SUBD, false);
-
-  return attr;
+  Attribute new_attr(name, type, element, geometry, prim);
+  attributes.emplace_back(std::move(new_attr));
+  return &attributes.back();
 }
 
 Attribute *AttributeSet::find(ustring name) const
@@ -418,7 +445,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
   if (name == ustring())
     name = Attribute::standard_name(std);
 
-  if (triangle_mesh || subd_mesh) {
+  if (geometry->type == Geometry::MESH) {
     switch (std) {
       case ATTR_STD_VERTEX_NORMAL:
         attr = add(name, TypeDesc::TypeNormal, ATTR_ELEMENT_VERTEX);
@@ -435,6 +462,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
       case ATTR_STD_UV_TANGENT_SIGN:
         attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CORNER);
         break;
+      case ATTR_STD_VERTEX_COLOR:
+        attr = add(name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
+        break;
       case ATTR_STD_GENERATED:
       case ATTR_STD_POSITION_UNDEFORMED:
       case ATTR_STD_POSITION_UNDISPLACED:
@@ -478,7 +508,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
         break;
     }
   }
-  else if (curve_mesh) {
+  else if (geometry->type == Geometry::HAIR) {
     switch (std) {
       case ATTR_STD_UV:
         attr = add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
@@ -561,12 +591,7 @@ void AttributeSet::remove(Attribute *attribute)
 void AttributeSet::resize(bool reserve_only)
 {
   foreach (Attribute &attr, attributes) {
-    if (triangle_mesh)
-      attr.resize(triangle_mesh, ATTR_PRIM_TRIANGLE, reserve_only);
-    if (curve_mesh)
-      attr.resize(curve_mesh, ATTR_PRIM_CURVE, reserve_only);
-    if (subd_mesh)
-      attr.resize(subd_mesh, ATTR_PRIM_SUBD, reserve_only);
+    attr.resize(geometry, prim, reserve_only);
   }
 }
 
@@ -596,15 +621,10 @@ AttributeRequest::AttributeRequest(ustring name_)
   name = name_;
   std = ATTR_STD_NONE;
 
-  triangle_type = TypeDesc::TypeFloat;
-  triangle_desc.element = ATTR_ELEMENT_NONE;
-  triangle_desc.offset = 0;
-  triangle_desc.type = NODE_ATTR_FLOAT;
-
-  curve_type = TypeDesc::TypeFloat;
-  curve_desc.element = ATTR_ELEMENT_NONE;
-  curve_desc.offset = 0;
-  curve_desc.type = NODE_ATTR_FLOAT;
+  type = TypeDesc::TypeFloat;
+  desc.element = ATTR_ELEMENT_NONE;
+  desc.offset = 0;
+  desc.type = NODE_ATTR_FLOAT;
 
   subd_type = TypeDesc::TypeFloat;
   subd_desc.element = ATTR_ELEMENT_NONE;
@@ -617,15 +637,10 @@ AttributeRequest::AttributeRequest(AttributeStandard std_)
   name = ustring();
   std = std_;
 
-  triangle_type = TypeDesc::TypeFloat;
-  triangle_desc.element = ATTR_ELEMENT_NONE;
-  triangle_desc.offset = 0;
-  triangle_desc.type = NODE_ATTR_FLOAT;
-
-  curve_type = TypeDesc::TypeFloat;
-  curve_desc.element = ATTR_ELEMENT_NONE;
-  curve_desc.offset = 0;
-  curve_desc.type = NODE_ATTR_FLOAT;
+  type = TypeDesc::TypeFloat;
+  desc.element = ATTR_ELEMENT_NONE;
+  desc.offset = 0;
+  desc.type = NODE_ATTR_FLOAT;
 
   subd_type = TypeDesc::TypeFloat;
   subd_desc.element = ATTR_ELEMENT_NONE;
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index ebab0fe7f88..5871fa04a31 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -17,10 +17,13 @@
 #ifndef __ATTRIBUTE_H__
 #define __ATTRIBUTE_H__
 
+#include "render/image.h"
+
 #include "kernel/kernel_types.h"
 
 #include "util/util_list.h"
 #include "util/util_param.h"
+#include "util/util_set.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
@@ -30,17 +33,12 @@ class Attribute;
 class AttributeRequest;
 class AttributeRequestSet;
 class AttributeSet;
-class ImageManager;
+class ImageHandle;
+class Geometry;
+class Hair;
 class Mesh;
 struct Transform;
 
-/* Attributes for voxels are images */
-
-struct VoxelAttribute {
-  ImageManager *manager;
-  int slot;
-};
-
 /* Attribute
  *
  * Arbitrary data layers on meshes.
@@ -56,17 +54,23 @@ class Attribute {
   AttributeElement element;
   uint flags; /* enum AttributeFlag */
 
-  Attribute()
-  {
-  }
+  Attribute(ustring name,
+            TypeDesc type,
+            AttributeElement element,
+            Geometry *geom,
+            AttributePrimitive prim);
+  Attribute(Attribute &&other) = default;
+  Attribute(const Attribute &other) = delete;
+  Attribute &operator=(const Attribute &other) = delete;
   ~Attribute();
+
   void set(ustring name, TypeDesc type, AttributeElement element);
-  void resize(Mesh *mesh, AttributePrimitive prim, bool reserve_only);
+  void resize(Geometry *geom, AttributePrimitive prim, bool reserve_only);
   void resize(size_t num_elements);
 
   size_t data_sizeof() const;
-  size_t element_size(Mesh *mesh, AttributePrimitive prim) const;
-  size_t buffer_size(Mesh *mesh, AttributePrimitive prim) const;
+  size_t element_size(Geometry *geom, AttributePrimitive prim) const;
+  size_t buffer_size(Geometry *geom, AttributePrimitive prim) const;
 
   char *data()
   {
@@ -102,10 +106,12 @@ class Attribute {
     assert(data_sizeof() == sizeof(Transform));
     return (Transform *)data();
   }
-  VoxelAttribute *data_voxel()
+
+  /* Attributes for voxels are images */
+  ImageHandle &data_voxel()
   {
-    assert(data_sizeof() == sizeof(VoxelAttribute));
-    return (VoxelAttribute *)data();
+    assert(data_sizeof() == sizeof(ImageHandle));
+    return *(ImageHandle *)data();
   }
 
   const char *data() const
@@ -137,10 +143,10 @@ class Attribute {
     assert(data_sizeof() == sizeof(Transform));
     return (const Transform *)data();
   }
-  const VoxelAttribute *data_voxel() const
+  const ImageHandle &data_voxel() const
   {
-    assert(data_sizeof() == sizeof(VoxelAttribute));
-    return (const VoxelAttribute *)data();
+    assert(data_sizeof() == sizeof(ImageHandle));
+    return *(const ImageHandle *)data();
   }
 
   void zero_data(void *dst);
@@ -150,13 +156,14 @@ class Attribute {
   void add(const float2 &f);
   void add(const float3 &f);
   void add(const uchar4 &f);
-  void add(const Transform &f);
-  void add(const VoxelAttribute &f);
+  void add(const Transform &tfm);
   void add(const char *data);
 
   static bool same_storage(TypeDesc a, TypeDesc b);
   static const char *standard_name(AttributeStandard std);
   static AttributeStandard name_standard(const char *name);
+
+  void get_uv_tiles(Geometry *geom, AttributePrimitive prim, unordered_set<int> &tiles) const;
 };
 
 /* Attribute Set
@@ -165,12 +172,11 @@ class Attribute {
 
 class AttributeSet {
  public:
-  Mesh *triangle_mesh;
-  Mesh *curve_mesh;
-  Mesh *subd_mesh;
+  Geometry *geometry;
+  AttributePrimitive prim;
   list<Attribute> attributes;
 
-  AttributeSet();
+  AttributeSet(Geometry *geometry, AttributePrimitive prim);
   ~AttributeSet();
 
   Attribute *add(ustring name, TypeDesc type, AttributeElement element);
@@ -200,9 +206,9 @@ class AttributeRequest {
   ustring name;
   AttributeStandard std;
 
-  /* temporary variables used by MeshManager */
-  TypeDesc triangle_type, curve_type, subd_type;
-  AttributeDescriptor triangle_desc, curve_desc, subd_desc;
+  /* temporary variables used by GeometryManager */
+  TypeDesc type, subd_type;
+  AttributeDescriptor desc, subd_desc;
 
   explicit AttributeRequest(ustring name_);
   explicit AttributeRequest(AttributeStandard std);
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index 6553ca735e4..694bb640995 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -16,8 +16,8 @@
 
 #include "render/background.h"
 #include "device/device.h"
-#include "render/integrator.h"
 #include "render/graph.h"
+#include "render/integrator.h"
 #include "render/nodes.h"
 #include "render/scene.h"
 #include "render/shader.h"
@@ -43,6 +43,8 @@ NODE_DEFINE(Background)
   SOCKET_BOOLEAN(transparent_glass, "Transparent Glass", false);
   SOCKET_FLOAT(transparent_roughness_threshold, "Transparent Roughness Threshold", 0.0f);
 
+  SOCKET_FLOAT(volume_step_size, "Volume Step Size", 0.1f);
+
   SOCKET_NODE(shader, "Shader", &Shader::node_type);
 
   return type;
@@ -51,6 +53,7 @@ NODE_DEFINE(Background)
 Background::Background() : Node(node_type)
 {
   need_update = true;
+  shader = NULL;
 }
 
 Background::~Background()
@@ -91,6 +94,8 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
   else
     kbackground->volume_shader = SHADER_NONE;
 
+  kbackground->volume_step_size = volume_step_size * scene->integrator->volume_step_rate;
+
   /* No background node, make world shader invisible to all rays, to skip evaluation in kernel. */
   if (bg_shader->graph->nodes.size() <= 1) {
     kbackground->surface_shader |= SHADER_EXCLUDE_ANY;
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index fb27430f9a3..c2ca1f75179 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -45,6 +45,8 @@ class Background : public Node {
   bool transparent_glass;
   float transparent_roughness_threshold;
 
+  float volume_step_size;
+
   bool need_update;
 
   Background();
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index b906357b7b5..35f942b3e9b 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -15,10 +15,10 @@
  */
 
 #include "render/bake.h"
+#include "render/integrator.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/shader.h"
-#include "render/integrator.h"
 
 #include "util/util_foreach.h"
 
@@ -253,8 +253,8 @@ int BakeManager::aa_samples(Scene *scene, BakeData *bake_data, ShaderEvalType ty
     /* Only antialias normal if mesh has bump mapping. */
     Object *object = scene->objects[bake_data->object()];
 
-    if (object->mesh) {
-      foreach (Shader *shader, object->mesh->used_shaders) {
+    if (object->geometry) {
+      foreach (Shader *shader, object->geometry->used_shaders) {
         if (shader->has_bump) {
           return scene->integrator->aa_samples;
         }
@@ -285,8 +285,6 @@ int BakeManager::shader_type_to_pass_filter(ShaderEvalType type, const int pass_
       return BAKE_FILTER_GLOSSY | component_flags;
     case SHADER_EVAL_TRANSMISSION:
       return BAKE_FILTER_TRANSMISSION | component_flags;
-    case SHADER_EVAL_SUBSURFACE:
-      return BAKE_FILTER_SUBSURFACE | component_flags;
     case SHADER_EVAL_COMBINED:
       return pass_filter;
     default:
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 50308d0d377..2d89fb9ffba 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -16,8 +16,8 @@
 
 #include <stdlib.h>
 
-#include "render/buffers.h"
 #include "device/device.h"
+#include "render/buffers.h"
 
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
@@ -43,6 +43,8 @@ BufferParams::BufferParams()
   denoising_data_pass = false;
   denoising_clean_pass = false;
   denoising_prefiltered_pass = false;
+
+  Pass::add(PASS_COMBINED, passes);
 }
 
 void BufferParams::get_offset_stride(int &offset, int &stride)
@@ -144,7 +146,7 @@ void RenderBuffers::reset(BufferParams &params_)
   params = params_;
 
   /* re-allocate buffer */
-  buffer.alloc(params.width * params.height * params.get_passes_size());
+  buffer.alloc(params.width * params.get_passes_size(), params.height);
   buffer.zero_to_device();
 }
 
@@ -258,6 +260,22 @@ bool RenderBuffers::get_pass_rect(
     return false;
   }
 
+  float *sample_count = NULL;
+  if (name == "Combined") {
+    int sample_offset = 0;
+    for (size_t j = 0; j < params.passes.size(); j++) {
+      Pass &pass = params.passes[j];
+      if (pass.type != PASS_SAMPLE_COUNT) {
+        sample_offset += pass.components;
+        continue;
+      }
+      else {
+        sample_count = buffer.data() + sample_offset;
+        break;
+      }
+    }
+  }
+
   int pass_offset = 0;
 
   for (size_t j = 0; j < params.passes.size(); j++) {
@@ -418,6 +436,11 @@ bool RenderBuffers::get_pass_rect(
       }
       else {
         for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
+          if (sample_count && sample_count[i * pass_stride] < 0.0f) {
+            scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
+            scale_exposure = (pass.exposure) ? scale * exposure : scale;
+          }
+
           float4 f = make_float4(in[0], in[1], in[2], in[3]);
 
           pixels[0] = f.x * scale_exposure;
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 1042b42810f..42efb031843 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -130,7 +130,7 @@ class DisplayBuffer {
 
 class RenderTile {
  public:
-  typedef enum { PATH_TRACE, DENOISE } Task;
+  typedef enum { PATH_TRACE = (1 << 0), DENOISE = (1 << 1) } Task;
 
   Task task;
   int x, y, w, h;
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 38306a63c74..74953afae9d 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -29,6 +29,7 @@
 #include "util/util_vector.h"
 
 /* needed for calculating differentials */
+// clang-format off
 #include "kernel/kernel_compat_cpu.h"
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
@@ -36,6 +37,7 @@
 #include "kernel/kernel_differential.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_camera.h"
+// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -498,7 +500,7 @@ void Camera::device_update_volume(Device * /*device*/, DeviceScene *dscene, Scen
   BoundBox viewplane_boundbox = viewplane_bounds_get();
   for (size_t i = 0; i < scene->objects.size(); ++i) {
     Object *object = scene->objects[i];
-    if (object->mesh->has_volume && viewplane_boundbox.intersects(object->bounds)) {
+    if (object->geometry->has_volume && viewplane_boundbox.intersects(object->bounds)) {
       /* TODO(sergey): Consider adding more grained check. */
       VLOG(1) << "Detected camera inside volume.";
       kcam->is_inside_volume = 1;
diff --git a/intern/cycles/render/colorspace.cpp b/intern/cycles/render/colorspace.cpp
index 2e5b53057c0..7605fcaf5ff 100644
--- a/intern/cycles/render/colorspace.cpp
+++ b/intern/cycles/render/colorspace.cpp
@@ -17,8 +17,8 @@
 #include "render/colorspace.h"
 
 #include "util/util_color.h"
-#include "util/util_image.h"
 #include "util/util_half.h"
+#include "util/util_image.h"
 #include "util/util_logging.h"
 #include "util/util_math.h"
 #include "util/util_thread.h"
@@ -262,56 +262,49 @@ template<typename T> inline void cast_from_float4(T *data, float4 value)
 
 /* Slower versions for other all data types, which needs to convert to float and back. */
 template<typename T, bool compress_as_srgb = false>
-inline void processor_apply_pixels(const OCIO::Processor *processor,
-                                   T *pixels,
-                                   size_t width,
-                                   size_t height)
+inline void processor_apply_pixels(const OCIO::Processor *processor, T *pixels, size_t num_pixels)
 {
   /* TODO: implement faster version for when we know the conversion
    * is a simple matrix transform between linear spaces. In that case
    * unpremultiply is not needed. */
 
   /* Process large images in chunks to keep temporary memory requirement down. */
-  size_t y_chunk_size = max(1, 16 * 1024 * 1024 / (sizeof(float4) * width));
-  vector<float4> float_pixels(y_chunk_size * width);
-
-  for (size_t y0 = 0; y0 < height; y0 += y_chunk_size) {
-    size_t y1 = std::min(y0 + y_chunk_size, height);
-    size_t i = 0;
+  const size_t chunk_size = std::min((size_t)(16 * 1024 * 1024), num_pixels);
+  vector<float4> float_pixels(chunk_size);
 
-    for (size_t y = y0; y < y1; y++) {
-      for (size_t x = 0; x < width; x++, i++) {
-        float4 value = cast_to_float4(pixels + 4 * (y * width + x));
+  for (size_t j = 0; j < num_pixels; j += chunk_size) {
+    size_t width = std::min(chunk_size, num_pixels - j);
 
-        if (!(value.w == 0.0f || value.w == 1.0f)) {
-          float inv_alpha = 1.0f / value.w;
-          value.x *= inv_alpha;
-          value.y *= inv_alpha;
-          value.z *= inv_alpha;
-        }
+    for (size_t i = 0; i < width; i++) {
+      float4 value = cast_to_float4(pixels + 4 * (j + i));
 
-        float_pixels[i] = value;
+      if (!(value.w <= 0.0f || value.w == 1.0f)) {
+        float inv_alpha = 1.0f / value.w;
+        value.x *= inv_alpha;
+        value.y *= inv_alpha;
+        value.z *= inv_alpha;
       }
+
+      float_pixels[i] = value;
     }
 
-    OCIO::PackedImageDesc desc((float *)float_pixels.data(), width, y_chunk_size, 4);
+    OCIO::PackedImageDesc desc((float *)float_pixels.data(), width, 1, 4);
     processor->apply(desc);
 
-    i = 0;
-    for (size_t y = y0; y < y1; y++) {
-      for (size_t x = 0; x < width; x++, i++) {
-        float4 value = float_pixels[i];
+    for (size_t i = 0; i < width; i++) {
+      float4 value = float_pixels[i];
+
+      if (compress_as_srgb) {
+        value = color_linear_to_srgb_v4(value);
+      }
 
+      if (!(value.w <= 0.0f || value.w == 1.0f)) {
         value.x *= value.w;
         value.y *= value.w;
         value.z *= value.w;
-
-        if (compress_as_srgb) {
-          value = color_linear_to_srgb_v4(value);
-        }
-
-        cast_from_float4(pixels + 4 * (y * width + x), value);
       }
+
+      cast_from_float4(pixels + 4 * (j + i), value);
     }
   }
 }
@@ -320,9 +313,7 @@ inline void processor_apply_pixels(const OCIO::Processor *processor,
 template<typename T>
 void ColorSpaceManager::to_scene_linear(ustring colorspace,
                                         T *pixels,
-                                        size_t width,
-                                        size_t height,
-                                        size_t depth,
+                                        size_t num_pixels,
                                         bool compress_as_srgb)
 {
 #ifdef WITH_OCIO
@@ -331,23 +322,17 @@ void ColorSpaceManager::to_scene_linear(ustring colorspace,
   if (processor) {
     if (compress_as_srgb) {
       /* Compress output as sRGB. */
-      for (size_t z = 0; z < depth; z++) {
-        processor_apply_pixels<T, true>(processor, &pixels[z * width * height], width, height);
-      }
+      processor_apply_pixels<T, true>(processor, pixels, num_pixels);
     }
     else {
       /* Write output as scene linear directly. */
-      for (size_t z = 0; z < depth; z++) {
-        processor_apply_pixels<T>(processor, &pixels[z * width * height], width, height);
-      }
+      processor_apply_pixels<T>(processor, pixels, num_pixels);
     }
   }
 #else
   (void)colorspace;
   (void)pixels;
-  (void)width;
-  (void)height;
-  (void)depth;
+  (void)num_pixels;
   (void)compress_as_srgb;
 #endif
 }
@@ -402,9 +387,9 @@ void ColorSpaceManager::free_memory()
 }
 
 /* Template instanstations so we don't have to inline functions. */
-template void ColorSpaceManager::to_scene_linear(ustring, uchar *, size_t, size_t, size_t, bool);
-template void ColorSpaceManager::to_scene_linear(ustring, ushort *, size_t, size_t, size_t, bool);
-template void ColorSpaceManager::to_scene_linear(ustring, half *, size_t, size_t, size_t, bool);
-template void ColorSpaceManager::to_scene_linear(ustring, float *, size_t, size_t, size_t, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, uchar *, size_t, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, ushort *, size_t, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, half *, size_t, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, float *, size_t, bool);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/colorspace.h b/intern/cycles/render/colorspace.h
index 9fea2d6efc6..51d0b121cc0 100644
--- a/intern/cycles/render/colorspace.h
+++ b/intern/cycles/render/colorspace.h
@@ -45,9 +45,7 @@ class ColorSpaceManager {
   template<typename T>
   static void to_scene_linear(ustring colorspace,
                               T *pixels,
-                              size_t width,
-                              size_t height,
-                              size_t depth,
+                              size_t num_pixels,
                               bool compress_as_srgb);
 
   /* Efficiently convert pixels to scene linear colorspace at render time,
diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h
index 7f622488a88..fec4123c361 100644
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -17,8 +17,8 @@
 #ifndef __CONSTANT_FOLD_H__
 #define __CONSTANT_FOLD_H__
 
-#include "util/util_types.h"
 #include "kernel/svm/svm_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp
index 0a29903728a..99d4daa6961 100644
--- a/intern/cycles/render/coverage.cpp
+++ b/intern/cycles/render/coverage.cpp
@@ -15,13 +15,16 @@
  */
 
 #include "render/coverage.h"
+#include "render/buffers.h"
+
 #include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data.h"
+
 #include "kernel/kernel_globals.h"
 #include "kernel/kernel_id_passes.h"
-#include "kernel/kernel_types.h"
+
 #include "util/util_map.h"
-#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h
index 3d1f6a2b040..12182c614da 100644
--- a/intern/cycles/render/coverage.h
+++ b/intern/cycles/render/coverage.h
@@ -14,18 +14,19 @@
  * limitations under the License.
  */
 
-#include "render/buffers.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
+#ifndef __COVERAGE_H__
+#define __COVERAGE_H__
+
 #include "util/util_map.h"
 #include "util/util_vector.h"
 
-#ifndef __COVERAGE_H__
-#  define __COVERAGE_H__
-
 CCL_NAMESPACE_BEGIN
 
+struct KernelGlobals;
+class RenderTile;
+
+typedef unordered_map<float, float> CoverageMap;
+
 class Coverage {
  public:
   Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_)
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index 66fbc9eb4a8..1907bb33d06 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "device/device.h"
 #include "render/curves.h"
+#include "device/device.h"
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
diff --git a/intern/cycles/render/denoising.h b/intern/cycles/render/denoising.h
index c234d00eb49..5c6f913cb38 100644
--- a/intern/cycles/render/denoising.h
+++ b/intern/cycles/render/denoising.h
@@ -23,8 +23,8 @@
 #include "render/buffers.h"
 
 #include "util/util_string.h"
-#include "util/util_vector.h"
 #include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
 
 #include <OpenImageIO/imageio.h>
 
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index bd274844b52..baf02901123 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "render/camera.h"
-#include "device/device.h"
 #include "render/film.h"
+#include "device/device.h"
+#include "render/camera.h"
 #include "render/integrator.h"
 #include "render/mesh.h"
 #include "render/scene.h"
@@ -155,7 +155,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
     case PASS_DIFFUSE_COLOR:
     case PASS_GLOSSY_COLOR:
     case PASS_TRANSMISSION_COLOR:
-    case PASS_SUBSURFACE_COLOR:
       pass.components = 4;
       break;
     case PASS_DIFFUSE_DIRECT:
@@ -176,12 +175,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
       pass.exposure = true;
       pass.divide_type = PASS_TRANSMISSION_COLOR;
       break;
-    case PASS_SUBSURFACE_DIRECT:
-    case PASS_SUBSURFACE_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_SUBSURFACE_COLOR;
-      break;
     case PASS_VOLUME_DIRECT:
     case PASS_VOLUME_INDIRECT:
       pass.components = 4;
@@ -190,6 +183,13 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
     case PASS_CRYPTOMATTE:
       pass.components = 4;
       break;
+    case PASS_ADAPTIVE_AUX_BUFFER:
+      pass.components = 4;
+      break;
+    case PASS_SAMPLE_COUNT:
+      pass.components = 1;
+      pass.exposure = false;
+      break;
     case PASS_AOV_COLOR:
       pass.components = 4;
       break;
@@ -203,9 +203,10 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
 
   passes.push_back(pass);
 
-  /* order from by components, to ensure alignment so passes with size 4
-   * come first and then passes with size 1 */
-  sort(&passes[0], &passes[0] + passes.size(), compare_pass_order);
+  /* Order from by components, to ensure alignment so passes with size 4
+   * come first and then passes with size 1. Note this must use stable sort
+   * so cryptomatte passes remain in the right order. */
+  stable_sort(&passes[0], &passes[0] + passes.size(), compare_pass_order);
 
   if (pass.divide_type != PASS_NONE)
     Pass::add(pass.divide_type, passes);
@@ -318,15 +319,19 @@ NODE_DEFINE(Film)
   SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
   SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
   SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
 
   return type;
 }
 
 Film::Film() : Node(node_type)
 {
+  Pass::add(PASS_COMBINED, passes);
+
   use_light_visibility = false;
   filter_table_offset = TABLE_OFFSET_INVALID;
   cryptomatte_passes = CRYPT_NONE;
+  display_pass = PASS_COMBINED;
 
   need_update = true;
 }
@@ -439,9 +444,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_TRANSMISSION_COLOR:
         kfilm->pass_transmission_color = kfilm->pass_stride;
         break;
-      case PASS_SUBSURFACE_COLOR:
-        kfilm->pass_subsurface_color = kfilm->pass_stride;
-        break;
       case PASS_DIFFUSE_INDIRECT:
         kfilm->pass_diffuse_indirect = kfilm->pass_stride;
         break;
@@ -451,9 +453,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_TRANSMISSION_INDIRECT:
         kfilm->pass_transmission_indirect = kfilm->pass_stride;
         break;
-      case PASS_SUBSURFACE_INDIRECT:
-        kfilm->pass_subsurface_indirect = kfilm->pass_stride;
-        break;
       case PASS_VOLUME_INDIRECT:
         kfilm->pass_volume_indirect = kfilm->pass_stride;
         break;
@@ -466,9 +465,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_TRANSMISSION_DIRECT:
         kfilm->pass_transmission_direct = kfilm->pass_stride;
         break;
-      case PASS_SUBSURFACE_DIRECT:
-        kfilm->pass_subsurface_direct = kfilm->pass_stride;
-        break;
       case PASS_VOLUME_DIRECT:
         kfilm->pass_volume_direct = kfilm->pass_stride;
         break;
@@ -495,6 +491,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
                                       kfilm->pass_stride;
         have_cryptomatte = true;
         break;
+      case PASS_ADAPTIVE_AUX_BUFFER:
+        kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
+        break;
+      case PASS_SAMPLE_COUNT:
+        kfilm->pass_sample_count = kfilm->pass_stride;
+        break;
       case PASS_AOV_COLOR:
         if (!have_aov_color) {
           kfilm->pass_aov_color = kfilm->pass_stride;
@@ -518,7 +520,7 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       kfilm->use_display_exposure = pass.exposure && (kfilm->exposure != 1.0f);
     }
     else if (pass.type == PASS_DIFFUSE_COLOR || pass.type == PASS_TRANSMISSION_COLOR ||
-             pass.type == PASS_GLOSSY_COLOR || pass.type == PASS_SUBSURFACE_COLOR) {
+             pass.type == PASS_GLOSSY_COLOR) {
       kfilm->display_divide_pass_stride = kfilm->pass_stride;
     }
 
@@ -590,13 +592,13 @@ bool Film::modified(const Film &film)
 void Film::tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes)
 {
   if (Pass::contains(passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) {
-    scene->mesh_manager->tag_update(scene);
+    scene->geometry_manager->tag_update(scene);
 
     foreach (Shader *shader, scene->shaders)
-      shader->need_update_mesh = true;
+      shader->need_update_geometry = true;
   }
   else if (Pass::contains(passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION)) {
-    scene->mesh_manager->tag_update(scene);
+    scene->geometry_manager->tag_update(scene);
   }
   else if (Pass::contains(passes, PASS_AO) != Pass::contains(passes_, PASS_AO)) {
     scene->integrator->tag_update(scene);
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 95e54cb54d8..aae8fb404b0 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -81,6 +81,8 @@ class Film : public Node {
   CryptomatteType cryptomatte_passes;
   int cryptomatte_depth;
 
+  bool use_adaptive_sampling;
+
   bool need_update;
 
   Film();
diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp
new file mode 100644
index 00000000000..d46ed430c4f
--- /dev/null
+++ b/intern/cycles/render/geometry.cpp
@@ -0,0 +1,1470 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+
+#ifdef WITH_EMBREE
+#  include "bvh/bvh_embree.h"
+#endif
+
+#include "render/attribute.h"
+#include "render/camera.h"
+#include "render/geometry.h"
+#include "render/hair.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/stats.h"
+
+#include "subd/subd_patch_table.h"
+#include "subd/subd_split.h"
+
+#include "kernel/osl/osl_globals.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Geometry */
+
+NODE_ABSTRACT_DEFINE(Geometry)
+{
+  NodeType *type = NodeType::add("geometry_base", NULL);
+
+  SOCKET_UINT(motion_steps, "Motion Steps", 3);
+  SOCKET_BOOLEAN(use_motion_blur, "Use Motion Blur", false);
+
+  return type;
+}
+
+Geometry::Geometry(const NodeType *node_type, const Type type)
+    : Node(node_type), type(type), attributes(this, ATTR_PRIM_GEOMETRY)
+{
+  need_update = true;
+  need_update_rebuild = false;
+
+  transform_applied = false;
+  transform_negative_scaled = false;
+  transform_normal = transform_identity();
+  bounds = BoundBox::empty;
+
+  has_volume = false;
+  has_surface_bssrdf = false;
+
+  bvh = NULL;
+  attr_map_offset = 0;
+  optix_prim_offset = 0;
+  prim_offset = 0;
+}
+
+Geometry::~Geometry()
+{
+  delete bvh;
+}
+
+void Geometry::clear()
+{
+  used_shaders.clear();
+  transform_applied = false;
+  transform_negative_scaled = false;
+  transform_normal = transform_identity();
+}
+
+bool Geometry::need_attribute(Scene *scene, AttributeStandard std)
+{
+  if (std == ATTR_STD_NONE)
+    return false;
+
+  if (scene->need_global_attribute(std))
+    return true;
+
+  foreach (Shader *shader, used_shaders)
+    if (shader->attributes.find(std))
+      return true;
+
+  return false;
+}
+
+bool Geometry::need_attribute(Scene * /*scene*/, ustring name)
+{
+  if (name == ustring())
+    return false;
+
+  foreach (Shader *shader, used_shaders)
+    if (shader->attributes.find(name))
+      return true;
+
+  return false;
+}
+
+float Geometry::motion_time(int step) const
+{
+  return (motion_steps > 1) ? 2.0f * step / (motion_steps - 1) - 1.0f : 0.0f;
+}
+
+int Geometry::motion_step(float time) const
+{
+  if (motion_steps > 1) {
+    int attr_step = 0;
+
+    for (int step = 0; step < motion_steps; step++) {
+      float step_time = motion_time(step);
+      if (step_time == time) {
+        return attr_step;
+      }
+
+      /* Center step is stored in a separate attribute. */
+      if (step != motion_steps / 2) {
+        attr_step++;
+      }
+    }
+  }
+
+  return -1;
+}
+
+bool Geometry::need_build_bvh(BVHLayout layout) const
+{
+  return !transform_applied || has_surface_bssrdf || layout == BVH_LAYOUT_OPTIX;
+}
+
+bool Geometry::is_instanced() const
+{
+  /* Currently we treat subsurface objects as instanced.
+   *
+   * While it might be not very optimal for ray traversal, it avoids having
+   * duplicated BVH in the memory, saving quite some space.
+   */
+  return !transform_applied || has_surface_bssrdf;
+}
+
+bool Geometry::has_true_displacement() const
+{
+  foreach (Shader *shader, used_shaders) {
+    if (shader->has_displacement && shader->displacement_method != DISPLACE_BUMP) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void Geometry::compute_bvh(
+    Device *device, DeviceScene *dscene, SceneParams *params, Progress *progress, int n, int total)
+{
+  if (progress->get_cancel())
+    return;
+
+  compute_bounds();
+
+  const BVHLayout bvh_layout = BVHParams::best_bvh_layout(params->bvh_layout,
+                                                          device->get_bvh_layout_mask());
+  if (need_build_bvh(bvh_layout)) {
+    string msg = "Updating Geometry BVH ";
+    if (name.empty())
+      msg += string_printf("%u/%u", (uint)(n + 1), (uint)total);
+    else
+      msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
+
+    Object object;
+    object.geometry = this;
+
+    vector<Geometry *> geometry;
+    geometry.push_back(this);
+    vector<Object *> objects;
+    objects.push_back(&object);
+
+    if (bvh && !need_update_rebuild) {
+      progress->set_status(msg, "Refitting BVH");
+
+      bvh->geometry = geometry;
+      bvh->objects = objects;
+
+      bvh->refit(*progress);
+    }
+    else {
+      progress->set_status(msg, "Building BVH");
+
+      BVHParams bparams;
+      bparams.use_spatial_split = params->use_bvh_spatial_split;
+      bparams.bvh_layout = bvh_layout;
+      bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
+                                    params->use_bvh_unaligned_nodes;
+      bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
+      bparams.num_motion_curve_steps = params->num_bvh_time_steps;
+      bparams.bvh_type = params->bvh_type;
+      bparams.curve_flags = dscene->data.curve.curveflags;
+      bparams.curve_subdivisions = dscene->data.curve.subdivisions;
+
+      delete bvh;
+      bvh = BVH::create(bparams, geometry, objects);
+      MEM_GUARDED_CALL(progress, bvh->build, *progress);
+    }
+  }
+
+  need_update = false;
+  need_update_rebuild = false;
+}
+
+bool Geometry::has_motion_blur() const
+{
+  return (use_motion_blur && attributes.find(ATTR_STD_MOTION_VERTEX_POSITION));
+}
+
+bool Geometry::has_voxel_attributes() const
+{
+  foreach (const Attribute &attr, attributes.attributes) {
+    if (attr.element == ATTR_ELEMENT_VOXEL) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void Geometry::tag_update(Scene *scene, bool rebuild)
+{
+  need_update = true;
+
+  if (rebuild) {
+    need_update_rebuild = true;
+    scene->light_manager->need_update = true;
+  }
+  else {
+    foreach (Shader *shader, used_shaders)
+      if (shader->has_surface_emission)
+        scene->light_manager->need_update = true;
+  }
+
+  scene->geometry_manager->need_update = true;
+  scene->object_manager->need_update = true;
+}
+
+/* Geometry Manager */
+
+GeometryManager::GeometryManager()
+{
+  need_update = true;
+  need_flags_update = true;
+}
+
+GeometryManager::~GeometryManager()
+{
+}
+
+void GeometryManager::update_osl_attributes(Device *device,
+                                            Scene *scene,
+                                            vector<AttributeRequestSet> &geom_attributes)
+{
+#ifdef WITH_OSL
+  /* for OSL, a hash map is used to lookup the attribute by name. */
+  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+
+  og->object_name_map.clear();
+  og->attribute_map.clear();
+  og->object_names.clear();
+
+  og->attribute_map.resize(scene->objects.size() * ATTR_PRIM_TYPES);
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    /* set object name to object index map */
+    Object *object = scene->objects[i];
+    og->object_name_map[object->name] = i;
+    og->object_names.push_back(object->name);
+
+    /* set object attributes */
+    foreach (ParamValue &attr, object->attributes) {
+      OSLGlobals::Attribute osl_attr;
+
+      osl_attr.type = attr.type();
+      osl_attr.desc.element = ATTR_ELEMENT_OBJECT;
+      osl_attr.value = attr;
+      osl_attr.desc.offset = 0;
+      osl_attr.desc.flags = 0;
+
+      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_GEOMETRY][attr.name()] = osl_attr;
+      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][attr.name()] = osl_attr;
+    }
+
+    /* find geometry attributes */
+    size_t j;
+
+    for (j = 0; j < scene->geometry.size(); j++)
+      if (scene->geometry[j] == object->geometry)
+        break;
+
+    AttributeRequestSet &attributes = geom_attributes[j];
+
+    /* set object attributes */
+    foreach (AttributeRequest &req, attributes.requests) {
+      OSLGlobals::Attribute osl_attr;
+
+      if (req.desc.element != ATTR_ELEMENT_NONE) {
+        osl_attr.desc = req.desc;
+
+        if (req.type == TypeDesc::TypeFloat)
+          osl_attr.type = TypeDesc::TypeFloat;
+        else if (req.type == TypeDesc::TypeMatrix)
+          osl_attr.type = TypeDesc::TypeMatrix;
+        else if (req.type == TypeFloat2)
+          osl_attr.type = TypeFloat2;
+        else if (req.type == TypeRGBA)
+          osl_attr.type = TypeRGBA;
+        else
+          osl_attr.type = TypeDesc::TypeColor;
+
+        if (req.std != ATTR_STD_NONE) {
+          /* if standard attribute, add lookup by geom: name convention */
+          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
+          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_GEOMETRY][stdname] = osl_attr;
+        }
+        else if (req.name != ustring()) {
+          /* add lookup by geometry attribute name */
+          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_GEOMETRY][req.name] = osl_attr;
+        }
+      }
+
+      if (req.subd_desc.element != ATTR_ELEMENT_NONE) {
+        osl_attr.desc = req.subd_desc;
+
+        if (req.subd_type == TypeDesc::TypeFloat)
+          osl_attr.type = TypeDesc::TypeFloat;
+        else if (req.subd_type == TypeDesc::TypeMatrix)
+          osl_attr.type = TypeDesc::TypeMatrix;
+        else if (req.subd_type == TypeFloat2)
+          osl_attr.type = TypeFloat2;
+        else if (req.subd_type == TypeRGBA)
+          osl_attr.type = TypeRGBA;
+        else
+          osl_attr.type = TypeDesc::TypeColor;
+
+        if (req.std != ATTR_STD_NONE) {
+          /* if standard attribute, add lookup by geom: name convention */
+          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
+          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][stdname] = osl_attr;
+        }
+        else if (req.name != ustring()) {
+          /* add lookup by geometry attribute name */
+          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][req.name] = osl_attr;
+        }
+      }
+    }
+  }
+#else
+  (void)device;
+  (void)scene;
+  (void)geom_attributes;
+#endif
+}
+
+void GeometryManager::update_svm_attributes(Device *,
+                                            DeviceScene *dscene,
+                                            Scene *scene,
+                                            vector<AttributeRequestSet> &geom_attributes)
+{
+  /* for SVM, the attributes_map table is used to lookup the offset of an
+   * attribute, based on a unique shader attribute id. */
+
+  /* compute array stride */
+  int attr_map_size = 0;
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    geom->attr_map_offset = attr_map_size;
+    attr_map_size += (geom_attributes[i].size() + 1) * ATTR_PRIM_TYPES;
+  }
+
+  if (attr_map_size == 0)
+    return;
+
+  /* create attribute map */
+  uint4 *attr_map = dscene->attributes_map.alloc(attr_map_size);
+  memset(attr_map, 0, dscene->attributes_map.size() * sizeof(uint));
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+
+    /* set object attributes */
+    int index = geom->attr_map_offset;
+
+    foreach (AttributeRequest &req, attributes.requests) {
+      uint id;
+
+      if (req.std == ATTR_STD_NONE)
+        id = scene->shader_manager->get_attribute_id(req.name);
+      else
+        id = scene->shader_manager->get_attribute_id(req.std);
+
+      attr_map[index].x = id;
+      attr_map[index].y = req.desc.element;
+      attr_map[index].z = as_uint(req.desc.offset);
+
+      if (req.type == TypeDesc::TypeFloat)
+        attr_map[index].w = NODE_ATTR_FLOAT;
+      else if (req.type == TypeDesc::TypeMatrix)
+        attr_map[index].w = NODE_ATTR_MATRIX;
+      else if (req.type == TypeFloat2)
+        attr_map[index].w = NODE_ATTR_FLOAT2;
+      else if (req.type == TypeRGBA)
+        attr_map[index].w = NODE_ATTR_RGBA;
+      else
+        attr_map[index].w = NODE_ATTR_FLOAT3;
+
+      attr_map[index].w |= req.desc.flags << 8;
+
+      index++;
+
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (mesh->subd_faces.size()) {
+          attr_map[index].x = id;
+          attr_map[index].y = req.subd_desc.element;
+          attr_map[index].z = as_uint(req.subd_desc.offset);
+
+          if (req.subd_type == TypeDesc::TypeFloat)
+            attr_map[index].w = NODE_ATTR_FLOAT;
+          else if (req.subd_type == TypeDesc::TypeMatrix)
+            attr_map[index].w = NODE_ATTR_MATRIX;
+          else if (req.subd_type == TypeFloat2)
+            attr_map[index].w = NODE_ATTR_FLOAT2;
+          else if (req.subd_type == TypeRGBA)
+            attr_map[index].w = NODE_ATTR_RGBA;
+          else
+            attr_map[index].w = NODE_ATTR_FLOAT3;
+
+          attr_map[index].w |= req.subd_desc.flags << 8;
+        }
+      }
+
+      index++;
+    }
+
+    /* terminator */
+    for (int j = 0; j < ATTR_PRIM_TYPES; j++) {
+      attr_map[index].x = ATTR_STD_NONE;
+      attr_map[index].y = 0;
+      attr_map[index].z = 0;
+      attr_map[index].w = 0;
+
+      index++;
+    }
+  }
+
+  /* copy to device */
+  dscene->attributes_map.copy_to_device();
+}
+
+static void update_attribute_element_size(Geometry *geom,
+                                          Attribute *mattr,
+                                          AttributePrimitive prim,
+                                          size_t *attr_float_size,
+                                          size_t *attr_float2_size,
+                                          size_t *attr_float3_size,
+                                          size_t *attr_uchar4_size)
+{
+  if (mattr) {
+    size_t size = mattr->element_size(geom, prim);
+
+    if (mattr->element == ATTR_ELEMENT_VOXEL) {
+      /* pass */
+    }
+    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
+      *attr_uchar4_size += size;
+    }
+    else if (mattr->type == TypeDesc::TypeFloat) {
+      *attr_float_size += size;
+    }
+    else if (mattr->type == TypeFloat2) {
+      *attr_float2_size += size;
+    }
+    else if (mattr->type == TypeDesc::TypeMatrix) {
+      *attr_float3_size += size * 4;
+    }
+    else {
+      *attr_float3_size += size;
+    }
+  }
+}
+
+static void update_attribute_element_offset(Geometry *geom,
+                                            device_vector<float> &attr_float,
+                                            size_t &attr_float_offset,
+                                            device_vector<float2> &attr_float2,
+                                            size_t &attr_float2_offset,
+                                            device_vector<float4> &attr_float3,
+                                            size_t &attr_float3_offset,
+                                            device_vector<uchar4> &attr_uchar4,
+                                            size_t &attr_uchar4_offset,
+                                            Attribute *mattr,
+                                            AttributePrimitive prim,
+                                            TypeDesc &type,
+                                            AttributeDescriptor &desc)
+{
+  if (mattr) {
+    /* store element and type */
+    desc.element = mattr->element;
+    desc.flags = mattr->flags;
+    type = mattr->type;
+
+    /* store attribute data in arrays */
+    size_t size = mattr->element_size(geom, prim);
+
+    AttributeElement &element = desc.element;
+    int &offset = desc.offset;
+
+    if (mattr->element == ATTR_ELEMENT_VOXEL) {
+      /* store slot in offset value */
+      ImageHandle &handle = mattr->data_voxel();
+      offset = handle.svm_slot();
+    }
+    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
+      uchar4 *data = mattr->data_uchar4();
+      offset = attr_uchar4_offset;
+
+      assert(attr_uchar4.size() >= offset + size);
+      for (size_t k = 0; k < size; k++) {
+        attr_uchar4[offset + k] = data[k];
+      }
+      attr_uchar4_offset += size;
+    }
+    else if (mattr->type == TypeDesc::TypeFloat) {
+      float *data = mattr->data_float();
+      offset = attr_float_offset;
+
+      assert(attr_float.size() >= offset + size);
+      for (size_t k = 0; k < size; k++) {
+        attr_float[offset + k] = data[k];
+      }
+      attr_float_offset += size;
+    }
+    else if (mattr->type == TypeFloat2) {
+      float2 *data = mattr->data_float2();
+      offset = attr_float2_offset;
+
+      assert(attr_float2.size() >= offset + size);
+      for (size_t k = 0; k < size; k++) {
+        attr_float2[offset + k] = data[k];
+      }
+      attr_float2_offset += size;
+    }
+    else if (mattr->type == TypeDesc::TypeMatrix) {
+      Transform *tfm = mattr->data_transform();
+      offset = attr_float3_offset;
+
+      assert(attr_float3.size() >= offset + size * 3);
+      for (size_t k = 0; k < size * 3; k++) {
+        attr_float3[offset + k] = (&tfm->x)[k];
+      }
+      attr_float3_offset += size * 3;
+    }
+    else {
+      float4 *data = mattr->data_float4();
+      offset = attr_float3_offset;
+
+      assert(attr_float3.size() >= offset + size);
+      for (size_t k = 0; k < size; k++) {
+        attr_float3[offset + k] = data[k];
+      }
+      attr_float3_offset += size;
+    }
+
+    /* mesh vertex/curve index is global, not per object, so we sneak
+     * a correction for that in here */
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      if (mesh->subdivision_type == Mesh::SUBDIVISION_CATMULL_CLARK &&
+          desc.flags & ATTR_SUBDIVIDED) {
+        /* indices for subdivided attributes are retrieved
+         * from patch table so no need for correction here*/
+      }
+      else if (element == ATTR_ELEMENT_VERTEX)
+        offset -= mesh->vert_offset;
+      else if (element == ATTR_ELEMENT_VERTEX_MOTION)
+        offset -= mesh->vert_offset;
+      else if (element == ATTR_ELEMENT_FACE) {
+        if (prim == ATTR_PRIM_GEOMETRY)
+          offset -= mesh->prim_offset;
+        else
+          offset -= mesh->face_offset;
+      }
+      else if (element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE) {
+        if (prim == ATTR_PRIM_GEOMETRY)
+          offset -= 3 * mesh->prim_offset;
+        else
+          offset -= mesh->corner_offset;
+      }
+    }
+    else if (geom->type == Geometry::HAIR) {
+      Hair *hair = static_cast<Hair *>(geom);
+      if (element == ATTR_ELEMENT_CURVE)
+        offset -= hair->prim_offset;
+      else if (element == ATTR_ELEMENT_CURVE_KEY)
+        offset -= hair->curvekey_offset;
+      else if (element == ATTR_ELEMENT_CURVE_KEY_MOTION)
+        offset -= hair->curvekey_offset;
+    }
+  }
+  else {
+    /* attribute not found */
+    desc.element = ATTR_ELEMENT_NONE;
+    desc.offset = 0;
+  }
+}
+
+void GeometryManager::device_update_attributes(Device *device,
+                                               DeviceScene *dscene,
+                                               Scene *scene,
+                                               Progress &progress)
+{
+  progress.set_status("Updating Mesh", "Computing attributes");
+
+  /* gather per mesh requested attributes. as meshes may have multiple
+   * shaders assigned, this merges the requested attributes that have
+   * been set per shader by the shader manager */
+  vector<AttributeRequestSet> geom_attributes(scene->geometry.size());
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+
+    scene->need_global_attributes(geom_attributes[i]);
+
+    foreach (Shader *shader, geom->used_shaders) {
+      geom_attributes[i].add(shader->attributes);
+    }
+  }
+
+  /* mesh attribute are stored in a single array per data type. here we fill
+   * those arrays, and set the offset and element type to create attribute
+   * maps next */
+
+  /* Pre-allocate attributes to avoid arrays re-allocation which would
+   * take 2x of overall attribute memory usage.
+   */
+  size_t attr_float_size = 0;
+  size_t attr_float2_size = 0;
+  size_t attr_float3_size = 0;
+  size_t attr_uchar4_size = 0;
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+    foreach (AttributeRequest &req, attributes.requests) {
+      Attribute *attr = geom->attributes.find(req);
+
+      update_attribute_element_size(geom,
+                                    attr,
+                                    ATTR_PRIM_GEOMETRY,
+                                    &attr_float_size,
+                                    &attr_float2_size,
+                                    &attr_float3_size,
+                                    &attr_uchar4_size);
+
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        Attribute *subd_attr = mesh->subd_attributes.find(req);
+
+        update_attribute_element_size(mesh,
+                                      subd_attr,
+                                      ATTR_PRIM_SUBD,
+                                      &attr_float_size,
+                                      &attr_float2_size,
+                                      &attr_float3_size,
+                                      &attr_uchar4_size);
+      }
+    }
+  }
+
+  dscene->attributes_float.alloc(attr_float_size);
+  dscene->attributes_float2.alloc(attr_float2_size);
+  dscene->attributes_float3.alloc(attr_float3_size);
+  dscene->attributes_uchar4.alloc(attr_uchar4_size);
+
+  size_t attr_float_offset = 0;
+  size_t attr_float2_offset = 0;
+  size_t attr_float3_offset = 0;
+  size_t attr_uchar4_offset = 0;
+
+  /* Fill in attributes. */
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+
+    /* todo: we now store std and name attributes from requests even if
+     * they actually refer to the same mesh attributes, optimize */
+    foreach (AttributeRequest &req, attributes.requests) {
+      Attribute *attr = geom->attributes.find(req);
+      update_attribute_element_offset(geom,
+                                      dscene->attributes_float,
+                                      attr_float_offset,
+                                      dscene->attributes_float2,
+                                      attr_float2_offset,
+                                      dscene->attributes_float3,
+                                      attr_float3_offset,
+                                      dscene->attributes_uchar4,
+                                      attr_uchar4_offset,
+                                      attr,
+                                      ATTR_PRIM_GEOMETRY,
+                                      req.type,
+                                      req.desc);
+
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        Attribute *subd_attr = mesh->subd_attributes.find(req);
+
+        update_attribute_element_offset(mesh,
+                                        dscene->attributes_float,
+                                        attr_float_offset,
+                                        dscene->attributes_float2,
+                                        attr_float2_offset,
+                                        dscene->attributes_float3,
+                                        attr_float3_offset,
+                                        dscene->attributes_uchar4,
+                                        attr_uchar4_offset,
+                                        subd_attr,
+                                        ATTR_PRIM_SUBD,
+                                        req.subd_type,
+                                        req.subd_desc);
+      }
+
+      if (progress.get_cancel())
+        return;
+    }
+  }
+
+  /* create attribute lookup maps */
+  if (scene->shader_manager->use_osl())
+    update_osl_attributes(device, scene, geom_attributes);
+
+  update_svm_attributes(device, dscene, scene, geom_attributes);
+
+  if (progress.get_cancel())
+    return;
+
+  /* copy to device */
+  progress.set_status("Updating Mesh", "Copying Attributes to device");
+
+  if (dscene->attributes_float.size()) {
+    dscene->attributes_float.copy_to_device();
+  }
+  if (dscene->attributes_float2.size()) {
+    dscene->attributes_float2.copy_to_device();
+  }
+  if (dscene->attributes_float3.size()) {
+    dscene->attributes_float3.copy_to_device();
+  }
+  if (dscene->attributes_uchar4.size()) {
+    dscene->attributes_uchar4.copy_to_device();
+  }
+
+  if (progress.get_cancel())
+    return;
+
+  /* After mesh attributes and patch tables have been copied to device memory,
+   * we need to update offsets in the objects. */
+  scene->object_manager->device_update_mesh_offsets(device, dscene, scene);
+}
+
+void GeometryManager::mesh_calc_offset(Scene *scene)
+{
+  size_t vert_size = 0;
+  size_t tri_size = 0;
+
+  size_t curve_key_size = 0;
+  size_t curve_size = 0;
+
+  size_t patch_size = 0;
+  size_t face_size = 0;
+  size_t corner_size = 0;
+
+  size_t optix_prim_size = 0;
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+
+      mesh->vert_offset = vert_size;
+      mesh->prim_offset = tri_size;
+
+      mesh->patch_offset = patch_size;
+      mesh->face_offset = face_size;
+      mesh->corner_offset = corner_size;
+
+      vert_size += mesh->verts.size();
+      tri_size += mesh->num_triangles();
+
+      if (mesh->subd_faces.size()) {
+        Mesh::SubdFace &last = mesh->subd_faces[mesh->subd_faces.size() - 1];
+        patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
+
+        /* patch tables are stored in same array so include them in patch_size */
+        if (mesh->patch_table) {
+          mesh->patch_table_offset = patch_size;
+          patch_size += mesh->patch_table->total_size();
+        }
+      }
+
+      face_size += mesh->subd_faces.size();
+      corner_size += mesh->subd_face_corners.size();
+
+      mesh->optix_prim_offset = optix_prim_size;
+      optix_prim_size += mesh->num_triangles();
+    }
+    else if (geom->type == Geometry::HAIR) {
+      Hair *hair = static_cast<Hair *>(geom);
+
+      hair->curvekey_offset = curve_key_size;
+      hair->prim_offset = curve_size;
+
+      curve_key_size += hair->curve_keys.size();
+      curve_size += hair->num_curves();
+
+      hair->optix_prim_offset = optix_prim_size;
+      optix_prim_size += hair->num_segments();
+    }
+  }
+}
+
+void GeometryManager::device_update_mesh(
+    Device *, DeviceScene *dscene, Scene *scene, bool for_displacement, Progress &progress)
+{
+  /* Count. */
+  size_t vert_size = 0;
+  size_t tri_size = 0;
+
+  size_t curve_key_size = 0;
+  size_t curve_size = 0;
+
+  size_t patch_size = 0;
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+
+      vert_size += mesh->verts.size();
+      tri_size += mesh->num_triangles();
+
+      if (mesh->subd_faces.size()) {
+        Mesh::SubdFace &last = mesh->subd_faces[mesh->subd_faces.size() - 1];
+        patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
+
+        /* patch tables are stored in same array so include them in patch_size */
+        if (mesh->patch_table) {
+          mesh->patch_table_offset = patch_size;
+          patch_size += mesh->patch_table->total_size();
+        }
+      }
+    }
+    else if (geom->type == Geometry::HAIR) {
+      Hair *hair = static_cast<Hair *>(geom);
+
+      curve_key_size += hair->curve_keys.size();
+      curve_size += hair->num_curves();
+    }
+  }
+
+  /* Create mapping from triangle to primitive triangle array. */
+  vector<uint> tri_prim_index(tri_size);
+  if (for_displacement) {
+    /* For displacement kernels we do some trickery to make them believe
+     * we've got all required data ready. However, that data is different
+     * from final render kernels since we don't have BVH yet, so can't
+     * really use same semantic of arrays.
+     */
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        for (size_t i = 0; i < mesh->num_triangles(); ++i) {
+          tri_prim_index[i + mesh->prim_offset] = 3 * (i + mesh->prim_offset);
+        }
+      }
+    }
+  }
+  else {
+    for (size_t i = 0; i < dscene->prim_index.size(); ++i) {
+      if ((dscene->prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
+        tri_prim_index[dscene->prim_index[i]] = dscene->prim_tri_index[i];
+      }
+    }
+  }
+
+  /* Fill in all the arrays. */
+  if (tri_size != 0) {
+    /* normals */
+    progress.set_status("Updating Mesh", "Computing normals");
+
+    uint *tri_shader = dscene->tri_shader.alloc(tri_size);
+    float4 *vnormal = dscene->tri_vnormal.alloc(vert_size);
+    uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
+    uint *tri_patch = dscene->tri_patch.alloc(tri_size);
+    float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        mesh->pack_shaders(scene, &tri_shader[mesh->prim_offset]);
+        mesh->pack_normals(&vnormal[mesh->vert_offset]);
+        mesh->pack_verts(tri_prim_index,
+                         &tri_vindex[mesh->prim_offset],
+                         &tri_patch[mesh->prim_offset],
+                         &tri_patch_uv[mesh->vert_offset],
+                         mesh->vert_offset,
+                         mesh->prim_offset);
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    /* vertex coordinates */
+    progress.set_status("Updating Mesh", "Copying Mesh to device");
+
+    dscene->tri_shader.copy_to_device();
+    dscene->tri_vnormal.copy_to_device();
+    dscene->tri_vindex.copy_to_device();
+    dscene->tri_patch.copy_to_device();
+    dscene->tri_patch_uv.copy_to_device();
+  }
+
+  if (curve_size != 0) {
+    progress.set_status("Updating Mesh", "Copying Strands to device");
+
+    float4 *curve_keys = dscene->curve_keys.alloc(curve_key_size);
+    float4 *curves = dscene->curves.alloc(curve_size);
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::HAIR) {
+        Hair *hair = static_cast<Hair *>(geom);
+        hair->pack_curves(scene,
+                          &curve_keys[hair->curvekey_offset],
+                          &curves[hair->prim_offset],
+                          hair->curvekey_offset);
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    dscene->curve_keys.copy_to_device();
+    dscene->curves.copy_to_device();
+  }
+
+  if (patch_size != 0) {
+    progress.set_status("Updating Mesh", "Copying Patches to device");
+
+    uint *patch_data = dscene->patches.alloc(patch_size);
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        mesh->pack_patches(&patch_data[mesh->patch_offset],
+                           mesh->vert_offset,
+                           mesh->face_offset,
+                           mesh->corner_offset);
+
+        if (mesh->patch_table) {
+          mesh->patch_table->copy_adjusting_offsets(&patch_data[mesh->patch_table_offset],
+                                                    mesh->patch_table_offset);
+        }
+
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    dscene->patches.copy_to_device();
+  }
+
+  if (for_displacement) {
+    float4 *prim_tri_verts = dscene->prim_tri_verts.alloc(tri_size * 3);
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        for (size_t i = 0; i < mesh->num_triangles(); ++i) {
+          Mesh::Triangle t = mesh->get_triangle(i);
+          size_t offset = 3 * (i + mesh->prim_offset);
+          prim_tri_verts[offset + 0] = float3_to_float4(mesh->verts[t.v[0]]);
+          prim_tri_verts[offset + 1] = float3_to_float4(mesh->verts[t.v[1]]);
+          prim_tri_verts[offset + 2] = float3_to_float4(mesh->verts[t.v[2]]);
+        }
+      }
+    }
+    dscene->prim_tri_verts.copy_to_device();
+  }
+}
+
+void GeometryManager::device_update_bvh(Device *device,
+                                        DeviceScene *dscene,
+                                        Scene *scene,
+                                        Progress &progress)
+{
+  /* bvh build */
+  progress.set_status("Updating Scene BVH", "Building");
+
+  BVHParams bparams;
+  bparams.top_level = true;
+  bparams.bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
+                                                  device->get_bvh_layout_mask());
+  bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
+  bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
+                                scene->params.use_bvh_unaligned_nodes;
+  bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
+  bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
+  bparams.bvh_type = scene->params.bvh_type;
+  bparams.curve_flags = dscene->data.curve.curveflags;
+  bparams.curve_subdivisions = dscene->data.curve.subdivisions;
+
+  VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout) << " layout.";
+
+#ifdef WITH_EMBREE
+  if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
+    if (dscene->data.bvh.scene) {
+      BVHEmbree::destroy(dscene->data.bvh.scene);
+    }
+  }
+#endif
+
+  BVH *bvh = BVH::create(bparams, scene->geometry, scene->objects);
+  bvh->build(progress, &device->stats);
+
+  if (progress.get_cancel()) {
+#ifdef WITH_EMBREE
+    if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
+      if (dscene->data.bvh.scene) {
+        BVHEmbree::destroy(dscene->data.bvh.scene);
+      }
+    }
+#endif
+    delete bvh;
+    return;
+  }
+
+  /* copy to device */
+  progress.set_status("Updating Scene BVH", "Copying BVH to device");
+
+  PackedBVH &pack = bvh->pack;
+
+  if (pack.nodes.size()) {
+    dscene->bvh_nodes.steal_data(pack.nodes);
+    dscene->bvh_nodes.copy_to_device();
+  }
+  if (pack.leaf_nodes.size()) {
+    dscene->bvh_leaf_nodes.steal_data(pack.leaf_nodes);
+    dscene->bvh_leaf_nodes.copy_to_device();
+  }
+  if (pack.object_node.size()) {
+    dscene->object_node.steal_data(pack.object_node);
+    dscene->object_node.copy_to_device();
+  }
+  if (pack.prim_tri_index.size()) {
+    dscene->prim_tri_index.steal_data(pack.prim_tri_index);
+    dscene->prim_tri_index.copy_to_device();
+  }
+  if (pack.prim_tri_verts.size()) {
+    dscene->prim_tri_verts.steal_data(pack.prim_tri_verts);
+    dscene->prim_tri_verts.copy_to_device();
+  }
+  if (pack.prim_type.size()) {
+    dscene->prim_type.steal_data(pack.prim_type);
+    dscene->prim_type.copy_to_device();
+  }
+  if (pack.prim_visibility.size()) {
+    dscene->prim_visibility.steal_data(pack.prim_visibility);
+    dscene->prim_visibility.copy_to_device();
+  }
+  if (pack.prim_index.size()) {
+    dscene->prim_index.steal_data(pack.prim_index);
+    dscene->prim_index.copy_to_device();
+  }
+  if (pack.prim_object.size()) {
+    dscene->prim_object.steal_data(pack.prim_object);
+    dscene->prim_object.copy_to_device();
+  }
+  if (pack.prim_time.size()) {
+    dscene->prim_time.steal_data(pack.prim_time);
+    dscene->prim_time.copy_to_device();
+  }
+
+  dscene->data.bvh.root = pack.root_index;
+  dscene->data.bvh.bvh_layout = bparams.bvh_layout;
+  dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
+
+  bvh->copy_to_device(progress, dscene);
+
+  delete bvh;
+}
+
+void GeometryManager::device_update_preprocess(Device *device, Scene *scene, Progress &progress)
+{
+  if (!need_update && !need_flags_update) {
+    return;
+  }
+
+  progress.set_status("Updating Meshes Flags");
+
+  /* Update flags. */
+  bool volume_images_updated = false;
+
+  foreach (Geometry *geom, scene->geometry) {
+    geom->has_volume = false;
+
+    foreach (const Shader *shader, geom->used_shaders) {
+      if (shader->has_volume) {
+        geom->has_volume = true;
+      }
+      if (shader->has_surface_bssrdf) {
+        geom->has_surface_bssrdf = true;
+      }
+    }
+
+    if (need_update && geom->has_volume && geom->type == Geometry::MESH) {
+      /* Create volume meshes if there is voxel data. */
+      if (geom->has_voxel_attributes()) {
+        if (!volume_images_updated) {
+          progress.set_status("Updating Meshes Volume Bounds");
+          device_update_volume_images(device, scene, progress);
+          volume_images_updated = true;
+        }
+
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        create_volume_mesh(mesh, progress);
+      }
+    }
+  }
+
+  need_flags_update = false;
+}
+
+void GeometryManager::device_update_displacement_images(Device *device,
+                                                        Scene *scene,
+                                                        Progress &progress)
+{
+  progress.set_status("Updating Displacement Images");
+  TaskPool pool;
+  ImageManager *image_manager = scene->image_manager;
+  set<int> bump_images;
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->need_update) {
+      foreach (Shader *shader, geom->used_shaders) {
+        if (!shader->has_displacement || shader->displacement_method == DISPLACE_BUMP) {
+          continue;
+        }
+        foreach (ShaderNode *node, shader->graph->nodes) {
+          if (node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
+            continue;
+          }
+
+          ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode *>(node);
+          for (int i = 0; i < image_node->handle.num_tiles(); i++) {
+            const int slot = image_node->handle.svm_slot(i);
+            if (slot != -1) {
+              bump_images.insert(slot);
+            }
+          }
+        }
+      }
+    }
+  }
+  foreach (int slot, bump_images) {
+    pool.push(function_bind(
+        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
+  }
+  pool.wait_work();
+}
+
+void GeometryManager::device_update_volume_images(Device *device, Scene *scene, Progress &progress)
+{
+  progress.set_status("Updating Volume Images");
+  TaskPool pool;
+  ImageManager *image_manager = scene->image_manager;
+  set<int> volume_images;
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (!geom->need_update) {
+      continue;
+    }
+
+    foreach (Attribute &attr, geom->attributes.attributes) {
+      if (attr.element != ATTR_ELEMENT_VOXEL) {
+        continue;
+      }
+
+      ImageHandle &handle = attr.data_voxel();
+      const int slot = handle.svm_slot();
+      if (slot != -1) {
+        volume_images.insert(slot);
+      }
+    }
+  }
+
+  foreach (int slot, volume_images) {
+    pool.push(function_bind(
+        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
+  }
+  pool.wait_work();
+}
+
+void GeometryManager::device_update(Device *device,
+                                    DeviceScene *dscene,
+                                    Scene *scene,
+                                    Progress &progress)
+{
+  if (!need_update)
+    return;
+
+  VLOG(1) << "Total " << scene->geometry.size() << " meshes.";
+
+  bool true_displacement_used = false;
+  size_t total_tess_needed = 0;
+
+  foreach (Geometry *geom, scene->geometry) {
+    foreach (Shader *shader, geom->used_shaders) {
+      if (shader->need_update_geometry)
+        geom->need_update = true;
+    }
+
+    if (geom->need_update && geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+
+      /* Update normals. */
+      mesh->add_face_normals();
+      mesh->add_vertex_normals();
+
+      if (mesh->need_attribute(scene, ATTR_STD_POSITION_UNDISPLACED)) {
+        mesh->add_undisplaced();
+      }
+
+      /* Test if we need tessellation. */
+      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE && mesh->num_subd_verts == 0 &&
+          mesh->subd_params) {
+        total_tess_needed++;
+      }
+
+      /* Test if we need displacement. */
+      if (mesh->has_true_displacement()) {
+        true_displacement_used = true;
+      }
+
+      if (progress.get_cancel())
+        return;
+    }
+  }
+
+  /* Tessellate meshes that are using subdivision */
+  if (total_tess_needed) {
+    Camera *dicing_camera = scene->dicing_camera;
+    dicing_camera->update(scene);
+
+    size_t i = 0;
+    foreach (Geometry *geom, scene->geometry) {
+      if (!(geom->need_update && geom->type == Geometry::MESH)) {
+        continue;
+      }
+
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE && mesh->num_subd_verts == 0 &&
+          mesh->subd_params) {
+        string msg = "Tessellating ";
+        if (mesh->name == "")
+          msg += string_printf("%u/%u", (uint)(i + 1), (uint)total_tess_needed);
+        else
+          msg += string_printf(
+              "%s %u/%u", mesh->name.c_str(), (uint)(i + 1), (uint)total_tess_needed);
+
+        progress.set_status("Updating Mesh", msg);
+
+        mesh->subd_params->camera = dicing_camera;
+        DiagSplit dsplit(*mesh->subd_params);
+        mesh->tessellate(&dsplit);
+
+        i++;
+
+        if (progress.get_cancel())
+          return;
+      }
+    }
+  }
+
+  /* Update images needed for true displacement. */
+  bool old_need_object_flags_update = false;
+  if (true_displacement_used) {
+    VLOG(1) << "Updating images used for true displacement.";
+    device_update_displacement_images(device, scene, progress);
+    old_need_object_flags_update = scene->object_manager->need_flags_update;
+    scene->object_manager->device_update_flags(device, dscene, scene, progress, false);
+  }
+
+  /* Device update. */
+  device_free(device, dscene);
+
+  mesh_calc_offset(scene);
+  if (true_displacement_used) {
+    device_update_mesh(device, dscene, scene, true, progress);
+  }
+  if (progress.get_cancel())
+    return;
+
+  device_update_attributes(device, dscene, scene, progress);
+  if (progress.get_cancel())
+    return;
+
+  /* Update displacement. */
+  bool displacement_done = false;
+  size_t num_bvh = 0;
+  BVHLayout bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
+                                                    device->get_bvh_layout_mask());
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->need_update) {
+      if (geom->type == Geometry::MESH) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        if (displace(device, dscene, scene, mesh, progress)) {
+          displacement_done = true;
+        }
+      }
+
+      if (geom->need_build_bvh(bvh_layout)) {
+        num_bvh++;
+      }
+    }
+
+    if (progress.get_cancel())
+      return;
+  }
+
+  /* Device re-update after displacement. */
+  if (displacement_done) {
+    device_free(device, dscene);
+
+    device_update_attributes(device, dscene, scene, progress);
+    if (progress.get_cancel())
+      return;
+  }
+
+  TaskPool pool;
+
+  size_t i = 0;
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->need_update) {
+      pool.push(function_bind(
+          &Geometry::compute_bvh, geom, device, dscene, &scene->params, &progress, i, num_bvh));
+      if (geom->need_build_bvh(bvh_layout)) {
+        i++;
+      }
+    }
+  }
+
+  TaskPool::Summary summary;
+  pool.wait_work(&summary);
+  VLOG(2) << "Objects BVH build pool statistics:\n" << summary.full_report();
+
+  foreach (Shader *shader, scene->shaders) {
+    shader->need_update_geometry = false;
+  }
+
+  Scene::MotionType need_motion = scene->need_motion();
+  bool motion_blur = need_motion == Scene::MOTION_BLUR;
+
+  /* Update objects. */
+  vector<Object *> volume_objects;
+  foreach (Object *object, scene->objects) {
+    object->compute_bounds(motion_blur);
+  }
+
+  if (progress.get_cancel())
+    return;
+
+  device_update_bvh(device, dscene, scene, progress);
+  if (progress.get_cancel())
+    return;
+
+  device_update_mesh(device, dscene, scene, false, progress);
+  if (progress.get_cancel())
+    return;
+
+  need_update = false;
+
+  if (true_displacement_used) {
+    /* Re-tag flags for update, so they're re-evaluated
+     * for meshes with correct bounding boxes.
+     *
+     * This wouldn't cause wrong results, just true
+     * displacement might be less optimal ot calculate.
+     */
+    scene->object_manager->need_flags_update = old_need_object_flags_update;
+  }
+}
+
+void GeometryManager::device_free(Device *device, DeviceScene *dscene)
+{
+  dscene->bvh_nodes.free();
+  dscene->bvh_leaf_nodes.free();
+  dscene->object_node.free();
+  dscene->prim_tri_verts.free();
+  dscene->prim_tri_index.free();
+  dscene->prim_type.free();
+  dscene->prim_visibility.free();
+  dscene->prim_index.free();
+  dscene->prim_object.free();
+  dscene->prim_time.free();
+  dscene->tri_shader.free();
+  dscene->tri_vnormal.free();
+  dscene->tri_vindex.free();
+  dscene->tri_patch.free();
+  dscene->tri_patch_uv.free();
+  dscene->curves.free();
+  dscene->curve_keys.free();
+  dscene->patches.free();
+  dscene->attributes_map.free();
+  dscene->attributes_float.free();
+  dscene->attributes_float2.free();
+  dscene->attributes_float3.free();
+  dscene->attributes_uchar4.free();
+
+  /* Signal for shaders like displacement not to do ray tracing. */
+  dscene->data.bvh.bvh_layout = BVH_LAYOUT_NONE;
+
+#ifdef WITH_OSL
+  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+
+  if (og) {
+    og->object_name_map.clear();
+    og->attribute_map.clear();
+    og->object_names.clear();
+  }
+#else
+  (void)device;
+#endif
+}
+
+void GeometryManager::tag_update(Scene *scene)
+{
+  need_update = true;
+  scene->object_manager->need_update = true;
+}
+
+void GeometryManager::collect_statistics(const Scene *scene, RenderStats *stats)
+{
+  foreach (Geometry *geometry, scene->geometry) {
+    stats->mesh.geometry.add_entry(
+        NamedSizeEntry(string(geometry->name.c_str()), geometry->get_total_size_in_bytes()));
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/geometry.h b/intern/cycles/render/geometry.h
new file mode 100644
index 00000000000..b0284304843
--- /dev/null
+++ b/intern/cycles/render/geometry.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __GEOMETRY_H__
+#define __GEOMETRY_H__
+
+#include "graph/node.h"
+
+#include "bvh/bvh_params.h"
+
+#include "render/attribute.h"
+
+#include "util/util_boundbox.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVH;
+class Device;
+class DeviceScene;
+class Mesh;
+class Progress;
+class RenderStats;
+class Scene;
+class SceneParams;
+class Shader;
+
+/* Geometry
+ *
+ * Base class for geometric types like Mesh and Hair. */
+
+class Geometry : public Node {
+ public:
+  NODE_ABSTRACT_DECLARE
+
+  enum Type {
+    MESH,
+    HAIR,
+  };
+
+  Type type;
+
+  /* Attributes */
+  AttributeSet attributes;
+
+  /* Shaders */
+  vector<Shader *> used_shaders;
+
+  /* Transform */
+  BoundBox bounds;
+  bool transform_applied;
+  bool transform_negative_scaled;
+  Transform transform_normal;
+
+  /* Motion Blur */
+  uint motion_steps;
+  bool use_motion_blur;
+
+  /* Maximum number of motion steps supported (due to Embree). */
+  static const uint MAX_MOTION_STEPS = 129;
+
+  /* BVH */
+  BVH *bvh;
+  size_t attr_map_offset;
+  size_t prim_offset;
+  size_t optix_prim_offset;
+
+  /* Shader Properties */
+  bool has_volume;         /* Set in the device_update_flags(). */
+  bool has_surface_bssrdf; /* Set in the device_update_flags(). */
+
+  /* Update Flags */
+  bool need_update;
+  bool need_update_rebuild;
+
+  /* Constructor/Destructor */
+  explicit Geometry(const NodeType *node_type, const Type type);
+  virtual ~Geometry();
+
+  /* Geometry */
+  virtual void clear();
+  virtual void compute_bounds() = 0;
+  virtual void apply_transform(const Transform &tfm, const bool apply_to_motion) = 0;
+
+  /* Attribute Requests */
+  bool need_attribute(Scene *scene, AttributeStandard std);
+  bool need_attribute(Scene *scene, ustring name);
+
+  /* UDIM */
+  virtual void get_uv_tiles(ustring map, unordered_set<int> &tiles) = 0;
+
+  /* Convert between normalized -1..1 motion time and index in the
+   * VERTEX_MOTION attribute. */
+  float motion_time(int step) const;
+  int motion_step(float time) const;
+
+  /* BVH */
+  void compute_bvh(Device *device,
+                   DeviceScene *dscene,
+                   SceneParams *params,
+                   Progress *progress,
+                   int n,
+                   int total);
+
+  /* Check whether the geometry should have own BVH built separately. Briefly,
+   * own BVH is needed for geometry, if:
+   *
+   * - It is instanced multiple times, so each instance object should share the
+   *   same BVH tree.
+   * - Special ray intersection is needed, for example to limit subsurface rays
+   *   to only the geometry itself.
+   * - The BVH layout requires the top level to only contain instances.
+   */
+  bool need_build_bvh(BVHLayout layout) const;
+
+  /* Test if the geometry should be treated as instanced. */
+  bool is_instanced() const;
+
+  bool has_true_displacement() const;
+  bool has_motion_blur() const;
+  bool has_voxel_attributes() const;
+
+  /* Updates */
+  void tag_update(Scene *scene, bool rebuild);
+};
+
+/* Geometry Manager */
+
+class GeometryManager {
+ public:
+  /* Update Flags */
+  bool need_update;
+  bool need_flags_update;
+
+  /* Constructor/Destructor */
+  GeometryManager();
+  ~GeometryManager();
+
+  /* Device Updates */
+  void device_update_preprocess(Device *device, Scene *scene, Progress &progress);
+  void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
+  void device_free(Device *device, DeviceScene *dscene);
+
+  /* Updates */
+  void tag_update(Scene *scene);
+
+  /* Statistics */
+  void collect_statistics(const Scene *scene, RenderStats *stats);
+
+ protected:
+  bool displace(Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress);
+
+  void create_volume_mesh(Mesh *mesh, Progress &progress);
+
+  /* Attributes */
+  void update_osl_attributes(Device *device,
+                             Scene *scene,
+                             vector<AttributeRequestSet> &geom_attributes);
+  void update_svm_attributes(Device *device,
+                             DeviceScene *dscene,
+                             Scene *scene,
+                             vector<AttributeRequestSet> &geom_attributes);
+
+  /* Compute verts/triangles/curves offsets in global arrays. */
+  void mesh_calc_offset(Scene *scene);
+
+  void device_update_object(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
+
+  void device_update_mesh(Device *device,
+                          DeviceScene *dscene,
+                          Scene *scene,
+                          bool for_displacement,
+                          Progress &progress);
+
+  void device_update_attributes(Device *device,
+                                DeviceScene *dscene,
+                                Scene *scene,
+                                Progress &progress);
+
+  void device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
+
+  void device_update_displacement_images(Device *device, Scene *scene, Progress &progress);
+
+  void device_update_volume_images(Device *device, Scene *scene, Progress &progress);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __GEOMETRY_H__ */
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 0e520c700a7..d2db59894ea 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "render/attribute.h"
 #include "render/graph.h"
+#include "render/attribute.h"
+#include "render/constant_fold.h"
 #include "render/nodes.h"
 #include "render/scene.h"
 #include "render/shader.h"
-#include "render/constant_fold.h"
 
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
diff --git a/intern/cycles/render/hair.cpp b/intern/cycles/render/hair.cpp
new file mode 100644
index 00000000000..3daa4cc1e35
--- /dev/null
+++ b/intern/cycles/render/hair.cpp
@@ -0,0 +1,487 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/hair.h"
+#include "render/curves.h"
+#include "render/scene.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Hair Curve */
+
+void Hair::Curve::bounds_grow(const int k,
+                              const float3 *curve_keys,
+                              const float *curve_radius,
+                              BoundBox &bounds) const
+{
+  float3 P[4];
+
+  P[0] = curve_keys[max(first_key + k - 1, first_key)];
+  P[1] = curve_keys[first_key + k];
+  P[2] = curve_keys[first_key + k + 1];
+  P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
+
+  float3 lower;
+  float3 upper;
+
+  curvebounds(&lower.x, &upper.x, P, 0);
+  curvebounds(&lower.y, &upper.y, P, 1);
+  curvebounds(&lower.z, &upper.z, P, 2);
+
+  float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
+
+  bounds.grow(lower, mr);
+  bounds.grow(upper, mr);
+}
+
+void Hair::Curve::bounds_grow(const int k,
+                              const float3 *curve_keys,
+                              const float *curve_radius,
+                              const Transform &aligned_space,
+                              BoundBox &bounds) const
+{
+  float3 P[4];
+
+  P[0] = curve_keys[max(first_key + k - 1, first_key)];
+  P[1] = curve_keys[first_key + k];
+  P[2] = curve_keys[first_key + k + 1];
+  P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
+
+  P[0] = transform_point(&aligned_space, P[0]);
+  P[1] = transform_point(&aligned_space, P[1]);
+  P[2] = transform_point(&aligned_space, P[2]);
+  P[3] = transform_point(&aligned_space, P[3]);
+
+  float3 lower;
+  float3 upper;
+
+  curvebounds(&lower.x, &upper.x, P, 0);
+  curvebounds(&lower.y, &upper.y, P, 1);
+  curvebounds(&lower.z, &upper.z, P, 2);
+
+  float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
+
+  bounds.grow(lower, mr);
+  bounds.grow(upper, mr);
+}
+
+void Hair::Curve::bounds_grow(float4 keys[4], BoundBox &bounds) const
+{
+  float3 P[4] = {
+      float4_to_float3(keys[0]),
+      float4_to_float3(keys[1]),
+      float4_to_float3(keys[2]),
+      float4_to_float3(keys[3]),
+  };
+
+  float3 lower;
+  float3 upper;
+
+  curvebounds(&lower.x, &upper.x, P, 0);
+  curvebounds(&lower.y, &upper.y, P, 1);
+  curvebounds(&lower.z, &upper.z, P, 2);
+
+  float mr = max(keys[1].w, keys[2].w);
+
+  bounds.grow(lower, mr);
+  bounds.grow(upper, mr);
+}
+
+void Hair::Curve::motion_keys(const float3 *curve_keys,
+                              const float *curve_radius,
+                              const float3 *key_steps,
+                              size_t num_curve_keys,
+                              size_t num_steps,
+                              float time,
+                              size_t k0,
+                              size_t k1,
+                              float4 r_keys[2]) const
+{
+  /* Figure out which steps we need to fetch and their interpolation factor. */
+  const size_t max_step = num_steps - 1;
+  const size_t step = min((int)(time * max_step), max_step - 1);
+  const float t = time * max_step - step;
+  /* Fetch vertex coordinates. */
+  float4 curr_keys[2];
+  float4 next_keys[2];
+  keys_for_step(
+      curve_keys, curve_radius, key_steps, num_curve_keys, num_steps, step, k0, k1, curr_keys);
+  keys_for_step(
+      curve_keys, curve_radius, key_steps, num_curve_keys, num_steps, step + 1, k0, k1, next_keys);
+  /* Interpolate between steps. */
+  r_keys[0] = (1.0f - t) * curr_keys[0] + t * next_keys[0];
+  r_keys[1] = (1.0f - t) * curr_keys[1] + t * next_keys[1];
+}
+
+void Hair::Curve::cardinal_motion_keys(const float3 *curve_keys,
+                                       const float *curve_radius,
+                                       const float3 *key_steps,
+                                       size_t num_curve_keys,
+                                       size_t num_steps,
+                                       float time,
+                                       size_t k0,
+                                       size_t k1,
+                                       size_t k2,
+                                       size_t k3,
+                                       float4 r_keys[4]) const
+{
+  /* Figure out which steps we need to fetch and their interpolation factor. */
+  const size_t max_step = num_steps - 1;
+  const size_t step = min((int)(time * max_step), max_step - 1);
+  const float t = time * max_step - step;
+  /* Fetch vertex coordinates. */
+  float4 curr_keys[4];
+  float4 next_keys[4];
+  cardinal_keys_for_step(curve_keys,
+                         curve_radius,
+                         key_steps,
+                         num_curve_keys,
+                         num_steps,
+                         step,
+                         k0,
+                         k1,
+                         k2,
+                         k3,
+                         curr_keys);
+  cardinal_keys_for_step(curve_keys,
+                         curve_radius,
+                         key_steps,
+                         num_curve_keys,
+                         num_steps,
+                         step + 1,
+                         k0,
+                         k1,
+                         k2,
+                         k3,
+                         next_keys);
+  /* Interpolate between steps. */
+  r_keys[0] = (1.0f - t) * curr_keys[0] + t * next_keys[0];
+  r_keys[1] = (1.0f - t) * curr_keys[1] + t * next_keys[1];
+  r_keys[2] = (1.0f - t) * curr_keys[2] + t * next_keys[2];
+  r_keys[3] = (1.0f - t) * curr_keys[3] + t * next_keys[3];
+}
+
+void Hair::Curve::keys_for_step(const float3 *curve_keys,
+                                const float *curve_radius,
+                                const float3 *key_steps,
+                                size_t num_curve_keys,
+                                size_t num_steps,
+                                size_t step,
+                                size_t k0,
+                                size_t k1,
+                                float4 r_keys[2]) const
+{
+  k0 = max(k0, 0);
+  k1 = min(k1, num_keys - 1);
+  const size_t center_step = ((num_steps - 1) / 2);
+  if (step == center_step) {
+    /* Center step: regular key location. */
+    /* TODO(sergey): Consider adding make_float4(float3, float)
+     * function.
+     */
+    r_keys[0] = make_float4(curve_keys[first_key + k0].x,
+                            curve_keys[first_key + k0].y,
+                            curve_keys[first_key + k0].z,
+                            curve_radius[first_key + k0]);
+    r_keys[1] = make_float4(curve_keys[first_key + k1].x,
+                            curve_keys[first_key + k1].y,
+                            curve_keys[first_key + k1].z,
+                            curve_radius[first_key + k1]);
+  }
+  else {
+    /* Center step is not stored in this array. */
+    if (step > center_step) {
+      step--;
+    }
+    const size_t offset = first_key + step * num_curve_keys;
+    r_keys[0] = make_float4(key_steps[offset + k0].x,
+                            key_steps[offset + k0].y,
+                            key_steps[offset + k0].z,
+                            curve_radius[first_key + k0]);
+    r_keys[1] = make_float4(key_steps[offset + k1].x,
+                            key_steps[offset + k1].y,
+                            key_steps[offset + k1].z,
+                            curve_radius[first_key + k1]);
+  }
+}
+
+void Hair::Curve::cardinal_keys_for_step(const float3 *curve_keys,
+                                         const float *curve_radius,
+                                         const float3 *key_steps,
+                                         size_t num_curve_keys,
+                                         size_t num_steps,
+                                         size_t step,
+                                         size_t k0,
+                                         size_t k1,
+                                         size_t k2,
+                                         size_t k3,
+                                         float4 r_keys[4]) const
+{
+  k0 = max(k0, 0);
+  k3 = min(k3, num_keys - 1);
+  const size_t center_step = ((num_steps - 1) / 2);
+  if (step == center_step) {
+    /* Center step: regular key location. */
+    r_keys[0] = make_float4(curve_keys[first_key + k0].x,
+                            curve_keys[first_key + k0].y,
+                            curve_keys[first_key + k0].z,
+                            curve_radius[first_key + k0]);
+    r_keys[1] = make_float4(curve_keys[first_key + k1].x,
+                            curve_keys[first_key + k1].y,
+                            curve_keys[first_key + k1].z,
+                            curve_radius[first_key + k1]);
+    r_keys[2] = make_float4(curve_keys[first_key + k2].x,
+                            curve_keys[first_key + k2].y,
+                            curve_keys[first_key + k2].z,
+                            curve_radius[first_key + k2]);
+    r_keys[3] = make_float4(curve_keys[first_key + k3].x,
+                            curve_keys[first_key + k3].y,
+                            curve_keys[first_key + k3].z,
+                            curve_radius[first_key + k3]);
+  }
+  else {
+    /* Center step is not stored in this array. */
+    if (step > center_step) {
+      step--;
+    }
+    const size_t offset = first_key + step * num_curve_keys;
+    r_keys[0] = make_float4(key_steps[offset + k0].x,
+                            key_steps[offset + k0].y,
+                            key_steps[offset + k0].z,
+                            curve_radius[first_key + k0]);
+    r_keys[1] = make_float4(key_steps[offset + k1].x,
+                            key_steps[offset + k1].y,
+                            key_steps[offset + k1].z,
+                            curve_radius[first_key + k1]);
+    r_keys[2] = make_float4(key_steps[offset + k2].x,
+                            key_steps[offset + k2].y,
+                            key_steps[offset + k2].z,
+                            curve_radius[first_key + k2]);
+    r_keys[3] = make_float4(key_steps[offset + k3].x,
+                            key_steps[offset + k3].y,
+                            key_steps[offset + k3].z,
+                            curve_radius[first_key + k3]);
+  }
+}
+
+/* Hair */
+
+NODE_DEFINE(Hair)
+{
+  NodeType *type = NodeType::add("hair", create, NodeType::NONE, Geometry::node_base_type);
+
+  SOCKET_POINT_ARRAY(curve_keys, "Curve Keys", array<float3>());
+  SOCKET_FLOAT_ARRAY(curve_radius, "Curve Radius", array<float>());
+  SOCKET_INT_ARRAY(curve_first_key, "Curve First Key", array<int>());
+  SOCKET_INT_ARRAY(curve_shader, "Curve Shader", array<int>());
+
+  return type;
+}
+
+Hair::Hair() : Geometry(node_type, Geometry::HAIR)
+{
+  curvekey_offset = 0;
+}
+
+Hair::~Hair()
+{
+}
+
+void Hair::resize_curves(int numcurves, int numkeys)
+{
+  curve_keys.resize(numkeys);
+  curve_radius.resize(numkeys);
+  curve_first_key.resize(numcurves);
+  curve_shader.resize(numcurves);
+
+  attributes.resize();
+}
+
+void Hair::reserve_curves(int numcurves, int numkeys)
+{
+  curve_keys.reserve(numkeys);
+  curve_radius.reserve(numkeys);
+  curve_first_key.reserve(numcurves);
+  curve_shader.reserve(numcurves);
+
+  attributes.resize(true);
+}
+
+void Hair::clear()
+{
+  Geometry::clear();
+
+  curve_keys.clear();
+  curve_radius.clear();
+  curve_first_key.clear();
+  curve_shader.clear();
+
+  attributes.clear();
+}
+
+void Hair::add_curve_key(float3 co, float radius)
+{
+  curve_keys.push_back_reserved(co);
+  curve_radius.push_back_reserved(radius);
+}
+
+void Hair::add_curve(int first_key, int shader)
+{
+  curve_first_key.push_back_reserved(first_key);
+  curve_shader.push_back_reserved(shader);
+}
+
+void Hair::copy_center_to_motion_step(const int motion_step)
+{
+  Attribute *attr_mP = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  if (attr_mP) {
+    float3 *keys = &curve_keys[0];
+    size_t numkeys = curve_keys.size();
+    memcpy(attr_mP->data_float3() + motion_step * numkeys, keys, sizeof(float3) * numkeys);
+  }
+}
+
+void Hair::get_uv_tiles(ustring map, unordered_set<int> &tiles)
+{
+  Attribute *attr;
+
+  if (map.empty()) {
+    attr = attributes.find(ATTR_STD_UV);
+  }
+  else {
+    attr = attributes.find(map);
+  }
+
+  if (attr) {
+    attr->get_uv_tiles(this, ATTR_PRIM_GEOMETRY, tiles);
+  }
+}
+
+void Hair::compute_bounds()
+{
+  BoundBox bnds = BoundBox::empty;
+  size_t curve_keys_size = curve_keys.size();
+
+  if (curve_keys_size > 0) {
+    for (size_t i = 0; i < curve_keys_size; i++)
+      bnds.grow(curve_keys[i], curve_radius[i]);
+
+    Attribute *curve_attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+    if (use_motion_blur && curve_attr) {
+      size_t steps_size = curve_keys.size() * (motion_steps - 1);
+      float3 *key_steps = curve_attr->data_float3();
+
+      for (size_t i = 0; i < steps_size; i++)
+        bnds.grow(key_steps[i]);
+    }
+
+    if (!bnds.valid()) {
+      bnds = BoundBox::empty;
+
+      /* skip nan or inf coordinates */
+      for (size_t i = 0; i < curve_keys_size; i++)
+        bnds.grow_safe(curve_keys[i], curve_radius[i]);
+
+      if (use_motion_blur && curve_attr) {
+        size_t steps_size = curve_keys.size() * (motion_steps - 1);
+        float3 *key_steps = curve_attr->data_float3();
+
+        for (size_t i = 0; i < steps_size; i++)
+          bnds.grow_safe(key_steps[i]);
+      }
+    }
+  }
+
+  if (!bnds.valid()) {
+    /* empty mesh */
+    bnds.grow(make_float3(0.0f, 0.0f, 0.0f));
+  }
+
+  bounds = bnds;
+}
+
+void Hair::apply_transform(const Transform &tfm, const bool apply_to_motion)
+{
+  /* compute uniform scale */
+  float3 c0 = transform_get_column(&tfm, 0);
+  float3 c1 = transform_get_column(&tfm, 1);
+  float3 c2 = transform_get_column(&tfm, 2);
+  float scalar = powf(fabsf(dot(cross(c0, c1), c2)), 1.0f / 3.0f);
+
+  /* apply transform to curve keys */
+  for (size_t i = 0; i < curve_keys.size(); i++) {
+    float3 co = transform_point(&tfm, curve_keys[i]);
+    float radius = curve_radius[i] * scalar;
+
+    /* scale for curve radius is only correct for uniform scale */
+    curve_keys[i] = co;
+    curve_radius[i] = radius;
+  }
+
+  if (apply_to_motion) {
+    Attribute *curve_attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+    if (curve_attr) {
+      /* apply transform to motion curve keys */
+      size_t steps_size = curve_keys.size() * (motion_steps - 1);
+      float4 *key_steps = curve_attr->data_float4();
+
+      for (size_t i = 0; i < steps_size; i++) {
+        float3 co = transform_point(&tfm, float4_to_float3(key_steps[i]));
+        float radius = key_steps[i].w * scalar;
+
+        /* scale for curve radius is only correct for uniform scale */
+        key_steps[i] = float3_to_float4(co);
+        key_steps[i].w = radius;
+      }
+    }
+  }
+}
+
+void Hair::pack_curves(Scene *scene,
+                       float4 *curve_key_co,
+                       float4 *curve_data,
+                       size_t curvekey_offset)
+{
+  size_t curve_keys_size = curve_keys.size();
+
+  /* pack curve keys */
+  if (curve_keys_size) {
+    float3 *keys_ptr = curve_keys.data();
+    float *radius_ptr = curve_radius.data();
+
+    for (size_t i = 0; i < curve_keys_size; i++)
+      curve_key_co[i] = make_float4(keys_ptr[i].x, keys_ptr[i].y, keys_ptr[i].z, radius_ptr[i]);
+  }
+
+  /* pack curve segments */
+  size_t curve_num = num_curves();
+
+  for (size_t i = 0; i < curve_num; i++) {
+    Curve curve = get_curve(i);
+    int shader_id = curve_shader[i];
+    Shader *shader = (shader_id < used_shaders.size()) ? used_shaders[shader_id] :
+                                                         scene->default_surface;
+    shader_id = scene->shader_manager->get_shader_id(shader, false);
+
+    curve_data[i] = make_float4(__int_as_float(curve.first_key + curvekey_offset),
+                                __int_as_float(curve.num_keys),
+                                __int_as_float(shader_id),
+                                0.0f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/hair.h b/intern/cycles/render/hair.h
new file mode 100644
index 00000000000..79f77a78753
--- /dev/null
+++ b/intern/cycles/render/hair.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HAIR_H__
+#define __HAIR_H__
+
+#include "render/geometry.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Hair : public Geometry {
+ public:
+  NODE_DECLARE
+
+  /* Hair Curve */
+  struct Curve {
+    int first_key;
+    int num_keys;
+
+    int num_segments() const
+    {
+      return num_keys - 1;
+    }
+
+    void bounds_grow(const int k,
+                     const float3 *curve_keys,
+                     const float *curve_radius,
+                     BoundBox &bounds) const;
+    void bounds_grow(float4 keys[4], BoundBox &bounds) const;
+    void bounds_grow(const int k,
+                     const float3 *curve_keys,
+                     const float *curve_radius,
+                     const Transform &aligned_space,
+                     BoundBox &bounds) const;
+
+    void motion_keys(const float3 *curve_keys,
+                     const float *curve_radius,
+                     const float3 *key_steps,
+                     size_t num_curve_keys,
+                     size_t num_steps,
+                     float time,
+                     size_t k0,
+                     size_t k1,
+                     float4 r_keys[2]) const;
+    void cardinal_motion_keys(const float3 *curve_keys,
+                              const float *curve_radius,
+                              const float3 *key_steps,
+                              size_t num_curve_keys,
+                              size_t num_steps,
+                              float time,
+                              size_t k0,
+                              size_t k1,
+                              size_t k2,
+                              size_t k3,
+                              float4 r_keys[4]) const;
+
+    void keys_for_step(const float3 *curve_keys,
+                       const float *curve_radius,
+                       const float3 *key_steps,
+                       size_t num_curve_keys,
+                       size_t num_steps,
+                       size_t step,
+                       size_t k0,
+                       size_t k1,
+                       float4 r_keys[2]) const;
+    void cardinal_keys_for_step(const float3 *curve_keys,
+                                const float *curve_radius,
+                                const float3 *key_steps,
+                                size_t num_curve_keys,
+                                size_t num_steps,
+                                size_t step,
+                                size_t k0,
+                                size_t k1,
+                                size_t k2,
+                                size_t k3,
+                                float4 r_keys[4]) const;
+  };
+
+  array<float3> curve_keys;
+  array<float> curve_radius;
+  array<int> curve_first_key;
+  array<int> curve_shader;
+
+  /* BVH */
+  size_t curvekey_offset;
+
+  /* Constructor/Destructor */
+  Hair();
+  ~Hair();
+
+  /* Geometry */
+  void clear() override;
+
+  void resize_curves(int numcurves, int numkeys);
+  void reserve_curves(int numcurves, int numkeys);
+  void add_curve_key(float3 loc, float radius);
+  void add_curve(int first_key, int shader);
+
+  void copy_center_to_motion_step(const int motion_step);
+
+  void compute_bounds() override;
+  void apply_transform(const Transform &tfm, const bool apply_to_motion) override;
+
+  /* Curves */
+  Curve get_curve(size_t i) const
+  {
+    int first = curve_first_key[i];
+    int next_first = (i + 1 < curve_first_key.size()) ? curve_first_key[i + 1] : curve_keys.size();
+
+    Curve curve = {first, next_first - first};
+    return curve;
+  }
+
+  size_t num_keys() const
+  {
+    return curve_keys.size();
+  }
+
+  size_t num_curves() const
+  {
+    return curve_first_key.size();
+  }
+
+  size_t num_segments() const
+  {
+    return curve_keys.size() - curve_first_key.size();
+  }
+
+  /* UDIM */
+  void get_uv_tiles(ustring map, unordered_set<int> &tiles) override;
+
+  /* BVH */
+  void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __HAIR_H__ */
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 212a867f9cd..67ed1176171 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -17,10 +17,12 @@
 #include "render/image.h"
 #include "device/device.h"
 #include "render/colorspace.h"
+#include "render/image_oiio.h"
 #include "render/scene.h"
 #include "render/stats.h"
 
 #include "util/util_foreach.h"
+#include "util/util_image.h"
 #include "util/util_image_impl.h"
 #include "util/util_logging.h"
 #include "util/util_path.h"
@@ -50,21 +52,6 @@ bool isfinite(uint16_t /*value*/)
   return true;
 }
 
-/* The lower three bits of a device texture slot number indicate its type.
- * These functions convert the slot ids from ImageManager "images" ones
- * to device ones and vice verse.
- */
-int type_index_to_flattened_slot(int slot, ImageDataType type)
-{
-  return (slot << IMAGE_DATA_TYPE_SHIFT) | (type);
-}
-
-int flattened_slot_to_type_index(int flat_slot, ImageDataType *type)
-{
-  *type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK);
-  return flat_slot >> IMAGE_DATA_TYPE_SHIFT;
-}
-
 const char *name_from_type(ImageDataType type)
 {
   switch (type) {
@@ -94,342 +81,352 @@ const char *name_from_type(ImageDataType type)
 
 }  // namespace
 
-ImageManager::ImageManager(const DeviceInfo &info)
+/* Image Handle */
+
+ImageHandle::ImageHandle() : manager(NULL)
 {
-  need_update = true;
-  osl_texture_system = NULL;
-  animation_frame = 0;
+}
 
-  /* Set image limits */
-  max_num_images = TEX_NUM_MAX;
-  has_half_images = info.has_half_images;
+ImageHandle::ImageHandle(const ImageHandle &other)
+    : tile_slots(other.tile_slots), manager(other.manager)
+{
+  /* Increase image user count. */
+  foreach (const int slot, tile_slots) {
+    manager->add_image_user(slot);
+  }
+}
+
+ImageHandle &ImageHandle::operator=(const ImageHandle &other)
+{
+  clear();
+  manager = other.manager;
+  tile_slots = other.tile_slots;
 
-  for (size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    tex_num_images[type] = 0;
+  foreach (const int slot, tile_slots) {
+    manager->add_image_user(slot);
   }
+
+  return *this;
 }
 
-ImageManager::~ImageManager()
+ImageHandle::~ImageHandle()
+{
+  clear();
+}
+
+void ImageHandle::clear()
 {
-  for (size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++)
-      assert(!images[type][slot]);
+  foreach (const int slot, tile_slots) {
+    manager->remove_image_user(slot);
   }
+
+  tile_slots.clear();
+  manager = NULL;
 }
 
-void ImageManager::set_osl_texture_system(void *texture_system)
+bool ImageHandle::empty()
 {
-  osl_texture_system = texture_system;
+  return tile_slots.empty();
 }
 
-bool ImageManager::set_animation_frame_update(int frame)
+int ImageHandle::num_tiles()
 {
-  if (frame != animation_frame) {
-    animation_frame = frame;
+  return tile_slots.size();
+}
 
-    for (size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-      for (size_t slot = 0; slot < images[type].size(); slot++) {
-        if (images[type][slot] && images[type][slot]->animated)
-          return true;
-      }
-    }
+ImageMetaData ImageHandle::metadata()
+{
+  if (tile_slots.empty()) {
+    return ImageMetaData();
   }
 
-  return false;
+  ImageManager::Image *img = manager->images[tile_slots.front()];
+  manager->load_image_metadata(img);
+  return img->metadata;
 }
 
-device_memory *ImageManager::image_memory(int flat_slot)
+int ImageHandle::svm_slot(const int tile_index) const
 {
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
+  if (tile_index >= tile_slots.size()) {
+    return -1;
+  }
 
-  Image *img = images[type][slot];
+  if (manager->osl_texture_system) {
+    ImageManager::Image *img = manager->images[tile_slots[tile_index]];
+    if (!img->loader->osl_filepath().empty()) {
+      return -1;
+    }
+  }
 
-  return img->mem;
+  return tile_slots[tile_index];
 }
 
-bool ImageManager::get_image_metadata(int flat_slot, ImageMetaData &metadata)
+device_texture *ImageHandle::image_memory(const int tile_index) const
 {
-  if (flat_slot == -1) {
-    return false;
+  if (tile_index >= tile_slots.size()) {
+    return NULL;
   }
 
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
+  ImageManager::Image *img = manager->images[tile_slots[tile_index]];
+  return img ? img->mem : NULL;
+}
 
-  Image *img = images[type][slot];
-  if (img) {
-    metadata = img->metadata;
-    return true;
-  }
+bool ImageHandle::operator==(const ImageHandle &other) const
+{
+  return manager == other.manager && tile_slots == other.tile_slots;
+}
 
-  return false;
+/* Image MetaData */
+
+ImageMetaData::ImageMetaData()
+    : channels(0),
+      width(0),
+      height(0),
+      depth(0),
+      type(IMAGE_DATA_NUM_TYPES),
+      colorspace(u_colorspace_raw),
+      colorspace_file_format(""),
+      use_transform_3d(false),
+      compress_as_srgb(false)
+{
+}
+
+bool ImageMetaData::operator==(const ImageMetaData &other) const
+{
+  return channels == other.channels && width == other.width && height == other.height &&
+         depth == other.depth && use_transform_3d == other.use_transform_3d &&
+         (!use_transform_3d || transform_3d == other.transform_3d) && type == other.type &&
+         colorspace == other.colorspace && compress_as_srgb == other.compress_as_srgb;
+}
+
+bool ImageMetaData::is_float() const
+{
+  return (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4 ||
+          type == IMAGE_DATA_TYPE_HALF || type == IMAGE_DATA_TYPE_HALF4);
 }
 
-void ImageManager::metadata_detect_colorspace(ImageMetaData &metadata, const char *file_format)
+void ImageMetaData::detect_colorspace()
 {
   /* Convert used specified color spaces to one we know how to handle. */
-  metadata.colorspace = ColorSpaceManager::detect_known_colorspace(
-      metadata.colorspace, file_format, metadata.is_float || metadata.is_half);
+  colorspace = ColorSpaceManager::detect_known_colorspace(
+      colorspace, colorspace_file_format, is_float());
 
-  if (metadata.colorspace == u_colorspace_raw) {
+  if (colorspace == u_colorspace_raw) {
     /* Nothing to do. */
   }
-  else if (metadata.colorspace == u_colorspace_srgb) {
+  else if (colorspace == u_colorspace_srgb) {
     /* Keep sRGB colorspace stored as sRGB, to save memory and/or loading time
      * for the common case of 8bit sRGB images like PNG. */
-    metadata.compress_as_srgb = true;
+    compress_as_srgb = true;
   }
   else {
     /* Always compress non-raw 8bit images as scene linear + sRGB, as a
      * heuristic to keep memory usage the same without too much data loss
      * due to quantization in common cases. */
-    metadata.compress_as_srgb = (metadata.type == IMAGE_DATA_TYPE_BYTE ||
-                                 metadata.type == IMAGE_DATA_TYPE_BYTE4);
+    compress_as_srgb = (type == IMAGE_DATA_TYPE_BYTE || type == IMAGE_DATA_TYPE_BYTE4);
 
     /* If colorspace conversion needed, use half instead of short so we can
      * represent HDR values that might result from conversion. */
-    if (metadata.type == IMAGE_DATA_TYPE_USHORT) {
-      metadata.type = IMAGE_DATA_TYPE_HALF;
+    if (type == IMAGE_DATA_TYPE_USHORT) {
+      type = IMAGE_DATA_TYPE_HALF;
     }
-    else if (metadata.type == IMAGE_DATA_TYPE_USHORT4) {
-      metadata.type = IMAGE_DATA_TYPE_HALF4;
+    else if (type == IMAGE_DATA_TYPE_USHORT4) {
+      type = IMAGE_DATA_TYPE_HALF4;
     }
   }
 }
 
-bool ImageManager::get_image_metadata(const string &filename,
-                                      void *builtin_data,
-                                      ustring colorspace,
-                                      ImageMetaData &metadata)
-{
-  metadata = ImageMetaData();
-  metadata.colorspace = colorspace;
-
-  if (builtin_data) {
-    if (builtin_image_info_cb) {
-      builtin_image_info_cb(filename, builtin_data, metadata);
-    }
-    else {
-      return false;
-    }
+/* Image Loader */
 
-    if (metadata.is_float) {
-      metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
-    }
-    else {
-      metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
-    }
+ImageLoader::ImageLoader()
+{
+}
 
-    metadata_detect_colorspace(metadata, "");
+ustring ImageLoader::osl_filepath() const
+{
+  return ustring();
+}
 
+bool ImageLoader::equals(const ImageLoader *a, const ImageLoader *b)
+{
+  if (a == NULL && b == NULL) {
     return true;
   }
-
-  /* Perform preliminary checks, with meaningful logging. */
-  if (!path_exists(filename)) {
-    VLOG(1) << "File '" << filename << "' does not exist.";
-    return false;
-  }
-  if (path_is_directory(filename)) {
-    VLOG(1) << "File '" << filename << "' is a directory, can't use as image.";
-    return false;
+  else {
+    return (a && b && typeid(*a) == typeid(*b) && a->equals(*b));
   }
+}
 
-  unique_ptr<ImageInput> in(ImageInput::create(filename));
+/* Image Manager */
 
-  if (!in) {
-    return false;
-  }
+ImageManager::ImageManager(const DeviceInfo &info)
+{
+  need_update = true;
+  osl_texture_system = NULL;
+  animation_frame = 0;
 
-  ImageSpec spec;
-  if (!in->open(filename, spec)) {
-    return false;
-  }
+  /* Set image limits */
+  has_half_images = info.has_half_images;
+}
 
-  metadata.width = spec.width;
-  metadata.height = spec.height;
-  metadata.depth = spec.depth;
-  metadata.compress_as_srgb = false;
+ImageManager::~ImageManager()
+{
+  for (size_t slot = 0; slot < images.size(); slot++)
+    assert(!images[slot]);
+}
 
-  /* Check the main format, and channel formats. */
-  size_t channel_size = spec.format.basesize();
+void ImageManager::set_osl_texture_system(void *texture_system)
+{
+  osl_texture_system = texture_system;
+}
 
-  if (spec.format.is_floating_point()) {
-    metadata.is_float = true;
-  }
+bool ImageManager::set_animation_frame_update(int frame)
+{
+  if (frame != animation_frame) {
+    animation_frame = frame;
 
-  for (size_t channel = 0; channel < spec.channelformats.size(); channel++) {
-    channel_size = max(channel_size, spec.channelformats[channel].basesize());
-    if (spec.channelformats[channel].is_floating_point()) {
-      metadata.is_float = true;
+    for (size_t slot = 0; slot < images.size(); slot++) {
+      if (images[slot] && images[slot]->params.animated)
+        return true;
     }
   }
 
-  /* check if it's half float */
-  if (spec.format == TypeDesc::HALF) {
-    metadata.is_half = true;
-  }
-
-  /* set type and channels */
-  metadata.channels = spec.nchannels;
+  return false;
+}
 
-  if (metadata.is_half) {
-    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_HALF4 : IMAGE_DATA_TYPE_HALF;
+void ImageManager::load_image_metadata(Image *img)
+{
+  if (!img->need_metadata) {
+    return;
   }
-  else if (metadata.is_float) {
-    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
+
+  thread_scoped_lock image_lock(img->mutex);
+  if (!img->need_metadata) {
+    return;
   }
-  else if (spec.format == TypeDesc::USHORT) {
-    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_USHORT4 : IMAGE_DATA_TYPE_USHORT;
+
+  ImageMetaData &metadata = img->metadata;
+  metadata = ImageMetaData();
+  metadata.colorspace = img->params.colorspace;
+
+  if (img->loader->load_metadata(metadata)) {
+    assert(metadata.type != IMAGE_DATA_NUM_TYPES);
   }
   else {
-    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
+    metadata.type = IMAGE_DATA_TYPE_BYTE4;
+  }
+
+  metadata.detect_colorspace();
+
+  /* No half textures on OpenCL, use full float instead. */
+  if (!has_half_images) {
+    if (metadata.type == IMAGE_DATA_TYPE_HALF4) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+    }
+    else if (metadata.type == IMAGE_DATA_TYPE_HALF) {
+      metadata.type = IMAGE_DATA_TYPE_FLOAT;
+    }
   }
 
-  metadata_detect_colorspace(metadata, in->format_name());
+  img->need_metadata = false;
+}
+
+ImageHandle ImageManager::add_image(const string &filename, const ImageParams &params)
+{
+  const int slot = add_image_slot(new OIIOImageLoader(filename), params, false);
 
-  in->close();
+  ImageHandle handle;
+  handle.tile_slots.push_back(slot);
+  handle.manager = this;
+  return handle;
+}
 
-  return true;
+ImageHandle ImageManager::add_image(const string &filename,
+                                    const ImageParams &params,
+                                    const vector<int> &tiles)
+{
+  ImageHandle handle;
+  handle.manager = this;
+
+  foreach (int tile, tiles) {
+    string tile_filename = filename;
+    if (tile != 0) {
+      string_replace(tile_filename, "<UDIM>", string_printf("%04d", tile));
+    }
+    const int slot = add_image_slot(new OIIOImageLoader(tile_filename), params, false);
+    handle.tile_slots.push_back(slot);
+  }
+
+  return handle;
 }
 
-static bool image_equals(ImageManager::Image *image,
-                         const string &filename,
-                         void *builtin_data,
-                         InterpolationType interpolation,
-                         ExtensionType extension,
-                         ImageAlphaType alpha_type,
-                         ustring colorspace)
+ImageHandle ImageManager::add_image(ImageLoader *loader, const ImageParams &params)
 {
-  return image->filename == filename && image->builtin_data == builtin_data &&
-         image->interpolation == interpolation && image->extension == extension &&
-         image->alpha_type == alpha_type && image->colorspace == colorspace;
+  const int slot = add_image_slot(loader, params, true);
+
+  ImageHandle handle;
+  handle.tile_slots.push_back(slot);
+  handle.manager = this;
+  return handle;
 }
 
-int ImageManager::add_image(const string &filename,
-                            void *builtin_data,
-                            bool animated,
-                            float frame,
-                            InterpolationType interpolation,
-                            ExtensionType extension,
-                            ImageAlphaType alpha_type,
-                            ustring colorspace,
-                            ImageMetaData &metadata)
+int ImageManager::add_image_slot(ImageLoader *loader,
+                                 const ImageParams &params,
+                                 const bool builtin)
 {
   Image *img;
   size_t slot;
 
-  get_image_metadata(filename, builtin_data, colorspace, metadata);
-  ImageDataType type = metadata.type;
-
   thread_scoped_lock device_lock(device_mutex);
 
-  /* No half textures on OpenCL, use full float instead. */
-  if (!has_half_images) {
-    if (type == IMAGE_DATA_TYPE_HALF4) {
-      type = IMAGE_DATA_TYPE_FLOAT4;
-    }
-    else if (type == IMAGE_DATA_TYPE_HALF) {
-      type = IMAGE_DATA_TYPE_FLOAT;
-    }
-  }
-
   /* Fnd existing image. */
-  for (slot = 0; slot < images[type].size(); slot++) {
-    img = images[type][slot];
-    if (img &&
-        image_equals(
-            img, filename, builtin_data, interpolation, extension, alpha_type, colorspace)) {
-      if (img->frame != frame) {
-        img->frame = frame;
-        img->need_load = true;
-      }
-      if (img->alpha_type != alpha_type) {
-        img->alpha_type = alpha_type;
-        img->need_load = true;
-      }
-      if (img->colorspace != colorspace) {
-        img->colorspace = colorspace;
-        img->need_load = true;
-      }
-      if (!(img->metadata == metadata)) {
-        img->metadata = metadata;
-        img->need_load = true;
-      }
+  for (slot = 0; slot < images.size(); slot++) {
+    img = images[slot];
+    if (img && ImageLoader::equals(img->loader, loader) && img->params == params) {
       img->users++;
-      return type_index_to_flattened_slot(slot, type);
+      delete loader;
+      return slot;
     }
   }
 
   /* Find free slot. */
-  for (slot = 0; slot < images[type].size(); slot++) {
-    if (!images[type][slot])
+  for (slot = 0; slot < images.size(); slot++) {
+    if (!images[slot])
       break;
   }
 
-  /* Count if we're over the limit.
-   * Very unlikely, since max_num_images is insanely big. But better safe
-   * than sorry.
-   */
-  int tex_count = 0;
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    tex_count += tex_num_images[type];
-  }
-  if (tex_count > max_num_images) {
-    printf(
-        "ImageManager::add_image: Reached image limit (%d), "
-        "skipping '%s'\n",
-        max_num_images,
-        filename.c_str());
-    return -1;
-  }
-
-  if (slot == images[type].size()) {
-    images[type].resize(images[type].size() + 1);
+  if (slot == images.size()) {
+    images.resize(images.size() + 1);
   }
 
   /* Add new image. */
   img = new Image();
-  img->filename = filename;
-  img->builtin_data = builtin_data;
-  img->metadata = metadata;
-  img->need_load = true;
-  img->animated = animated;
-  img->frame = frame;
-  img->interpolation = interpolation;
-  img->extension = extension;
+  img->params = params;
+  img->loader = loader;
+  img->need_metadata = true;
+  img->need_load = !(osl_texture_system && !img->loader->osl_filepath().empty());
+  img->builtin = builtin;
   img->users = 1;
-  img->alpha_type = alpha_type;
-  img->colorspace = colorspace;
   img->mem = NULL;
 
-  images[type][slot] = img;
-
-  ++tex_num_images[type];
+  images[slot] = img;
 
   need_update = true;
 
-  return type_index_to_flattened_slot(slot, type);
+  return slot;
 }
 
-void ImageManager::add_image_user(int flat_slot)
+void ImageManager::add_image_user(int slot)
 {
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
-
-  Image *image = images[type][slot];
+  Image *image = images[slot];
   assert(image && image->users >= 1);
 
   image->users++;
 }
 
-void ImageManager::remove_image(int flat_slot)
+void ImageManager::remove_image_user(int slot)
 {
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
-
-  Image *image = images[type][slot];
+  Image *image = images[slot];
   assert(image && image->users >= 1);
 
   /* decrement user count */
@@ -442,119 +439,20 @@ void ImageManager::remove_image(int flat_slot)
     need_update = true;
 }
 
-void ImageManager::remove_image(const string &filename,
-                                void *builtin_data,
-                                InterpolationType interpolation,
-                                ExtensionType extension,
-                                ImageAlphaType alpha_type,
-                                ustring colorspace)
-{
-  size_t slot;
-
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (slot = 0; slot < images[type].size(); slot++) {
-      if (images[type][slot] && image_equals(images[type][slot],
-                                             filename,
-                                             builtin_data,
-                                             interpolation,
-                                             extension,
-                                             alpha_type,
-                                             colorspace)) {
-        remove_image(type_index_to_flattened_slot(slot, (ImageDataType)type));
-        return;
-      }
-    }
-  }
-}
-
-/* TODO(sergey): Deduplicate with the iteration above, but make it pretty,
- * without bunch of arguments passing around making code readability even
- * more cluttered.
- */
-void ImageManager::tag_reload_image(const string &filename,
-                                    void *builtin_data,
-                                    InterpolationType interpolation,
-                                    ExtensionType extension,
-                                    ImageAlphaType alpha_type,
-                                    ustring colorspace)
-{
-  for (size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      if (images[type][slot] && image_equals(images[type][slot],
-                                             filename,
-                                             builtin_data,
-                                             interpolation,
-                                             extension,
-                                             alpha_type,
-                                             colorspace)) {
-        images[type][slot]->need_load = true;
-        break;
-      }
-    }
-  }
-}
-
 static bool image_associate_alpha(ImageManager::Image *img)
 {
   /* For typical RGBA images we let OIIO convert to associated alpha,
    * but some types we want to leave the RGB channels untouched. */
-  return !(ColorSpaceManager::colorspace_is_data(img->colorspace) ||
-           img->alpha_type == IMAGE_ALPHA_IGNORE || img->alpha_type == IMAGE_ALPHA_CHANNEL_PACKED);
+  return !(ColorSpaceManager::colorspace_is_data(img->params.colorspace) ||
+           img->params.alpha_type == IMAGE_ALPHA_IGNORE ||
+           img->params.alpha_type == IMAGE_ALPHA_CHANNEL_PACKED);
 }
 
-bool ImageManager::file_load_image_generic(Image *img, unique_ptr<ImageInput> *in)
+template<TypeDesc::BASETYPE FileFormat, typename StorageType>
+bool ImageManager::file_load_image(Image *img, int texture_limit)
 {
-  if (img->filename == "")
-    return false;
-
-  if (!img->builtin_data) {
-    /* NOTE: Error logging is done in meta data acquisition. */
-    if (!path_exists(img->filename) || path_is_directory(img->filename)) {
-      return false;
-    }
-
-    /* load image from file through OIIO */
-    *in = unique_ptr<ImageInput>(ImageInput::create(img->filename));
-
-    if (!*in)
-      return false;
-
-    ImageSpec spec = ImageSpec();
-    ImageSpec config = ImageSpec();
-
-    if (!image_associate_alpha(img)) {
-      config.attribute("oiio:UnassociatedAlpha", 1);
-    }
-
-    if (!(*in)->open(img->filename, spec, config)) {
-      return false;
-    }
-  }
-  else {
-    /* load image using builtin images callbacks */
-    if (!builtin_image_info_cb || !builtin_image_pixels_cb)
-      return false;
-  }
-
   /* we only handle certain number of components */
   if (!(img->metadata.channels >= 1 && img->metadata.channels <= 4)) {
-    if (*in) {
-      (*in)->close();
-    }
-    return false;
-  }
-
-  return true;
-}
-
-template<TypeDesc::BASETYPE FileFormat, typename StorageType, typename DeviceType>
-bool ImageManager::file_load_image(Image *img,
-                                   ImageDataType type,
-                                   int texture_limit,
-                                   device_vector<DeviceType> &tex_img)
-{
-  unique_ptr<ImageInput> in = NULL;
-  if (!file_load_image_generic(img, &in)) {
     return false;
   }
 
@@ -580,7 +478,7 @@ bool ImageManager::file_load_image(Image *img,
   }
   else {
     thread_scoped_lock device_lock(device_mutex);
-    pixels = (StorageType *)tex_img.alloc(width, height, depth);
+    pixels = (StorageType *)img->mem->alloc(width, height, depth);
   }
 
   if (pixels == NULL) {
@@ -588,90 +486,21 @@ bool ImageManager::file_load_image(Image *img,
     return false;
   }
 
-  bool cmyk = false;
   const size_t num_pixels = ((size_t)width) * height * depth;
-  if (in) {
-    /* Read pixels through OpenImageIO. */
-    StorageType *readpixels = pixels;
-    vector<StorageType> tmppixels;
-    if (components > 4) {
-      tmppixels.resize(((size_t)width) * height * components);
-      readpixels = &tmppixels[0];
-    }
-
-    if (depth <= 1) {
-      size_t scanlinesize = ((size_t)width) * components * sizeof(StorageType);
-      in->read_image(FileFormat,
-                     (uchar *)readpixels + (height - 1) * scanlinesize,
-                     AutoStride,
-                     -scanlinesize,
-                     AutoStride);
-    }
-    else {
-      in->read_image(FileFormat, (uchar *)readpixels);
-    }
-
-    if (components > 4) {
-      size_t dimensions = ((size_t)width) * height;
-      for (size_t i = dimensions - 1, pixel = 0; pixel < dimensions; pixel++, i--) {
-        pixels[i * 4 + 3] = tmppixels[i * components + 3];
-        pixels[i * 4 + 2] = tmppixels[i * components + 2];
-        pixels[i * 4 + 1] = tmppixels[i * components + 1];
-        pixels[i * 4 + 0] = tmppixels[i * components + 0];
-      }
-      tmppixels.clear();
-    }
-
-    cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
-    in->close();
-  }
-  else {
-    /* Read pixels through callback. */
-    if (FileFormat == TypeDesc::FLOAT) {
-      builtin_image_float_pixels_cb(img->filename,
-                                    img->builtin_data,
-                                    0, /* TODO(lukas): Support tiles here? */
-                                    (float *)&pixels[0],
-                                    num_pixels * components,
-                                    image_associate_alpha(img),
-                                    img->metadata.builtin_free_cache);
-    }
-    else if (FileFormat == TypeDesc::UINT8) {
-      builtin_image_pixels_cb(img->filename,
-                              img->builtin_data,
-                              0, /* TODO(lukas): Support tiles here? */
-                              (uchar *)&pixels[0],
-                              num_pixels * components,
-                              image_associate_alpha(img),
-                              img->metadata.builtin_free_cache);
-    }
-    else {
-      /* TODO(dingto): Support half for ImBuf. */
-    }
-  }
+  img->loader->load_pixels(
+      img->metadata, pixels, num_pixels * components, image_associate_alpha(img));
 
   /* The kernel can handle 1 and 4 channel images. Anything that is not a single
    * channel image is converted to RGBA format. */
-  bool is_rgba = (type == IMAGE_DATA_TYPE_FLOAT4 || type == IMAGE_DATA_TYPE_HALF4 ||
-                  type == IMAGE_DATA_TYPE_BYTE4 || type == IMAGE_DATA_TYPE_USHORT4);
+  bool is_rgba = (img->metadata.type == IMAGE_DATA_TYPE_FLOAT4 ||
+                  img->metadata.type == IMAGE_DATA_TYPE_HALF4 ||
+                  img->metadata.type == IMAGE_DATA_TYPE_BYTE4 ||
+                  img->metadata.type == IMAGE_DATA_TYPE_USHORT4);
 
   if (is_rgba) {
     const StorageType one = util_image_cast_from_float<StorageType>(1.0f);
 
-    if (cmyk) {
-      /* CMYK to RGBA. */
-      for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-        float c = util_image_cast_to_float(pixels[i * 4 + 0]);
-        float m = util_image_cast_to_float(pixels[i * 4 + 1]);
-        float y = util_image_cast_to_float(pixels[i * 4 + 2]);
-        float k = util_image_cast_to_float(pixels[i * 4 + 3]);
-        pixels[i * 4 + 0] = util_image_cast_from_float<StorageType>((1.0f - c) * (1.0f - k));
-        pixels[i * 4 + 1] = util_image_cast_from_float<StorageType>((1.0f - m) * (1.0f - k));
-        pixels[i * 4 + 2] = util_image_cast_from_float<StorageType>((1.0f - y) * (1.0f - k));
-        pixels[i * 4 + 3] = one;
-      }
-    }
-    else if (components == 2) {
+    if (components == 2) {
       /* Grayscale + alpha to RGBA. */
       for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
         pixels[i * 4 + 3] = pixels[i * 2 + 1];
@@ -700,7 +529,7 @@ bool ImageManager::file_load_image(Image *img,
     }
 
     /* Disable alpha if requested by the user. */
-    if (img->alpha_type == IMAGE_ALPHA_IGNORE) {
+    if (img->params.alpha_type == IMAGE_ALPHA_IGNORE) {
       for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
         pixels[i * 4 + 3] = one;
       }
@@ -710,7 +539,7 @@ bool ImageManager::file_load_image(Image *img,
         img->metadata.colorspace != u_colorspace_srgb) {
       /* Convert to scene linear. */
       ColorSpaceManager::to_scene_linear(
-          img->metadata.colorspace, pixels, width, height, depth, img->metadata.compress_as_srgb);
+          img->metadata.colorspace, pixels, num_pixels, img->metadata.compress_as_srgb);
     }
   }
 
@@ -747,7 +576,8 @@ bool ImageManager::file_load_image(Image *img,
     while (max_size * scale_factor > texture_limit) {
       scale_factor *= 0.5f;
     }
-    VLOG(1) << "Scaling image " << img->filename << " by a factor of " << scale_factor << ".";
+    VLOG(1) << "Scaling image " << img->loader->name() << " by a factor of " << scale_factor
+            << ".";
     vector<StorageType> scaled_pixels;
     size_t scaled_width, scaled_height, scaled_depth;
     util_image_resize_pixels(pixels_storage,
@@ -765,7 +595,7 @@ bool ImageManager::file_load_image(Image *img,
 
     {
       thread_scoped_lock device_lock(device_mutex);
-      texture_pixels = (StorageType *)tex_img.alloc(scaled_width, scaled_height, scaled_depth);
+      texture_pixels = (StorageType *)img->mem->alloc(scaled_width, scaled_height, scaled_depth);
     }
 
     memcpy(texture_pixels, &scaled_pixels[0], scaled_pixels.size() * sizeof(StorageType));
@@ -774,25 +604,23 @@ bool ImageManager::file_load_image(Image *img,
   return true;
 }
 
-void ImageManager::device_load_image(
-    Device *device, Scene *scene, ImageDataType type, int slot, Progress *progress)
+void ImageManager::device_load_image(Device *device, Scene *scene, int slot, Progress *progress)
 {
-  if (progress->get_cancel())
+  if (progress->get_cancel()) {
     return;
+  }
 
-  Image *img = images[type][slot];
+  Image *img = images[slot];
 
-  if (osl_texture_system && !img->builtin_data)
-    return;
-
-  string filename = path_filename(images[type][slot]->filename);
-  progress->set_status("Updating Images", "Loading " + filename);
+  progress->set_status("Updating Images", "Loading " + img->loader->name());
 
   const int texture_limit = scene->params.texture_limit;
 
-  /* Slot assignment */
-  int flat_slot = type_index_to_flattened_slot(slot, type);
-  img->mem_name = string_printf("__tex_image_%s_%03d", name_from_type(type), flat_slot);
+  load_image_metadata(img);
+  ImageDataType type = img->metadata.type;
+
+  /* Name for debugging. */
+  img->mem_name = string_printf("__tex_image_%s_%03d", name_from_type(type), slot);
 
   /* Free previous texture in slot. */
   if (img->mem) {
@@ -801,195 +629,131 @@ void ImageManager::device_load_image(
     img->mem = NULL;
   }
 
+  img->mem = new device_texture(
+      device, img->mem_name.c_str(), slot, type, img->params.interpolation, img->params.extension);
+  img->mem->info.use_transform_3d = img->metadata.use_transform_3d;
+  img->mem->info.transform_3d = img->metadata.transform_3d;
+
   /* Create new texture. */
   if (type == IMAGE_DATA_TYPE_FLOAT4) {
-    device_vector<float4> *tex_img = new device_vector<float4>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::FLOAT, float>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::FLOAT, float>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      float *pixels = (float *)tex_img->alloc(1, 1);
+      float *pixels = (float *)img->mem->alloc(1, 1);
 
       pixels[0] = TEX_IMAGE_MISSING_R;
       pixels[1] = TEX_IMAGE_MISSING_G;
       pixels[2] = TEX_IMAGE_MISSING_B;
       pixels[3] = TEX_IMAGE_MISSING_A;
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_FLOAT) {
-    device_vector<float> *tex_img = new device_vector<float>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::FLOAT, float>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::FLOAT, float>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      float *pixels = (float *)tex_img->alloc(1, 1);
+      float *pixels = (float *)img->mem->alloc(1, 1);
 
       pixels[0] = TEX_IMAGE_MISSING_R;
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_BYTE4) {
-    device_vector<uchar4> *tex_img = new device_vector<uchar4>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::UINT8, uchar>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::UINT8, uchar>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      uchar *pixels = (uchar *)tex_img->alloc(1, 1);
+      uchar *pixels = (uchar *)img->mem->alloc(1, 1);
 
       pixels[0] = (TEX_IMAGE_MISSING_R * 255);
       pixels[1] = (TEX_IMAGE_MISSING_G * 255);
       pixels[2] = (TEX_IMAGE_MISSING_B * 255);
       pixels[3] = (TEX_IMAGE_MISSING_A * 255);
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_BYTE) {
-    device_vector<uchar> *tex_img = new device_vector<uchar>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::UINT8, uchar>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::UINT8, uchar>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      uchar *pixels = (uchar *)tex_img->alloc(1, 1);
+      uchar *pixels = (uchar *)img->mem->alloc(1, 1);
 
       pixels[0] = (TEX_IMAGE_MISSING_R * 255);
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_HALF4) {
-    device_vector<half4> *tex_img = new device_vector<half4>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::HALF, half>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::HALF, half>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      half *pixels = (half *)tex_img->alloc(1, 1);
+      half *pixels = (half *)img->mem->alloc(1, 1);
 
       pixels[0] = TEX_IMAGE_MISSING_R;
       pixels[1] = TEX_IMAGE_MISSING_G;
       pixels[2] = TEX_IMAGE_MISSING_B;
       pixels[3] = TEX_IMAGE_MISSING_A;
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_USHORT) {
-    device_vector<uint16_t> *tex_img = new device_vector<uint16_t>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::USHORT, uint16_t>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::USHORT, uint16_t>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      uint16_t *pixels = (uint16_t *)tex_img->alloc(1, 1);
+      uint16_t *pixels = (uint16_t *)img->mem->alloc(1, 1);
 
       pixels[0] = (TEX_IMAGE_MISSING_R * 65535);
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_USHORT4) {
-    device_vector<ushort4> *tex_img = new device_vector<ushort4>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::USHORT, uint16_t>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::USHORT, uint16_t>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      uint16_t *pixels = (uint16_t *)tex_img->alloc(1, 1);
+      uint16_t *pixels = (uint16_t *)img->mem->alloc(1, 1);
 
       pixels[0] = (TEX_IMAGE_MISSING_R * 65535);
       pixels[1] = (TEX_IMAGE_MISSING_G * 65535);
       pixels[2] = (TEX_IMAGE_MISSING_B * 65535);
       pixels[3] = (TEX_IMAGE_MISSING_A * 65535);
     }
-
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
-    thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
   }
   else if (type == IMAGE_DATA_TYPE_HALF) {
-    device_vector<half> *tex_img = new device_vector<half>(
-        device, img->mem_name.c_str(), MEM_TEXTURE);
-
-    if (!file_load_image<TypeDesc::HALF, half>(img, type, texture_limit, *tex_img)) {
+    if (!file_load_image<TypeDesc::HALF, half>(img, texture_limit)) {
       /* on failure to load, we set a 1x1 pixels pink image */
       thread_scoped_lock device_lock(device_mutex);
-      half *pixels = (half *)tex_img->alloc(1, 1);
+      half *pixels = (half *)img->mem->alloc(1, 1);
 
       pixels[0] = TEX_IMAGE_MISSING_R;
     }
+  }
 
-    img->mem = tex_img;
-    img->mem->interpolation = img->interpolation;
-    img->mem->extension = img->extension;
-
+  {
     thread_scoped_lock device_lock(device_mutex);
-    tex_img->copy_to_device();
+    img->mem->copy_to_device();
   }
+
+  /* Cleanup memory in image loader. */
+  img->loader->cleanup();
   img->need_load = false;
 }
 
-void ImageManager::device_free_image(Device *, ImageDataType type, int slot)
+void ImageManager::device_free_image(Device *, int slot)
 {
-  Image *img = images[type][slot];
+  Image *img = images[slot];
+  if (img == NULL) {
+    return;
+  }
 
-  if (img) {
-    if (osl_texture_system && !img->builtin_data) {
+  if (osl_texture_system) {
 #ifdef WITH_OSL
-      ustring filename(images[type][slot]->filename);
-      ((OSL::TextureSystem *)osl_texture_system)->invalidate(filename);
-#endif
-    }
-
-    if (img->mem) {
-      thread_scoped_lock device_lock(device_mutex);
-      delete img->mem;
+    ustring filepath = img->loader->osl_filepath();
+    if (!filepath.empty()) {
+      ((OSL::TextureSystem *)osl_texture_system)->invalidate(filepath);
     }
+#endif
+  }
 
-    delete img;
-    images[type][slot] = NULL;
-    --tex_num_images[type];
+  if (img->mem) {
+    thread_scoped_lock device_lock(device_mutex);
+    delete img->mem;
   }
+
+  delete img->loader;
+  delete img;
+  images[slot] = NULL;
 }
 
 void ImageManager::device_update(Device *device, Scene *scene, Progress &progress)
@@ -999,24 +763,14 @@ void ImageManager::device_update(Device *device, Scene *scene, Progress &progres
   }
 
   TaskPool pool;
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      if (!images[type][slot])
-        continue;
-
-      if (images[type][slot]->users == 0) {
-        device_free_image(device, (ImageDataType)type, slot);
-      }
-      else if (images[type][slot]->need_load) {
-        if (!osl_texture_system || images[type][slot]->builtin_data)
-          pool.push(function_bind(&ImageManager::device_load_image,
-                                  this,
-                                  device,
-                                  scene,
-                                  (ImageDataType)type,
-                                  slot,
-                                  &progress));
-      }
+  for (size_t slot = 0; slot < images.size(); slot++) {
+    Image *img = images[slot];
+    if (img && img->users == 0) {
+      device_free_image(device, slot);
+    }
+    else if (img && img->need_load) {
+      pool.push(
+          function_bind(&ImageManager::device_load_image, this, device, scene, slot, &progress));
     }
   }
 
@@ -1025,23 +779,16 @@ void ImageManager::device_update(Device *device, Scene *scene, Progress &progres
   need_update = false;
 }
 
-void ImageManager::device_update_slot(Device *device,
-                                      Scene *scene,
-                                      int flat_slot,
-                                      Progress *progress)
+void ImageManager::device_update_slot(Device *device, Scene *scene, int slot, Progress *progress)
 {
-  ImageDataType type;
-  int slot = flattened_slot_to_type_index(flat_slot, &type);
-
-  Image *image = images[type][slot];
-  assert(image != NULL);
+  Image *img = images[slot];
+  assert(img != NULL);
 
-  if (image->users == 0) {
-    device_free_image(device, type, slot);
+  if (img->users == 0) {
+    device_free_image(device, slot);
   }
-  else if (image->need_load) {
-    if (!osl_texture_system || image->builtin_data)
-      device_load_image(device, scene, type, slot, progress);
+  else if (img->need_load) {
+    device_load_image(device, scene, slot, progress);
   }
 }
 
@@ -1054,22 +801,11 @@ void ImageManager::device_load_builtin(Device *device, Scene *scene, Progress &p
   }
 
   TaskPool pool;
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      if (!images[type][slot])
-        continue;
-
-      if (images[type][slot]->need_load) {
-        if (images[type][slot]->builtin_data) {
-          pool.push(function_bind(&ImageManager::device_load_image,
-                                  this,
-                                  device,
-                                  scene,
-                                  (ImageDataType)type,
-                                  slot,
-                                  &progress));
-        }
-      }
+  for (size_t slot = 0; slot < images.size(); slot++) {
+    Image *img = images[slot];
+    if (img && img->need_load && img->builtin) {
+      pool.push(
+          function_bind(&ImageManager::device_load_image, this, device, scene, slot, &progress));
     }
   }
 
@@ -1078,31 +814,27 @@ void ImageManager::device_load_builtin(Device *device, Scene *scene, Progress &p
 
 void ImageManager::device_free_builtin(Device *device)
 {
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      if (images[type][slot] && images[type][slot]->builtin_data)
-        device_free_image(device, (ImageDataType)type, slot);
+  for (size_t slot = 0; slot < images.size(); slot++) {
+    Image *img = images[slot];
+    if (img && img->builtin) {
+      device_free_image(device, slot);
     }
   }
 }
 
 void ImageManager::device_free(Device *device)
 {
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    for (size_t slot = 0; slot < images[type].size(); slot++) {
-      device_free_image(device, (ImageDataType)type, slot);
-    }
-    images[type].clear();
+  for (size_t slot = 0; slot < images.size(); slot++) {
+    device_free_image(device, slot);
   }
+  images.clear();
 }
 
 void ImageManager::collect_statistics(RenderStats *stats)
 {
-  for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
-    foreach (const Image *image, images[type]) {
-      stats->image.textures.add_entry(
-          NamedSizeEntry(path_filename(image->filename), image->mem->memory_size()));
-    }
+  foreach (const Image *image, images) {
+    stats->image.textures.add_entry(
+        NamedSizeEntry(image->loader->name(), image->mem->memory_size()));
   }
 }
 
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index bc04a667953..00ab12afd7a 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -22,92 +22,157 @@
 
 #include "render/colorspace.h"
 
-#include "util/util_image.h"
 #include "util/util_string.h"
 #include "util/util_thread.h"
+#include "util/util_transform.h"
 #include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
 class Device;
+class ImageHandle;
+class ImageKey;
+class ImageMetaData;
+class ImageManager;
 class Progress;
 class RenderStats;
 class Scene;
 class ColorSpaceProcessor;
 
-class ImageMetaData {
+/* Image Parameters */
+class ImageParams {
  public:
-  /* Must be set by image file or builtin callback. */
-  bool is_float, is_half;
-  int channels;
-  size_t width, height, depth;
-  bool builtin_free_cache;
-
-  /* Automatically set. */
-  ImageDataType type;
+  bool animated;
+  InterpolationType interpolation;
+  ExtensionType extension;
+  ImageAlphaType alpha_type;
   ustring colorspace;
-  bool compress_as_srgb;
+  float frame;
 
-  ImageMetaData()
-      : is_float(false),
-        is_half(false),
-        channels(0),
-        width(0),
-        height(0),
-        depth(0),
-        builtin_free_cache(false),
-        type((ImageDataType)0),
+  ImageParams()
+      : animated(false),
+        interpolation(INTERPOLATION_LINEAR),
+        extension(EXTENSION_CLIP),
+        alpha_type(IMAGE_ALPHA_AUTO),
         colorspace(u_colorspace_raw),
-        compress_as_srgb(false)
+        frame(0.0f)
   {
   }
 
-  bool operator==(const ImageMetaData &other) const
+  bool operator==(const ImageParams &other) const
   {
-    return is_float == other.is_float && is_half == other.is_half && channels == other.channels &&
-           width == other.width && height == other.height && depth == other.depth &&
-           type == other.type && colorspace == other.colorspace &&
-           compress_as_srgb == other.compress_as_srgb;
+    return (animated == other.animated && interpolation == other.interpolation &&
+            extension == other.extension && alpha_type == other.alpha_type &&
+            colorspace == other.colorspace && frame == other.frame);
   }
 };
 
+/* Image MetaData
+ *
+ * Information about the image that is available before the image pixels are loaded. */
+class ImageMetaData {
+ public:
+  /* Set by ImageLoader.load_metadata(). */
+  int channels;
+  size_t width, height, depth;
+  ImageDataType type;
+
+  /* Optional color space, defaults to raw. */
+  ustring colorspace;
+  const char *colorspace_file_format;
+
+  /* Optional transform for 3D images. */
+  bool use_transform_3d;
+  Transform transform_3d;
+
+  /* Automatically set. */
+  bool compress_as_srgb;
+
+  ImageMetaData();
+  bool operator==(const ImageMetaData &other) const;
+  bool is_float() const;
+  void detect_colorspace();
+};
+
+/* Image loader base class, that can be subclassed to load image data
+ * from custom sources (file, memory, procedurally generated, etc). */
+class ImageLoader {
+ public:
+  ImageLoader();
+  virtual ~ImageLoader(){};
+
+  /* Load metadata without actual image yet, should be fast. */
+  virtual bool load_metadata(ImageMetaData &metadata) = 0;
+
+  /* Load actual image contents. */
+  virtual bool load_pixels(const ImageMetaData &metadata,
+                           void *pixels,
+                           const size_t pixels_size,
+                           const bool associate_alpha) = 0;
+
+  /* Name for logs and stats. */
+  virtual string name() const = 0;
+
+  /* Optional for OSL texture cache. */
+  virtual ustring osl_filepath() const;
+
+  /* Free any memory used for loading metadata and pixels. */
+  virtual void cleanup(){};
+
+  /* Compare avoid loading the same image multiple times. */
+  virtual bool equals(const ImageLoader &other) const = 0;
+  static bool equals(const ImageLoader *a, const ImageLoader *b);
+
+  /* Work around for no RTTI. */
+};
+
+/* Image Handle
+ *
+ * Access handle for image in the image manager. Multiple shader nodes may
+ * share the same image, and this class handles reference counting for that. */
+class ImageHandle {
+ public:
+  ImageHandle();
+  ImageHandle(const ImageHandle &other);
+  ImageHandle &operator=(const ImageHandle &other);
+  ~ImageHandle();
+
+  bool operator==(const ImageHandle &other) const;
+
+  void clear();
+
+  bool empty();
+  int num_tiles();
+
+  ImageMetaData metadata();
+  int svm_slot(const int tile_index = 0) const;
+  device_texture *image_memory(const int tile_index = 0) const;
+
+ protected:
+  vector<int> tile_slots;
+  ImageManager *manager;
+
+  friend class ImageManager;
+};
+
+/* Image Manager
+ *
+ * Handles loading and storage of all images in the scene. This includes 2D
+ * texture images and 3D volume images. */
 class ImageManager {
  public:
   explicit ImageManager(const DeviceInfo &info);
   ~ImageManager();
 
-  int add_image(const string &filename,
-                void *builtin_data,
-                bool animated,
-                float frame,
-                InterpolationType interpolation,
-                ExtensionType extension,
-                ImageAlphaType alpha_type,
-                ustring colorspace,
-                ImageMetaData &metadata);
-  void add_image_user(int flat_slot);
-  void remove_image(int flat_slot);
-  void remove_image(const string &filename,
-                    void *builtin_data,
-                    InterpolationType interpolation,
-                    ExtensionType extension,
-                    ImageAlphaType alpha_type,
-                    ustring colorspace);
-  void tag_reload_image(const string &filename,
-                        void *builtin_data,
-                        InterpolationType interpolation,
-                        ExtensionType extension,
-                        ImageAlphaType alpha_type,
-                        ustring colorspace);
-  bool get_image_metadata(const string &filename,
-                          void *builtin_data,
-                          ustring colorspace,
-                          ImageMetaData &metadata);
-  bool get_image_metadata(int flat_slot, ImageMetaData &metadata);
+  ImageHandle add_image(const string &filename, const ImageParams &params);
+  ImageHandle add_image(const string &filename,
+                        const ImageParams &params,
+                        const vector<int> &tiles);
+  ImageHandle add_image(ImageLoader *loader, const ImageParams &params);
 
   void device_update(Device *device, Scene *scene, Progress &progress);
-  void device_update_slot(Device *device, Scene *scene, int flat_slot, Progress *progress);
+  void device_update_slot(Device *device, Scene *scene, int slot, Progress *progress);
   void device_free(Device *device);
 
   void device_load_builtin(Device *device, Scene *scene, Progress &progress);
@@ -116,78 +181,49 @@ class ImageManager {
   void set_osl_texture_system(void *texture_system);
   bool set_animation_frame_update(int frame);
 
-  device_memory *image_memory(int flat_slot);
-
   void collect_statistics(RenderStats *stats);
 
   bool need_update;
 
-  /* NOTE: Here pixels_size is a size of storage, which equals to
-   *       width * height * depth.
-   *       Use this to avoid some nasty memory corruptions.
-   */
-  function<void(const string &filename, void *data, ImageMetaData &metadata)>
-      builtin_image_info_cb;
-  function<bool(const string &filename,
-                void *data,
-                int tile,
-                unsigned char *pixels,
-                const size_t pixels_size,
-                const bool associate_alpha,
-                const bool free_cache)>
-      builtin_image_pixels_cb;
-  function<bool(const string &filename,
-                void *data,
-                int tile,
-                float *pixels,
-                const size_t pixels_size,
-                const bool associate_alpha,
-                const bool free_cache)>
-      builtin_image_float_pixels_cb;
-
   struct Image {
-    string filename;
-    void *builtin_data;
+    ImageParams params;
     ImageMetaData metadata;
+    ImageLoader *loader;
 
-    ustring colorspace;
-    ImageAlphaType alpha_type;
-    bool need_load;
-    bool animated;
     float frame;
-    InterpolationType interpolation;
-    ExtensionType extension;
+    bool need_metadata;
+    bool need_load;
+    bool builtin;
 
     string mem_name;
-    device_memory *mem;
+    device_texture *mem;
 
     int users;
+    thread_mutex mutex;
   };
 
  private:
-  int tex_num_images[IMAGE_DATA_NUM_TYPES];
-  int max_num_images;
   bool has_half_images;
 
   thread_mutex device_mutex;
   int animation_frame;
 
-  vector<Image *> images[IMAGE_DATA_NUM_TYPES];
+  vector<Image *> images;
   void *osl_texture_system;
 
-  bool file_load_image_generic(Image *img, unique_ptr<ImageInput> *in);
+  int add_image_slot(ImageLoader *loader, const ImageParams &params, const bool builtin);
+  void add_image_user(int slot);
+  void remove_image_user(int slot);
+
+  void load_image_metadata(Image *img);
 
-  template<TypeDesc::BASETYPE FileFormat, typename StorageType, typename DeviceType>
-  bool file_load_image(Image *img,
-                       ImageDataType type,
-                       int texture_limit,
-                       device_vector<DeviceType> &tex_img);
+  template<TypeDesc::BASETYPE FileFormat, typename StorageType>
+  bool file_load_image(Image *img, int texture_limit);
 
-  void metadata_detect_colorspace(ImageMetaData &metadata, const char *file_format);
+  void device_load_image(Device *device, Scene *scene, int slot, Progress *progress);
+  void device_free_image(Device *device, int slot);
 
-  void device_load_image(
-      Device *device, Scene *scene, ImageDataType type, int slot, Progress *progress);
-  void device_free_image(Device *device, ImageDataType type, int slot);
+  friend class ImageHandle;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_oiio.cpp b/intern/cycles/render/image_oiio.cpp
new file mode 100644
index 00000000000..c4f95c6b4bc
--- /dev/null
+++ b/intern/cycles/render/image_oiio.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/image_oiio.h"
+
+#include "util/util_image.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+
+CCL_NAMESPACE_BEGIN
+
+OIIOImageLoader::OIIOImageLoader(const string &filepath) : filepath(filepath)
+{
+}
+
+OIIOImageLoader::~OIIOImageLoader()
+{
+}
+
+bool OIIOImageLoader::load_metadata(ImageMetaData &metadata)
+{
+  /* Perform preliminary checks, with meaningful logging. */
+  if (!path_exists(filepath.string())) {
+    VLOG(1) << "File '" << filepath.string() << "' does not exist.";
+    return false;
+  }
+  if (path_is_directory(filepath.string())) {
+    VLOG(1) << "File '" << filepath.string() << "' is a directory, can't use as image.";
+    return false;
+  }
+
+  unique_ptr<ImageInput> in(ImageInput::create(filepath.string()));
+
+  if (!in) {
+    return false;
+  }
+
+  ImageSpec spec;
+  if (!in->open(filepath.string(), spec)) {
+    return false;
+  }
+
+  metadata.width = spec.width;
+  metadata.height = spec.height;
+  metadata.depth = spec.depth;
+  metadata.compress_as_srgb = false;
+
+  /* Check the main format, and channel formats. */
+  size_t channel_size = spec.format.basesize();
+
+  bool is_float = false;
+  bool is_half = false;
+
+  if (spec.format.is_floating_point()) {
+    is_float = true;
+  }
+
+  for (size_t channel = 0; channel < spec.channelformats.size(); channel++) {
+    channel_size = max(channel_size, spec.channelformats[channel].basesize());
+    if (spec.channelformats[channel].is_floating_point()) {
+      is_float = true;
+    }
+  }
+
+  /* check if it's half float */
+  if (spec.format == TypeDesc::HALF) {
+    is_half = true;
+  }
+
+  /* set type and channels */
+  metadata.channels = spec.nchannels;
+
+  if (is_half) {
+    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_HALF4 : IMAGE_DATA_TYPE_HALF;
+  }
+  else if (is_float) {
+    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT;
+  }
+  else if (spec.format == TypeDesc::USHORT) {
+    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_USHORT4 : IMAGE_DATA_TYPE_USHORT;
+  }
+  else {
+    metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE;
+  }
+
+  metadata.colorspace_file_format = in->format_name();
+
+  in->close();
+
+  return true;
+}
+
+template<TypeDesc::BASETYPE FileFormat, typename StorageType>
+static void oiio_load_pixels(const ImageMetaData &metadata,
+                             const unique_ptr<ImageInput> &in,
+                             StorageType *pixels)
+{
+  const int width = metadata.width;
+  const int height = metadata.height;
+  const int depth = metadata.depth;
+  const int components = metadata.channels;
+
+  /* Read pixels through OpenImageIO. */
+  StorageType *readpixels = pixels;
+  vector<StorageType> tmppixels;
+  if (components > 4) {
+    tmppixels.resize(((size_t)width) * height * components);
+    readpixels = &tmppixels[0];
+  }
+
+  if (depth <= 1) {
+    size_t scanlinesize = ((size_t)width) * components * sizeof(StorageType);
+    in->read_image(FileFormat,
+                   (uchar *)readpixels + (height - 1) * scanlinesize,
+                   AutoStride,
+                   -scanlinesize,
+                   AutoStride);
+  }
+  else {
+    in->read_image(FileFormat, (uchar *)readpixels);
+  }
+
+  if (components > 4) {
+    size_t dimensions = ((size_t)width) * height;
+    for (size_t i = dimensions - 1, pixel = 0; pixel < dimensions; pixel++, i--) {
+      pixels[i * 4 + 3] = tmppixels[i * components + 3];
+      pixels[i * 4 + 2] = tmppixels[i * components + 2];
+      pixels[i * 4 + 1] = tmppixels[i * components + 1];
+      pixels[i * 4 + 0] = tmppixels[i * components + 0];
+    }
+    tmppixels.clear();
+  }
+
+  /* CMYK to RGBA. */
+  const bool cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
+  if (cmyk) {
+    const StorageType one = util_image_cast_from_float<StorageType>(1.0f);
+
+    const size_t num_pixels = ((size_t)width) * height * depth;
+    for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
+      float c = util_image_cast_to_float(pixels[i * 4 + 0]);
+      float m = util_image_cast_to_float(pixels[i * 4 + 1]);
+      float y = util_image_cast_to_float(pixels[i * 4 + 2]);
+      float k = util_image_cast_to_float(pixels[i * 4 + 3]);
+      pixels[i * 4 + 0] = util_image_cast_from_float<StorageType>((1.0f - c) * (1.0f - k));
+      pixels[i * 4 + 1] = util_image_cast_from_float<StorageType>((1.0f - m) * (1.0f - k));
+      pixels[i * 4 + 2] = util_image_cast_from_float<StorageType>((1.0f - y) * (1.0f - k));
+      pixels[i * 4 + 3] = one;
+    }
+  }
+}
+
+bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata,
+                                  void *pixels,
+                                  const size_t,
+                                  const bool associate_alpha)
+{
+  unique_ptr<ImageInput> in = NULL;
+
+  /* NOTE: Error logging is done in meta data acquisition. */
+  if (!path_exists(filepath.string()) || path_is_directory(filepath.string())) {
+    return false;
+  }
+
+  /* load image from file through OIIO */
+  in = unique_ptr<ImageInput>(ImageInput::create(filepath.string()));
+  if (!in) {
+    return false;
+  }
+
+  ImageSpec spec = ImageSpec();
+  ImageSpec config = ImageSpec();
+
+  if (!associate_alpha) {
+    config.attribute("oiio:UnassociatedAlpha", 1);
+  }
+
+  if (!in->open(filepath.string(), spec, config)) {
+    return false;
+  }
+
+  switch (metadata.type) {
+    case IMAGE_DATA_TYPE_BYTE:
+    case IMAGE_DATA_TYPE_BYTE4:
+      oiio_load_pixels<TypeDesc::UINT8, uchar>(metadata, in, (uchar *)pixels);
+      break;
+    case IMAGE_DATA_TYPE_USHORT:
+    case IMAGE_DATA_TYPE_USHORT4:
+      oiio_load_pixels<TypeDesc::USHORT, uint16_t>(metadata, in, (uint16_t *)pixels);
+      break;
+    case IMAGE_DATA_TYPE_HALF:
+    case IMAGE_DATA_TYPE_HALF4:
+      oiio_load_pixels<TypeDesc::HALF, half>(metadata, in, (half *)pixels);
+      break;
+    case IMAGE_DATA_TYPE_FLOAT:
+    case IMAGE_DATA_TYPE_FLOAT4:
+      oiio_load_pixels<TypeDesc::FLOAT, float>(metadata, in, (float *)pixels);
+      break;
+    case IMAGE_DATA_NUM_TYPES:
+      break;
+  }
+
+  in->close();
+  return true;
+}
+
+string OIIOImageLoader::name() const
+{
+  return path_filename(filepath.string());
+}
+
+ustring OIIOImageLoader::osl_filepath() const
+{
+  return filepath;
+}
+
+bool OIIOImageLoader::equals(const ImageLoader &other) const
+{
+  const OIIOImageLoader &other_loader = (const OIIOImageLoader &)other;
+  return filepath == other_loader.filepath;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_oiio.h b/intern/cycles/render/image_oiio.h
new file mode 100644
index 00000000000..a234b968557
--- /dev/null
+++ b/intern/cycles/render/image_oiio.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __IMAGE_OIIO__
+#define __IMAGE_OIIO__
+
+#include "render/image.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OIIOImageLoader : public ImageLoader {
+ public:
+  OIIOImageLoader(const string &filepath);
+  ~OIIOImageLoader();
+
+  bool load_metadata(ImageMetaData &metadata) override;
+
+  bool load_pixels(const ImageMetaData &metadata,
+                   void *pixels,
+                   const size_t pixels_size,
+                   const bool associate_alpha) override;
+
+  string name() const override;
+
+  ustring osl_filepath() const override;
+
+  bool equals(const ImageLoader &other) const override;
+
+ protected:
+  ustring filepath;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __IMAGE_OIIO__ */
diff --git a/intern/cycles/render/image_vdb.cpp b/intern/cycles/render/image_vdb.cpp
new file mode 100644
index 00000000000..500131c2d84
--- /dev/null
+++ b/intern/cycles/render/image_vdb.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/image_vdb.h"
+
+#ifdef WITH_OPENVDB
+#  include <openvdb/openvdb.h>
+#  include <openvdb/tools/Dense.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+VDBImageLoader::VDBImageLoader(const string &grid_name) : grid_name(grid_name)
+{
+}
+
+VDBImageLoader::~VDBImageLoader()
+{
+}
+
+bool VDBImageLoader::load_metadata(ImageMetaData &metadata)
+{
+#ifdef WITH_OPENVDB
+  if (!grid) {
+    return false;
+  }
+
+  bbox = grid->evalActiveVoxelBoundingBox();
+  if (bbox.empty()) {
+    return false;
+  }
+
+  /* Set dimensions. */
+  openvdb::Coord dim = bbox.dim();
+  openvdb::Coord min = bbox.min();
+  metadata.width = dim.x();
+  metadata.height = dim.y();
+  metadata.depth = dim.z();
+
+  /* Set data type. */
+  if (grid->isType<openvdb::FloatGrid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::Vec3fGrid>()) {
+    metadata.channels = 3;
+  }
+  else if (grid->isType<openvdb::BoolGrid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::DoubleGrid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::Int32Grid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::Int64Grid>()) {
+    metadata.channels = 1;
+  }
+  else if (grid->isType<openvdb::Vec3IGrid>()) {
+    metadata.channels = 3;
+  }
+  else if (grid->isType<openvdb::Vec3dGrid>()) {
+    metadata.channels = 3;
+  }
+  else if (grid->isType<openvdb::MaskGrid>()) {
+    metadata.channels = 1;
+  }
+  else {
+    return false;
+  }
+
+  if (metadata.channels == 1) {
+    metadata.type = IMAGE_DATA_TYPE_FLOAT;
+  }
+  else {
+    metadata.type = IMAGE_DATA_TYPE_FLOAT4;
+  }
+
+  /* Set transform from object space to voxel index. */
+  openvdb::math::Mat4f grid_matrix = grid->transform().baseMap()->getAffineMap()->getMat4();
+  Transform index_to_object;
+  for (int col = 0; col < 4; col++) {
+    for (int row = 0; row < 3; row++) {
+      index_to_object[row][col] = (float)grid_matrix[col][row];
+    }
+  }
+
+  Transform texture_to_index = transform_translate(min.x(), min.y(), min.z()) *
+                               transform_scale(dim.x(), dim.y(), dim.z());
+
+  metadata.transform_3d = transform_inverse(index_to_object * texture_to_index);
+  metadata.use_transform_3d = true;
+
+  return true;
+#else
+  (void)metadata;
+  return false;
+#endif
+}
+
+bool VDBImageLoader::load_pixels(const ImageMetaData &, void *pixels, const size_t, const bool)
+{
+#ifdef WITH_OPENVDB
+  if (grid->isType<openvdb::FloatGrid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::FloatGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Vec3fGrid>()) {
+    openvdb::tools::Dense<openvdb::Vec3f, openvdb::tools::LayoutXYZ> dense(
+        bbox, (openvdb::Vec3f *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Vec3fGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::BoolGrid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::BoolGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::DoubleGrid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::DoubleGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Int32Grid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Int32Grid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Int64Grid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Int64Grid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Vec3IGrid>()) {
+    openvdb::tools::Dense<openvdb::Vec3f, openvdb::tools::LayoutXYZ> dense(
+        bbox, (openvdb::Vec3f *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Vec3IGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::Vec3dGrid>()) {
+    openvdb::tools::Dense<openvdb::Vec3f, openvdb::tools::LayoutXYZ> dense(
+        bbox, (openvdb::Vec3f *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::Vec3dGrid>(grid), dense);
+  }
+  else if (grid->isType<openvdb::MaskGrid>()) {
+    openvdb::tools::Dense<float, openvdb::tools::LayoutXYZ> dense(bbox, (float *)pixels);
+    openvdb::tools::copyToDense(*openvdb::gridConstPtrCast<openvdb::MaskGrid>(grid), dense);
+  }
+
+  return true;
+#else
+  (void)pixels;
+  return false;
+#endif
+}
+
+string VDBImageLoader::name() const
+{
+  return grid_name;
+}
+
+bool VDBImageLoader::equals(const ImageLoader &other) const
+{
+#ifdef WITH_OPENVDB
+  const VDBImageLoader &other_loader = (const VDBImageLoader &)other;
+  return grid == other_loader.grid;
+#else
+  (void)other;
+  return true;
+#endif
+}
+
+void VDBImageLoader::cleanup()
+{
+#ifdef WITH_OPENVDB
+  /* Free OpenVDB grid memory as soon as we can. */
+  grid.reset();
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image_vdb.h b/intern/cycles/render/image_vdb.h
new file mode 100644
index 00000000000..7dec63b11e6
--- /dev/null
+++ b/intern/cycles/render/image_vdb.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __IMAGE_VDB__
+#define __IMAGE_VDB__
+
+#ifdef WITH_OPENVDB
+#  include <openvdb/openvdb.h>
+#endif
+
+#include "render/image.h"
+
+CCL_NAMESPACE_BEGIN
+
+class VDBImageLoader : public ImageLoader {
+ public:
+  VDBImageLoader(const string &grid_name);
+  ~VDBImageLoader();
+
+  virtual bool load_metadata(ImageMetaData &metadata) override;
+
+  virtual bool load_pixels(const ImageMetaData &metadata,
+                           void *pixels,
+                           const size_t pixels_size,
+                           const bool associate_alpha) override;
+
+  virtual string name() const override;
+
+  virtual bool equals(const ImageLoader &other) const override;
+
+  virtual void cleanup() override;
+
+ protected:
+  string grid_name;
+#ifdef WITH_OPENVDB
+  openvdb::GridBase::ConstPtr grid;
+  openvdb::CoordBBox bbox;
+#endif
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __IMAGE_VDB__ */
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 530c32106b7..2f9d088899e 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -14,17 +14,21 @@
  * limitations under the License.
  */
 
+#include "render/integrator.h"
 #include "device/device.h"
 #include "render/background.h"
-#include "render/integrator.h"
 #include "render/film.h"
+#include "render/jitter.h"
 #include "render/light.h"
 #include "render/scene.h"
 #include "render/shader.h"
 #include "render/sobol.h"
 
+#include "kernel/kernel_types.h"
+
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -46,7 +50,7 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(ao_bounces, "AO Bounces", 0);
 
   SOCKET_INT(volume_max_steps, "Volume Max Steps", 1024);
-  SOCKET_FLOAT(volume_step_size, "Volume Step Size", 0.1f);
+  SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f);
 
   SOCKET_BOOLEAN(caustics_reflective, "Reflective Caustics", true);
   SOCKET_BOOLEAN(caustics_refractive, "Refractive Caustics", true);
@@ -66,6 +70,9 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(volume_samples, "Volume Samples", 1);
   SOCKET_INT(start_sample, "Start Sample", 0);
 
+  SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
+  SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
+
   SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
   SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
   SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
@@ -78,6 +85,7 @@ NODE_DEFINE(Integrator)
   static NodeEnum sampling_pattern_enum;
   sampling_pattern_enum.insert("sobol", SAMPLING_PATTERN_SOBOL);
   sampling_pattern_enum.insert("cmj", SAMPLING_PATTERN_CMJ);
+  sampling_pattern_enum.insert("pmj", SAMPLING_PATTERN_PMJ);
   SOCKET_ENUM(sampling_pattern, "Sampling Pattern", sampling_pattern_enum, SAMPLING_PATTERN_SOBOL);
 
   return type;
@@ -135,7 +143,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   }
 
   kintegrator->volume_max_steps = volume_max_steps;
-  kintegrator->volume_step_size = volume_step_size;
+  kintegrator->volume_step_rate = volume_step_rate;
 
   kintegrator->caustics_reflective = caustics_reflective;
   kintegrator->caustics_refractive = caustics_refractive;
@@ -174,6 +182,22 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
   kintegrator->sampling_pattern = sampling_pattern;
   kintegrator->aa_samples = aa_samples;
+  if (aa_samples > 0 && adaptive_min_samples == 0) {
+    kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
+    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+            << kintegrator->adaptive_min_samples;
+  }
+  else {
+    kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
+  }
+  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+    kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
+            << kintegrator->adaptive_threshold;
+  }
+  else {
+    kintegrator->adaptive_threshold = adaptive_threshold;
+  }
 
   if (light_sampling_threshold > 0.0f) {
     kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
@@ -203,18 +227,34 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   int dimensions = PRNG_BASE_NUM + max_samples * PRNG_BOUNCE_NUM;
   dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS);
 
-  uint *directions = dscene->sobol_directions.alloc(SOBOL_BITS * dimensions);
+  if (sampling_pattern == SAMPLING_PATTERN_SOBOL) {
+    uint *directions = dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
 
-  sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
+    sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
 
-  dscene->sobol_directions.copy_to_device();
+    dscene->sample_pattern_lut.copy_to_device();
+  }
+  else {
+    constexpr int sequence_size = NUM_PMJ_SAMPLES;
+    constexpr int num_sequences = NUM_PMJ_PATTERNS;
+    float2 *directions = (float2 *)dscene->sample_pattern_lut.alloc(sequence_size * num_sequences *
+                                                                    2);
+    TaskPool pool;
+    for (int j = 0; j < num_sequences; ++j) {
+      float2 *sequence = directions + j * sequence_size;
+      pool.push(
+          function_bind(&progressive_multi_jitter_02_generate_2D, sequence, sequence_size, j));
+    }
+    pool.wait_work();
+    dscene->sample_pattern_lut.copy_to_device();
+  }
 
   need_update = false;
 }
 
 void Integrator::device_free(Device *, DeviceScene *dscene)
 {
-  dscene->sobol_directions.free();
+  dscene->sample_pattern_lut.free();
 }
 
 bool Integrator::modified(const Integrator &integrator)
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 32d84c27072..9804caebe6e 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -45,7 +45,7 @@ class Integrator : public Node {
   int ao_bounces;
 
   int volume_max_steps;
-  float volume_step_size;
+  float volume_step_rate;
 
   bool caustics_reflective;
   bool caustics_refractive;
@@ -75,6 +75,9 @@ class Integrator : public Node {
   bool sample_all_lights_indirect;
   float light_sampling_threshold;
 
+  int adaptive_min_samples;
+  float adaptive_threshold;
+
   enum Method {
     BRANCHED_PATH = 0,
     PATH = 1,
diff --git a/intern/cycles/render/jitter.cpp b/intern/cycles/render/jitter.cpp
new file mode 100644
index 00000000000..fc47b0e8f0a
--- /dev/null
+++ b/intern/cycles/render/jitter.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This file is based on "Progressive Multi-Jittered Sample Sequences"
+ * by Per Christensen, Andrew Kensler and Charlie Kilpatrick.
+ * http://graphics.pixar.com/library/ProgressiveMultiJitteredSampling/paper.pdf
+ *
+ * Performance can be improved in the future by implementing the new
+ * algorithm from Matt Pharr in  http://jcgt.org/published/0008/01/04/
+ * "Efficient Generation of Points that Satisfy Two-Dimensional Elementary Intervals"
+ */
+
+#include "render/jitter.h"
+
+#include <math.h>
+#include <vector>
+
+CCL_NAMESPACE_BEGIN
+
+static uint cmj_hash(uint i, uint p)
+{
+  i ^= p;
+  i ^= i >> 17;
+  i ^= i >> 10;
+  i *= 0xb36534e5;
+  i ^= i >> 12;
+  i ^= i >> 21;
+  i *= 0x93fc4795;
+  i ^= 0xdf6e307f;
+  i ^= i >> 17;
+  i *= 1 | p >> 18;
+
+  return i;
+}
+
+static float cmj_randfloat(uint i, uint p)
+{
+  return cmj_hash(i, p) * (1.0f / 4294967808.0f);
+}
+
+class PMJ_Generator {
+ public:
+  static void generate_2D(float2 points[], int size, int rng_seed_in)
+  {
+    PMJ_Generator g(rng_seed_in);
+    points[0].x = g.rnd();
+    points[0].y = g.rnd();
+    int N = 1;
+    while (N < size) {
+      g.extend_sequence_even(points, N);
+      g.extend_sequence_odd(points, 2 * N);
+      N = 4 * N;
+    }
+  }
+
+ protected:
+  PMJ_Generator(int rnd_seed_in) : num_samples(1), rnd_index(2), rnd_seed(rnd_seed_in)
+  {
+  }
+
+  float rnd()
+  {
+    return cmj_randfloat(++rnd_index, rnd_seed);
+  }
+
+  virtual void mark_occupied_strata(float2 points[], int N)
+  {
+    int NN = 2 * N;
+    for (int s = 0; s < NN; ++s) {
+      occupied1Dx[s] = occupied1Dy[s] = false;
+    }
+    for (int s = 0; s < N; ++s) {
+      int xstratum = (int)(NN * points[s].x);
+      int ystratum = (int)(NN * points[s].y);
+      occupied1Dx[xstratum] = true;
+      occupied1Dy[ystratum] = true;
+    }
+  }
+
+  virtual void generate_sample_point(
+      float2 points[], float i, float j, float xhalf, float yhalf, int n, int N)
+  {
+    int NN = 2 * N;
+    float2 pt;
+    int xstratum, ystratum;
+    do {
+      pt.x = (i + 0.5f * (xhalf + rnd())) / n;
+      xstratum = (int)(NN * pt.x);
+    } while (occupied1Dx[xstratum]);
+    do {
+      pt.y = (j + 0.5f * (yhalf + rnd())) / n;
+      ystratum = (int)(NN * pt.y);
+    } while (occupied1Dy[ystratum]);
+    occupied1Dx[xstratum] = true;
+    occupied1Dy[ystratum] = true;
+    points[num_samples] = pt;
+    ++num_samples;
+  }
+
+  void extend_sequence_even(float2 points[], int N)
+  {
+    int n = (int)sqrtf(N);
+    occupied1Dx.resize(2 * N);
+    occupied1Dy.resize(2 * N);
+    mark_occupied_strata(points, N);
+    for (int s = 0; s < N; ++s) {
+      float2 oldpt = points[s];
+      float i = floorf(n * oldpt.x);
+      float j = floorf(n * oldpt.y);
+      float xhalf = floorf(2.0f * (n * oldpt.x - i));
+      float yhalf = floorf(2.0f * (n * oldpt.y - j));
+      xhalf = 1.0f - xhalf;
+      yhalf = 1.0f - yhalf;
+      generate_sample_point(points, i, j, xhalf, yhalf, n, N);
+    }
+  }
+
+  void extend_sequence_odd(float2 points[], int N)
+  {
+    int n = (int)sqrtf(N / 2);
+    occupied1Dx.resize(2 * N);
+    occupied1Dy.resize(2 * N);
+    mark_occupied_strata(points, N);
+    std::vector<float> xhalves(N / 2);
+    std::vector<float> yhalves(N / 2);
+    for (int s = 0; s < N / 2; ++s) {
+      float2 oldpt = points[s];
+      float i = floorf(n * oldpt.x);
+      float j = floorf(n * oldpt.y);
+      float xhalf = floorf(2.0f * (n * oldpt.x - i));
+      float yhalf = floorf(2.0f * (n * oldpt.y - j));
+      if (rnd() > 0.5f) {
+        xhalf = 1.0f - xhalf;
+      }
+      else {
+        yhalf = 1.0f - yhalf;
+      }
+      xhalves[s] = xhalf;
+      yhalves[s] = yhalf;
+      generate_sample_point(points, i, j, xhalf, yhalf, n, N);
+    }
+    for (int s = 0; s < N / 2; ++s) {
+      float2 oldpt = points[s];
+      float i = floorf(n * oldpt.x);
+      float j = floorf(n * oldpt.y);
+      float xhalf = 1.0f - xhalves[s];
+      float yhalf = 1.0f - yhalves[s];
+      generate_sample_point(points, i, j, xhalf, yhalf, n, N);
+    }
+  }
+
+  std::vector<bool> occupied1Dx, occupied1Dy;
+  int num_samples;
+  int rnd_index, rnd_seed;
+};
+
+class PMJ02_Generator : public PMJ_Generator {
+ protected:
+  void generate_sample_point(
+      float2 points[], float i, float j, float xhalf, float yhalf, int n, int N) override
+  {
+    int NN = 2 * N;
+    float2 pt;
+    do {
+      pt.x = (i + 0.5f * (xhalf + rnd())) / n;
+      pt.y = (j + 0.5f * (yhalf + rnd())) / n;
+    } while (is_occupied(pt, NN));
+    mark_occupied_strata1(pt, NN);
+    points[num_samples] = pt;
+    ++num_samples;
+  }
+
+  void mark_occupied_strata(float2 points[], int N) override
+  {
+    int NN = 2 * N;
+    int num_shapes = (int)log2f(NN) + 1;
+    occupiedStrata.resize(num_shapes);
+    for (int shape = 0; shape < num_shapes; ++shape) {
+      occupiedStrata[shape].resize(NN);
+      for (int n = 0; n < NN; ++n) {
+        occupiedStrata[shape][n] = false;
+      }
+    }
+    for (int s = 0; s < N; ++s) {
+      mark_occupied_strata1(points[s], NN);
+    }
+  }
+
+  void mark_occupied_strata1(float2 pt, int NN)
+  {
+    int shape = 0;
+    int xdivs = NN;
+    int ydivs = 1;
+    do {
+      int xstratum = (int)(xdivs * pt.x);
+      int ystratum = (int)(ydivs * pt.y);
+      size_t index = ystratum * xdivs + xstratum;
+      assert(index < NN);
+      occupiedStrata[shape][index] = true;
+      shape = shape + 1;
+      xdivs = xdivs / 2;
+      ydivs = ydivs * 2;
+    } while (xdivs > 0);
+  }
+
+  bool is_occupied(float2 pt, int NN)
+  {
+    int shape = 0;
+    int xdivs = NN;
+    int ydivs = 1;
+    do {
+      int xstratum = (int)(xdivs * pt.x);
+      int ystratum = (int)(ydivs * pt.y);
+      size_t index = ystratum * xdivs + xstratum;
+      assert(index < NN);
+      if (occupiedStrata[shape][index]) {
+        return true;
+      }
+      shape = shape + 1;
+      xdivs = xdivs / 2;
+      ydivs = ydivs * 2;
+    } while (xdivs > 0);
+    return false;
+  }
+
+ private:
+  std::vector<std::vector<bool>> occupiedStrata;
+};
+
+static void shuffle(float2 points[], int size, int rng_seed)
+{
+  /* Offset samples by 1.0 for faster scrambling in kernel_random.h */
+  for (int i = 0; i < size; ++i) {
+    points[i].x += 1.0f;
+    points[i].y += 1.0f;
+  }
+
+  if (rng_seed == 0) {
+    return;
+  }
+
+  constexpr int odd[8] = {0, 1, 4, 5, 10, 11, 14, 15};
+  constexpr int even[8] = {2, 3, 6, 7, 8, 9, 12, 13};
+
+  int rng_index = 0;
+  for (int yy = 0; yy < size / 16; ++yy) {
+    for (int xx = 0; xx < 8; ++xx) {
+      int other = (int)(cmj_randfloat(++rng_index, rng_seed) * (8.0f - xx) + xx);
+      float2 tmp = points[odd[other] + yy * 16];
+      points[odd[other] + yy * 16] = points[odd[xx] + yy * 16];
+      points[odd[xx] + yy * 16] = tmp;
+    }
+    for (int xx = 0; xx < 8; ++xx) {
+      int other = (int)(cmj_randfloat(++rng_index, rng_seed) * (8.0f - xx) + xx);
+      float2 tmp = points[even[other] + yy * 16];
+      points[even[other] + yy * 16] = points[even[xx] + yy * 16];
+      points[even[xx] + yy * 16] = tmp;
+    }
+  }
+}
+
+void progressive_multi_jitter_generate_2D(float2 points[], int size, int rng_seed)
+{
+  PMJ_Generator::generate_2D(points, size, rng_seed);
+  shuffle(points, size, rng_seed);
+}
+
+void progressive_multi_jitter_02_generate_2D(float2 points[], int size, int rng_seed)
+{
+  PMJ02_Generator::generate_2D(points, size, rng_seed);
+  shuffle(points, size, rng_seed);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/jitter.h b/intern/cycles/render/jitter.h
new file mode 100644
index 00000000000..ed34c7a4f4d
--- /dev/null
+++ b/intern/cycles/render/jitter.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __JITTER_H__
+#define __JITTER_H__
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+void progressive_multi_jitter_generate_2D(float2 points[], int size, int rng_seed);
+void progressive_multi_jitter_02_generate_2D(float2 points[], int size, int rng_seed);
+
+CCL_NAMESPACE_END
+
+#endif /* __JITTER_H__ */
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 664217d6f26..9adf8e5341a 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "render/background.h"
+#include "render/light.h"
 #include "device/device.h"
-#include "render/integrator.h"
+#include "render/background.h"
 #include "render/film.h"
 #include "render/graph.h"
-#include "render/light.h"
+#include "render/integrator.h"
 #include "render/mesh.h"
 #include "render/nodes.h"
 #include "render/object.h"
@@ -28,9 +28,9 @@
 
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
+#include "util/util_logging.h"
 #include "util/util_path.h"
 #include "util/util_progress.h"
-#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -232,7 +232,10 @@ void LightManager::disable_ineffective_light(Scene *scene)
 
 bool LightManager::object_usable_as_light(Object *object)
 {
-  Mesh *mesh = object->mesh;
+  Geometry *geom = object->geometry;
+  if (geom->type != Geometry::MESH) {
+    return false;
+  }
   /* Skip objects with NaNs */
   if (!object->bounds.valid()) {
     return false;
@@ -243,10 +246,10 @@ bool LightManager::object_usable_as_light(Object *object)
   }
   /* Skip if we have no emission shaders. */
   /* TODO(sergey): Ideally we want to avoid such duplicated loop, since it'll
-   * iterate all mesh shaders twice (when counting and when calculating
+   * iterate all geometry shaders twice (when counting and when calculating
    * triangle area.
    */
-  foreach (const Shader *shader, mesh->used_shaders) {
+  foreach (const Shader *shader, geom->used_shaders) {
     if (shader->use_mis && shader->has_surface_emission) {
       return true;
     }
@@ -285,8 +288,9 @@ void LightManager::device_update_distribution(Device *,
     if (!object_usable_as_light(object)) {
       continue;
     }
+
     /* Count triangles. */
-    Mesh *mesh = object->mesh;
+    Mesh *mesh = static_cast<Mesh *>(object->geometry);
     size_t mesh_num_triangles = mesh->num_triangles();
     for (size_t i = 0; i < mesh_num_triangles; i++) {
       int shader_index = mesh->shader[i];
@@ -320,7 +324,7 @@ void LightManager::device_update_distribution(Device *,
       continue;
     }
     /* Sum area. */
-    Mesh *mesh = object->mesh;
+    Mesh *mesh = static_cast<Mesh *>(object->geometry);
     bool transform_applied = mesh->transform_applied;
     Transform tfm = object->tfm;
     int object_id = j;
@@ -352,7 +356,7 @@ void LightManager::device_update_distribution(Device *,
 
       if (shader->use_mis && shader->has_surface_emission) {
         distribution[offset].totarea = totarea;
-        distribution[offset].prim = i + mesh->tri_offset;
+        distribution[offset].prim = i + mesh->prim_offset;
         distribution[offset].mesh_light.shader_flag = shader_flag;
         distribution[offset].mesh_light.object_id = object_id;
         offset++;
@@ -573,8 +577,8 @@ void LightManager::device_update_background(Device *device,
       if (node->type == EnvironmentTextureNode::node_type) {
         EnvironmentTextureNode *env = (EnvironmentTextureNode *)node;
         ImageMetaData metadata;
-        if (env->image_manager && !env->slots.empty() &&
-            env->image_manager->get_image_metadata(env->slots[0], metadata)) {
+        if (!env->handle.empty()) {
+          ImageMetaData metadata = env->handle.metadata();
           res.x = max(res.x, metadata.width);
           res.y = max(res.y, metadata.height);
         }
diff --git a/intern/cycles/render/merge.cpp b/intern/cycles/render/merge.cpp
index cac07e59fe3..3ea3952b96c 100644
--- a/intern/cycles/render/merge.cpp
+++ b/intern/cycles/render/merge.cpp
@@ -22,8 +22,8 @@
 #include "util/util_time.h"
 #include "util/util_unique_ptr.h"
 
-#include <OpenImageIO/imageio.h>
 #include <OpenImageIO/filesystem.h>
+#include <OpenImageIO/imageio.h>
 
 OIIO_NAMESPACE_USING
 
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index d9e6d998ebd..c262d770331 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -17,32 +17,22 @@
 #include "bvh/bvh.h"
 #include "bvh/bvh_build.h"
 
-#include "render/camera.h"
-#include "render/curves.h"
 #include "device/device.h"
+
 #include "render/graph.h"
-#include "render/shader.h"
-#include "render/light.h"
+#include "render/hair.h"
 #include "render/mesh.h"
-#include "render/nodes.h"
 #include "render/object.h"
 #include "render/scene.h"
-#include "render/stats.h"
-
-#include "kernel/osl/osl_globals.h"
 
-#include "subd/subd_split.h"
 #include "subd/subd_patch_table.h"
+#include "subd/subd_split.h"
 
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_progress.h"
 #include "util/util_set.h"
 
-#ifdef WITH_EMBREE
-#  include "bvh/bvh_embree.h"
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 /* Triangle */
@@ -120,263 +110,6 @@ bool Mesh::Triangle::valid(const float3 *verts) const
   return isfinite3_safe(verts[v[0]]) && isfinite3_safe(verts[v[1]]) && isfinite3_safe(verts[v[2]]);
 }
 
-/* Curve */
-
-void Mesh::Curve::bounds_grow(const int k,
-                              const float3 *curve_keys,
-                              const float *curve_radius,
-                              BoundBox &bounds) const
-{
-  float3 P[4];
-
-  P[0] = curve_keys[max(first_key + k - 1, first_key)];
-  P[1] = curve_keys[first_key + k];
-  P[2] = curve_keys[first_key + k + 1];
-  P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
-
-  float3 lower;
-  float3 upper;
-
-  curvebounds(&lower.x, &upper.x, P, 0);
-  curvebounds(&lower.y, &upper.y, P, 1);
-  curvebounds(&lower.z, &upper.z, P, 2);
-
-  float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
-
-  bounds.grow(lower, mr);
-  bounds.grow(upper, mr);
-}
-
-void Mesh::Curve::bounds_grow(const int k,
-                              const float3 *curve_keys,
-                              const float *curve_radius,
-                              const Transform &aligned_space,
-                              BoundBox &bounds) const
-{
-  float3 P[4];
-
-  P[0] = curve_keys[max(first_key + k - 1, first_key)];
-  P[1] = curve_keys[first_key + k];
-  P[2] = curve_keys[first_key + k + 1];
-  P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)];
-
-  P[0] = transform_point(&aligned_space, P[0]);
-  P[1] = transform_point(&aligned_space, P[1]);
-  P[2] = transform_point(&aligned_space, P[2]);
-  P[3] = transform_point(&aligned_space, P[3]);
-
-  float3 lower;
-  float3 upper;
-
-  curvebounds(&lower.x, &upper.x, P, 0);
-  curvebounds(&lower.y, &upper.y, P, 1);
-  curvebounds(&lower.z, &upper.z, P, 2);
-
-  float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]);
-
-  bounds.grow(lower, mr);
-  bounds.grow(upper, mr);
-}
-
-void Mesh::Curve::bounds_grow(float4 keys[4], BoundBox &bounds) const
-{
-  float3 P[4] = {
-      float4_to_float3(keys[0]),
-      float4_to_float3(keys[1]),
-      float4_to_float3(keys[2]),
-      float4_to_float3(keys[3]),
-  };
-
-  float3 lower;
-  float3 upper;
-
-  curvebounds(&lower.x, &upper.x, P, 0);
-  curvebounds(&lower.y, &upper.y, P, 1);
-  curvebounds(&lower.z, &upper.z, P, 2);
-
-  float mr = max(keys[1].w, keys[2].w);
-
-  bounds.grow(lower, mr);
-  bounds.grow(upper, mr);
-}
-
-void Mesh::Curve::motion_keys(const float3 *curve_keys,
-                              const float *curve_radius,
-                              const float3 *key_steps,
-                              size_t num_curve_keys,
-                              size_t num_steps,
-                              float time,
-                              size_t k0,
-                              size_t k1,
-                              float4 r_keys[2]) const
-{
-  /* Figure out which steps we need to fetch and their interpolation factor. */
-  const size_t max_step = num_steps - 1;
-  const size_t step = min((int)(time * max_step), max_step - 1);
-  const float t = time * max_step - step;
-  /* Fetch vertex coordinates. */
-  float4 curr_keys[2];
-  float4 next_keys[2];
-  keys_for_step(
-      curve_keys, curve_radius, key_steps, num_curve_keys, num_steps, step, k0, k1, curr_keys);
-  keys_for_step(
-      curve_keys, curve_radius, key_steps, num_curve_keys, num_steps, step + 1, k0, k1, next_keys);
-  /* Interpolate between steps. */
-  r_keys[0] = (1.0f - t) * curr_keys[0] + t * next_keys[0];
-  r_keys[1] = (1.0f - t) * curr_keys[1] + t * next_keys[1];
-}
-
-void Mesh::Curve::cardinal_motion_keys(const float3 *curve_keys,
-                                       const float *curve_radius,
-                                       const float3 *key_steps,
-                                       size_t num_curve_keys,
-                                       size_t num_steps,
-                                       float time,
-                                       size_t k0,
-                                       size_t k1,
-                                       size_t k2,
-                                       size_t k3,
-                                       float4 r_keys[4]) const
-{
-  /* Figure out which steps we need to fetch and their interpolation factor. */
-  const size_t max_step = num_steps - 1;
-  const size_t step = min((int)(time * max_step), max_step - 1);
-  const float t = time * max_step - step;
-  /* Fetch vertex coordinates. */
-  float4 curr_keys[4];
-  float4 next_keys[4];
-  cardinal_keys_for_step(curve_keys,
-                         curve_radius,
-                         key_steps,
-                         num_curve_keys,
-                         num_steps,
-                         step,
-                         k0,
-                         k1,
-                         k2,
-                         k3,
-                         curr_keys);
-  cardinal_keys_for_step(curve_keys,
-                         curve_radius,
-                         key_steps,
-                         num_curve_keys,
-                         num_steps,
-                         step + 1,
-                         k0,
-                         k1,
-                         k2,
-                         k3,
-                         next_keys);
-  /* Interpolate between steps. */
-  r_keys[0] = (1.0f - t) * curr_keys[0] + t * next_keys[0];
-  r_keys[1] = (1.0f - t) * curr_keys[1] + t * next_keys[1];
-  r_keys[2] = (1.0f - t) * curr_keys[2] + t * next_keys[2];
-  r_keys[3] = (1.0f - t) * curr_keys[3] + t * next_keys[3];
-}
-
-void Mesh::Curve::keys_for_step(const float3 *curve_keys,
-                                const float *curve_radius,
-                                const float3 *key_steps,
-                                size_t num_curve_keys,
-                                size_t num_steps,
-                                size_t step,
-                                size_t k0,
-                                size_t k1,
-                                float4 r_keys[2]) const
-{
-  k0 = max(k0, 0);
-  k1 = min(k1, num_keys - 1);
-  const size_t center_step = ((num_steps - 1) / 2);
-  if (step == center_step) {
-    /* Center step: regular key location. */
-    /* TODO(sergey): Consider adding make_float4(float3, float)
-     * function.
-     */
-    r_keys[0] = make_float4(curve_keys[first_key + k0].x,
-                            curve_keys[first_key + k0].y,
-                            curve_keys[first_key + k0].z,
-                            curve_radius[first_key + k0]);
-    r_keys[1] = make_float4(curve_keys[first_key + k1].x,
-                            curve_keys[first_key + k1].y,
-                            curve_keys[first_key + k1].z,
-                            curve_radius[first_key + k1]);
-  }
-  else {
-    /* Center step is not stored in this array. */
-    if (step > center_step) {
-      step--;
-    }
-    const size_t offset = first_key + step * num_curve_keys;
-    r_keys[0] = make_float4(key_steps[offset + k0].x,
-                            key_steps[offset + k0].y,
-                            key_steps[offset + k0].z,
-                            curve_radius[first_key + k0]);
-    r_keys[1] = make_float4(key_steps[offset + k1].x,
-                            key_steps[offset + k1].y,
-                            key_steps[offset + k1].z,
-                            curve_radius[first_key + k1]);
-  }
-}
-
-void Mesh::Curve::cardinal_keys_for_step(const float3 *curve_keys,
-                                         const float *curve_radius,
-                                         const float3 *key_steps,
-                                         size_t num_curve_keys,
-                                         size_t num_steps,
-                                         size_t step,
-                                         size_t k0,
-                                         size_t k1,
-                                         size_t k2,
-                                         size_t k3,
-                                         float4 r_keys[4]) const
-{
-  k0 = max(k0, 0);
-  k3 = min(k3, num_keys - 1);
-  const size_t center_step = ((num_steps - 1) / 2);
-  if (step == center_step) {
-    /* Center step: regular key location. */
-    r_keys[0] = make_float4(curve_keys[first_key + k0].x,
-                            curve_keys[first_key + k0].y,
-                            curve_keys[first_key + k0].z,
-                            curve_radius[first_key + k0]);
-    r_keys[1] = make_float4(curve_keys[first_key + k1].x,
-                            curve_keys[first_key + k1].y,
-                            curve_keys[first_key + k1].z,
-                            curve_radius[first_key + k1]);
-    r_keys[2] = make_float4(curve_keys[first_key + k2].x,
-                            curve_keys[first_key + k2].y,
-                            curve_keys[first_key + k2].z,
-                            curve_radius[first_key + k2]);
-    r_keys[3] = make_float4(curve_keys[first_key + k3].x,
-                            curve_keys[first_key + k3].y,
-                            curve_keys[first_key + k3].z,
-                            curve_radius[first_key + k3]);
-  }
-  else {
-    /* Center step is not stored in this array. */
-    if (step > center_step) {
-      step--;
-    }
-    const size_t offset = first_key + step * num_curve_keys;
-    r_keys[0] = make_float4(key_steps[offset + k0].x,
-                            key_steps[offset + k0].y,
-                            key_steps[offset + k0].z,
-                            curve_radius[first_key + k0]);
-    r_keys[1] = make_float4(key_steps[offset + k1].x,
-                            key_steps[offset + k1].y,
-                            key_steps[offset + k1].z,
-                            curve_radius[first_key + k1]);
-    r_keys[2] = make_float4(key_steps[offset + k2].x,
-                            key_steps[offset + k2].y,
-                            key_steps[offset + k2].z,
-                            curve_radius[first_key + k2]);
-    r_keys[3] = make_float4(key_steps[offset + k3].x,
-                            key_steps[offset + k3].y,
-                            key_steps[offset + k3].z,
-                            curve_radius[first_key + k3]);
-  }
-}
-
 /* SubdFace */
 
 float3 Mesh::SubdFace::normal(const Mesh *mesh) const
@@ -392,60 +125,29 @@ float3 Mesh::SubdFace::normal(const Mesh *mesh) const
 
 NODE_DEFINE(Mesh)
 {
-  NodeType *type = NodeType::add("mesh", create);
-
-  SOCKET_UINT(motion_steps, "Motion Steps", 3);
-  SOCKET_BOOLEAN(use_motion_blur, "Use Motion Blur", false);
+  NodeType *type = NodeType::add("mesh", create, NodeType::NONE, Geometry::node_base_type);
 
   SOCKET_INT_ARRAY(triangles, "Triangles", array<int>());
   SOCKET_POINT_ARRAY(verts, "Vertices", array<float3>());
   SOCKET_INT_ARRAY(shader, "Shader", array<int>());
   SOCKET_BOOLEAN_ARRAY(smooth, "Smooth", array<bool>());
 
-  SOCKET_POINT_ARRAY(curve_keys, "Curve Keys", array<float3>());
-  SOCKET_FLOAT_ARRAY(curve_radius, "Curve Radius", array<float>());
-  SOCKET_INT_ARRAY(curve_first_key, "Curve First Key", array<int>());
-  SOCKET_INT_ARRAY(curve_shader, "Curve Shader", array<int>());
-
   return type;
 }
 
-Mesh::Mesh() : Node(node_type)
+Mesh::Mesh() : Geometry(node_type, Geometry::MESH), subd_attributes(this, ATTR_PRIM_SUBD)
 {
-  need_update = true;
-  need_update_rebuild = false;
-  transform_applied = false;
-  transform_negative_scaled = false;
-  transform_normal = transform_identity();
-  bounds = BoundBox::empty;
-
-  bvh = NULL;
-
-  tri_offset = 0;
   vert_offset = 0;
 
-  curve_offset = 0;
-  curvekey_offset = 0;
-
   patch_offset = 0;
   face_offset = 0;
   corner_offset = 0;
 
-  attr_map_offset = 0;
-
-  prim_offset = 0;
-
   num_subd_verts = 0;
 
-  attributes.triangle_mesh = this;
-  curve_attributes.curve_mesh = this;
-  subd_attributes.subd_mesh = this;
-
-  geometry_flags = GEOMETRY_NONE;
-
-  volume_isovalue = 0.001f;
-  has_volume = false;
-  has_surface_bssrdf = false;
+  volume_clipping = 0.001f;
+  volume_step_size = 0.0f;
+  volume_object_space = false;
 
   num_ngons = 0;
 
@@ -457,7 +159,6 @@ Mesh::Mesh() : Node(node_type)
 
 Mesh::~Mesh()
 {
-  delete bvh;
   delete patch_table;
   delete subd_params;
 }
@@ -493,26 +194,6 @@ void Mesh::reserve_mesh(int numverts, int numtris)
   attributes.resize(true);
 }
 
-void Mesh::resize_curves(int numcurves, int numkeys)
-{
-  curve_keys.resize(numkeys);
-  curve_radius.resize(numkeys);
-  curve_first_key.resize(numcurves);
-  curve_shader.resize(numcurves);
-
-  curve_attributes.resize();
-}
-
-void Mesh::reserve_curves(int numcurves, int numkeys)
-{
-  curve_keys.reserve(numkeys);
-  curve_radius.reserve(numkeys);
-  curve_first_key.reserve(numcurves);
-  curve_shader.reserve(numcurves);
-
-  curve_attributes.resize(true);
-}
-
 void Mesh::resize_subd_faces(int numfaces, int num_ngons_, int numcorners)
 {
   subd_faces.resize(numfaces);
@@ -533,6 +214,8 @@ void Mesh::reserve_subd_faces(int numfaces, int num_ngons_, int numcorners)
 
 void Mesh::clear(bool preserve_voxel_data)
 {
+  Geometry::clear();
+
   /* clear all verts and triangles */
   verts.clear();
   triangles.clear();
@@ -542,11 +225,6 @@ void Mesh::clear(bool preserve_voxel_data)
   triangle_patch.clear();
   vert_patch_uv.clear();
 
-  curve_keys.clear();
-  curve_radius.clear();
-  curve_first_key.clear();
-  curve_shader.clear();
-
   subd_faces.clear();
   subd_face_corners.clear();
 
@@ -554,27 +232,21 @@ void Mesh::clear(bool preserve_voxel_data)
 
   subd_creases.clear();
 
-  curve_attributes.clear();
   subd_attributes.clear();
   attributes.clear(preserve_voxel_data);
 
-  used_shaders.clear();
-
   vert_to_stitching_key_map.clear();
   vert_stitching_map.clear();
 
-  if (!preserve_voxel_data) {
-    geometry_flags = GEOMETRY_NONE;
-  }
-
-  transform_applied = false;
-  transform_negative_scaled = false;
-  transform_normal = transform_identity();
-
   delete patch_table;
   patch_table = NULL;
 }
 
+void Mesh::clear()
+{
+  clear(false);
+}
+
 void Mesh::add_vertex(float3 P)
 {
   verts.push_back_reserved(P);
@@ -606,18 +278,6 @@ void Mesh::add_triangle(int v0, int v1, int v2, int shader_, bool smooth_)
   }
 }
 
-void Mesh::add_curve_key(float3 co, float radius)
-{
-  curve_keys.push_back_reserved(co);
-  curve_radius.push_back_reserved(radius);
-}
-
-void Mesh::add_curve(int first_key, int shader)
-{
-  curve_first_key.push_back_reserved(first_key);
-  curve_shader.push_back_reserved(shader);
-}
-
 void Mesh::add_subd_face(int *corners, int num_corners, int shader_, bool smooth_)
 {
   int start_corner = subd_face_corners.size();
@@ -637,47 +297,41 @@ void Mesh::add_subd_face(int *corners, int num_corners, int shader_, bool smooth
   subd_faces.push_back_reserved(face);
 }
 
-static void get_uv_tiles_from_attribute(Attribute *attr, int num, unordered_set<int> &tiles)
+void Mesh::copy_center_to_motion_step(const int motion_step)
 {
-  if (attr == NULL) {
-    return;
-  }
+  Attribute *attr_mP = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
 
-  const float2 *uv = attr->data_float2();
-  for (int i = 0; i < num; i++, uv++) {
-    float u = uv->x, v = uv->y;
-    int x = (int)u, y = (int)v;
+  if (attr_mP) {
+    Attribute *attr_mN = attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
+    Attribute *attr_N = attributes.find(ATTR_STD_VERTEX_NORMAL);
+    float3 *P = &verts[0];
+    float3 *N = (attr_N) ? attr_N->data_float3() : NULL;
+    size_t numverts = verts.size();
 
-    if (x < 0 || y < 0 || x >= 10) {
-      continue;
-    }
-
-    /* Be conservative in corners - precisely touching the right or upper edge of a tile
-     * should not load its right/upper neighbor as well. */
-    if (x > 0 && (u < x + 1e-6f)) {
-      x--;
-    }
-    if (y > 0 && (v < y + 1e-6f)) {
-      y--;
-    }
-
-    tiles.insert(1001 + 10 * y + x);
+    memcpy(attr_mP->data_float3() + motion_step * numverts, P, sizeof(float3) * numverts);
+    if (attr_mN)
+      memcpy(attr_mN->data_float3() + motion_step * numverts, N, sizeof(float3) * numverts);
   }
 }
 
 void Mesh::get_uv_tiles(ustring map, unordered_set<int> &tiles)
 {
+  Attribute *attr, *subd_attr;
+
   if (map.empty()) {
-    get_uv_tiles_from_attribute(attributes.find(ATTR_STD_UV), num_triangles() * 3, tiles);
-    get_uv_tiles_from_attribute(
-        subd_attributes.find(ATTR_STD_UV), subd_face_corners.size() + num_ngons, tiles);
-    get_uv_tiles_from_attribute(curve_attributes.find(ATTR_STD_UV), num_curves(), tiles);
+    attr = attributes.find(ATTR_STD_UV);
+    subd_attr = subd_attributes.find(ATTR_STD_UV);
   }
   else {
-    get_uv_tiles_from_attribute(attributes.find(map), num_triangles() * 3, tiles);
-    get_uv_tiles_from_attribute(
-        subd_attributes.find(map), subd_face_corners.size() + num_ngons, tiles);
-    get_uv_tiles_from_attribute(curve_attributes.find(map), num_curves(), tiles);
+    attr = attributes.find(map);
+    subd_attr = subd_attributes.find(map);
+  }
+
+  if (attr) {
+    attr->get_uv_tiles(this, ATTR_PRIM_GEOMETRY, tiles);
+  }
+  if (subd_attr) {
+    subd_attr->get_uv_tiles(this, ATTR_PRIM_SUBD, tiles);
   }
 }
 
@@ -685,15 +339,11 @@ void Mesh::compute_bounds()
 {
   BoundBox bnds = BoundBox::empty;
   size_t verts_size = verts.size();
-  size_t curve_keys_size = curve_keys.size();
 
-  if (verts_size + curve_keys_size > 0) {
+  if (verts_size > 0) {
     for (size_t i = 0; i < verts_size; i++)
       bnds.grow(verts[i]);
 
-    for (size_t i = 0; i < curve_keys_size; i++)
-      bnds.grow(curve_keys[i], curve_radius[i]);
-
     Attribute *attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
     if (use_motion_blur && attr) {
       size_t steps_size = verts.size() * (motion_steps - 1);
@@ -703,15 +353,6 @@ void Mesh::compute_bounds()
         bnds.grow(vert_steps[i]);
     }
 
-    Attribute *curve_attr = curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-    if (use_motion_blur && curve_attr) {
-      size_t steps_size = curve_keys.size() * (motion_steps - 1);
-      float3 *key_steps = curve_attr->data_float3();
-
-      for (size_t i = 0; i < steps_size; i++)
-        bnds.grow(key_steps[i]);
-    }
-
     if (!bnds.valid()) {
       bnds = BoundBox::empty;
 
@@ -719,9 +360,6 @@ void Mesh::compute_bounds()
       for (size_t i = 0; i < verts_size; i++)
         bnds.grow_safe(verts[i]);
 
-      for (size_t i = 0; i < curve_keys_size; i++)
-        bnds.grow_safe(curve_keys[i], curve_radius[i]);
-
       if (use_motion_blur && attr) {
         size_t steps_size = verts.size() * (motion_steps - 1);
         float3 *vert_steps = attr->data_float3();
@@ -729,14 +367,6 @@ void Mesh::compute_bounds()
         for (size_t i = 0; i < steps_size; i++)
           bnds.grow_safe(vert_steps[i]);
       }
-
-      if (use_motion_blur && curve_attr) {
-        size_t steps_size = curve_keys.size() * (motion_steps - 1);
-        float3 *key_steps = curve_attr->data_float3();
-
-        for (size_t i = 0; i < steps_size; i++)
-          bnds.grow_safe(key_steps[i]);
-      }
     }
   }
 
@@ -748,6 +378,38 @@ void Mesh::compute_bounds()
   bounds = bnds;
 }
 
+void Mesh::apply_transform(const Transform &tfm, const bool apply_to_motion)
+{
+  transform_normal = transform_transposed_inverse(tfm);
+
+  /* apply to mesh vertices */
+  for (size_t i = 0; i < verts.size(); i++)
+    verts[i] = transform_point(&tfm, verts[i]);
+
+  if (apply_to_motion) {
+    Attribute *attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+    if (attr) {
+      size_t steps_size = verts.size() * (motion_steps - 1);
+      float3 *vert_steps = attr->data_float3();
+
+      for (size_t i = 0; i < steps_size; i++)
+        vert_steps[i] = transform_point(&tfm, vert_steps[i]);
+    }
+
+    Attribute *attr_N = attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
+
+    if (attr_N) {
+      Transform ntfm = transform_normal;
+      size_t steps_size = verts.size() * (motion_steps - 1);
+      float3 *normal_steps = attr_N->data_float3();
+
+      for (size_t i = 0; i < steps_size; i++)
+        normal_steps[i] = normalize(transform_direction(&ntfm, normal_steps[i]));
+    }
+  }
+}
+
 void Mesh::add_face_normals()
 {
   /* don't compute if already there */
@@ -885,8 +547,7 @@ void Mesh::add_undisplaced()
   float3 *data = attr->data_float3();
 
   /* copy verts */
-  size_t size = attr->buffer_size(
-      this, (subdivision_type == SUBDIVISION_NONE) ? ATTR_PRIM_TRIANGLE : ATTR_PRIM_SUBD);
+  size_t size = attr->buffer_size(this, attrs.prim);
 
   /* Center points for ngons aren't stored in Mesh::verts but are included in size since they will
    * be calculated later, we subtract them from size here so we don't have an overflow while
@@ -975,39 +636,6 @@ void Mesh::pack_verts(const vector<uint> &tri_prim_index,
   }
 }
 
-void Mesh::pack_curves(Scene *scene,
-                       float4 *curve_key_co,
-                       float4 *curve_data,
-                       size_t curvekey_offset)
-{
-  size_t curve_keys_size = curve_keys.size();
-
-  /* pack curve keys */
-  if (curve_keys_size) {
-    float3 *keys_ptr = curve_keys.data();
-    float *radius_ptr = curve_radius.data();
-
-    for (size_t i = 0; i < curve_keys_size; i++)
-      curve_key_co[i] = make_float4(keys_ptr[i].x, keys_ptr[i].y, keys_ptr[i].z, radius_ptr[i]);
-  }
-
-  /* pack curve segments */
-  size_t curve_num = num_curves();
-
-  for (size_t i = 0; i < curve_num; i++) {
-    Curve curve = get_curve(i);
-    int shader_id = curve_shader[i];
-    Shader *shader = (shader_id < used_shaders.size()) ? used_shaders[shader_id] :
-                                                         scene->default_surface;
-    shader_id = scene->shader_manager->get_shader_id(shader, false);
-
-    curve_data[i] = make_float4(__int_as_float(curve.first_key + curvekey_offset),
-                                __int_as_float(curve.num_keys),
-                                __int_as_float(shader_id),
-                                0.0f);
-  }
-}
-
 void Mesh::pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset)
 {
   size_t num_faces = subd_faces.size();
@@ -1054,1391 +682,4 @@ void Mesh::pack_patches(uint *patch_data, uint vert_offset, uint face_offset, ui
   }
 }
 
-void Mesh::compute_bvh(
-    Device *device, DeviceScene *dscene, SceneParams *params, Progress *progress, int n, int total)
-{
-  if (progress->get_cancel())
-    return;
-
-  compute_bounds();
-
-  const BVHLayout bvh_layout = BVHParams::best_bvh_layout(params->bvh_layout,
-                                                          device->get_bvh_layout_mask());
-  if (need_build_bvh(bvh_layout)) {
-    string msg = "Updating Mesh BVH ";
-    if (name.empty())
-      msg += string_printf("%u/%u", (uint)(n + 1), (uint)total);
-    else
-      msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
-
-    Object object;
-    object.mesh = this;
-
-    vector<Mesh *> meshes;
-    meshes.push_back(this);
-    vector<Object *> objects;
-    objects.push_back(&object);
-
-    if (bvh && !need_update_rebuild) {
-      progress->set_status(msg, "Refitting BVH");
-
-      bvh->meshes = meshes;
-      bvh->objects = objects;
-
-      bvh->refit(*progress);
-    }
-    else {
-      progress->set_status(msg, "Building BVH");
-
-      BVHParams bparams;
-      bparams.use_spatial_split = params->use_bvh_spatial_split;
-      bparams.bvh_layout = bvh_layout;
-      bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
-                                    params->use_bvh_unaligned_nodes;
-      bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
-      bparams.num_motion_curve_steps = params->num_bvh_time_steps;
-      bparams.bvh_type = params->bvh_type;
-      bparams.curve_flags = dscene->data.curve.curveflags;
-      bparams.curve_subdivisions = dscene->data.curve.subdivisions;
-
-      delete bvh;
-      bvh = BVH::create(bparams, meshes, objects);
-      MEM_GUARDED_CALL(progress, bvh->build, *progress);
-    }
-  }
-
-  need_update = false;
-  need_update_rebuild = false;
-}
-
-void Mesh::tag_update(Scene *scene, bool rebuild)
-{
-  need_update = true;
-
-  if (rebuild) {
-    need_update_rebuild = true;
-    scene->light_manager->need_update = true;
-  }
-  else {
-    foreach (Shader *shader, used_shaders)
-      if (shader->has_surface_emission)
-        scene->light_manager->need_update = true;
-  }
-
-  scene->mesh_manager->need_update = true;
-  scene->object_manager->need_update = true;
-}
-
-bool Mesh::has_motion_blur() const
-{
-  return (use_motion_blur && (attributes.find(ATTR_STD_MOTION_VERTEX_POSITION) ||
-                              curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)));
-}
-
-bool Mesh::has_true_displacement() const
-{
-  foreach (Shader *shader, used_shaders) {
-    if (shader->has_displacement && shader->displacement_method != DISPLACE_BUMP) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool Mesh::has_voxel_attributes() const
-{
-  foreach (const Attribute &attr, attributes.attributes) {
-    if (attr.element == ATTR_ELEMENT_VOXEL) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-float Mesh::motion_time(int step) const
-{
-  return (motion_steps > 1) ? 2.0f * step / (motion_steps - 1) - 1.0f : 0.0f;
-}
-
-int Mesh::motion_step(float time) const
-{
-  if (motion_steps > 1) {
-    int attr_step = 0;
-
-    for (int step = 0; step < motion_steps; step++) {
-      float step_time = motion_time(step);
-      if (step_time == time) {
-        return attr_step;
-      }
-
-      /* Center step is stored in a separate attribute. */
-      if (step != motion_steps / 2) {
-        attr_step++;
-      }
-    }
-  }
-
-  return -1;
-}
-
-bool Mesh::need_build_bvh(BVHLayout layout) const
-{
-  return !transform_applied || has_surface_bssrdf || layout == BVH_LAYOUT_OPTIX;
-}
-
-bool Mesh::is_instanced() const
-{
-  /* Currently we treat subsurface objects as instanced.
-   *
-   * While it might be not very optimal for ray traversal, it avoids having
-   * duplicated BVH in the memory, saving quite some space.
-   */
-  return !transform_applied || has_surface_bssrdf;
-}
-
-/* Mesh Manager */
-
-MeshManager::MeshManager()
-{
-  need_update = true;
-  need_flags_update = true;
-}
-
-MeshManager::~MeshManager()
-{
-}
-
-void MeshManager::update_osl_attributes(Device *device,
-                                        Scene *scene,
-                                        vector<AttributeRequestSet> &mesh_attributes)
-{
-#ifdef WITH_OSL
-  /* for OSL, a hash map is used to lookup the attribute by name. */
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
-
-  og->object_name_map.clear();
-  og->attribute_map.clear();
-  og->object_names.clear();
-
-  og->attribute_map.resize(scene->objects.size() * ATTR_PRIM_TYPES);
-
-  for (size_t i = 0; i < scene->objects.size(); i++) {
-    /* set object name to object index map */
-    Object *object = scene->objects[i];
-    og->object_name_map[object->name] = i;
-    og->object_names.push_back(object->name);
-
-    /* set object attributes */
-    foreach (ParamValue &attr, object->attributes) {
-      OSLGlobals::Attribute osl_attr;
-
-      osl_attr.type = attr.type();
-      osl_attr.desc.element = ATTR_ELEMENT_OBJECT;
-      osl_attr.value = attr;
-      osl_attr.desc.offset = 0;
-      osl_attr.desc.flags = 0;
-
-      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][attr.name()] = osl_attr;
-      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_CURVE][attr.name()] = osl_attr;
-      og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][attr.name()] = osl_attr;
-    }
-
-    /* find mesh attributes */
-    size_t j;
-
-    for (j = 0; j < scene->meshes.size(); j++)
-      if (scene->meshes[j] == object->mesh)
-        break;
-
-    AttributeRequestSet &attributes = mesh_attributes[j];
-
-    /* set object attributes */
-    foreach (AttributeRequest &req, attributes.requests) {
-      OSLGlobals::Attribute osl_attr;
-
-      if (req.triangle_desc.element != ATTR_ELEMENT_NONE) {
-        osl_attr.desc = req.triangle_desc;
-
-        if (req.triangle_type == TypeDesc::TypeFloat)
-          osl_attr.type = TypeDesc::TypeFloat;
-        else if (req.triangle_type == TypeDesc::TypeMatrix)
-          osl_attr.type = TypeDesc::TypeMatrix;
-        else if (req.triangle_type == TypeFloat2)
-          osl_attr.type = TypeFloat2;
-        else if (req.triangle_type == TypeRGBA)
-          osl_attr.type = TypeRGBA;
-        else
-          osl_attr.type = TypeDesc::TypeColor;
-
-        if (req.std != ATTR_STD_NONE) {
-          /* if standard attribute, add lookup by geom: name convention */
-          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][stdname] = osl_attr;
-        }
-        else if (req.name != ustring()) {
-          /* add lookup by mesh attribute name */
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_TRIANGLE][req.name] = osl_attr;
-        }
-      }
-
-      if (req.curve_desc.element != ATTR_ELEMENT_NONE) {
-        osl_attr.desc = req.curve_desc;
-
-        if (req.curve_type == TypeDesc::TypeFloat)
-          osl_attr.type = TypeDesc::TypeFloat;
-        else if (req.curve_type == TypeDesc::TypeMatrix)
-          osl_attr.type = TypeDesc::TypeMatrix;
-        else if (req.curve_type == TypeFloat2)
-          osl_attr.type = TypeFloat2;
-        else if (req.curve_type == TypeRGBA)
-          osl_attr.type = TypeRGBA;
-        else
-          osl_attr.type = TypeDesc::TypeColor;
-
-        if (req.std != ATTR_STD_NONE) {
-          /* if standard attribute, add lookup by geom: name convention */
-          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_CURVE][stdname] = osl_attr;
-        }
-        else if (req.name != ustring()) {
-          /* add lookup by mesh attribute name */
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_CURVE][req.name] = osl_attr;
-        }
-      }
-
-      if (req.subd_desc.element != ATTR_ELEMENT_NONE) {
-        osl_attr.desc = req.subd_desc;
-
-        if (req.subd_type == TypeDesc::TypeFloat)
-          osl_attr.type = TypeDesc::TypeFloat;
-        else if (req.subd_type == TypeDesc::TypeMatrix)
-          osl_attr.type = TypeDesc::TypeMatrix;
-        else if (req.subd_type == TypeFloat2)
-          osl_attr.type = TypeFloat2;
-        else if (req.subd_type == TypeRGBA)
-          osl_attr.type = TypeRGBA;
-        else
-          osl_attr.type = TypeDesc::TypeColor;
-
-        if (req.std != ATTR_STD_NONE) {
-          /* if standard attribute, add lookup by geom: name convention */
-          ustring stdname(string("geom:") + string(Attribute::standard_name(req.std)));
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][stdname] = osl_attr;
-        }
-        else if (req.name != ustring()) {
-          /* add lookup by mesh attribute name */
-          og->attribute_map[i * ATTR_PRIM_TYPES + ATTR_PRIM_SUBD][req.name] = osl_attr;
-        }
-      }
-    }
-  }
-#else
-  (void)device;
-  (void)scene;
-  (void)mesh_attributes;
-#endif
-}
-
-void MeshManager::update_svm_attributes(Device *,
-                                        DeviceScene *dscene,
-                                        Scene *scene,
-                                        vector<AttributeRequestSet> &mesh_attributes)
-{
-  /* for SVM, the attributes_map table is used to lookup the offset of an
-   * attribute, based on a unique shader attribute id. */
-
-  /* compute array stride */
-  int attr_map_size = 0;
-
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-    mesh->attr_map_offset = attr_map_size;
-    attr_map_size += (mesh_attributes[i].size() + 1) * ATTR_PRIM_TYPES;
-  }
-
-  if (attr_map_size == 0)
-    return;
-
-  /* create attribute map */
-  uint4 *attr_map = dscene->attributes_map.alloc(attr_map_size);
-  memset(attr_map, 0, dscene->attributes_map.size() * sizeof(uint));
-
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-    AttributeRequestSet &attributes = mesh_attributes[i];
-
-    /* set object attributes */
-    int index = mesh->attr_map_offset;
-
-    foreach (AttributeRequest &req, attributes.requests) {
-      uint id;
-
-      if (req.std == ATTR_STD_NONE)
-        id = scene->shader_manager->get_attribute_id(req.name);
-      else
-        id = scene->shader_manager->get_attribute_id(req.std);
-
-      if (mesh->num_triangles()) {
-        attr_map[index].x = id;
-        attr_map[index].y = req.triangle_desc.element;
-        attr_map[index].z = as_uint(req.triangle_desc.offset);
-
-        if (req.triangle_type == TypeDesc::TypeFloat)
-          attr_map[index].w = NODE_ATTR_FLOAT;
-        else if (req.triangle_type == TypeDesc::TypeMatrix)
-          attr_map[index].w = NODE_ATTR_MATRIX;
-        else if (req.triangle_type == TypeFloat2)
-          attr_map[index].w = NODE_ATTR_FLOAT2;
-        else if (req.triangle_type == TypeRGBA)
-          attr_map[index].w = NODE_ATTR_RGBA;
-        else
-          attr_map[index].w = NODE_ATTR_FLOAT3;
-
-        attr_map[index].w |= req.triangle_desc.flags << 8;
-      }
-
-      index++;
-
-      if (mesh->num_curves()) {
-        attr_map[index].x = id;
-        attr_map[index].y = req.curve_desc.element;
-        attr_map[index].z = as_uint(req.curve_desc.offset);
-
-        if (req.curve_type == TypeDesc::TypeFloat)
-          attr_map[index].w = NODE_ATTR_FLOAT;
-        else if (req.curve_type == TypeDesc::TypeMatrix)
-          attr_map[index].w = NODE_ATTR_MATRIX;
-        else if (req.curve_type == TypeFloat2)
-          attr_map[index].w = NODE_ATTR_FLOAT2;
-        else
-          attr_map[index].w = NODE_ATTR_FLOAT3;
-
-        attr_map[index].w |= req.curve_desc.flags << 8;
-      }
-
-      index++;
-
-      if (mesh->subd_faces.size()) {
-        attr_map[index].x = id;
-        attr_map[index].y = req.subd_desc.element;
-        attr_map[index].z = as_uint(req.subd_desc.offset);
-
-        if (req.subd_type == TypeDesc::TypeFloat)
-          attr_map[index].w = NODE_ATTR_FLOAT;
-        else if (req.subd_type == TypeDesc::TypeMatrix)
-          attr_map[index].w = NODE_ATTR_MATRIX;
-        else if (req.subd_type == TypeFloat2)
-          attr_map[index].w = NODE_ATTR_FLOAT2;
-        else if (req.triangle_type == TypeRGBA)
-          attr_map[index].w = NODE_ATTR_RGBA;
-        else
-          attr_map[index].w = NODE_ATTR_FLOAT3;
-
-        attr_map[index].w |= req.subd_desc.flags << 8;
-      }
-
-      index++;
-    }
-
-    /* terminator */
-    for (int j = 0; j < ATTR_PRIM_TYPES; j++) {
-      attr_map[index].x = ATTR_STD_NONE;
-      attr_map[index].y = 0;
-      attr_map[index].z = 0;
-      attr_map[index].w = 0;
-
-      index++;
-    }
-  }
-
-  /* copy to device */
-  dscene->attributes_map.copy_to_device();
-}
-
-static void update_attribute_element_size(Mesh *mesh,
-                                          Attribute *mattr,
-                                          AttributePrimitive prim,
-                                          size_t *attr_float_size,
-                                          size_t *attr_float2_size,
-                                          size_t *attr_float3_size,
-                                          size_t *attr_uchar4_size)
-{
-  if (mattr) {
-    size_t size = mattr->element_size(mesh, prim);
-
-    if (mattr->element == ATTR_ELEMENT_VOXEL) {
-      /* pass */
-    }
-    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
-      *attr_uchar4_size += size;
-    }
-    else if (mattr->type == TypeDesc::TypeFloat) {
-      *attr_float_size += size;
-    }
-    else if (mattr->type == TypeFloat2) {
-      *attr_float2_size += size;
-    }
-    else if (mattr->type == TypeDesc::TypeMatrix) {
-      *attr_float3_size += size * 4;
-    }
-    else {
-      *attr_float3_size += size;
-    }
-  }
-}
-
-static void update_attribute_element_offset(Mesh *mesh,
-                                            device_vector<float> &attr_float,
-                                            size_t &attr_float_offset,
-                                            device_vector<float2> &attr_float2,
-                                            size_t &attr_float2_offset,
-                                            device_vector<float4> &attr_float3,
-                                            size_t &attr_float3_offset,
-                                            device_vector<uchar4> &attr_uchar4,
-                                            size_t &attr_uchar4_offset,
-                                            Attribute *mattr,
-                                            AttributePrimitive prim,
-                                            TypeDesc &type,
-                                            AttributeDescriptor &desc)
-{
-  if (mattr) {
-    /* store element and type */
-    desc.element = mattr->element;
-    desc.flags = mattr->flags;
-    type = mattr->type;
-
-    /* store attribute data in arrays */
-    size_t size = mattr->element_size(mesh, prim);
-
-    AttributeElement &element = desc.element;
-    int &offset = desc.offset;
-
-    if (mattr->element == ATTR_ELEMENT_VOXEL) {
-      /* store slot in offset value */
-      VoxelAttribute *voxel_data = mattr->data_voxel();
-      offset = voxel_data->slot;
-    }
-    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
-      uchar4 *data = mattr->data_uchar4();
-      offset = attr_uchar4_offset;
-
-      assert(attr_uchar4.size() >= offset + size);
-      for (size_t k = 0; k < size; k++) {
-        attr_uchar4[offset + k] = data[k];
-      }
-      attr_uchar4_offset += size;
-    }
-    else if (mattr->type == TypeDesc::TypeFloat) {
-      float *data = mattr->data_float();
-      offset = attr_float_offset;
-
-      assert(attr_float.size() >= offset + size);
-      for (size_t k = 0; k < size; k++) {
-        attr_float[offset + k] = data[k];
-      }
-      attr_float_offset += size;
-    }
-    else if (mattr->type == TypeFloat2) {
-      float2 *data = mattr->data_float2();
-      offset = attr_float2_offset;
-
-      assert(attr_float2.size() >= offset + size);
-      for (size_t k = 0; k < size; k++) {
-        attr_float2[offset + k] = data[k];
-      }
-      attr_float2_offset += size;
-    }
-    else if (mattr->type == TypeDesc::TypeMatrix) {
-      Transform *tfm = mattr->data_transform();
-      offset = attr_float3_offset;
-
-      assert(attr_float3.size() >= offset + size * 3);
-      for (size_t k = 0; k < size * 3; k++) {
-        attr_float3[offset + k] = (&tfm->x)[k];
-      }
-      attr_float3_offset += size * 3;
-    }
-    else {
-      float4 *data = mattr->data_float4();
-      offset = attr_float3_offset;
-
-      assert(attr_float3.size() >= offset + size);
-      for (size_t k = 0; k < size; k++) {
-        attr_float3[offset + k] = data[k];
-      }
-      attr_float3_offset += size;
-    }
-
-    /* mesh vertex/curve index is global, not per object, so we sneak
-     * a correction for that in here */
-    if (mesh->subdivision_type == Mesh::SUBDIVISION_CATMULL_CLARK &&
-        desc.flags & ATTR_SUBDIVIDED) {
-      /* indices for subdivided attributes are retrieved
-       * from patch table so no need for correction here*/
-    }
-    else if (element == ATTR_ELEMENT_VERTEX)
-      offset -= mesh->vert_offset;
-    else if (element == ATTR_ELEMENT_VERTEX_MOTION)
-      offset -= mesh->vert_offset;
-    else if (element == ATTR_ELEMENT_FACE) {
-      if (prim == ATTR_PRIM_TRIANGLE)
-        offset -= mesh->tri_offset;
-      else
-        offset -= mesh->face_offset;
-    }
-    else if (element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE) {
-      if (prim == ATTR_PRIM_TRIANGLE)
-        offset -= 3 * mesh->tri_offset;
-      else
-        offset -= mesh->corner_offset;
-    }
-    else if (element == ATTR_ELEMENT_CURVE)
-      offset -= mesh->curve_offset;
-    else if (element == ATTR_ELEMENT_CURVE_KEY)
-      offset -= mesh->curvekey_offset;
-    else if (element == ATTR_ELEMENT_CURVE_KEY_MOTION)
-      offset -= mesh->curvekey_offset;
-  }
-  else {
-    /* attribute not found */
-    desc.element = ATTR_ELEMENT_NONE;
-    desc.offset = 0;
-  }
-}
-
-void MeshManager::device_update_attributes(Device *device,
-                                           DeviceScene *dscene,
-                                           Scene *scene,
-                                           Progress &progress)
-{
-  progress.set_status("Updating Mesh", "Computing attributes");
-
-  /* gather per mesh requested attributes. as meshes may have multiple
-   * shaders assigned, this merges the requested attributes that have
-   * been set per shader by the shader manager */
-  vector<AttributeRequestSet> mesh_attributes(scene->meshes.size());
-
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-
-    scene->need_global_attributes(mesh_attributes[i]);
-
-    foreach (Shader *shader, mesh->used_shaders) {
-      mesh_attributes[i].add(shader->attributes);
-    }
-  }
-
-  /* mesh attribute are stored in a single array per data type. here we fill
-   * those arrays, and set the offset and element type to create attribute
-   * maps next */
-
-  /* Pre-allocate attributes to avoid arrays re-allocation which would
-   * take 2x of overall attribute memory usage.
-   */
-  size_t attr_float_size = 0;
-  size_t attr_float2_size = 0;
-  size_t attr_float3_size = 0;
-  size_t attr_uchar4_size = 0;
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-    AttributeRequestSet &attributes = mesh_attributes[i];
-    foreach (AttributeRequest &req, attributes.requests) {
-      Attribute *triangle_mattr = mesh->attributes.find(req);
-      Attribute *curve_mattr = mesh->curve_attributes.find(req);
-      Attribute *subd_mattr = mesh->subd_attributes.find(req);
-
-      update_attribute_element_size(mesh,
-                                    triangle_mattr,
-                                    ATTR_PRIM_TRIANGLE,
-                                    &attr_float_size,
-                                    &attr_float2_size,
-                                    &attr_float3_size,
-                                    &attr_uchar4_size);
-      update_attribute_element_size(mesh,
-                                    curve_mattr,
-                                    ATTR_PRIM_CURVE,
-                                    &attr_float_size,
-                                    &attr_float2_size,
-                                    &attr_float3_size,
-                                    &attr_uchar4_size);
-      update_attribute_element_size(mesh,
-                                    subd_mattr,
-                                    ATTR_PRIM_SUBD,
-                                    &attr_float_size,
-                                    &attr_float2_size,
-                                    &attr_float3_size,
-                                    &attr_uchar4_size);
-    }
-  }
-
-  dscene->attributes_float.alloc(attr_float_size);
-  dscene->attributes_float2.alloc(attr_float2_size);
-  dscene->attributes_float3.alloc(attr_float3_size);
-  dscene->attributes_uchar4.alloc(attr_uchar4_size);
-
-  size_t attr_float_offset = 0;
-  size_t attr_float2_offset = 0;
-  size_t attr_float3_offset = 0;
-  size_t attr_uchar4_offset = 0;
-
-  /* Fill in attributes. */
-  for (size_t i = 0; i < scene->meshes.size(); i++) {
-    Mesh *mesh = scene->meshes[i];
-    AttributeRequestSet &attributes = mesh_attributes[i];
-
-    /* todo: we now store std and name attributes from requests even if
-     * they actually refer to the same mesh attributes, optimize */
-    foreach (AttributeRequest &req, attributes.requests) {
-      Attribute *triangle_mattr = mesh->attributes.find(req);
-      Attribute *curve_mattr = mesh->curve_attributes.find(req);
-      Attribute *subd_mattr = mesh->subd_attributes.find(req);
-
-      update_attribute_element_offset(mesh,
-                                      dscene->attributes_float,
-                                      attr_float_offset,
-                                      dscene->attributes_float2,
-                                      attr_float2_offset,
-                                      dscene->attributes_float3,
-                                      attr_float3_offset,
-                                      dscene->attributes_uchar4,
-                                      attr_uchar4_offset,
-                                      triangle_mattr,
-                                      ATTR_PRIM_TRIANGLE,
-                                      req.triangle_type,
-                                      req.triangle_desc);
-
-      update_attribute_element_offset(mesh,
-                                      dscene->attributes_float,
-                                      attr_float_offset,
-                                      dscene->attributes_float2,
-                                      attr_float2_offset,
-                                      dscene->attributes_float3,
-                                      attr_float3_offset,
-                                      dscene->attributes_uchar4,
-                                      attr_uchar4_offset,
-                                      curve_mattr,
-                                      ATTR_PRIM_CURVE,
-                                      req.curve_type,
-                                      req.curve_desc);
-
-      update_attribute_element_offset(mesh,
-                                      dscene->attributes_float,
-                                      attr_float_offset,
-                                      dscene->attributes_float2,
-                                      attr_float2_offset,
-                                      dscene->attributes_float3,
-                                      attr_float3_offset,
-                                      dscene->attributes_uchar4,
-                                      attr_uchar4_offset,
-                                      subd_mattr,
-                                      ATTR_PRIM_SUBD,
-                                      req.subd_type,
-                                      req.subd_desc);
-
-      if (progress.get_cancel())
-        return;
-    }
-  }
-
-  /* create attribute lookup maps */
-  if (scene->shader_manager->use_osl())
-    update_osl_attributes(device, scene, mesh_attributes);
-
-  update_svm_attributes(device, dscene, scene, mesh_attributes);
-
-  if (progress.get_cancel())
-    return;
-
-  /* copy to device */
-  progress.set_status("Updating Mesh", "Copying Attributes to device");
-
-  if (dscene->attributes_float.size()) {
-    dscene->attributes_float.copy_to_device();
-  }
-  if (dscene->attributes_float2.size()) {
-    dscene->attributes_float2.copy_to_device();
-  }
-  if (dscene->attributes_float3.size()) {
-    dscene->attributes_float3.copy_to_device();
-  }
-  if (dscene->attributes_uchar4.size()) {
-    dscene->attributes_uchar4.copy_to_device();
-  }
-
-  if (progress.get_cancel())
-    return;
-
-  /* After mesh attributes and patch tables have been copied to device memory,
-   * we need to update offsets in the objects. */
-  scene->object_manager->device_update_mesh_offsets(device, dscene, scene);
-}
-
-void MeshManager::mesh_calc_offset(Scene *scene)
-{
-  size_t vert_size = 0;
-  size_t tri_size = 0;
-
-  size_t curve_key_size = 0;
-  size_t curve_size = 0;
-
-  size_t patch_size = 0;
-  size_t face_size = 0;
-  size_t corner_size = 0;
-
-  size_t prim_size = 0;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    mesh->vert_offset = vert_size;
-    mesh->tri_offset = tri_size;
-
-    mesh->curvekey_offset = curve_key_size;
-    mesh->curve_offset = curve_size;
-
-    mesh->patch_offset = patch_size;
-    mesh->face_offset = face_size;
-    mesh->corner_offset = corner_size;
-
-    vert_size += mesh->verts.size();
-    tri_size += mesh->num_triangles();
-
-    curve_key_size += mesh->curve_keys.size();
-    curve_size += mesh->num_curves();
-
-    if (mesh->subd_faces.size()) {
-      Mesh::SubdFace &last = mesh->subd_faces[mesh->subd_faces.size() - 1];
-      patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
-
-      /* patch tables are stored in same array so include them in patch_size */
-      if (mesh->patch_table) {
-        mesh->patch_table_offset = patch_size;
-        patch_size += mesh->patch_table->total_size();
-      }
-    }
-    face_size += mesh->subd_faces.size();
-    corner_size += mesh->subd_face_corners.size();
-
-    mesh->prim_offset = prim_size;
-    prim_size += mesh->num_primitives();
-  }
-}
-
-void MeshManager::device_update_mesh(
-    Device *, DeviceScene *dscene, Scene *scene, bool for_displacement, Progress &progress)
-{
-  /* Count. */
-  size_t vert_size = 0;
-  size_t tri_size = 0;
-
-  size_t curve_key_size = 0;
-  size_t curve_size = 0;
-
-  size_t patch_size = 0;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    vert_size += mesh->verts.size();
-    tri_size += mesh->num_triangles();
-
-    curve_key_size += mesh->curve_keys.size();
-    curve_size += mesh->num_curves();
-
-    if (mesh->subd_faces.size()) {
-      Mesh::SubdFace &last = mesh->subd_faces[mesh->subd_faces.size() - 1];
-      patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
-
-      /* patch tables are stored in same array so include them in patch_size */
-      if (mesh->patch_table) {
-        mesh->patch_table_offset = patch_size;
-        patch_size += mesh->patch_table->total_size();
-      }
-    }
-  }
-
-  /* Create mapping from triangle to primitive triangle array. */
-  vector<uint> tri_prim_index(tri_size);
-  if (for_displacement) {
-    /* For displacement kernels we do some trickery to make them believe
-     * we've got all required data ready. However, that data is different
-     * from final render kernels since we don't have BVH yet, so can't
-     * really use same semantic of arrays.
-     */
-    foreach (Mesh *mesh, scene->meshes) {
-      for (size_t i = 0; i < mesh->num_triangles(); ++i) {
-        tri_prim_index[i + mesh->tri_offset] = 3 * (i + mesh->tri_offset);
-      }
-    }
-  }
-  else {
-    for (size_t i = 0; i < dscene->prim_index.size(); ++i) {
-      if ((dscene->prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
-        tri_prim_index[dscene->prim_index[i]] = dscene->prim_tri_index[i];
-      }
-    }
-  }
-
-  /* Fill in all the arrays. */
-  if (tri_size != 0) {
-    /* normals */
-    progress.set_status("Updating Mesh", "Computing normals");
-
-    uint *tri_shader = dscene->tri_shader.alloc(tri_size);
-    float4 *vnormal = dscene->tri_vnormal.alloc(vert_size);
-    uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
-    uint *tri_patch = dscene->tri_patch.alloc(tri_size);
-    float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);
-
-    foreach (Mesh *mesh, scene->meshes) {
-      mesh->pack_shaders(scene, &tri_shader[mesh->tri_offset]);
-      mesh->pack_normals(&vnormal[mesh->vert_offset]);
-      mesh->pack_verts(tri_prim_index,
-                       &tri_vindex[mesh->tri_offset],
-                       &tri_patch[mesh->tri_offset],
-                       &tri_patch_uv[mesh->vert_offset],
-                       mesh->vert_offset,
-                       mesh->tri_offset);
-      if (progress.get_cancel())
-        return;
-    }
-
-    /* vertex coordinates */
-    progress.set_status("Updating Mesh", "Copying Mesh to device");
-
-    dscene->tri_shader.copy_to_device();
-    dscene->tri_vnormal.copy_to_device();
-    dscene->tri_vindex.copy_to_device();
-    dscene->tri_patch.copy_to_device();
-    dscene->tri_patch_uv.copy_to_device();
-  }
-
-  if (curve_size != 0) {
-    progress.set_status("Updating Mesh", "Copying Strands to device");
-
-    float4 *curve_keys = dscene->curve_keys.alloc(curve_key_size);
-    float4 *curves = dscene->curves.alloc(curve_size);
-
-    foreach (Mesh *mesh, scene->meshes) {
-      mesh->pack_curves(scene,
-                        &curve_keys[mesh->curvekey_offset],
-                        &curves[mesh->curve_offset],
-                        mesh->curvekey_offset);
-      if (progress.get_cancel())
-        return;
-    }
-
-    dscene->curve_keys.copy_to_device();
-    dscene->curves.copy_to_device();
-  }
-
-  if (patch_size != 0) {
-    progress.set_status("Updating Mesh", "Copying Patches to device");
-
-    uint *patch_data = dscene->patches.alloc(patch_size);
-
-    foreach (Mesh *mesh, scene->meshes) {
-      mesh->pack_patches(&patch_data[mesh->patch_offset],
-                         mesh->vert_offset,
-                         mesh->face_offset,
-                         mesh->corner_offset);
-
-      if (mesh->patch_table) {
-        mesh->patch_table->copy_adjusting_offsets(&patch_data[mesh->patch_table_offset],
-                                                  mesh->patch_table_offset);
-      }
-
-      if (progress.get_cancel())
-        return;
-    }
-
-    dscene->patches.copy_to_device();
-  }
-
-  if (for_displacement) {
-    float4 *prim_tri_verts = dscene->prim_tri_verts.alloc(tri_size * 3);
-    foreach (Mesh *mesh, scene->meshes) {
-      for (size_t i = 0; i < mesh->num_triangles(); ++i) {
-        Mesh::Triangle t = mesh->get_triangle(i);
-        size_t offset = 3 * (i + mesh->tri_offset);
-        prim_tri_verts[offset + 0] = float3_to_float4(mesh->verts[t.v[0]]);
-        prim_tri_verts[offset + 1] = float3_to_float4(mesh->verts[t.v[1]]);
-        prim_tri_verts[offset + 2] = float3_to_float4(mesh->verts[t.v[2]]);
-      }
-    }
-    dscene->prim_tri_verts.copy_to_device();
-  }
-}
-
-void MeshManager::device_update_bvh(Device *device,
-                                    DeviceScene *dscene,
-                                    Scene *scene,
-                                    Progress &progress)
-{
-  /* bvh build */
-  progress.set_status("Updating Scene BVH", "Building");
-
-  BVHParams bparams;
-  bparams.top_level = true;
-  bparams.bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
-                                                  device->get_bvh_layout_mask());
-  bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
-  bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
-                                scene->params.use_bvh_unaligned_nodes;
-  bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
-  bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
-  bparams.bvh_type = scene->params.bvh_type;
-  bparams.curve_flags = dscene->data.curve.curveflags;
-  bparams.curve_subdivisions = dscene->data.curve.subdivisions;
-
-  VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout) << " layout.";
-
-#ifdef WITH_EMBREE
-  if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
-    if (dscene->data.bvh.scene) {
-      BVHEmbree::destroy(dscene->data.bvh.scene);
-    }
-  }
-#endif
-
-  BVH *bvh = BVH::create(bparams, scene->meshes, scene->objects);
-  bvh->build(progress, &device->stats);
-
-  if (progress.get_cancel()) {
-#ifdef WITH_EMBREE
-    if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) {
-      if (dscene->data.bvh.scene) {
-        BVHEmbree::destroy(dscene->data.bvh.scene);
-      }
-    }
-#endif
-    delete bvh;
-    return;
-  }
-
-  /* copy to device */
-  progress.set_status("Updating Scene BVH", "Copying BVH to device");
-
-  PackedBVH &pack = bvh->pack;
-
-  if (pack.nodes.size()) {
-    dscene->bvh_nodes.steal_data(pack.nodes);
-    dscene->bvh_nodes.copy_to_device();
-  }
-  if (pack.leaf_nodes.size()) {
-    dscene->bvh_leaf_nodes.steal_data(pack.leaf_nodes);
-    dscene->bvh_leaf_nodes.copy_to_device();
-  }
-  if (pack.object_node.size()) {
-    dscene->object_node.steal_data(pack.object_node);
-    dscene->object_node.copy_to_device();
-  }
-  if (pack.prim_tri_index.size()) {
-    dscene->prim_tri_index.steal_data(pack.prim_tri_index);
-    dscene->prim_tri_index.copy_to_device();
-  }
-  if (pack.prim_tri_verts.size()) {
-    dscene->prim_tri_verts.steal_data(pack.prim_tri_verts);
-    dscene->prim_tri_verts.copy_to_device();
-  }
-  if (pack.prim_type.size()) {
-    dscene->prim_type.steal_data(pack.prim_type);
-    dscene->prim_type.copy_to_device();
-  }
-  if (pack.prim_visibility.size()) {
-    dscene->prim_visibility.steal_data(pack.prim_visibility);
-    dscene->prim_visibility.copy_to_device();
-  }
-  if (pack.prim_index.size()) {
-    dscene->prim_index.steal_data(pack.prim_index);
-    dscene->prim_index.copy_to_device();
-  }
-  if (pack.prim_object.size()) {
-    dscene->prim_object.steal_data(pack.prim_object);
-    dscene->prim_object.copy_to_device();
-  }
-  if (pack.prim_time.size()) {
-    dscene->prim_time.steal_data(pack.prim_time);
-    dscene->prim_time.copy_to_device();
-  }
-
-  dscene->data.bvh.root = pack.root_index;
-  dscene->data.bvh.bvh_layout = bparams.bvh_layout;
-  dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
-
-  bvh->copy_to_device(progress, dscene);
-
-  delete bvh;
-}
-
-void MeshManager::device_update_preprocess(Device *device, Scene *scene, Progress &progress)
-{
-  if (!need_update && !need_flags_update) {
-    return;
-  }
-
-  progress.set_status("Updating Meshes Flags");
-
-  /* Update flags. */
-  bool volume_images_updated = false;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    mesh->has_volume = false;
-
-    foreach (const Shader *shader, mesh->used_shaders) {
-      if (shader->has_volume) {
-        mesh->has_volume = true;
-      }
-      if (shader->has_surface_bssrdf) {
-        mesh->has_surface_bssrdf = true;
-      }
-    }
-
-    if (need_update && mesh->has_volume) {
-      /* Create volume meshes if there is voxel data. */
-      if (mesh->has_voxel_attributes()) {
-        if (!volume_images_updated) {
-          progress.set_status("Updating Meshes Volume Bounds");
-          device_update_volume_images(device, scene, progress);
-          volume_images_updated = true;
-        }
-
-        create_volume_mesh(scene, mesh, progress);
-      }
-    }
-  }
-
-  need_flags_update = false;
-}
-
-void MeshManager::device_update_displacement_images(Device *device,
-                                                    Scene *scene,
-                                                    Progress &progress)
-{
-  progress.set_status("Updating Displacement Images");
-  TaskPool pool;
-  ImageManager *image_manager = scene->image_manager;
-  set<int> bump_images;
-  foreach (Mesh *mesh, scene->meshes) {
-    if (mesh->need_update) {
-      foreach (Shader *shader, mesh->used_shaders) {
-        if (!shader->has_displacement || shader->displacement_method == DISPLACE_BUMP) {
-          continue;
-        }
-        foreach (ShaderNode *node, shader->graph->nodes) {
-          if (node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
-            continue;
-          }
-
-          ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode *>(node);
-          foreach (int slot, image_node->slots) {
-            if (slot != -1) {
-              bump_images.insert(slot);
-            }
-          }
-        }
-      }
-    }
-  }
-  foreach (int slot, bump_images) {
-    pool.push(function_bind(
-        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
-  }
-  pool.wait_work();
-}
-
-void MeshManager::device_update_volume_images(Device *device, Scene *scene, Progress &progress)
-{
-  progress.set_status("Updating Volume Images");
-  TaskPool pool;
-  ImageManager *image_manager = scene->image_manager;
-  set<int> volume_images;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    if (!mesh->need_update) {
-      continue;
-    }
-
-    foreach (Attribute &attr, mesh->attributes.attributes) {
-      if (attr.element != ATTR_ELEMENT_VOXEL) {
-        continue;
-      }
-
-      VoxelAttribute *voxel = attr.data_voxel();
-
-      if (voxel->slot != -1) {
-        volume_images.insert(voxel->slot);
-      }
-    }
-  }
-
-  foreach (int slot, volume_images) {
-    pool.push(function_bind(
-        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
-  }
-  pool.wait_work();
-}
-
-void MeshManager::device_update(Device *device,
-                                DeviceScene *dscene,
-                                Scene *scene,
-                                Progress &progress)
-{
-  if (!need_update)
-    return;
-
-  VLOG(1) << "Total " << scene->meshes.size() << " meshes.";
-
-  bool true_displacement_used = false;
-  size_t total_tess_needed = 0;
-
-  foreach (Mesh *mesh, scene->meshes) {
-    foreach (Shader *shader, mesh->used_shaders) {
-      if (shader->need_update_mesh)
-        mesh->need_update = true;
-    }
-
-    if (mesh->need_update) {
-      /* Update normals. */
-      mesh->add_face_normals();
-      mesh->add_vertex_normals();
-
-      if (mesh->need_attribute(scene, ATTR_STD_POSITION_UNDISPLACED)) {
-        mesh->add_undisplaced();
-      }
-
-      /* Test if we need tessellation. */
-      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE && mesh->num_subd_verts == 0 &&
-          mesh->subd_params) {
-        total_tess_needed++;
-      }
-
-      /* Test if we need displacement. */
-      if (mesh->has_true_displacement()) {
-        true_displacement_used = true;
-      }
-
-      if (progress.get_cancel())
-        return;
-    }
-  }
-
-  /* Tessellate meshes that are using subdivision */
-  if (total_tess_needed) {
-    Camera *dicing_camera = scene->dicing_camera;
-    dicing_camera->update(scene);
-
-    size_t i = 0;
-    foreach (Mesh *mesh, scene->meshes) {
-      if (mesh->need_update && mesh->subdivision_type != Mesh::SUBDIVISION_NONE &&
-          mesh->num_subd_verts == 0 && mesh->subd_params) {
-        string msg = "Tessellating ";
-        if (mesh->name == "")
-          msg += string_printf("%u/%u", (uint)(i + 1), (uint)total_tess_needed);
-        else
-          msg += string_printf(
-              "%s %u/%u", mesh->name.c_str(), (uint)(i + 1), (uint)total_tess_needed);
-
-        progress.set_status("Updating Mesh", msg);
-
-        mesh->subd_params->camera = dicing_camera;
-        DiagSplit dsplit(*mesh->subd_params);
-        mesh->tessellate(&dsplit);
-
-        i++;
-
-        if (progress.get_cancel())
-          return;
-      }
-    }
-  }
-
-  /* Update images needed for true displacement. */
-  bool old_need_object_flags_update = false;
-  if (true_displacement_used) {
-    VLOG(1) << "Updating images used for true displacement.";
-    device_update_displacement_images(device, scene, progress);
-    old_need_object_flags_update = scene->object_manager->need_flags_update;
-    scene->object_manager->device_update_flags(device, dscene, scene, progress, false);
-  }
-
-  /* Device update. */
-  device_free(device, dscene);
-
-  mesh_calc_offset(scene);
-  if (true_displacement_used) {
-    device_update_mesh(device, dscene, scene, true, progress);
-  }
-  if (progress.get_cancel())
-    return;
-
-  device_update_attributes(device, dscene, scene, progress);
-  if (progress.get_cancel())
-    return;
-
-  /* Update displacement. */
-  bool displacement_done = false;
-  size_t num_bvh = 0;
-  BVHLayout bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
-                                                    device->get_bvh_layout_mask());
-
-  foreach (Mesh *mesh, scene->meshes) {
-    if (mesh->need_update) {
-      if (displace(device, dscene, scene, mesh, progress)) {
-        displacement_done = true;
-      }
-
-      if (mesh->need_build_bvh(bvh_layout)) {
-        num_bvh++;
-      }
-    }
-
-    if (progress.get_cancel())
-      return;
-  }
-
-  /* Device re-update after displacement. */
-  if (displacement_done) {
-    device_free(device, dscene);
-
-    device_update_attributes(device, dscene, scene, progress);
-    if (progress.get_cancel())
-      return;
-  }
-
-  TaskPool pool;
-
-  size_t i = 0;
-  foreach (Mesh *mesh, scene->meshes) {
-    if (mesh->need_update) {
-      pool.push(function_bind(
-          &Mesh::compute_bvh, mesh, device, dscene, &scene->params, &progress, i, num_bvh));
-      if (mesh->need_build_bvh(bvh_layout)) {
-        i++;
-      }
-    }
-  }
-
-  TaskPool::Summary summary;
-  pool.wait_work(&summary);
-  VLOG(2) << "Objects BVH build pool statistics:\n" << summary.full_report();
-
-  foreach (Shader *shader, scene->shaders) {
-    shader->need_update_mesh = false;
-  }
-
-  Scene::MotionType need_motion = scene->need_motion();
-  bool motion_blur = need_motion == Scene::MOTION_BLUR;
-
-  /* Update objects. */
-  vector<Object *> volume_objects;
-  foreach (Object *object, scene->objects) {
-    object->compute_bounds(motion_blur);
-  }
-
-  if (progress.get_cancel())
-    return;
-
-  device_update_bvh(device, dscene, scene, progress);
-  if (progress.get_cancel())
-    return;
-
-  device_update_mesh(device, dscene, scene, false, progress);
-  if (progress.get_cancel())
-    return;
-
-  need_update = false;
-
-  if (true_displacement_used) {
-    /* Re-tag flags for update, so they're re-evaluated
-     * for meshes with correct bounding boxes.
-     *
-     * This wouldn't cause wrong results, just true
-     * displacement might be less optimal ot calculate.
-     */
-    scene->object_manager->need_flags_update = old_need_object_flags_update;
-  }
-}
-
-void MeshManager::device_free(Device *device, DeviceScene *dscene)
-{
-  dscene->bvh_nodes.free();
-  dscene->bvh_leaf_nodes.free();
-  dscene->object_node.free();
-  dscene->prim_tri_verts.free();
-  dscene->prim_tri_index.free();
-  dscene->prim_type.free();
-  dscene->prim_visibility.free();
-  dscene->prim_index.free();
-  dscene->prim_object.free();
-  dscene->prim_time.free();
-  dscene->tri_shader.free();
-  dscene->tri_vnormal.free();
-  dscene->tri_vindex.free();
-  dscene->tri_patch.free();
-  dscene->tri_patch_uv.free();
-  dscene->curves.free();
-  dscene->curve_keys.free();
-  dscene->patches.free();
-  dscene->attributes_map.free();
-  dscene->attributes_float.free();
-  dscene->attributes_float2.free();
-  dscene->attributes_float3.free();
-  dscene->attributes_uchar4.free();
-
-  /* Signal for shaders like displacement not to do ray tracing. */
-  dscene->data.bvh.bvh_layout = BVH_LAYOUT_NONE;
-
-#ifdef WITH_OSL
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
-
-  if (og) {
-    og->object_name_map.clear();
-    og->attribute_map.clear();
-    og->object_names.clear();
-  }
-#else
-  (void)device;
-#endif
-}
-
-void MeshManager::tag_update(Scene *scene)
-{
-  need_update = true;
-  scene->object_manager->need_update = true;
-}
-
-void MeshManager::collect_statistics(const Scene *scene, RenderStats *stats)
-{
-  foreach (Mesh *mesh, scene->meshes) {
-    stats->mesh.geometry.add_entry(
-        NamedSizeEntry(string(mesh->name.c_str()), mesh->get_total_size_in_bytes()));
-  }
-}
-
-bool Mesh::need_attribute(Scene *scene, AttributeStandard std)
-{
-  if (std == ATTR_STD_NONE)
-    return false;
-
-  if (scene->need_global_attribute(std))
-    return true;
-
-  foreach (Shader *shader, used_shaders)
-    if (shader->attributes.find(std))
-      return true;
-
-  return false;
-}
-
-bool Mesh::need_attribute(Scene * /*scene*/, ustring name)
-{
-  if (name == ustring())
-    return false;
-
-  foreach (Shader *shader, used_shaders)
-    if (shader->attributes.find(name))
-      return true;
-
-  return false;
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index c5be0ba60b9..d0cf4d557aa 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -21,6 +21,7 @@
 
 #include "bvh/bvh_params.h"
 #include "render/attribute.h"
+#include "render/geometry.h"
 #include "render/shader.h"
 
 #include "util/util_array.h"
@@ -29,7 +30,6 @@
 #include "util/util_map.h"
 #include "util/util_param.h"
 #include "util/util_set.h"
-#include "util/util_transform.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
@@ -51,7 +51,7 @@ struct PackedPatchTable;
 
 /* Mesh */
 
-class Mesh : public Node {
+class Mesh : public Geometry {
  public:
   NODE_DECLARE
 
@@ -91,94 +91,6 @@ class Mesh : public Node {
     return triangles.size() / 3;
   }
 
-  /* Mesh Curve */
-  struct Curve {
-    int first_key;
-    int num_keys;
-
-    int num_segments() const
-    {
-      return num_keys - 1;
-    }
-
-    void bounds_grow(const int k,
-                     const float3 *curve_keys,
-                     const float *curve_radius,
-                     BoundBox &bounds) const;
-    void bounds_grow(float4 keys[4], BoundBox &bounds) const;
-    void bounds_grow(const int k,
-                     const float3 *curve_keys,
-                     const float *curve_radius,
-                     const Transform &aligned_space,
-                     BoundBox &bounds) const;
-
-    void motion_keys(const float3 *curve_keys,
-                     const float *curve_radius,
-                     const float3 *key_steps,
-                     size_t num_curve_keys,
-                     size_t num_steps,
-                     float time,
-                     size_t k0,
-                     size_t k1,
-                     float4 r_keys[2]) const;
-    void cardinal_motion_keys(const float3 *curve_keys,
-                              const float *curve_radius,
-                              const float3 *key_steps,
-                              size_t num_curve_keys,
-                              size_t num_steps,
-                              float time,
-                              size_t k0,
-                              size_t k1,
-                              size_t k2,
-                              size_t k3,
-                              float4 r_keys[4]) const;
-
-    void keys_for_step(const float3 *curve_keys,
-                       const float *curve_radius,
-                       const float3 *key_steps,
-                       size_t num_curve_keys,
-                       size_t num_steps,
-                       size_t step,
-                       size_t k0,
-                       size_t k1,
-                       float4 r_keys[2]) const;
-    void cardinal_keys_for_step(const float3 *curve_keys,
-                                const float *curve_radius,
-                                const float3 *key_steps,
-                                size_t num_curve_keys,
-                                size_t num_steps,
-                                size_t step,
-                                size_t k0,
-                                size_t k1,
-                                size_t k2,
-                                size_t k3,
-                                float4 r_keys[4]) const;
-  };
-
-  Curve get_curve(size_t i) const
-  {
-    int first = curve_first_key[i];
-    int next_first = (i + 1 < curve_first_key.size()) ? curve_first_key[i + 1] : curve_keys.size();
-
-    Curve curve = {first, next_first - first};
-    return curve;
-  }
-
-  size_t num_curves() const
-  {
-    return curve_first_key.size();
-  }
-
-  size_t num_segments() const
-  {
-    return curve_keys.size() - curve_first_key.size();
-  }
-
-  size_t num_primitives() const
-  {
-    return num_triangles() + num_segments();
-  }
-
   /* Mesh SubdFace */
   struct SubdFace {
     int start_corner;
@@ -212,14 +124,6 @@ class Mesh : public Node {
   SubdivisionType subdivision_type;
 
   /* Mesh Data */
-  enum GeometryFlags {
-    GEOMETRY_NONE = 0,
-    GEOMETRY_TRIANGLES = (1 << 0),
-    GEOMETRY_CURVES = (1 << 1),
-  };
-  int geometry_flags; /* used to distinguish meshes with no verts
-                          and meshed for which geometry is not created */
-
   array<int> triangles;
   array<float3> verts;
   array<int> shader;
@@ -229,14 +133,9 @@ class Mesh : public Node {
   array<int> triangle_patch; /* must be < 0 for non subd triangles */
   array<float2> vert_patch_uv;
 
-  float volume_isovalue;
-  bool has_volume;         /* Set in the device_update_flags(). */
-  bool has_surface_bssrdf; /* Set in the device_update_flags(). */
-
-  array<float3> curve_keys;
-  array<float> curve_radius;
-  array<int> curve_first_key;
-  array<int> curve_shader;
+  float volume_clipping;
+  float volume_step_size;
+  bool volume_object_space;
 
   array<SubdFace> subd_faces;
   array<int> subd_face_corners;
@@ -246,42 +145,18 @@ class Mesh : public Node {
 
   SubdParams *subd_params;
 
-  vector<Shader *> used_shaders;
-  AttributeSet attributes;
-  AttributeSet curve_attributes;
   AttributeSet subd_attributes;
 
-  BoundBox bounds;
-  bool transform_applied;
-  bool transform_negative_scaled;
-  Transform transform_normal;
-
   PackedPatchTable *patch_table;
 
-  uint motion_steps;
-  bool use_motion_blur;
-
-  /* Update Flags */
-  bool need_update;
-  bool need_update_rebuild;
-
   /* BVH */
-  BVH *bvh;
-  size_t tri_offset;
   size_t vert_offset;
 
-  size_t curve_offset;
-  size_t curvekey_offset;
-
   size_t patch_offset;
   size_t patch_table_offset;
   size_t face_offset;
   size_t corner_offset;
 
-  size_t attr_map_offset;
-
-  size_t prim_offset;
-
   size_t num_subd_verts;
 
  private:
@@ -289,7 +164,7 @@ class Mesh : public Node {
   unordered_multimap<int, int>
       vert_stitching_map; /* stitching index -> multiple real vert indices */
   friend class DiagSplit;
-  friend class MeshManager;
+  friend class GeometryManager;
 
  public:
   /* Functions */
@@ -298,24 +173,24 @@ class Mesh : public Node {
 
   void resize_mesh(int numverts, int numfaces);
   void reserve_mesh(int numverts, int numfaces);
-  void resize_curves(int numcurves, int numkeys);
-  void reserve_curves(int numcurves, int numkeys);
   void resize_subd_faces(int numfaces, int num_ngons, int numcorners);
   void reserve_subd_faces(int numfaces, int num_ngons, int numcorners);
-  void clear(bool preserve_voxel_data = false);
+  void clear(bool preserve_voxel_data);
+  void clear() override;
   void add_vertex(float3 P);
   void add_vertex_slow(float3 P);
   void add_triangle(int v0, int v1, int v2, int shader, bool smooth);
-  void add_curve_key(float3 loc, float radius);
-  void add_curve(int first_key, int shader);
   void add_subd_face(int *corners, int num_corners, int shader_, bool smooth_);
 
-  void compute_bounds();
+  void copy_center_to_motion_step(const int motion_step);
+
+  void compute_bounds() override;
+  void apply_transform(const Transform &tfm, const bool apply_to_motion) override;
   void add_face_normals();
   void add_vertex_normals();
   void add_undisplaced();
 
-  void get_uv_tiles(ustring map, unordered_set<int> &tiles);
+  void get_uv_tiles(ustring map, unordered_set<int> &tiles) override;
 
   void pack_shaders(Scene *scene, uint *shader);
   void pack_normals(float4 *vnormal);
@@ -325,103 +200,11 @@ class Mesh : public Node {
                   float2 *tri_patch_uv,
                   size_t vert_offset,
                   size_t tri_offset);
-  void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
   void pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset);
 
-  void compute_bvh(Device *device,
-                   DeviceScene *dscene,
-                   SceneParams *params,
-                   Progress *progress,
-                   int n,
-                   int total);
-
-  bool need_attribute(Scene *scene, AttributeStandard std);
-  bool need_attribute(Scene *scene, ustring name);
-
-  void tag_update(Scene *scene, bool rebuild);
-
-  bool has_motion_blur() const;
-  bool has_true_displacement() const;
-  bool has_voxel_attributes() const;
-
-  /* Convert between normalized -1..1 motion time and index
-   * in the VERTEX_MOTION attribute. */
-  float motion_time(int step) const;
-  int motion_step(float time) const;
-
-  /* Check whether the mesh should have own BVH built separately. Briefly,
-   * own BVH is needed for mesh, if:
-   *
-   * - It is instanced multiple times, so each instance object should share the
-   *   same BVH tree.
-   * - Special ray intersection is needed, for example to limit subsurface rays
-   *   to only the mesh itself.
-   * - The BVH layout requires the top level to only contain instances.
-   */
-  bool need_build_bvh(BVHLayout layout) const;
-
-  /* Check if the mesh should be treated as instanced. */
-  bool is_instanced() const;
-
   void tessellate(DiagSplit *split);
 };
 
-/* Mesh Manager */
-
-class MeshManager {
- public:
-  bool need_update;
-  bool need_flags_update;
-
-  MeshManager();
-  ~MeshManager();
-
-  bool displace(Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress);
-
-  /* attributes */
-  void update_osl_attributes(Device *device,
-                             Scene *scene,
-                             vector<AttributeRequestSet> &mesh_attributes);
-  void update_svm_attributes(Device *device,
-                             DeviceScene *dscene,
-                             Scene *scene,
-                             vector<AttributeRequestSet> &mesh_attributes);
-
-  void device_update_preprocess(Device *device, Scene *scene, Progress &progress);
-  void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-
-  void device_free(Device *device, DeviceScene *dscene);
-
-  void tag_update(Scene *scene);
-
-  void create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progress);
-
-  void collect_statistics(const Scene *scene, RenderStats *stats);
-
- protected:
-  /* Calculate verts/triangles/curves offsets in global arrays. */
-  void mesh_calc_offset(Scene *scene);
-
-  void device_update_object(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-
-  void device_update_mesh(Device *device,
-                          DeviceScene *dscene,
-                          Scene *scene,
-                          bool for_displacement,
-                          Progress &progress);
-
-  void device_update_attributes(Device *device,
-                                DeviceScene *dscene,
-                                Scene *scene,
-                                Progress &progress);
-
-  void device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
-
-  void device_update_displacement_images(Device *device, Scene *scene, Progress &progress);
-
-  void device_update_volume_images(Device *device, Scene *scene, Progress &progress);
-};
-
 CCL_NAMESPACE_END
 
 #endif /* __MESH_H__ */
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index 6a6c2fbb3eb..467810f9273 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -43,7 +43,7 @@ static float3 compute_face_normal(const Mesh::Triangle &t, float3 *verts)
   return norm / normlen;
 }
 
-bool MeshManager::displace(
+bool GeometryManager::displace(
     Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
 {
   /* verify if we have a displacement shader */
@@ -58,7 +58,7 @@ bool MeshManager::displace(
   size_t object_index = OBJECT_NONE;
 
   for (size_t i = 0; i < scene->objects.size(); i++) {
-    if (scene->objects[i]->mesh == mesh) {
+    if (scene->objects[i]->geometry == mesh) {
       object_index = i;
       break;
     }
@@ -91,7 +91,7 @@ bool MeshManager::displace(
 
       /* set up object, primitive and barycentric coordinates */
       int object = object_index;
-      int prim = mesh->tri_offset + i;
+      int prim = mesh->prim_offset + i;
       float u, v;
 
       switch (j) {
diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp
index 40dd658eadd..3d72b2fab91 100644
--- a/intern/cycles/render/mesh_subdivision.cpp
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "render/mesh.h"
 #include "render/attribute.h"
 #include "render/camera.h"
+#include "render/mesh.h"
 
-#include "subd/subd_split.h"
 #include "subd/subd_patch.h"
 #include "subd/subd_patch_table.h"
+#include "subd/subd_split.h"
 
-#include "util/util_foreach.h"
 #include "util/util_algorithm.h"
+#include "util/util_foreach.h"
 #include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
@@ -32,10 +32,10 @@ CCL_NAMESPACE_BEGIN
 
 CCL_NAMESPACE_END
 
-#  include <opensubdiv/far/topologyRefinerFactory.h>
-#  include <opensubdiv/far/primvarRefiner.h>
-#  include <opensubdiv/far/patchTableFactory.h>
 #  include <opensubdiv/far/patchMap.h>
+#  include <opensubdiv/far/patchTableFactory.h>
+#  include <opensubdiv/far/primvarRefiner.h>
+#  include <opensubdiv/far/topologyRefinerFactory.h>
 
 /* specializations of TopologyRefinerFactory for ccl::Mesh */
 
diff --git a/intern/cycles/render/mesh_volume.cpp b/intern/cycles/render/mesh_volume.cpp
index f451b58e92a..d73ba3b06dd 100644
--- a/intern/cycles/render/mesh_volume.cpp
+++ b/intern/cycles/render/mesh_volume.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "render/mesh.h"
 #include "render/attribute.h"
+#include "render/mesh.h"
 #include "render/scene.h"
 
 #include "util/util_foreach.h"
@@ -362,7 +362,7 @@ struct VoxelAttributeGrid {
   int channels;
 };
 
-void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progress)
+void GeometryManager::create_volume_mesh(Mesh *mesh, Progress &progress)
 {
   string msg = string_printf("Computing Volume Mesh %s", mesh->name.c_str());
   progress.set_status("Updating Mesh", msg);
@@ -373,13 +373,15 @@ void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progres
   VolumeParams volume_params;
   volume_params.resolution = make_int3(0, 0, 0);
 
+  Transform transform = transform_identity();
+
   foreach (Attribute &attr, mesh->attributes.attributes) {
     if (attr.element != ATTR_ELEMENT_VOXEL) {
       continue;
     }
 
-    VoxelAttribute *voxel = attr.data_voxel();
-    device_memory *image_memory = scene->image_manager->image_memory(voxel->slot);
+    ImageHandle &handle = attr.data_voxel();
+    device_texture *image_memory = handle.image_memory();
     int3 resolution = make_int3(
         image_memory->data_width, image_memory->data_height, image_memory->data_depth);
 
@@ -387,14 +389,20 @@ void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progres
       volume_params.resolution = resolution;
     }
     else if (volume_params.resolution != resolution) {
-      VLOG(1) << "Can't create volume mesh, all voxel grid resolutions must be equal\n";
-      return;
+      /* TODO: support this as it's common for OpenVDB. */
+      VLOG(1) << "Can't create accurate volume mesh, all voxel grid resolutions must be equal\n";
+      continue;
     }
 
     VoxelAttributeGrid voxel_grid;
     voxel_grid.data = static_cast<float *>(image_memory->host_pointer);
     voxel_grid.channels = image_memory->data_elements;
     voxel_grids.push_back(voxel_grid);
+
+    /* TODO: support multiple transforms. */
+    if (image_memory->info.use_transform_3d) {
+      transform = image_memory->info.transform_3d;
+    }
   }
 
   if (voxel_grids.empty()) {
@@ -427,17 +435,14 @@ void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progres
   }
 
   /* Compute start point and cell size from transform. */
-  Attribute *attr = mesh->attributes.find(ATTR_STD_GENERATED_TRANSFORM);
   const int3 resolution = volume_params.resolution;
   float3 start_point = make_float3(0.0f, 0.0f, 0.0f);
   float3 cell_size = make_float3(1.0f / resolution.x, 1.0f / resolution.y, 1.0f / resolution.z);
 
-  if (attr) {
-    const Transform *tfm = attr->data_transform();
-    const Transform itfm = transform_inverse(*tfm);
-    start_point = transform_point(&itfm, start_point);
-    cell_size = transform_direction(&itfm, cell_size);
-  }
+  /* TODO: support arbitrary transforms, not just scale + translate. */
+  const Transform itfm = transform_inverse(transform);
+  start_point = transform_point(&itfm, start_point);
+  cell_size = transform_direction(&itfm, cell_size);
 
   volume_params.start_point = start_point;
   volume_params.cell_size = cell_size;
@@ -445,7 +450,7 @@ void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progres
 
   /* Build bounding mesh around non-empty volume cells. */
   VolumeMeshBuilder builder(&volume_params);
-  const float isovalue = mesh->volume_isovalue;
+  const float clipping = mesh->volume_clipping;
 
   for (int z = 0; z < resolution.z; ++z) {
     for (int y = 0; y < resolution.y; ++y) {
@@ -457,7 +462,7 @@ void MeshManager::create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progres
           const int channels = voxel_grid.channels;
 
           for (int c = 0; c < channels; c++) {
-            if (voxel_grid.data[voxel_index * channels + c] >= isovalue) {
+            if (voxel_grid.data[voxel_index * channels + c] >= clipping) {
               builder.add_node_with_padding(x, y, z);
               break;
             }
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index bdab2a99897..ac07d91c4ca 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -14,27 +14,28 @@
  * limitations under the License.
  */
 
+#include "render/nodes.h"
 #include "render/colorspace.h"
+#include "render/constant_fold.h"
 #include "render/film.h"
 #include "render/image.h"
 #include "render/integrator.h"
 #include "render/light.h"
 #include "render/mesh.h"
-#include "render/nodes.h"
+#include "render/osl.h"
 #include "render/scene.h"
 #include "render/svm.h"
-#include "kernel/svm/svm_color_util.h"
-#include "kernel/svm/svm_ramp_util.h"
-#include "kernel/svm/svm_math_util.h"
-#include "kernel/svm/svm_mapping_util.h"
-#include "render/osl.h"
-#include "render/constant_fold.h"
 
-#include "util/util_sky_model.h"
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
+#include "util/util_sky_model.h"
 #include "util/util_transform.h"
 
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_mapping_util.h"
+#include "kernel/svm/svm_math_util.h"
+#include "kernel/svm/svm_ramp_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Texture Mapping */
@@ -205,27 +206,6 @@ void TextureMapping::compile(OSLCompiler &compiler)
 
 /* Image Texture */
 
-ImageSlotTextureNode::~ImageSlotTextureNode()
-{
-  if (image_manager) {
-    foreach (int slot, slots) {
-      if (slot != -1) {
-        image_manager->remove_image(slot);
-      }
-    }
-  }
-}
-
-void ImageSlotTextureNode::add_image_user() const
-{
-  /* Increase image user count for new node. */
-  foreach (int slot, slots) {
-    if (slot != -1) {
-      image_manager->add_image_user(slot);
-    }
-  }
-}
-
 NODE_DEFINE(ImageTextureNode)
 {
   NodeType *type = NodeType::add("image_texture", create, NodeType::SHADER);
@@ -275,18 +255,27 @@ NODE_DEFINE(ImageTextureNode)
 
 ImageTextureNode::ImageTextureNode() : ImageSlotTextureNode(node_type)
 {
-  is_float = false;
-  compress_as_srgb = false;
   colorspace = u_colorspace_raw;
-  builtin_data = NULL;
   animated = false;
   tiles.push_back(1001);
 }
 
 ShaderNode *ImageTextureNode::clone() const
 {
-  add_image_user();
-  return new ImageTextureNode(*this);
+  ImageTextureNode *node = new ImageTextureNode(*this);
+  node->handle = handle;
+  return node;
+}
+
+ImageParams ImageTextureNode::image_params() const
+{
+  ImageParams params;
+  params.animated = animated;
+  params.interpolation = interpolation;
+  params.extension = extension;
+  params.alpha_type = alpha_type;
+  params.colorspace = colorspace;
+  return params;
 }
 
 void ImageTextureNode::cull_tiles(Scene *scene, ShaderGraph *graph)
@@ -333,10 +322,10 @@ void ImageTextureNode::cull_tiles(Scene *scene, ShaderGraph *graph)
   /* TODO(lukas): This is quite inefficient. A fairly simple improvement would
    * be to have a cache in each mesh that is indexed by attribute.
    * Additionally, building a graph-to-meshes list once could help. */
-  foreach (Mesh *mesh, scene->meshes) {
-    foreach (Shader *shader, mesh->used_shaders) {
+  foreach (Geometry *geom, scene->geometry) {
+    foreach (Shader *shader, geom->used_shaders) {
       if (shader->graph == graph) {
-        mesh->get_uv_tiles(attribute, used_tiles);
+        geom->get_uv_tiles(attribute, used_tiles);
       }
     }
   }
@@ -371,123 +360,80 @@ void ImageTextureNode::compile(SVMCompiler &compiler)
   ShaderOutput *color_out = output("Color");
   ShaderOutput *alpha_out = output("Alpha");
 
-  image_manager = compiler.scene->image_manager;
-  if (slots.empty()) {
+  if (handle.empty()) {
     cull_tiles(compiler.scene, compiler.current_graph);
-    slots.reserve(tiles.size());
-
-    bool have_metadata = false;
-    foreach (int tile, tiles) {
-      string tile_name = filename.string();
-      string_replace(tile_name, "<UDIM>", string_printf("%04d", tile));
-
-      ImageMetaData metadata;
-      int slot = image_manager->add_image(tile_name,
-                                          builtin_data,
-                                          animated,
-                                          0,
-                                          interpolation,
-                                          extension,
-                                          alpha_type,
-                                          colorspace,
-                                          metadata);
-      slots.push_back(slot);
-
-      /* We assume that all tiles have the same metadata. */
-      if (!have_metadata) {
-        is_float = metadata.is_float;
-        compress_as_srgb = metadata.compress_as_srgb;
-        known_colorspace = metadata.colorspace;
-        have_metadata = true;
-      }
-    }
+    ImageManager *image_manager = compiler.scene->image_manager;
+    handle = image_manager->add_image(filename.string(), image_params(), tiles);
   }
 
-  bool has_image = false;
-  foreach (int slot, slots) {
-    if (slot != -1) {
-      has_image = true;
-      break;
-    }
-  }
+  /* All tiles have the same metadata. */
+  const ImageMetaData metadata = handle.metadata();
+  const bool compress_as_srgb = metadata.compress_as_srgb;
+  const ustring known_colorspace = metadata.colorspace;
 
-  if (has_image) {
-    int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
-    uint flags = 0;
+  int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
+  uint flags = 0;
 
-    if (compress_as_srgb) {
-      flags |= NODE_IMAGE_COMPRESS_AS_SRGB;
+  if (compress_as_srgb) {
+    flags |= NODE_IMAGE_COMPRESS_AS_SRGB;
+  }
+  if (!alpha_out->links.empty()) {
+    const bool unassociate_alpha = !(ColorSpaceManager::colorspace_is_data(colorspace) ||
+                                     alpha_type == IMAGE_ALPHA_CHANNEL_PACKED ||
+                                     alpha_type == IMAGE_ALPHA_IGNORE);
+
+    if (unassociate_alpha) {
+      flags |= NODE_IMAGE_ALPHA_UNASSOCIATE;
     }
-    if (!alpha_out->links.empty()) {
-      const bool unassociate_alpha = !(ColorSpaceManager::colorspace_is_data(colorspace) ||
-                                       alpha_type == IMAGE_ALPHA_CHANNEL_PACKED ||
-                                       alpha_type == IMAGE_ALPHA_IGNORE);
+  }
 
-      if (unassociate_alpha) {
-        flags |= NODE_IMAGE_ALPHA_UNASSOCIATE;
-      }
+  if (projection != NODE_IMAGE_PROJ_BOX) {
+    /* If there only is one image (a very common case), we encode it as a negative value. */
+    int num_nodes;
+    if (handle.num_tiles() == 1) {
+      num_nodes = -handle.svm_slot();
+    }
+    else {
+      num_nodes = divide_up(handle.num_tiles(), 2);
     }
 
-    if (projection != NODE_IMAGE_PROJ_BOX) {
-      /* If there only is one image (a very common case), we encode it as a negative value. */
-      int num_nodes;
-      if (slots.size() == 1) {
-        num_nodes = -slots[0];
-      }
-      else {
-        num_nodes = divide_up(slots.size(), 2);
-      }
+    compiler.add_node(NODE_TEX_IMAGE,
+                      num_nodes,
+                      compiler.encode_uchar4(vector_offset,
+                                             compiler.stack_assign_if_linked(color_out),
+                                             compiler.stack_assign_if_linked(alpha_out),
+                                             flags),
+                      projection);
 
-      compiler.add_node(NODE_TEX_IMAGE,
-                        num_nodes,
-                        compiler.encode_uchar4(vector_offset,
-                                               compiler.stack_assign_if_linked(color_out),
-                                               compiler.stack_assign_if_linked(alpha_out),
-                                               flags),
-                        projection);
-
-      if (num_nodes > 0) {
-        for (int i = 0; i < num_nodes; i++) {
-          int4 node;
-          node.x = tiles[2 * i];
-          node.y = slots[2 * i];
-          if (2 * i + 1 < slots.size()) {
-            node.z = tiles[2 * i + 1];
-            node.w = slots[2 * i + 1];
-          }
-          else {
-            node.z = -1;
-            node.w = -1;
-          }
-          compiler.add_node(node.x, node.y, node.z, node.w);
+    if (num_nodes > 0) {
+      for (int i = 0; i < num_nodes; i++) {
+        int4 node;
+        node.x = tiles[2 * i];
+        node.y = handle.svm_slot(2 * i);
+        if (2 * i + 1 < tiles.size()) {
+          node.z = tiles[2 * i + 1];
+          node.w = handle.svm_slot(2 * i + 1);
         }
+        else {
+          node.z = -1;
+          node.w = -1;
+        }
+        compiler.add_node(node.x, node.y, node.z, node.w);
       }
     }
-    else {
-      assert(slots.size() == 1);
-      compiler.add_node(NODE_TEX_IMAGE_BOX,
-                        slots[0],
-                        compiler.encode_uchar4(vector_offset,
-                                               compiler.stack_assign_if_linked(color_out),
-                                               compiler.stack_assign_if_linked(alpha_out),
-                                               flags),
-                        __float_as_int(projection_blend));
-    }
-
-    tex_mapping.compile_end(compiler, vector_in, vector_offset);
   }
   else {
-    /* image not found */
-    if (!color_out->links.empty()) {
-      compiler.add_node(NODE_VALUE_V, compiler.stack_assign(color_out));
-      compiler.add_node(
-          NODE_VALUE_V,
-          make_float3(TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B));
-    }
-    if (!alpha_out->links.empty())
-      compiler.add_node(
-          NODE_VALUE_F, __float_as_int(TEX_IMAGE_MISSING_A), compiler.stack_assign(alpha_out));
+    assert(handle.num_tiles() == 1);
+    compiler.add_node(NODE_TEX_IMAGE_BOX,
+                      handle.svm_slot(),
+                      compiler.encode_uchar4(vector_offset,
+                                             compiler.stack_assign_if_linked(color_out),
+                                             compiler.stack_assign_if_linked(alpha_out),
+                                             flags),
+                      __float_as_int(projection_blend));
   }
+
+  tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
 
 void ImageTextureNode::compile(OSLCompiler &compiler)
@@ -496,38 +442,22 @@ void ImageTextureNode::compile(OSLCompiler &compiler)
 
   tex_mapping.compile(compiler);
 
-  image_manager = compiler.scene->image_manager;
-  if (slots.size() == 0) {
-    ImageMetaData metadata;
-    if (builtin_data == NULL) {
-      string tile_name = filename.string();
-      string_replace(tile_name, "<UDIM>", "1001");
-      image_manager->get_image_metadata(tile_name, NULL, colorspace, metadata);
-      slots.push_back(-1);
-    }
-    else {
-      int slot = image_manager->add_image(filename.string(),
-                                          builtin_data,
-                                          animated,
-                                          0,
-                                          interpolation,
-                                          extension,
-                                          alpha_type,
-                                          colorspace,
-                                          metadata);
-      slots.push_back(slot);
-    }
-    is_float = metadata.is_float;
-    compress_as_srgb = metadata.compress_as_srgb;
-    known_colorspace = metadata.colorspace;
+  if (handle.empty()) {
+    ImageManager *image_manager = compiler.scene->image_manager;
+    handle = image_manager->add_image(filename.string(), image_params());
   }
 
-  if (slots[0] == -1) {
+  const ImageMetaData metadata = handle.metadata();
+  const bool is_float = metadata.is_float();
+  const bool compress_as_srgb = metadata.compress_as_srgb;
+  const ustring known_colorspace = metadata.colorspace;
+
+  if (handle.svm_slot() == -1) {
     compiler.parameter_texture(
         "filename", filename, compress_as_srgb ? u_colorspace_raw : known_colorspace);
   }
   else {
-    compiler.parameter_texture("filename", slots[0]);
+    compiler.parameter_texture("filename", handle.svm_slot());
   }
 
   const bool unassociate_alpha = !(ColorSpaceManager::colorspace_is_data(colorspace) ||
@@ -589,17 +519,26 @@ NODE_DEFINE(EnvironmentTextureNode)
 
 EnvironmentTextureNode::EnvironmentTextureNode() : ImageSlotTextureNode(node_type)
 {
-  is_float = false;
-  compress_as_srgb = false;
   colorspace = u_colorspace_raw;
-  builtin_data = NULL;
   animated = false;
 }
 
 ShaderNode *EnvironmentTextureNode::clone() const
 {
-  add_image_user();
-  return new EnvironmentTextureNode(*this);
+  EnvironmentTextureNode *node = new EnvironmentTextureNode(*this);
+  node->handle = handle;
+  return node;
+}
+
+ImageParams EnvironmentTextureNode::image_params() const
+{
+  ImageParams params;
+  params.animated = animated;
+  params.interpolation = interpolation;
+  params.extension = EXTENSION_REPEAT;
+  params.alpha_type = alpha_type;
+  params.colorspace = colorspace;
+  return params;
 }
 
 void EnvironmentTextureNode::attributes(Shader *shader, AttributeRequestSet *attributes)
@@ -621,93 +560,53 @@ void EnvironmentTextureNode::compile(SVMCompiler &compiler)
   ShaderOutput *color_out = output("Color");
   ShaderOutput *alpha_out = output("Alpha");
 
-  image_manager = compiler.scene->image_manager;
-  if (slots.empty()) {
-    ImageMetaData metadata;
-    int slot = image_manager->add_image(filename.string(),
-                                        builtin_data,
-                                        animated,
-                                        0,
-                                        interpolation,
-                                        EXTENSION_REPEAT,
-                                        alpha_type,
-                                        colorspace,
-                                        metadata);
-    slots.push_back(slot);
-    is_float = metadata.is_float;
-    compress_as_srgb = metadata.compress_as_srgb;
-    known_colorspace = metadata.colorspace;
-  }
-
-  if (slots[0] != -1) {
-    int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
-    uint flags = 0;
-
-    if (compress_as_srgb) {
-      flags |= NODE_IMAGE_COMPRESS_AS_SRGB;
-    }
+  if (handle.empty()) {
+    ImageManager *image_manager = compiler.scene->image_manager;
+    handle = image_manager->add_image(filename.string(), image_params());
+  }
 
-    compiler.add_node(NODE_TEX_ENVIRONMENT,
-                      slots[0],
-                      compiler.encode_uchar4(vector_offset,
-                                             compiler.stack_assign_if_linked(color_out),
-                                             compiler.stack_assign_if_linked(alpha_out),
-                                             flags),
-                      projection);
+  const ImageMetaData metadata = handle.metadata();
+  const bool compress_as_srgb = metadata.compress_as_srgb;
+  const ustring known_colorspace = metadata.colorspace;
 
-    tex_mapping.compile_end(compiler, vector_in, vector_offset);
-  }
-  else {
-    /* image not found */
-    if (!color_out->links.empty()) {
-      compiler.add_node(NODE_VALUE_V, compiler.stack_assign(color_out));
-      compiler.add_node(
-          NODE_VALUE_V,
-          make_float3(TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B));
-    }
-    if (!alpha_out->links.empty())
-      compiler.add_node(
-          NODE_VALUE_F, __float_as_int(TEX_IMAGE_MISSING_A), compiler.stack_assign(alpha_out));
+  int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
+  uint flags = 0;
+
+  if (compress_as_srgb) {
+    flags |= NODE_IMAGE_COMPRESS_AS_SRGB;
   }
+
+  compiler.add_node(NODE_TEX_ENVIRONMENT,
+                    handle.svm_slot(),
+                    compiler.encode_uchar4(vector_offset,
+                                           compiler.stack_assign_if_linked(color_out),
+                                           compiler.stack_assign_if_linked(alpha_out),
+                                           flags),
+                    projection);
+
+  tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
 
 void EnvironmentTextureNode::compile(OSLCompiler &compiler)
 {
+  if (handle.empty()) {
+    ImageManager *image_manager = compiler.scene->image_manager;
+    handle = image_manager->add_image(filename.string(), image_params());
+  }
+
   tex_mapping.compile(compiler);
 
-  /* See comments in ImageTextureNode::compile about support
-   * of builtin images.
-   */
-  image_manager = compiler.scene->image_manager;
-  if (slots.empty()) {
-    ImageMetaData metadata;
-    if (builtin_data == NULL) {
-      image_manager->get_image_metadata(filename.string(), NULL, colorspace, metadata);
-      slots.push_back(-1);
-    }
-    else {
-      int slot = image_manager->add_image(filename.string(),
-                                          builtin_data,
-                                          animated,
-                                          0,
-                                          interpolation,
-                                          EXTENSION_REPEAT,
-                                          alpha_type,
-                                          colorspace,
-                                          metadata);
-      slots.push_back(slot);
-    }
-    is_float = metadata.is_float;
-    compress_as_srgb = metadata.compress_as_srgb;
-    known_colorspace = metadata.colorspace;
-  }
+  const ImageMetaData metadata = handle.metadata();
+  const bool is_float = metadata.is_float();
+  const bool compress_as_srgb = metadata.compress_as_srgb;
+  const ustring known_colorspace = metadata.colorspace;
 
-  if (slots[0] == -1) {
+  if (handle.svm_slot() == -1) {
     compiler.parameter_texture(
         "filename", filename, compress_as_srgb ? u_colorspace_raw : known_colorspace);
   }
   else {
-    compiler.parameter_texture("filename", slots[0]);
+    compiler.parameter_texture("filename", handle.svm_slot());
   }
 
   compiler.parameter(this, "projection");
@@ -1350,7 +1249,7 @@ NODE_DEFINE(MusgraveTextureNode)
   SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
   SOCKET_IN_FLOAT(detail, "Detail", 2.0f);
   SOCKET_IN_FLOAT(dimension, "Dimension", 2.0f);
-  SOCKET_IN_FLOAT(lacunarity, "Lacunarity", 1.0f);
+  SOCKET_IN_FLOAT(lacunarity, "Lacunarity", 2.0f);
   SOCKET_IN_FLOAT(offset, "Offset", 0.0f);
   SOCKET_IN_FLOAT(gain, "Gain", 1.0f);
 
@@ -1422,15 +1321,33 @@ NODE_DEFINE(WaveTextureNode)
   type_enum.insert("rings", NODE_WAVE_RINGS);
   SOCKET_ENUM(type, "Type", type_enum, NODE_WAVE_BANDS);
 
+  static NodeEnum bands_direction_enum;
+  bands_direction_enum.insert("x", NODE_WAVE_BANDS_DIRECTION_X);
+  bands_direction_enum.insert("y", NODE_WAVE_BANDS_DIRECTION_Y);
+  bands_direction_enum.insert("z", NODE_WAVE_BANDS_DIRECTION_Z);
+  bands_direction_enum.insert("diagonal", NODE_WAVE_BANDS_DIRECTION_DIAGONAL);
+  SOCKET_ENUM(
+      bands_direction, "Bands Direction", bands_direction_enum, NODE_WAVE_BANDS_DIRECTION_X);
+
+  static NodeEnum rings_direction_enum;
+  rings_direction_enum.insert("x", NODE_WAVE_RINGS_DIRECTION_X);
+  rings_direction_enum.insert("y", NODE_WAVE_RINGS_DIRECTION_Y);
+  rings_direction_enum.insert("z", NODE_WAVE_RINGS_DIRECTION_Z);
+  rings_direction_enum.insert("spherical", NODE_WAVE_RINGS_DIRECTION_SPHERICAL);
+  SOCKET_ENUM(
+      rings_direction, "Rings Direction", rings_direction_enum, NODE_WAVE_BANDS_DIRECTION_X);
+
   static NodeEnum profile_enum;
   profile_enum.insert("sine", NODE_WAVE_PROFILE_SIN);
   profile_enum.insert("saw", NODE_WAVE_PROFILE_SAW);
+  profile_enum.insert("tri", NODE_WAVE_PROFILE_TRI);
   SOCKET_ENUM(profile, "Profile", profile_enum, NODE_WAVE_PROFILE_SIN);
 
   SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
   SOCKET_IN_FLOAT(distortion, "Distortion", 0.0f);
   SOCKET_IN_FLOAT(detail, "Detail", 2.0f);
   SOCKET_IN_FLOAT(detail_scale, "Detail Scale", 0.0f);
+  SOCKET_IN_FLOAT(phase, "Phase Offset", 0.0f);
   SOCKET_IN_POINT(
       vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED);
 
@@ -1446,32 +1363,36 @@ WaveTextureNode::WaveTextureNode() : TextureNode(node_type)
 
 void WaveTextureNode::compile(SVMCompiler &compiler)
 {
+  ShaderInput *vector_in = input("Vector");
   ShaderInput *scale_in = input("Scale");
   ShaderInput *distortion_in = input("Distortion");
-  ShaderInput *dscale_in = input("Detail Scale");
   ShaderInput *detail_in = input("Detail");
-  ShaderInput *vector_in = input("Vector");
-  ShaderOutput *fac_out = output("Fac");
+  ShaderInput *dscale_in = input("Detail Scale");
+  ShaderInput *phase_in = input("Phase Offset");
   ShaderOutput *color_out = output("Color");
+  ShaderOutput *fac_out = output("Fac");
 
   int vector_offset = tex_mapping.compile_begin(compiler, vector_in);
 
   compiler.add_node(NODE_TEX_WAVE,
-                    compiler.encode_uchar4(type,
-                                           compiler.stack_assign_if_linked(color_out),
-                                           compiler.stack_assign_if_linked(fac_out),
-                                           compiler.stack_assign_if_linked(dscale_in)),
+                    compiler.encode_uchar4(type, bands_direction, rings_direction, profile),
                     compiler.encode_uchar4(vector_offset,
                                            compiler.stack_assign_if_linked(scale_in),
-                                           compiler.stack_assign_if_linked(detail_in),
-                                           compiler.stack_assign_if_linked(distortion_in)),
-                    profile);
+                                           compiler.stack_assign_if_linked(distortion_in),
+                                           compiler.stack_assign_if_linked(detail_in)),
+                    compiler.encode_uchar4(compiler.stack_assign_if_linked(dscale_in),
+                                           compiler.stack_assign_if_linked(phase_in),
+                                           compiler.stack_assign_if_linked(color_out),
+                                           compiler.stack_assign_if_linked(fac_out)));
 
   compiler.add_node(__float_as_int(scale),
                     __float_as_int(detail),
                     __float_as_int(distortion),
                     __float_as_int(detail_scale));
 
+  compiler.add_node(
+      __float_as_int(phase), SVM_STACK_INVALID, SVM_STACK_INVALID, SVM_STACK_INVALID);
+
   tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
 
@@ -1480,6 +1401,8 @@ void WaveTextureNode::compile(OSLCompiler &compiler)
   tex_mapping.compile(compiler);
 
   compiler.parameter(this, "type");
+  compiler.parameter(this, "bands_direction");
+  compiler.parameter(this, "rings_direction");
   compiler.parameter(this, "profile");
 
   compiler.add(this, "node_wave_texture");
@@ -1722,21 +1645,10 @@ NODE_DEFINE(PointDensityTextureNode)
 
 PointDensityTextureNode::PointDensityTextureNode() : ShaderNode(node_type)
 {
-  image_manager = NULL;
-  slot = -1;
-  builtin_data = NULL;
 }
 
 PointDensityTextureNode::~PointDensityTextureNode()
 {
-  if (image_manager) {
-    image_manager->remove_image(filename.string(),
-                                builtin_data,
-                                interpolation,
-                                EXTENSION_CLIP,
-                                IMAGE_ALPHA_AUTO,
-                                ustring());
-  }
 }
 
 ShaderNode *PointDensityTextureNode::clone() const
@@ -1744,10 +1656,9 @@ ShaderNode *PointDensityTextureNode::clone() const
   /* Increase image user count for new node. We need to ensure to not call
    * add_image again, to work around access of freed data on the Blender
    * side. A better solution should be found to avoid this. */
-  if (slot != -1) {
-    image_manager->add_image_user(slot);
-  }
-  return new PointDensityTextureNode(*this);
+  PointDensityTextureNode *node = new PointDensityTextureNode(*this);
+  node->handle = handle; /* TODO: not needed? */
+  return node;
 }
 
 void PointDensityTextureNode::attributes(Shader *shader, AttributeRequestSet *attributes)
@@ -1758,20 +1669,11 @@ void PointDensityTextureNode::attributes(Shader *shader, AttributeRequestSet *at
   ShaderNode::attributes(shader, attributes);
 }
 
-void PointDensityTextureNode::add_image()
+ImageParams PointDensityTextureNode::image_params() const
 {
-  if (slot == -1) {
-    ImageMetaData metadata;
-    slot = image_manager->add_image(filename.string(),
-                                    builtin_data,
-                                    false,
-                                    0,
-                                    interpolation,
-                                    EXTENSION_CLIP,
-                                    IMAGE_ALPHA_AUTO,
-                                    u_colorspace_raw,
-                                    metadata);
-  }
+  ImageParams params;
+  params.interpolation = interpolation;
+  return params;
 }
 
 void PointDensityTextureNode::compile(SVMCompiler &compiler)
@@ -1783,11 +1685,13 @@ void PointDensityTextureNode::compile(SVMCompiler &compiler)
   const bool use_density = !density_out->links.empty();
   const bool use_color = !color_out->links.empty();
 
-  image_manager = compiler.scene->image_manager;
-
   if (use_density || use_color) {
-    add_image();
+    if (handle.empty()) {
+      ImageManager *image_manager = compiler.scene->image_manager;
+      handle = image_manager->add_image(filename.string(), image_params());
+    }
 
+    const int slot = handle.svm_slot();
     if (slot != -1) {
       compiler.stack_assign(vector_in);
       compiler.add_node(NODE_TEX_VOXEL,
@@ -1824,12 +1728,13 @@ void PointDensityTextureNode::compile(OSLCompiler &compiler)
   const bool use_density = !density_out->links.empty();
   const bool use_color = !color_out->links.empty();
 
-  image_manager = compiler.scene->image_manager;
-
   if (use_density || use_color) {
-    add_image();
+    if (handle.empty()) {
+      ImageManager *image_manager = compiler.scene->image_manager;
+      handle = image_manager->add_image(filename.string(), image_params());
+    }
 
-    compiler.parameter_texture("filename", slot);
+    compiler.parameter_texture("filename", handle.svm_slot());
     if (space == NODE_TEX_VOXEL_SPACE_WORLD) {
       compiler.parameter("mapping", tfm);
       compiler.parameter("use_mapping", 1);
@@ -3343,7 +3248,7 @@ NODE_DEFINE(PrincipledVolumeNode)
   SOCKET_IN_COLOR(emission_color, "Emission Color", make_float3(1.0f, 1.0f, 1.0f));
   SOCKET_IN_FLOAT(blackbody_intensity, "Blackbody Intensity", 0.0f);
   SOCKET_IN_COLOR(blackbody_tint, "Blackbody Tint", make_float3(1.0f, 1.0f, 1.0f));
-  SOCKET_IN_FLOAT(temperature, "Temperature", 1500.0f);
+  SOCKET_IN_FLOAT(temperature, "Temperature", 1000.0f);
   SOCKET_IN_FLOAT(volume_mix_weight, "VolumeMixWeight", 0.0f, SocketType::SVM_INTERNAL);
 
   SOCKET_OUT_CLOSURE(volume, "Volume");
@@ -3354,6 +3259,8 @@ NODE_DEFINE(PrincipledVolumeNode)
 PrincipledVolumeNode::PrincipledVolumeNode() : VolumeNode(node_type)
 {
   closure = CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID;
+  density_attribute = ustring("density");
+  temperature_attribute = ustring("temperature");
 }
 
 void PrincipledVolumeNode::attributes(Shader *shader, AttributeRequestSet *attributes)
@@ -4495,7 +4402,10 @@ VertexColorNode::VertexColorNode() : ShaderNode(node_type)
 void VertexColorNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
   if (!(output("Color")->links.empty() && output("Alpha")->links.empty())) {
-    attributes->add_standard(layer_name);
+    if (layer_name != "")
+      attributes->add_standard(layer_name);
+    else
+      attributes->add(ATTR_STD_VERTEX_COLOR);
   }
   ShaderNode::attributes(shader, attributes);
 }
@@ -4504,7 +4414,14 @@ void VertexColorNode::compile(SVMCompiler &compiler)
 {
   ShaderOutput *color_out = output("Color");
   ShaderOutput *alpha_out = output("Alpha");
-  int layer_id = compiler.attribute(layer_name);
+  int layer_id = 0;
+
+  if (layer_name != "") {
+    layer_id = compiler.attribute(layer_name);
+  }
+  else {
+    layer_id = compiler.attribute(ATTR_STD_VERTEX_COLOR);
+  }
 
   ShaderNodeType node;
 
@@ -4531,7 +4448,19 @@ void VertexColorNode::compile(OSLCompiler &compiler)
   else {
     compiler.parameter("bump_offset", "center");
   }
-  compiler.parameter("layer_name", layer_name.c_str());
+
+  if (layer_name.empty()) {
+    compiler.parameter("layer_name", ustring("geom:vertex_color"));
+  }
+  else {
+    if (Attribute::name_standard(layer_name.c_str()) != ATTR_STD_NONE) {
+      compiler.parameter("name", (string("geom:") + layer_name.c_str()).c_str());
+    }
+    else {
+      compiler.parameter("layer_name", layer_name.c_str());
+    }
+  }
+
   compiler.add(this, "node_vertex_color");
 }
 
@@ -6022,14 +5951,20 @@ NODE_DEFINE(VectorMathNode)
   type_enum.insert("floor", NODE_VECTOR_MATH_FLOOR);
   type_enum.insert("ceil", NODE_VECTOR_MATH_CEIL);
   type_enum.insert("modulo", NODE_VECTOR_MATH_MODULO);
+  type_enum.insert("wrap", NODE_VECTOR_MATH_WRAP);
   type_enum.insert("fraction", NODE_VECTOR_MATH_FRACTION);
   type_enum.insert("absolute", NODE_VECTOR_MATH_ABSOLUTE);
   type_enum.insert("minimum", NODE_VECTOR_MATH_MINIMUM);
   type_enum.insert("maximum", NODE_VECTOR_MATH_MAXIMUM);
+
+  type_enum.insert("sine", NODE_VECTOR_MATH_SINE);
+  type_enum.insert("cosine", NODE_VECTOR_MATH_COSINE);
+  type_enum.insert("tangent", NODE_VECTOR_MATH_TANGENT);
   SOCKET_ENUM(type, "Type", type_enum, NODE_VECTOR_MATH_ADD);
 
   SOCKET_IN_VECTOR(vector1, "Vector1", make_float3(0.0f, 0.0f, 0.0f));
   SOCKET_IN_VECTOR(vector2, "Vector2", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_VECTOR(vector3, "Vector3", make_float3(0.0f, 0.0f, 0.0f));
   SOCKET_IN_FLOAT(scale, "Scale", 1.0f);
 
   SOCKET_OUT_FLOAT(value, "Value");
@@ -6048,7 +5983,7 @@ void VectorMathNode::constant_fold(const ConstantFolder &folder)
   float3 vector = make_float3(0.0f, 0.0f, 0.0f);
 
   if (folder.all_inputs_constant()) {
-    svm_vector_math(&value, &vector, type, vector1, vector2, scale);
+    svm_vector_math(&value, &vector, type, vector1, vector2, vector3, scale);
     if (folder.output == output("Value")) {
       folder.make_constant(value);
     }
@@ -6075,11 +6010,24 @@ void VectorMathNode::compile(SVMCompiler &compiler)
   int value_stack_offset = compiler.stack_assign_if_linked(value_out);
   int vector_stack_offset = compiler.stack_assign_if_linked(vector_out);
 
-  compiler.add_node(
-      NODE_VECTOR_MATH,
-      type,
-      compiler.encode_uchar4(vector1_stack_offset, vector2_stack_offset, scale_stack_offset),
-      compiler.encode_uchar4(value_stack_offset, vector_stack_offset));
+  /* 3 Vector Operators */
+  if (type == NODE_VECTOR_MATH_WRAP) {
+    ShaderInput *vector3_in = input("Vector3");
+    int vector3_stack_offset = compiler.stack_assign(vector3_in);
+    compiler.add_node(
+        NODE_VECTOR_MATH,
+        type,
+        compiler.encode_uchar4(vector1_stack_offset, vector2_stack_offset, scale_stack_offset),
+        compiler.encode_uchar4(value_stack_offset, vector_stack_offset));
+    compiler.add_node(vector3_stack_offset);
+  }
+  else {
+    compiler.add_node(
+        NODE_VECTOR_MATH,
+        type,
+        compiler.encode_uchar4(vector1_stack_offset, vector2_stack_offset, scale_stack_offset),
+        compiler.encode_uchar4(value_stack_offset, vector_stack_offset));
+  }
 }
 
 void VectorMathNode::compile(OSLCompiler &compiler)
@@ -6088,6 +6036,62 @@ void VectorMathNode::compile(OSLCompiler &compiler)
   compiler.add(this, "node_vector_math");
 }
 
+/* Vector Rotate */
+
+NODE_DEFINE(VectorRotateNode)
+{
+  NodeType *type = NodeType::add("vector_rotate", create, NodeType::SHADER);
+
+  static NodeEnum type_enum;
+  type_enum.insert("axis", NODE_VECTOR_ROTATE_TYPE_AXIS);
+  type_enum.insert("x_axis", NODE_VECTOR_ROTATE_TYPE_AXIS_X);
+  type_enum.insert("y_axis", NODE_VECTOR_ROTATE_TYPE_AXIS_Y);
+  type_enum.insert("z_axis", NODE_VECTOR_ROTATE_TYPE_AXIS_Z);
+  type_enum.insert("euler_xyz", NODE_VECTOR_ROTATE_TYPE_EULER_XYZ);
+  SOCKET_ENUM(type, "Type", type_enum, NODE_VECTOR_ROTATE_TYPE_AXIS);
+
+  SOCKET_BOOLEAN(invert, "Invert", false);
+
+  SOCKET_IN_VECTOR(vector, "Vector", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_POINT(rotation, "Rotation", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_POINT(center, "Center", make_float3(0.0f, 0.0f, 0.0f));
+  SOCKET_IN_VECTOR(axis, "Axis", make_float3(0.0f, 0.0f, 1.0f));
+  SOCKET_IN_FLOAT(angle, "Angle", 0.0f);
+  SOCKET_OUT_VECTOR(vector, "Vector");
+
+  return type;
+}
+
+VectorRotateNode::VectorRotateNode() : ShaderNode(node_type)
+{
+}
+
+void VectorRotateNode::compile(SVMCompiler &compiler)
+{
+  ShaderInput *vector_in = input("Vector");
+  ShaderInput *rotation_in = input("Rotation");
+  ShaderInput *center_in = input("Center");
+  ShaderInput *axis_in = input("Axis");
+  ShaderInput *angle_in = input("Angle");
+  ShaderOutput *vector_out = output("Vector");
+
+  compiler.add_node(
+      NODE_VECTOR_ROTATE,
+      compiler.encode_uchar4(
+          type, compiler.stack_assign(vector_in), compiler.stack_assign(rotation_in), invert),
+      compiler.encode_uchar4(compiler.stack_assign(center_in),
+                             compiler.stack_assign(axis_in),
+                             compiler.stack_assign(angle_in)),
+      compiler.stack_assign(vector_out));
+}
+
+void VectorRotateNode::compile(OSLCompiler &compiler)
+{
+  compiler.parameter(this, "type");
+  compiler.parameter(this, "invert");
+  compiler.add(this, "node_vector_rotate");
+}
+
 /* VectorTransform */
 
 NODE_DEFINE(VectorTransformNode)
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index a8fe7644957..e201118574b 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -17,8 +17,9 @@
 #ifndef __NODES_H__
 #define __NODES_H__
 
-#include "render/graph.h"
 #include "graph/node.h"
+#include "render/graph.h"
+#include "render/image.h"
 
 #include "util/util_array.h"
 #include "util/util_string.h"
@@ -77,12 +78,15 @@ class ImageSlotTextureNode : public TextureNode {
   explicit ImageSlotTextureNode(const NodeType *node_type) : TextureNode(node_type)
   {
     special_type = SHADER_SPECIAL_TYPE_IMAGE_SLOT;
-    image_manager = NULL;
   }
-  ~ImageSlotTextureNode();
-  void add_image_user() const;
-  ImageManager *image_manager;
-  vector<int> slots;
+
+  virtual bool equals(const ShaderNode &other)
+  {
+    const ImageSlotTextureNode &other_node = (const ImageSlotTextureNode &)other;
+    return TextureNode::equals(other) && handle == other_node.handle;
+  }
+
+  ImageHandle handle;
 };
 
 class ImageTextureNode : public ImageSlotTextureNode {
@@ -97,14 +101,14 @@ class ImageTextureNode : public ImageSlotTextureNode {
 
   virtual bool equals(const ShaderNode &other)
   {
-    const ImageTextureNode &image_node = (const ImageTextureNode &)other;
-    return ImageSlotTextureNode::equals(other) && builtin_data == image_node.builtin_data &&
-           animated == image_node.animated;
+    const ImageTextureNode &other_node = (const ImageTextureNode &)other;
+    return ImageSlotTextureNode::equals(other) && animated == other_node.animated;
   }
 
+  ImageParams image_params() const;
+
   /* Parameters. */
   ustring filename;
-  void *builtin_data;
   ustring colorspace;
   ImageAlphaType alpha_type;
   NodeImageProjection projection;
@@ -115,11 +119,6 @@ class ImageTextureNode : public ImageSlotTextureNode {
   float3 vector;
   ccl::vector<int> tiles;
 
-  /* Runtime. */
-  bool is_float;
-  bool compress_as_srgb;
-  ustring known_colorspace;
-
  protected:
   void cull_tiles(Scene *scene, ShaderGraph *graph);
 };
@@ -140,25 +139,20 @@ class EnvironmentTextureNode : public ImageSlotTextureNode {
 
   virtual bool equals(const ShaderNode &other)
   {
-    const EnvironmentTextureNode &env_node = (const EnvironmentTextureNode &)other;
-    return ImageSlotTextureNode::equals(other) && builtin_data == env_node.builtin_data &&
-           animated == env_node.animated;
+    const EnvironmentTextureNode &other_node = (const EnvironmentTextureNode &)other;
+    return ImageSlotTextureNode::equals(other) && animated == other_node.animated;
   }
 
+  ImageParams image_params() const;
+
   /* Parameters. */
   ustring filename;
-  void *builtin_data;
   ustring colorspace;
   ImageAlphaType alpha_type;
   NodeEnvironmentProjection projection;
   InterpolationType interpolation;
   bool animated;
   float3 vector;
-
-  /* Runtime. */
-  bool is_float;
-  bool compress_as_srgb;
-  ustring known_colorspace;
 };
 
 class SkyTextureNode : public TextureNode {
@@ -203,6 +197,11 @@ class OutputAOVNode : public ShaderNode {
 
   ustring name;
 
+  virtual int get_group()
+  {
+    return NODE_GROUP_LEVEL_4;
+  }
+
   /* Don't allow output node de-duplication. */
   virtual bool equals(const ShaderNode & /*other*/)
   {
@@ -288,9 +287,11 @@ class WaveTextureNode : public TextureNode {
   }
 
   NodeWaveType type;
+  NodeWaveBandsDirection bands_direction;
+  NodeWaveRingsDirection rings_direction;
   NodeWaveProfile profile;
 
-  float scale, distortion, detail, detail_scale;
+  float scale, distortion, detail, detail_scale, phase;
   float3 vector;
 };
 
@@ -343,7 +344,7 @@ class PointDensityTextureNode : public ShaderNode {
   SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
   virtual int get_group()
   {
-    return NODE_GROUP_LEVEL_3;
+    return NODE_GROUP_LEVEL_4;
   }
 
   ~PointDensityTextureNode();
@@ -363,24 +364,22 @@ class PointDensityTextureNode : public ShaderNode {
     return true;
   }
 
-  void add_image();
-
   /* Parameters. */
   ustring filename;
   NodeTexVoxelSpace space;
   InterpolationType interpolation;
   Transform tfm;
   float3 vector;
-  void *builtin_data;
 
   /* Runtime. */
-  ImageManager *image_manager;
-  int slot;
+  ImageHandle handle;
+
+  ImageParams image_params() const;
 
   virtual bool equals(const ShaderNode &other)
   {
-    const PointDensityTextureNode &point_dendity_node = (const PointDensityTextureNode &)other;
-    return ShaderNode::equals(other) && builtin_data == point_dendity_node.builtin_data;
+    const PointDensityTextureNode &other_node = (const PointDensityTextureNode &)other;
+    return ShaderNode::equals(other) && handle == other_node.handle;
   }
 };
 
@@ -1377,10 +1376,28 @@ class VectorMathNode : public ShaderNode {
 
   float3 vector1;
   float3 vector2;
+  float3 vector3;
   float scale;
   NodeVectorMathType type;
 };
 
+class VectorRotateNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(VectorRotateNode)
+
+  virtual int get_group()
+  {
+    return NODE_GROUP_LEVEL_3;
+  }
+  NodeVectorRotateType type;
+  bool invert;
+  float3 vector;
+  float3 center;
+  float3 axis;
+  float angle;
+  float3 rotation;
+};
+
 class VectorTransformNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorTransformNode)
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 849329a086d..90a1d90019d 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -14,22 +14,24 @@
  * limitations under the License.
  */
 
-#include "render/camera.h"
+#include "render/object.h"
 #include "device/device.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "render/hair.h"
+#include "render/integrator.h"
 #include "render/light.h"
 #include "render/mesh.h"
-#include "render/curves.h"
-#include "render/object.h"
 #include "render/particles.h"
 #include "render/scene.h"
 
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
+#include "util/util_murmurhash.h"
 #include "util/util_progress.h"
 #include "util/util_set.h"
 #include "util/util_vector.h"
-#include "util/util_murmurhash.h"
 
 #include "subd/subd_patch_table.h"
 
@@ -64,6 +66,7 @@ struct UpdateObjectTransformState {
   KernelObject *objects;
   Transform *object_motion_pass;
   DecomposedTransform *object_motion;
+  float *object_volume_step;
 
   /* Flags which will be synchronized to Integrator. */
   bool have_motion;
@@ -87,7 +90,7 @@ NODE_DEFINE(Object)
 {
   NodeType *type = NodeType::add("object", create);
 
-  SOCKET_NODE(mesh, "Mesh", &Mesh::node_type);
+  SOCKET_NODE(geometry, "Geometry", &Geometry::node_base_type);
   SOCKET_TRANSFORM(tfm, "Transform", transform_identity());
   SOCKET_UINT(visibility, "Visibility", ~0);
   SOCKET_COLOR(color, "Color", make_float3(0.0f, 0.0f, 0.0f));
@@ -152,7 +155,7 @@ void Object::update_motion()
 
 void Object::compute_bounds(bool motion_blur)
 {
-  BoundBox mbounds = mesh->bounds;
+  BoundBox mbounds = geometry->bounds;
 
   if (motion_blur && use_motion()) {
     array<DecomposedTransform> decomp(motion.size());
@@ -172,7 +175,7 @@ void Object::compute_bounds(bool motion_blur)
   }
   else {
     /* No motion blur case. */
-    if (mesh->transform_applied) {
+    if (geometry->transform_applied) {
       bounds = mbounds;
     }
     else {
@@ -183,89 +186,18 @@ void Object::compute_bounds(bool motion_blur)
 
 void Object::apply_transform(bool apply_to_motion)
 {
-  if (!mesh || tfm == transform_identity())
+  if (!geometry || tfm == transform_identity())
     return;
 
-  /* triangles */
-  if (mesh->verts.size()) {
-    /* store matrix to transform later. when accessing these as attributes we
-     * do not want the transform to be applied for consistency between static
-     * and dynamic BVH, so we do it on packing. */
-    mesh->transform_normal = transform_transposed_inverse(tfm);
-
-    /* apply to mesh vertices */
-    for (size_t i = 0; i < mesh->verts.size(); i++)
-      mesh->verts[i] = transform_point(&tfm, mesh->verts[i]);
-
-    if (apply_to_motion) {
-      Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-      if (attr) {
-        size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
-        float3 *vert_steps = attr->data_float3();
-
-        for (size_t i = 0; i < steps_size; i++)
-          vert_steps[i] = transform_point(&tfm, vert_steps[i]);
-      }
-
-      Attribute *attr_N = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
-
-      if (attr_N) {
-        Transform ntfm = mesh->transform_normal;
-        size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
-        float3 *normal_steps = attr_N->data_float3();
-
-        for (size_t i = 0; i < steps_size; i++)
-          normal_steps[i] = normalize(transform_direction(&ntfm, normal_steps[i]));
-      }
-    }
-  }
-
-  /* curves */
-  if (mesh->curve_keys.size()) {
-    /* compute uniform scale */
-    float3 c0 = transform_get_column(&tfm, 0);
-    float3 c1 = transform_get_column(&tfm, 1);
-    float3 c2 = transform_get_column(&tfm, 2);
-    float scalar = powf(fabsf(dot(cross(c0, c1), c2)), 1.0f / 3.0f);
-
-    /* apply transform to curve keys */
-    for (size_t i = 0; i < mesh->curve_keys.size(); i++) {
-      float3 co = transform_point(&tfm, mesh->curve_keys[i]);
-      float radius = mesh->curve_radius[i] * scalar;
-
-      /* scale for curve radius is only correct for uniform scale */
-      mesh->curve_keys[i] = co;
-      mesh->curve_radius[i] = radius;
-    }
-
-    if (apply_to_motion) {
-      Attribute *curve_attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-      if (curve_attr) {
-        /* apply transform to motion curve keys */
-        size_t steps_size = mesh->curve_keys.size() * (mesh->motion_steps - 1);
-        float4 *key_steps = curve_attr->data_float4();
-
-        for (size_t i = 0; i < steps_size; i++) {
-          float3 co = transform_point(&tfm, float4_to_float3(key_steps[i]));
-          float radius = key_steps[i].w * scalar;
-
-          /* scale for curve radius is only correct for uniform scale */
-          key_steps[i] = float3_to_float4(co);
-          key_steps[i].w = radius;
-        }
-      }
-    }
-  }
+  geometry->apply_transform(tfm, apply_to_motion);
 
   /* we keep normals pointing in same direction on negative scale, notify
-   * mesh about this in it (re)calculates normals */
+   * geometry about this in it (re)calculates normals */
   if (transform_negative_scale(tfm))
-    mesh->transform_negative_scaled = true;
+    geometry->transform_negative_scaled = true;
 
   if (bounds.valid()) {
-    mesh->compute_bounds();
+    geometry->compute_bounds();
     compute_bounds(false);
   }
 
@@ -275,11 +207,11 @@ void Object::apply_transform(bool apply_to_motion)
 
 void Object::tag_update(Scene *scene)
 {
-  if (mesh) {
-    if (mesh->transform_applied)
-      mesh->need_update = true;
+  if (geometry) {
+    if (geometry->transform_applied)
+      geometry->need_update = true;
 
-    foreach (Shader *shader, mesh->used_shaders) {
+    foreach (Shader *shader, geometry->used_shaders) {
       if (shader->use_mis && shader->has_surface_emission)
         scene->light_manager->need_update = true;
     }
@@ -287,7 +219,7 @@ void Object::tag_update(Scene *scene)
 
   scene->camera->need_flags_update = true;
   scene->curve_system_manager->need_update = true;
-  scene->mesh_manager->need_update = true;
+  scene->geometry_manager->need_update = true;
   scene->object_manager->need_update = true;
 }
 
@@ -336,6 +268,82 @@ uint Object::visibility_for_tracing() const
   return trace_visibility;
 }
 
+float Object::compute_volume_step_size() const
+{
+  if (geometry->type != Geometry::MESH) {
+    return FLT_MAX;
+  }
+
+  Mesh *mesh = static_cast<Mesh *>(geometry);
+
+  if (!mesh->has_volume) {
+    return FLT_MAX;
+  }
+
+  /* Compute step rate from shaders. */
+  float step_rate = FLT_MAX;
+
+  foreach (Shader *shader, mesh->used_shaders) {
+    if (shader->has_volume) {
+      if ((shader->heterogeneous_volume && shader->has_volume_spatial_varying) ||
+          (shader->has_volume_attribute_dependency)) {
+        step_rate = fminf(shader->volume_step_rate, step_rate);
+      }
+    }
+  }
+
+  if (step_rate == FLT_MAX) {
+    return FLT_MAX;
+  }
+
+  /* Compute step size from voxel grids. */
+  float step_size = FLT_MAX;
+
+  foreach (Attribute &attr, mesh->attributes.attributes) {
+    if (attr.element == ATTR_ELEMENT_VOXEL) {
+      ImageHandle &handle = attr.data_voxel();
+      const ImageMetaData &metadata = handle.metadata();
+      if (metadata.width == 0 || metadata.height == 0 || metadata.depth == 0) {
+        continue;
+      }
+
+      /* User specified step size. */
+      float voxel_step_size = mesh->volume_step_size;
+
+      if (voxel_step_size == 0.0f) {
+        /* Auto detect step size. */
+        float3 size = make_float3(
+            1.0f / metadata.width, 1.0f / metadata.height, 1.0f / metadata.depth);
+
+        /* Step size is transformed from voxel to world space. */
+        Transform voxel_tfm = tfm;
+        if (metadata.use_transform_3d) {
+          voxel_tfm = tfm * transform_inverse(metadata.transform_3d);
+        }
+        voxel_step_size = min3(fabs(transform_direction(&voxel_tfm, size)));
+      }
+      else if (mesh->volume_object_space) {
+        /* User specified step size in object space. */
+        float3 size = make_float3(voxel_step_size, voxel_step_size, voxel_step_size);
+        voxel_step_size = min3(fabs(transform_direction(&tfm, size)));
+      }
+
+      if (voxel_step_size > 0.0f) {
+        step_size = fminf(voxel_step_size, step_size);
+      }
+    }
+  }
+
+  if (step_size == FLT_MAX) {
+    /* Fall back to 1/10th of bounds for procedural volumes. */
+    step_size = 0.1f * average(bounds.size());
+  }
+
+  step_size *= step_rate;
+
+  return step_size;
+}
+
 int Object::get_device_index() const
 {
   return index;
@@ -353,32 +361,33 @@ ObjectManager::~ObjectManager()
 {
 }
 
-void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state, Object *ob)
+static float object_surface_area(UpdateObjectTransformState *state,
+                                 const Transform &tfm,
+                                 Geometry *geom)
 {
-  KernelObject &kobject = state->objects[ob->index];
-  Transform *object_motion_pass = state->object_motion_pass;
-
-  Mesh *mesh = ob->mesh;
-  uint flag = 0;
+  if (geom->type != Geometry::MESH) {
+    return 0.0f;
+  }
 
-  /* Compute transformations. */
-  Transform tfm = ob->tfm;
-  Transform itfm = transform_inverse(tfm);
+  Mesh *mesh = static_cast<Mesh *>(geom);
+  if (mesh->has_volume) {
+    /* Volume density automatically adjust to object scale. */
+    if (mesh->volume_object_space) {
+      const float3 unit = normalize(make_float3(1.0f, 1.0f, 1.0f));
+      return 1.0f / len(transform_direction(&tfm, unit));
+    }
+    else {
+      return 1.0f;
+    }
+  }
 
   /* Compute surface area. for uniform scale we can do avoid the many
    * transform calls and share computation for instances.
    *
    * TODO(brecht): Correct for displacement, and move to a better place.
    */
-  float uniform_scale;
   float surface_area = 0.0f;
-  float3 color = ob->color;
-  float pass_id = ob->pass_id;
-  float random_number = (float)ob->random_id * (1.0f / (float)0xFFFFFFFF);
-  int particle_index = (ob->particle_system) ?
-                           ob->particle_index + state->particle_offset[ob->particle_system] :
-                           0;
-
+  float uniform_scale;
   if (transform_uniform_scale(tfm, uniform_scale)) {
     map<Mesh *, float>::iterator it;
 
@@ -424,9 +433,31 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
     }
   }
 
+  return surface_area;
+}
+
+void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state, Object *ob)
+{
+  KernelObject &kobject = state->objects[ob->index];
+  Transform *object_motion_pass = state->object_motion_pass;
+
+  Geometry *geom = ob->geometry;
+  uint flag = 0;
+
+  /* Compute transformations. */
+  Transform tfm = ob->tfm;
+  Transform itfm = transform_inverse(tfm);
+
+  float3 color = ob->color;
+  float pass_id = ob->pass_id;
+  float random_number = (float)ob->random_id * (1.0f / (float)0xFFFFFFFF);
+  int particle_index = (ob->particle_system) ?
+                           ob->particle_index + state->particle_offset[ob->particle_system] :
+                           0;
+
   kobject.tfm = tfm;
   kobject.itfm = itfm;
-  kobject.surface_area = surface_area;
+  kobject.surface_area = object_surface_area(state, tfm, geom);
   kobject.color[0] = color.x;
   kobject.color[1] = color.y;
   kobject.color[2] = color.z;
@@ -435,11 +466,16 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
   kobject.particle_index = particle_index;
   kobject.motion_offset = 0;
 
-  if (mesh->use_motion_blur) {
+  if (geom->use_motion_blur) {
     state->have_motion = true;
   }
-  if (mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
-    flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+
+  if (geom->type == Geometry::MESH) {
+    /* TODO: why only mesh? */
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    if (mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
+      flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+    }
   }
 
   if (state->need_motion == Scene::MOTION_PASS) {
@@ -460,7 +496,7 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
     /* Motion transformations, is world/object space depending if mesh
      * comes with deformed position in object space, or if we transform
      * the shading point in world space. */
-    if (!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
+    if (!(flag & SD_OBJECT_HAS_VERTEX_MOTION)) {
       tfm_pre = tfm_pre * itfm;
       tfm_post = tfm_post * itfm;
     }
@@ -485,12 +521,13 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
   kobject.dupli_generated[0] = ob->dupli_generated[0];
   kobject.dupli_generated[1] = ob->dupli_generated[1];
   kobject.dupli_generated[2] = ob->dupli_generated[2];
-  kobject.numkeys = mesh->curve_keys.size();
+  kobject.numkeys = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom)->curve_keys.size() :
+                                                     0;
   kobject.dupli_uv[0] = ob->dupli_uv[0];
   kobject.dupli_uv[1] = ob->dupli_uv[1];
-  int totalsteps = mesh->motion_steps;
+  int totalsteps = geom->motion_steps;
   kobject.numsteps = (totalsteps - 1) / 2;
-  kobject.numverts = mesh->verts.size();
+  kobject.numverts = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom)->verts.size() : 0;
   kobject.patch_map_offset = 0;
   kobject.attribute_map_offset = 0;
   uint32_t hash_name = util_murmur_hash3(ob->name.c_str(), ob->name.length(), 0);
@@ -503,9 +540,10 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
     flag |= SD_OBJECT_HOLDOUT_MASK;
   }
   state->object_flag[ob->index] = flag;
+  state->object_volume_step[ob->index] = FLT_MAX;
 
   /* Have curves. */
-  if (mesh->num_curves()) {
+  if (geom->type == Geometry::HAIR) {
     state->have_curves = true;
   }
 }
@@ -556,6 +594,7 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene,
 
   state.objects = dscene->objects.alloc(scene->objects.size());
   state.object_flag = dscene->object_flag.alloc(scene->objects.size());
+  state.object_volume_step = dscene->object_volume_step.alloc(scene->objects.size());
   state.object_motion = NULL;
   state.object_motion_pass = NULL;
 
@@ -676,25 +715,30 @@ void ObjectManager::device_update_flags(
 
   /* Object info flag. */
   uint *object_flag = dscene->object_flag.data();
+  float *object_volume_step = dscene->object_volume_step.data();
 
   /* Object volume intersection. */
   vector<Object *> volume_objects;
   bool has_volume_objects = false;
   foreach (Object *object, scene->objects) {
-    if (object->mesh->has_volume) {
+    if (object->geometry->has_volume) {
       if (bounds_valid) {
         volume_objects.push_back(object);
       }
       has_volume_objects = true;
+      object_volume_step[object->index] = object->compute_volume_step_size();
+    }
+    else {
+      object_volume_step[object->index] = FLT_MAX;
     }
   }
 
   foreach (Object *object, scene->objects) {
-    if (object->mesh->has_volume) {
+    if (object->geometry->has_volume) {
       object_flag[object->index] |= SD_OBJECT_HAS_VOLUME;
       object_flag[object->index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
 
-      foreach (Attribute &attr, object->mesh->attributes.attributes) {
+      foreach (Attribute &attr, object->geometry->attributes.attributes) {
         if (attr.element == ATTR_ELEMENT_VOXEL) {
           object_flag[object->index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
         }
@@ -703,6 +747,7 @@ void ObjectManager::device_update_flags(
     else {
       object_flag[object->index] &= ~(SD_OBJECT_HAS_VOLUME | SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
     }
+
     if (object->is_shadow_catcher) {
       object_flag[object->index] |= SD_OBJECT_SHADOW_CATCHER;
     }
@@ -731,6 +776,7 @@ void ObjectManager::device_update_flags(
 
   /* Copy object flag. */
   dscene->object_flag.copy_to_device();
+  dscene->object_volume_step.copy_to_device();
 }
 
 void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Scene *scene)
@@ -744,21 +790,24 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc
   bool update = false;
 
   foreach (Object *object, scene->objects) {
-    Mesh *mesh = object->mesh;
-
-    if (mesh->patch_table) {
-      uint patch_map_offset = 2 * (mesh->patch_table_offset + mesh->patch_table->total_size() -
-                                   mesh->patch_table->num_nodes * PATCH_NODE_SIZE) -
-                              mesh->patch_offset;
-
-      if (kobjects[object->index].patch_map_offset != patch_map_offset) {
-        kobjects[object->index].patch_map_offset = patch_map_offset;
-        update = true;
+    Geometry *geom = object->geometry;
+
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      if (mesh->patch_table) {
+        uint patch_map_offset = 2 * (mesh->patch_table_offset + mesh->patch_table->total_size() -
+                                     mesh->patch_table->num_nodes * PATCH_NODE_SIZE) -
+                                mesh->patch_offset;
+
+        if (kobjects[object->index].patch_map_offset != patch_map_offset) {
+          kobjects[object->index].patch_map_offset = patch_map_offset;
+          update = true;
+        }
       }
     }
 
-    if (kobjects[object->index].attribute_map_offset != mesh->attr_map_offset) {
-      kobjects[object->index].attribute_map_offset = mesh->attr_map_offset;
+    if (kobjects[object->index].attribute_map_offset != geom->attr_map_offset) {
+      kobjects[object->index].attribute_map_offset = geom->attr_map_offset;
       update = true;
     }
   }
@@ -774,15 +823,16 @@ void ObjectManager::device_free(Device *, DeviceScene *dscene)
   dscene->object_motion_pass.free();
   dscene->object_motion.free();
   dscene->object_flag.free();
+  dscene->object_volume_step.free();
 }
 
 void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, Progress &progress)
 {
   /* todo: normals and displacement should be done before applying transform! */
-  /* todo: create objects/meshes in right order! */
+  /* todo: create objects/geometry in right order! */
 
-  /* counter mesh users */
-  map<Mesh *, int> mesh_users;
+  /* counter geometry users */
+  map<Geometry *, int> geometry_users;
   Scene::MotionType need_motion = scene->need_motion();
   bool motion_blur = need_motion == Scene::MOTION_BLUR;
   bool apply_to_motion = need_motion != Scene::MOTION_PASS;
@@ -790,10 +840,10 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, P
   bool have_instancing = false;
 
   foreach (Object *object, scene->objects) {
-    map<Mesh *, int>::iterator it = mesh_users.find(object->mesh);
+    map<Geometry *, int>::iterator it = geometry_users.find(object->geometry);
 
-    if (it == mesh_users.end())
-      mesh_users[object->mesh] = 1;
+    if (it == geometry_users.end())
+      geometry_users[object->geometry] = 1;
     else
       it->second++;
   }
@@ -803,27 +853,34 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, P
 
   uint *object_flag = dscene->object_flag.data();
 
-  /* apply transforms for objects with single user meshes */
+  /* apply transforms for objects with single user geometry */
   foreach (Object *object, scene->objects) {
     /* Annoying feedback loop here: we can't use is_instanced() because
      * it'll use uninitialized transform_applied flag.
      *
-     * Could be solved by moving reference counter to Mesh.
+     * Could be solved by moving reference counter to Geometry.
      */
-    if ((mesh_users[object->mesh] == 1 && !object->mesh->has_surface_bssrdf) &&
-        !object->mesh->has_true_displacement() &&
-        object->mesh->subdivision_type == Mesh::SUBDIVISION_NONE) {
+    Geometry *geom = object->geometry;
+    bool apply = (geometry_users[geom] == 1) && !geom->has_surface_bssrdf &&
+                 !geom->has_true_displacement();
+
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      apply = apply && mesh->subdivision_type == Mesh::SUBDIVISION_NONE;
+    }
+
+    if (apply) {
       if (!(motion_blur && object->use_motion())) {
-        if (!object->mesh->transform_applied) {
+        if (!geom->transform_applied) {
           object->apply_transform(apply_to_motion);
-          object->mesh->transform_applied = true;
+          geom->transform_applied = true;
 
           if (progress.get_cancel())
             return;
         }
 
         object_flag[i] |= SD_OBJECT_TRANSFORM_APPLIED;
-        if (object->mesh->transform_negative_scaled)
+        if (geom->transform_negative_scaled)
           object_flag[i] |= SD_OBJECT_NEGATIVE_SCALE_APPLIED;
       }
       else
@@ -842,7 +899,7 @@ void ObjectManager::tag_update(Scene *scene)
 {
   need_update = true;
   scene->curve_system_manager->need_update = true;
-  scene->mesh_manager->need_update = true;
+  scene->geometry_manager->need_update = true;
   scene->light_manager->need_update = true;
 }
 
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index cbbff0d4c6d..7c84c2de4fb 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -23,8 +23,8 @@
 #include "util/util_array.h"
 #include "util/util_boundbox.h"
 #include "util/util_param.h"
-#include "util/util_transform.h"
 #include "util/util_thread.h"
+#include "util/util_transform.h"
 #include "util/util_types.h"
 #include "util/util_vector.h"
 
@@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 class DeviceScene;
-class Mesh;
+class Geometry;
 class ParticleSystem;
 class Progress;
 class Scene;
@@ -46,7 +46,7 @@ class Object : public Node {
  public:
   NODE_DECLARE
 
-  Mesh *mesh;
+  Geometry *geometry;
   Transform tfm;
   BoundBox bounds;
   uint random_id;
@@ -81,6 +81,9 @@ class Object : public Node {
   int motion_step(float time) const;
   void update_motion();
 
+  /* Maximum number of motion steps supported (due to Embree). */
+  static const uint MAX_MOTION_STEPS = 129;
+
   /* Check whether object is traceable and it worth adding it to
    * kernel scene.
    */
@@ -94,6 +97,9 @@ class Object : public Node {
   /* Returns the index that is used in the kernel for this object. */
   int get_device_index() const;
 
+  /* Compute step size from attributes, shaders, transforms. */
+  float compute_volume_step_size() const;
+
  protected:
   /* Specifies the position of the object in scene->objects and
    * in the device vectors. Gets set in device_update. */
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 1f0a243e6c1..06d832a29ca 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -20,10 +20,10 @@
 #include "render/colorspace.h"
 #include "render/graph.h"
 #include "render/light.h"
+#include "render/nodes.h"
 #include "render/osl.h"
 #include "render/scene.h"
 #include "render/shader.h"
-#include "render/nodes.h"
 
 #ifdef WITH_OSL
 
@@ -102,8 +102,8 @@ void OSLShaderManager::device_update(Device *device,
 
   device_free(device, dscene, scene);
 
-  /* determine which shaders are in use */
-  device_update_shaders_used(scene);
+  /* set texture system */
+  scene->image_manager->set_osl_texture_system((void *)ts);
 
   /* create shaders */
   OSLGlobals *og = (OSLGlobals *)device->osl_memory();
@@ -142,9 +142,6 @@ void OSLShaderManager::device_update(Device *device,
 
   need_update = false;
 
-  /* set texture system */
-  scene->image_manager->set_osl_texture_system((void *)ts);
-
   /* add special builtin texture types */
   services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
   services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
@@ -319,7 +316,7 @@ bool OSLShaderManager::osl_compile(const string &inputfile, const string &output
   string include_path_arg = string("-I") + shader_path;
   options.push_back(include_path_arg);
 
-  stdosl_path = path_get("shader/stdosl.h");
+  stdosl_path = path_get("shader/stdcycles.h");
 
   /* compile */
   OSL::OSLCompiler *compiler = new OSL::OSLCompiler(&OSL::ErrorHandler::default_handler());
@@ -440,27 +437,35 @@ const char *OSLShaderManager::shader_load_bytecode(const string &hash, const str
   return loaded_shaders.find(hash)->first.c_str();
 }
 
-OSLNode *OSLShaderManager::osl_node(const std::string &filepath,
+/* This is a static function to avoid RTTI link errors with only this
+ * file being compiled without RTTI to match OSL and LLVM libraries. */
+OSLNode *OSLShaderManager::osl_node(ShaderManager *manager,
+                                    const std::string &filepath,
                                     const std::string &bytecode_hash,
                                     const std::string &bytecode)
 {
+  if (!manager->use_osl()) {
+    return NULL;
+  }
+
   /* create query */
+  OSLShaderManager *osl_manager = static_cast<OSLShaderManager *>(manager);
   const char *hash;
 
   if (!filepath.empty()) {
-    hash = shader_load_filepath(filepath);
+    hash = osl_manager->shader_load_filepath(filepath);
   }
   else {
-    hash = shader_test_loaded(bytecode_hash);
+    hash = osl_manager->shader_test_loaded(bytecode_hash);
     if (!hash)
-      hash = shader_load_bytecode(bytecode_hash, bytecode);
+      hash = osl_manager->shader_load_bytecode(bytecode_hash, bytecode);
   }
 
   if (!hash) {
     return NULL;
   }
 
-  OSLShaderInfo *info = shader_loaded_info(hash);
+  OSLShaderInfo *info = osl_manager->shader_loaded_info(hash);
 
   /* count number of inputs */
   size_t num_inputs = 0;
@@ -755,16 +760,14 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
   else if (current_type == SHADER_TYPE_VOLUME) {
     if (node->has_spatial_varying())
       current_shader->has_volume_spatial_varying = true;
+    if (node->has_attribute_dependency())
+      current_shader->has_volume_attribute_dependency = true;
   }
 
   if (node->has_object_dependency()) {
     current_shader->has_object_dependency = true;
   }
 
-  if (node->has_attribute_dependency()) {
-    current_shader->has_attribute_dependency = true;
-  }
-
   if (node->has_integrator_dependency()) {
     current_shader->has_integrator_dependency = true;
   }
@@ -1138,8 +1141,8 @@ void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
     shader->has_displacement = false;
     shader->has_surface_spatial_varying = false;
     shader->has_volume_spatial_varying = false;
+    shader->has_volume_attribute_dependency = false;
     shader->has_object_dependency = false;
-    shader->has_attribute_dependency = false;
     shader->has_integrator_dependency = false;
 
     /* generate surface shader */
diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h
index 62cbfebf7eb..4dd9f6630f2 100644
--- a/intern/cycles/render/osl.h
+++ b/intern/cycles/render/osl.h
@@ -93,9 +93,10 @@ class OSLShaderManager : public ShaderManager {
   OSLShaderInfo *shader_loaded_info(const string &hash);
 
   /* create OSL node using OSLQuery */
-  OSLNode *osl_node(const std::string &filepath,
-                    const std::string &bytecode_hash = "",
-                    const std::string &bytecode = "");
+  static OSLNode *osl_node(ShaderManager *manager,
+                           const std::string &filepath,
+                           const std::string &bytecode_hash = "",
+                           const std::string &bytecode = "");
 
  protected:
   void texture_system_init();
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index 8335404b197..ec9276eff86 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "device/device.h"
 #include "render/particles.h"
+#include "device/device.h"
 #include "render/scene.h"
 
 #include "util/util_foreach.h"
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 1e75fa0f99b..f5b68d5a4fe 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -16,11 +16,11 @@
 
 #include <stdlib.h>
 
+#include "device/device.h"
 #include "render/background.h"
 #include "render/bake.h"
 #include "render/camera.h"
 #include "render/curves.h"
-#include "device/device.h"
 #include "render/film.h"
 #include "render/integrator.h"
 #include "render/light.h"
@@ -41,50 +41,59 @@
 CCL_NAMESPACE_BEGIN
 
 DeviceScene::DeviceScene(Device *device)
-    : bvh_nodes(device, "__bvh_nodes", MEM_TEXTURE),
-      bvh_leaf_nodes(device, "__bvh_leaf_nodes", MEM_TEXTURE),
-      object_node(device, "__object_node", MEM_TEXTURE),
-      prim_tri_index(device, "__prim_tri_index", MEM_TEXTURE),
-      prim_tri_verts(device, "__prim_tri_verts", MEM_TEXTURE),
-      prim_type(device, "__prim_type", MEM_TEXTURE),
-      prim_visibility(device, "__prim_visibility", MEM_TEXTURE),
-      prim_index(device, "__prim_index", MEM_TEXTURE),
-      prim_object(device, "__prim_object", MEM_TEXTURE),
-      prim_time(device, "__prim_time", MEM_TEXTURE),
-      tri_shader(device, "__tri_shader", MEM_TEXTURE),
-      tri_vnormal(device, "__tri_vnormal", MEM_TEXTURE),
-      tri_vindex(device, "__tri_vindex", MEM_TEXTURE),
-      tri_patch(device, "__tri_patch", MEM_TEXTURE),
-      tri_patch_uv(device, "__tri_patch_uv", MEM_TEXTURE),
-      curves(device, "__curves", MEM_TEXTURE),
-      curve_keys(device, "__curve_keys", MEM_TEXTURE),
-      patches(device, "__patches", MEM_TEXTURE),
-      objects(device, "__objects", MEM_TEXTURE),
-      object_motion_pass(device, "__object_motion_pass", MEM_TEXTURE),
-      object_motion(device, "__object_motion", MEM_TEXTURE),
-      object_flag(device, "__object_flag", MEM_TEXTURE),
-      camera_motion(device, "__camera_motion", MEM_TEXTURE),
-      attributes_map(device, "__attributes_map", MEM_TEXTURE),
-      attributes_float(device, "__attributes_float", MEM_TEXTURE),
-      attributes_float2(device, "__attributes_float2", MEM_TEXTURE),
-      attributes_float3(device, "__attributes_float3", MEM_TEXTURE),
-      attributes_uchar4(device, "__attributes_uchar4", MEM_TEXTURE),
-      light_distribution(device, "__light_distribution", MEM_TEXTURE),
-      lights(device, "__lights", MEM_TEXTURE),
-      light_background_marginal_cdf(device, "__light_background_marginal_cdf", MEM_TEXTURE),
-      light_background_conditional_cdf(device, "__light_background_conditional_cdf", MEM_TEXTURE),
-      particles(device, "__particles", MEM_TEXTURE),
-      svm_nodes(device, "__svm_nodes", MEM_TEXTURE),
-      shaders(device, "__shaders", MEM_TEXTURE),
-      lookup_table(device, "__lookup_table", MEM_TEXTURE),
-      sobol_directions(device, "__sobol_directions", MEM_TEXTURE),
-      ies_lights(device, "__ies", MEM_TEXTURE)
+    : bvh_nodes(device, "__bvh_nodes", MEM_GLOBAL),
+      bvh_leaf_nodes(device, "__bvh_leaf_nodes", MEM_GLOBAL),
+      object_node(device, "__object_node", MEM_GLOBAL),
+      prim_tri_index(device, "__prim_tri_index", MEM_GLOBAL),
+      prim_tri_verts(device, "__prim_tri_verts", MEM_GLOBAL),
+      prim_type(device, "__prim_type", MEM_GLOBAL),
+      prim_visibility(device, "__prim_visibility", MEM_GLOBAL),
+      prim_index(device, "__prim_index", MEM_GLOBAL),
+      prim_object(device, "__prim_object", MEM_GLOBAL),
+      prim_time(device, "__prim_time", MEM_GLOBAL),
+      tri_shader(device, "__tri_shader", MEM_GLOBAL),
+      tri_vnormal(device, "__tri_vnormal", MEM_GLOBAL),
+      tri_vindex(device, "__tri_vindex", MEM_GLOBAL),
+      tri_patch(device, "__tri_patch", MEM_GLOBAL),
+      tri_patch_uv(device, "__tri_patch_uv", MEM_GLOBAL),
+      curves(device, "__curves", MEM_GLOBAL),
+      curve_keys(device, "__curve_keys", MEM_GLOBAL),
+      patches(device, "__patches", MEM_GLOBAL),
+      objects(device, "__objects", MEM_GLOBAL),
+      object_motion_pass(device, "__object_motion_pass", MEM_GLOBAL),
+      object_motion(device, "__object_motion", MEM_GLOBAL),
+      object_flag(device, "__object_flag", MEM_GLOBAL),
+      object_volume_step(device, "__object_volume_step", MEM_GLOBAL),
+      camera_motion(device, "__camera_motion", MEM_GLOBAL),
+      attributes_map(device, "__attributes_map", MEM_GLOBAL),
+      attributes_float(device, "__attributes_float", MEM_GLOBAL),
+      attributes_float2(device, "__attributes_float2", MEM_GLOBAL),
+      attributes_float3(device, "__attributes_float3", MEM_GLOBAL),
+      attributes_uchar4(device, "__attributes_uchar4", MEM_GLOBAL),
+      light_distribution(device, "__light_distribution", MEM_GLOBAL),
+      lights(device, "__lights", MEM_GLOBAL),
+      light_background_marginal_cdf(device, "__light_background_marginal_cdf", MEM_GLOBAL),
+      light_background_conditional_cdf(device, "__light_background_conditional_cdf", MEM_GLOBAL),
+      particles(device, "__particles", MEM_GLOBAL),
+      svm_nodes(device, "__svm_nodes", MEM_GLOBAL),
+      shaders(device, "__shaders", MEM_GLOBAL),
+      lookup_table(device, "__lookup_table", MEM_GLOBAL),
+      sample_pattern_lut(device, "__sample_pattern_lut", MEM_GLOBAL),
+      ies_lights(device, "__ies", MEM_GLOBAL)
 {
   memset((void *)&data, 0, sizeof(data));
 }
 
 Scene::Scene(const SceneParams &params_, Device *device)
-    : name("Scene"), device(device), dscene(device), params(params_)
+    : name("Scene"),
+      default_surface(NULL),
+      default_volume(NULL),
+      default_light(NULL),
+      default_background(NULL),
+      default_empty(NULL),
+      device(device),
+      dscene(device),
+      params(params_)
 {
   memset((void *)&dscene.data, 0, sizeof(dscene.data));
 
@@ -94,7 +103,7 @@ Scene::Scene(const SceneParams &params_, Device *device)
   film = new Film();
   background = new Background();
   light_manager = new LightManager();
-  mesh_manager = new MeshManager();
+  geometry_manager = new GeometryManager();
   object_manager = new ObjectManager();
   integrator = new Integrator();
   image_manager = new ImageManager(device->info);
@@ -104,9 +113,11 @@ Scene::Scene(const SceneParams &params_, Device *device)
 
   /* OSL only works on the CPU */
   if (device->info.has_osl)
-    shader_manager = ShaderManager::create(this, params.shadingsystem);
+    shader_manager = ShaderManager::create(params.shadingsystem);
   else
-    shader_manager = ShaderManager::create(this, SHADINGSYSTEM_SVM);
+    shader_manager = ShaderManager::create(SHADINGSYSTEM_SVM);
+
+  shader_manager->add_default(this);
 }
 
 Scene::~Scene()
@@ -118,8 +129,8 @@ void Scene::free_memory(bool final)
 {
   foreach (Shader *s, shaders)
     delete s;
-  foreach (Mesh *m, meshes)
-    delete m;
+  foreach (Geometry *g, geometry)
+    delete g;
   foreach (Object *o, objects)
     delete o;
   foreach (Light *l, lights)
@@ -128,7 +139,7 @@ void Scene::free_memory(bool final)
     delete p;
 
   shaders.clear();
-  meshes.clear();
+  geometry.clear();
   objects.clear();
   lights.clear();
   particle_systems.clear();
@@ -140,7 +151,7 @@ void Scene::free_memory(bool final)
     integrator->device_free(device, &dscene);
 
     object_manager->device_free(device, &dscene);
-    mesh_manager->device_free(device, &dscene);
+    geometry_manager->device_free(device, &dscene);
     shader_manager->device_free(device, &dscene, this);
     light_manager->device_free(device, &dscene);
 
@@ -165,7 +176,7 @@ void Scene::free_memory(bool final)
     delete background;
     delete integrator;
     delete object_manager;
-    delete mesh_manager;
+    delete geometry_manager;
     delete shader_manager;
     delete light_manager;
     delete particle_system_manager;
@@ -211,7 +222,7 @@ void Scene::device_update(Device *device_, Progress &progress)
   if (progress.get_cancel() || device->have_error())
     return;
 
-  mesh_manager->device_update_preprocess(device, this, progress);
+  geometry_manager->device_update_preprocess(device, this, progress);
 
   if (progress.get_cancel() || device->have_error())
     return;
@@ -235,7 +246,7 @@ void Scene::device_update(Device *device_, Progress &progress)
     return;
 
   progress.set_status("Updating Meshes");
-  mesh_manager->device_update(device, &dscene, this, progress);
+  geometry_manager->device_update(device, &dscene, this, progress);
 
   if (progress.get_cancel() || device->have_error())
     return;
@@ -356,8 +367,8 @@ bool Scene::need_update()
 bool Scene::need_data_update()
 {
   return (background->need_update || image_manager->need_update || object_manager->need_update ||
-          mesh_manager->need_update || light_manager->need_update || lookup_tables->need_update ||
-          integrator->need_update || shader_manager->need_update ||
+          geometry_manager->need_update || light_manager->need_update ||
+          lookup_tables->need_update || integrator->need_update || shader_manager->need_update ||
           particle_system_manager->need_update || curve_system_manager->need_update ||
           bake_manager->need_update || film->need_update);
 }
@@ -379,7 +390,7 @@ void Scene::reset()
   background->tag_update(this);
   integrator->tag_update(this);
   object_manager->tag_update(this);
-  mesh_manager->tag_update(this);
+  geometry_manager->tag_update(this);
   light_manager->tag_update(this);
   particle_system_manager->tag_update(this);
   curve_system_manager->tag_update(this);
@@ -392,7 +403,7 @@ void Scene::device_free()
 
 void Scene::collect_statistics(RenderStats *stats)
 {
-  mesh_manager->collect_statistics(this, stats);
+  geometry_manager->collect_statistics(this, stats);
   image_manager->collect_statistics(stats);
 }
 
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index f99510d2d42..6b10a901d7b 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -44,8 +44,8 @@ class Integrator;
 class Light;
 class LightManager;
 class LookupTables;
-class Mesh;
-class MeshManager;
+class Geometry;
+class GeometryManager;
 class Object;
 class ObjectManager;
 class ParticleSystemManager;
@@ -91,6 +91,7 @@ class DeviceScene {
   device_vector<Transform> object_motion_pass;
   device_vector<DecomposedTransform> object_motion;
   device_vector<uint> object_flag;
+  device_vector<float> object_volume_step;
 
   /* cameras */
   device_vector<DecomposedTransform> camera_motion;
@@ -119,7 +120,7 @@ class DeviceScene {
   device_vector<float> lookup_table;
 
   /* integrator */
-  device_vector<uint> sobol_directions;
+  device_vector<uint> sample_pattern_lut;
 
   /* ies lights */
   device_vector<float> ies_lights;
@@ -213,7 +214,7 @@ class Scene {
 
   /* data lists */
   vector<Object *> objects;
-  vector<Mesh *> meshes;
+  vector<Geometry *> geometry;
   vector<Shader *> shaders;
   vector<Light *> lights;
   vector<ParticleSystem *> particle_systems;
@@ -222,7 +223,7 @@ class Scene {
   ImageManager *image_manager;
   LightManager *light_manager;
   ShaderManager *shader_manager;
-  MeshManager *mesh_manager;
+  GeometryManager *geometry_manager;
   ObjectManager *object_manager;
   ParticleSystemManager *particle_system_manager;
   CurveSystemManager *curve_system_manager;
@@ -230,6 +231,7 @@ class Scene {
 
   /* default shaders */
   Shader *default_surface;
+  Shader *default_volume;
   Shader *default_light;
   Shader *default_background;
   Shader *default_empty;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index c77a20787f5..b1b30979b0e 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include <string.h>
 #include <limits.h>
+#include <string.h>
 
+#include "device/device.h"
+#include "render/bake.h"
 #include "render/buffers.h"
 #include "render/camera.h"
-#include "device/device.h"
 #include "render/graph.h"
 #include "render/integrator.h"
 #include "render/light.h"
@@ -27,7 +28,6 @@
 #include "render/object.h"
 #include "render/scene.h"
 #include "render/session.h"
-#include "render/bake.h"
 
 #include "util/util_foreach.h"
 #include "util/util_function.h"
@@ -183,7 +183,8 @@ bool Session::draw_gpu(BufferParams &buffer_params, DeviceDrawParams &draw_param
   if (gpu_draw_ready) {
     /* then verify the buffers have the expected size, so we don't
      * draw previous results in a resized window */
-    if (!buffer_params.modified(display->params)) {
+    if (buffer_params.width == display->params.width &&
+        buffer_params.height == display->params.height) {
       /* for CUDA we need to do tone-mapping still, since we can
        * only access GL buffers from the main thread. */
       if (gpu_need_display_buffer_update) {
@@ -211,6 +212,7 @@ void Session::run_gpu()
 
   reset_time = time_dt();
   last_update_time = time_dt();
+  last_display_time = last_update_time;
 
   progress.set_render_start_time();
 
@@ -291,11 +293,15 @@ void Session::run_gpu()
        * reset and draw in between */
       thread_scoped_lock buffers_lock(buffers_mutex);
 
+      /* avoid excessive denoising in viewport after reaching a certain amount of samples */
+      bool need_denoise = tile_manager.schedule_denoising || tile_manager.state.sample < 20 ||
+                          (time_dt() - last_display_time) >= params.progressive_update_timeout;
+
       /* update status and timing */
       update_status_time();
 
       /* render */
-      render();
+      render(need_denoise);
 
       device->task_wait();
 
@@ -305,7 +311,7 @@ void Session::run_gpu()
       /* update status and timing */
       update_status_time();
 
-      gpu_need_display_buffer_update = true;
+      gpu_need_display_buffer_update = need_denoise || !params.run_denoising;
       gpu_draw_ready = true;
       progress.set_update();
 
@@ -359,7 +365,8 @@ bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_param
   if (display->draw_ready()) {
     /* then verify the buffers have the expected size, so we don't
      * draw previous results in a resized window */
-    if (!buffer_params.modified(display->params)) {
+    if (buffer_params.width == display->params.width &&
+        buffer_params.height == display->params.height) {
       display->draw(device, draw_params);
 
       if (display_outdated && (time_dt() - reset_time) > params.text_timeout)
@@ -372,7 +379,7 @@ bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_param
   return false;
 }
 
-bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
+bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_types)
 {
   if (progress.get_cancel()) {
     if (params.progressive_refine == false) {
@@ -387,8 +394,14 @@ bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
   Tile *tile;
   int device_num = device->device_number(tile_device);
 
-  if (!tile_manager.next_tile(tile, device_num))
+  while (!tile_manager.next_tile(tile, device_num, tile_types)) {
+    /* Wait for denoising tiles to become available */
+    if ((tile_types & RenderTile::DENOISE) && !progress.get_cancel() && tile_manager.has_tiles()) {
+      denoising_cond.wait(tile_lock);
+      continue;
+    }
     return false;
+  }
 
   /* fill render tile */
   rtile.x = tile_manager.state.buffer.full_x + tile->x;
@@ -399,7 +412,7 @@ bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
   rtile.num_samples = tile_manager.state.num_samples;
   rtile.resolution = tile_manager.state.resolution_divider;
   rtile.tile_index = tile->index;
-  rtile.task = (tile->state == Tile::DENOISE) ? RenderTile::DENOISE : RenderTile::PATH_TRACE;
+  rtile.task = tile->state == Tile::DENOISE ? RenderTile::DENOISE : RenderTile::PATH_TRACE;
 
   tile_lock.unlock();
 
@@ -413,6 +426,9 @@ bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
 
     device->map_tile(tile_device, rtile);
 
+    /* Reset copy state, since buffer contents change after the tile was acquired */
+    buffers->map_neighbor_copied = false;
+
     return true;
   }
 
@@ -429,6 +445,8 @@ bool Session::acquire_tile(Device *tile_device, RenderTile &rtile)
     tile->buffers->reset(buffer_params);
   }
 
+  tile->buffers->map_neighbor_copied = false;
+
   tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
 
   rtile.buffer = tile->buffers->buffer.device_pointer;
@@ -484,45 +502,75 @@ void Session::release_tile(RenderTile &rtile)
   }
 
   update_status_time();
+
+  /* Notify denoising thread that a tile was finished. */
+  denoising_cond.notify_all();
 }
 
 void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device)
 {
   thread_scoped_lock tile_lock(tile_mutex);
 
-  int center_idx = tiles[4].tile_index;
-  assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
-  BufferParams buffer_params = tile_manager.params;
-  int4 image_region = make_int4(buffer_params.full_x,
-                                buffer_params.full_y,
-                                buffer_params.full_x + buffer_params.width,
-                                buffer_params.full_y + buffer_params.height);
-
-  for (int dy = -1, i = 0; dy <= 1; dy++) {
-    for (int dx = -1; dx <= 1; dx++, i++) {
-      int px = tiles[4].x + dx * params.tile_size.x;
-      int py = tiles[4].y + dy * params.tile_size.y;
-      if (px >= image_region.x && py >= image_region.y && px < image_region.z &&
-          py < image_region.w) {
-        int tile_index = center_idx + dy * tile_manager.state.tile_stride + dx;
-        Tile *tile = &tile_manager.state.tiles[tile_index];
-        assert(tile->buffers);
-
-        tiles[i].buffer = tile->buffers->buffer.device_pointer;
-        tiles[i].x = tile_manager.state.buffer.full_x + tile->x;
-        tiles[i].y = tile_manager.state.buffer.full_y + tile->y;
-        tiles[i].w = tile->w;
-        tiles[i].h = tile->h;
-        tiles[i].buffers = tile->buffers;
-
-        tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride);
-      }
-      else {
-        tiles[i].buffer = (device_ptr)NULL;
-        tiles[i].buffers = NULL;
-        tiles[i].x = clamp(px, image_region.x, image_region.z);
-        tiles[i].y = clamp(py, image_region.y, image_region.w);
-        tiles[i].w = tiles[i].h = 0;
+  const int4 image_region = make_int4(
+      tile_manager.state.buffer.full_x,
+      tile_manager.state.buffer.full_y,
+      tile_manager.state.buffer.full_x + tile_manager.state.buffer.width,
+      tile_manager.state.buffer.full_y + tile_manager.state.buffer.height);
+
+  if (!tile_manager.schedule_denoising) {
+    /* Fix up tile slices with overlap. */
+    if (tile_manager.slice_overlap != 0) {
+      int y = max(tiles[4].y - tile_manager.slice_overlap, image_region.y);
+      tiles[4].h = min(tiles[4].y + tiles[4].h + tile_manager.slice_overlap, image_region.w) - y;
+      tiles[4].y = y;
+    }
+
+    /* Tiles are not being denoised individually, which means the entire image is processed. */
+    tiles[3].x = tiles[4].x;
+    tiles[1].y = tiles[4].y;
+    tiles[5].x = tiles[4].x + tiles[4].w;
+    tiles[7].y = tiles[4].y + tiles[4].h;
+  }
+  else {
+    int center_idx = tiles[4].tile_index;
+    assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
+
+    for (int dy = -1, i = 0; dy <= 1; dy++) {
+      for (int dx = -1; dx <= 1; dx++, i++) {
+        int nindex = tile_manager.get_neighbor_index(center_idx, i);
+        if (nindex >= 0) {
+          Tile *tile = &tile_manager.state.tiles[nindex];
+
+          tiles[i].x = image_region.x + tile->x;
+          tiles[i].y = image_region.y + tile->y;
+          tiles[i].w = tile->w;
+          tiles[i].h = tile->h;
+
+          if (buffers) {
+            tile_manager.state.buffer.get_offset_stride(tiles[i].offset, tiles[i].stride);
+
+            tiles[i].buffer = buffers->buffer.device_pointer;
+            tiles[i].buffers = buffers;
+          }
+          else {
+            assert(tile->buffers);
+            tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride);
+
+            tiles[i].buffer = tile->buffers->buffer.device_pointer;
+            tiles[i].buffers = tile->buffers;
+          }
+        }
+        else {
+          int px = tiles[4].x + dx * params.tile_size.x;
+          int py = tiles[4].y + dy * params.tile_size.y;
+
+          tiles[i].x = clamp(px, image_region.x, image_region.z);
+          tiles[i].y = clamp(py, image_region.y, image_region.w);
+          tiles[i].w = tiles[i].h = 0;
+
+          tiles[i].buffer = (device_ptr)NULL;
+          tiles[i].buffers = NULL;
+        }
       }
     }
   }
@@ -545,6 +593,7 @@ void Session::run_cpu()
   bool tiles_written = false;
 
   last_update_time = time_dt();
+  last_display_time = last_update_time;
 
   {
     /* reset once to start */
@@ -575,7 +624,7 @@ void Session::run_cpu()
     }
 
     /* Don't go in pause mode when preview kernels are used
-     * When feature kernels become available the session will be resetted. */
+     * When feature kernels become available the session will be reset. */
     else if (no_tiles && kernel_state == DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL) {
       time_sleep(0.1);
     }
@@ -620,11 +669,6 @@ void Session::run_cpu()
     }
 
     if (!no_tiles) {
-      /* buffers mutex is locked entirely while rendering each
-       * sample, and released/reacquired on each iteration to allow
-       * reset and draw in between */
-      thread_scoped_lock buffers_lock(buffers_mutex);
-
       /* update scene */
       scoped_timer update_timer;
       if (update_scene()) {
@@ -638,17 +682,26 @@ void Session::run_cpu()
       if (progress.get_cancel())
         break;
 
+      /* buffers mutex is locked entirely while rendering each
+       * sample, and released/reacquired on each iteration to allow
+       * reset and draw in between */
+      thread_scoped_lock buffers_lock(buffers_mutex);
+
+      /* avoid excessive denoising in viewport after reaching a certain amount of samples */
+      bool need_denoise = tile_manager.schedule_denoising || tile_manager.state.sample < 20 ||
+                          (time_dt() - last_display_time) >= params.progressive_update_timeout;
+
       /* update status and timing */
       update_status_time();
 
       /* render */
-      render();
+      render(need_denoise);
 
       /* update status and timing */
       update_status_time();
 
       if (!params.background)
-        need_copy_to_display_buffer = true;
+        need_copy_to_display_buffer = need_denoise || !params.run_denoising;
 
       if (!device->error_message().empty())
         progress.set_error(device->error_message());
@@ -701,23 +754,26 @@ DeviceRequestedFeatures Session::get_requested_device_features()
   requested_features.use_object_motion = false;
   requested_features.use_camera_motion = use_motion && scene->camera->use_motion();
   foreach (Object *object, scene->objects) {
-    Mesh *mesh = object->mesh;
-    if (mesh->num_curves()) {
-      requested_features.use_hair = true;
-    }
+    Geometry *geom = object->geometry;
     if (use_motion) {
-      requested_features.use_object_motion |= object->use_motion() | mesh->use_motion_blur;
-      requested_features.use_camera_motion |= mesh->use_motion_blur;
+      requested_features.use_object_motion |= object->use_motion() | geom->use_motion_blur;
+      requested_features.use_camera_motion |= geom->use_motion_blur;
     }
-#ifdef WITH_OPENSUBDIV
-    if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
-      requested_features.use_patch_evaluation = true;
-    }
-#endif
     if (object->is_shadow_catcher) {
       requested_features.use_shadow_tricks = true;
     }
-    requested_features.use_true_displacement |= mesh->has_true_displacement();
+    if (geom->type == Geometry::MESH) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+#ifdef WITH_OPENSUBDIV
+      if (mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
+        requested_features.use_patch_evaluation = true;
+      }
+#endif
+      requested_features.use_true_displacement |= mesh->has_true_displacement();
+    }
+    else if (geom->type == Geometry::HAIR) {
+      requested_features.use_hair = true;
+    }
   }
 
   requested_features.use_background_light = scene->light_manager->has_background_light(scene);
@@ -842,9 +898,6 @@ void Session::set_samples(int samples)
     params.samples = samples;
     tile_manager.set_samples(samples);
 
-    {
-      thread_scoped_lock pause_lock(pause_mutex);
-    }
     pause_cond.notify_all();
   }
 }
@@ -866,6 +919,29 @@ void Session::set_pause(bool pause_)
     pause_cond.notify_all();
 }
 
+void Session::set_denoising(bool denoising, bool optix_denoising)
+{
+  /* Lock buffers so no denoising operation is triggered while the settings are changed here. */
+  thread_scoped_lock buffers_lock(buffers_mutex);
+
+  params.run_denoising = denoising;
+  params.full_denoising = !optix_denoising;
+  params.optix_denoising = optix_denoising;
+
+  // TODO(pmours): Query the required overlap value for denoising from the device?
+  tile_manager.slice_overlap = denoising && !params.background ? 64 : 0;
+  tile_manager.schedule_denoising = denoising && !buffers;
+}
+
+void Session::set_denoising_start_sample(int sample)
+{
+  if (sample != params.denoising_start_sample) {
+    params.denoising_start_sample = sample;
+
+    pause_cond.notify_all();
+  }
+}
+
 void Session::wait()
 {
   if (session_thread) {
@@ -900,7 +976,7 @@ bool Session::update_scene()
   Integrator *integrator = scene->integrator;
   BakeManager *bake_manager = scene->bake_manager;
 
-  if (integrator->sampling_pattern == SAMPLING_PATTERN_CMJ || bake_manager->get_baking()) {
+  if (integrator->sampling_pattern != SAMPLING_PATTERN_SOBOL || bake_manager->get_baking()) {
     int aa_samples = tile_manager.num_samples;
 
     if (aa_samples != integrator->aa_samples) {
@@ -911,7 +987,8 @@ bool Session::update_scene()
 
   /* update scene */
   if (scene->need_update()) {
-    bool new_kernels_needed = load_kernels(false);
+    /* Updated used shader tag so we know which features are need for the kernel. */
+    scene->shader_manager->update_shaders_used(scene);
 
     /* Update max_closures. */
     KernelIntegrator *kintegrator = &scene->dscene.data.integrator;
@@ -923,6 +1000,9 @@ bool Session::update_scene()
       kintegrator->max_closures = MAX_CLOSURE;
     }
 
+    /* Load render kernels, before device update where we upload data to the GPU. */
+    bool new_kernels_needed = load_kernels(false);
+
     progress.set_status("Updating Scene");
     MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
 
@@ -1003,17 +1083,21 @@ void Session::update_status_time(bool show_pause, bool show_done)
   progress.set_status(status, substatus);
 }
 
-void Session::render()
+void Session::render(bool with_denoising)
 {
-  /* Clear buffers. */
   if (buffers && tile_manager.state.sample == tile_manager.range_start_sample) {
+    /* Clear buffers. */
     buffers->zero();
   }
 
+  if (tile_manager.state.buffer.width == 0 || tile_manager.state.buffer.height == 0) {
+    return; /* Avoid empty launches. */
+  }
+
   /* Add path trace task. */
   DeviceTask task(DeviceTask::RENDER);
 
-  task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2);
+  task.acquire_tile = function_bind(&Session::acquire_tile, this, _2, _1, _3);
   task.release_tile = function_bind(&Session::release_tile, this, _1);
   task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
   task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
@@ -1022,13 +1106,37 @@ void Session::render()
   task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
   task.need_finish_queue = params.progressive_refine;
   task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
-  task.requested_tile_size = params.tile_size;
-  task.passes_size = tile_manager.params.get_passes_size();
 
-  if (params.run_denoising) {
+  task.adaptive_sampling.use = (scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ) &&
+                               scene->dscene.data.film.pass_adaptive_aux_buffer;
+  task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
+
+  /* Acquire render tiles by default. */
+  task.tile_types = RenderTile::PATH_TRACE;
+
+  with_denoising = params.run_denoising && with_denoising;
+  if (with_denoising) {
+    /* Do not denoise viewport until the sample at which denoising should start is reached. */
+    if (!params.background && tile_manager.state.sample < params.denoising_start_sample) {
+      with_denoising = false;
+    }
+
+    /* Cannot denoise with resolution divider and separate denoising devices.
+     * It breaks the copy in 'MultiDevice::map_neighbor_tiles' (which operates on the full buffer
+     * dimensions and not the scaled ones). */
+    if (!params.device.denoising_devices.empty() && tile_manager.state.resolution_divider > 1) {
+      with_denoising = false;
+    }
+
+    /* It can happen that denoising was already enabled, but the scene still needs an update. */
+    if (scene->film->need_update || !scene->film->denoising_data_offset) {
+      with_denoising = false;
+    }
+  }
+
+  if (with_denoising) {
     task.denoising = params.denoising;
 
-    assert(!scene->film->need_update);
     task.pass_stride = scene->film->pass_stride;
     task.target_pass_stride = task.pass_stride;
     task.pass_denoising_data = scene->film->denoising_data_offset;
@@ -1038,6 +1146,30 @@ void Session::render()
     task.denoising_do_filter = params.full_denoising;
     task.denoising_use_optix = params.optix_denoising;
     task.denoising_write_passes = params.write_denoising_passes;
+
+    if (tile_manager.schedule_denoising) {
+      /* Acquire denoising tiles during rendering. */
+      task.tile_types |= RenderTile::DENOISE;
+    }
+    else {
+      assert(buffers);
+
+      /* Schedule rendering and wait for it to finish. */
+      device->task_add(task);
+      device->task_wait();
+
+      /* Then run denoising on the whole image at once. */
+      task.type = DeviceTask::DENOISE_BUFFER;
+      task.x = tile_manager.state.buffer.full_x;
+      task.y = tile_manager.state.buffer.full_y;
+      task.w = tile_manager.state.buffer.width;
+      task.h = tile_manager.state.buffer.height;
+      task.buffer = buffers->buffer.device_pointer;
+      task.sample = tile_manager.state.sample;
+      task.num_samples = tile_manager.state.num_samples;
+      tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
+      task.buffers = buffers;
+    }
   }
 
   device->task_add(task);
@@ -1064,6 +1196,8 @@ void Session::copy_to_display_buffer(int sample)
 
     /* set display to new size */
     display->draw_set(task.w, task.h);
+
+    last_display_time = time_dt();
   }
 
   display_outdated = false;
@@ -1141,8 +1275,11 @@ int Session::get_max_closure_count()
 
   int max_closures = 0;
   for (int i = 0; i < scene->shaders.size(); i++) {
-    int num_closures = scene->shaders[i]->graph->get_num_closures();
-    max_closures = max(max_closures, num_closures);
+    Shader *shader = scene->shaders[i];
+    if (shader->used) {
+      int num_closures = shader->graph->get_num_closures();
+      max_closures = max(max_closures, num_closures);
+    }
   }
   max_closure_global = max(max_closure_global, max_closures);
 
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index ec465601541..61970d87e9c 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -17,8 +17,8 @@
 #ifndef __SESSION_H__
 #define __SESSION_H__
 
-#include "render/buffers.h"
 #include "device/device.h"
+#include "render/buffers.h"
 #include "render/shader.h"
 #include "render/stats.h"
 #include "render/tile.h"
@@ -53,8 +53,10 @@ class SessionParams {
   int2 tile_size;
   TileOrder tile_order;
   int start_resolution;
+  int denoising_start_sample;
   int pixel_size;
   int threads;
+  bool adaptive_sampling;
 
   bool use_profiling;
 
@@ -85,8 +87,10 @@ class SessionParams {
     samples = 1024;
     tile_size = make_int2(64, 64);
     start_resolution = INT_MAX;
+    denoising_start_sample = 0;
     pixel_size = 1;
     threads = 0;
+    adaptive_sampling = false;
 
     use_profiling = false;
 
@@ -109,11 +113,13 @@ class SessionParams {
   bool modified(const SessionParams &params)
   {
     return !(device == params.device && background == params.background &&
-             progressive_refine == params.progressive_refine
-             /* && samples == params.samples */
-             && progressive == params.progressive && experimental == params.experimental &&
+             progressive_refine == params.progressive_refine &&
+             /* samples == params.samples && denoising_start_sample ==
+                params.denoising_start_sample && */
+             progressive == params.progressive && experimental == params.experimental &&
              tile_size == params.tile_size && start_resolution == params.start_resolution &&
              pixel_size == params.pixel_size && threads == params.threads &&
+             adaptive_sampling == params.adaptive_sampling &&
              use_profiling == params.use_profiling &&
              display_buffer_linear == params.display_buffer_linear &&
              cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
@@ -152,8 +158,10 @@ class Session {
 
   bool ready_to_reset();
   void reset(BufferParams &params, int samples);
-  void set_samples(int samples);
   void set_pause(bool pause);
+  void set_samples(int samples);
+  void set_denoising(bool denoising, bool optix_denoising);
+  void set_denoising_start_sample(int sample);
 
   bool update_scene();
   bool load_kernels(bool lock_scene = true);
@@ -178,8 +186,9 @@ class Session {
 
   void update_status_time(bool show_pause = false, bool show_done = false);
 
+  void render(bool with_denoising);
   void copy_to_display_buffer(int sample);
-  void render();
+
   void reset_(BufferParams &params, int samples);
 
   void run_cpu();
@@ -190,7 +199,7 @@ class Session {
   bool draw_gpu(BufferParams &params, DeviceDrawParams &draw_params);
   void reset_gpu(BufferParams &params, int samples);
 
-  bool acquire_tile(Device *tile_device, RenderTile &tile);
+  bool acquire_tile(RenderTile &tile, Device *tile_device, uint tile_types);
   void update_tile_sample(RenderTile &tile);
   void release_tile(RenderTile &tile);
 
@@ -213,14 +222,16 @@ class Session {
   thread_mutex tile_mutex;
   thread_mutex buffers_mutex;
   thread_mutex display_mutex;
+  thread_condition_variable denoising_cond;
 
   bool kernels_loaded;
   DeviceRequestedFeatures loaded_kernel_features;
 
   double reset_time;
+  double last_update_time;
+  double last_display_time;
 
   /* progressive refine */
-  double last_update_time;
   bool update_progressive_refine(bool cancel);
 
   DeviceRequestedFeatures get_requested_device_features();
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 661208c6463..747fc58f81a 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -168,7 +168,7 @@ NODE_DEFINE(Shader)
   SOCKET_ENUM(volume_sampling_method,
               "Volume Sampling Method",
               volume_sampling_method_enum,
-              VOLUME_SAMPLING_DISTANCE);
+              VOLUME_SAMPLING_MULTIPLE_IMPORTANCE);
 
   static NodeEnum volume_interpolation_method_enum;
   volume_interpolation_method_enum.insert("linear", VOLUME_INTERPOLATION_LINEAR);
@@ -178,6 +178,8 @@ NODE_DEFINE(Shader)
               volume_interpolation_method_enum,
               VOLUME_INTERPOLATION_LINEAR);
 
+  SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f);
+
   static NodeEnum displacement_method_enum;
   displacement_method_enum.insert("bump", DISPLACE_BUMP);
   displacement_method_enum.insert("true", DISPLACE_TRUE);
@@ -203,10 +205,11 @@ Shader::Shader() : Node(node_type)
   has_bssrdf_bump = false;
   has_surface_spatial_varying = false;
   has_volume_spatial_varying = false;
+  has_volume_attribute_dependency = false;
   has_object_dependency = false;
-  has_attribute_dependency = false;
   has_integrator_dependency = false;
   has_volume_connected = false;
+  prev_volume_step_rate = 0.0f;
 
   displacement_method = DISPLACE_BUMP;
 
@@ -214,7 +217,7 @@ Shader::Shader() : Node(node_type)
   used = false;
 
   need_update = true;
-  need_update_mesh = true;
+  need_update_geometry = true;
   need_sync_object = false;
 }
 
@@ -288,7 +291,7 @@ void Shader::set_graph(ShaderGraph *graph_)
     const char *new_hash = (graph_) ? graph_->displacement_hash.c_str() : "";
 
     if (strcmp(old_hash, new_hash) != 0) {
-      need_update_mesh = true;
+      need_update_geometry = true;
     }
   }
 
@@ -347,15 +350,16 @@ void Shader::tag_update(Scene *scene)
   }
 
   /* compare if the attributes changed, mesh manager will check
-   * need_update_mesh, update the relevant meshes and clear it. */
+   * need_update_geometry, update the relevant meshes and clear it. */
   if (attributes.modified(prev_attributes)) {
-    need_update_mesh = true;
-    scene->mesh_manager->need_update = true;
+    need_update_geometry = true;
+    scene->geometry_manager->need_update = true;
   }
 
-  if (has_volume != prev_has_volume) {
-    scene->mesh_manager->need_flags_update = true;
+  if (has_volume != prev_has_volume || volume_step_rate != prev_volume_step_rate) {
+    scene->geometry_manager->need_flags_update = true;
     scene->object_manager->need_flags_update = true;
+    prev_volume_step_rate = volume_step_rate;
   }
 }
 
@@ -415,7 +419,7 @@ ShaderManager::~ShaderManager()
 {
 }
 
-ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
+ShaderManager *ShaderManager::create(int shadingsystem)
 {
   ShaderManager *manager;
 
@@ -431,8 +435,6 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
     manager = new SVMShaderManager();
   }
 
-  add_default(scene);
-
   return manager;
 }
 
@@ -471,8 +473,12 @@ int ShaderManager::get_shader_id(Shader *shader, bool smooth)
   return id;
 }
 
-void ShaderManager::device_update_shaders_used(Scene *scene)
+void ShaderManager::update_shaders_used(Scene *scene)
 {
+  if (!need_update) {
+    return;
+  }
+
   /* figure out which shaders are in use, so SVM/OSL can skip compiling them
    * for speed and avoid loading image textures into memory */
   uint id = 0;
@@ -489,8 +495,8 @@ void ShaderManager::device_update_shaders_used(Scene *scene)
   if (scene->background->shader)
     scene->background->shader->used = true;
 
-  foreach (Mesh *mesh, scene->meshes)
-    foreach (Shader *shader, mesh->used_shaders)
+  foreach (Geometry *geom, scene->geometry)
+    foreach (Shader *shader, geom->used_shaders)
       shader->used = true;
 
   foreach (Light *light, scene->lights)
@@ -531,10 +537,12 @@ void ShaderManager::device_update_common(Device *device,
     /* in this case we can assume transparent surface */
     if (shader->has_volume_connected && !shader->has_surface)
       flag |= SD_HAS_ONLY_VOLUME;
-    if (shader->heterogeneous_volume && shader->has_volume_spatial_varying)
-      flag |= SD_HETEROGENEOUS_VOLUME;
-    if (shader->has_attribute_dependency)
-      flag |= SD_NEED_ATTRIBUTES;
+    if (shader->has_volume) {
+      if (shader->heterogeneous_volume && shader->has_volume_spatial_varying)
+        flag |= SD_HETEROGENEOUS_VOLUME;
+    }
+    if (shader->has_volume_attribute_dependency)
+      flag |= SD_NEED_VOLUME_ATTRIBUTES;
     if (shader->has_bssrdf_bump)
       flag |= SD_HAS_BSSRDF_BUMP;
     if (device->info.has_volume_decoupled) {
@@ -623,9 +631,27 @@ void ShaderManager::add_default(Scene *scene)
 
     Shader *shader = new Shader();
     shader->name = "default_surface";
-    shader->graph = graph;
+    shader->set_graph(graph);
     scene->shaders.push_back(shader);
     scene->default_surface = shader;
+    shader->tag_update(scene);
+  }
+
+  /* default volume */
+  {
+    ShaderGraph *graph = new ShaderGraph();
+
+    PrincipledVolumeNode *principled = new PrincipledVolumeNode();
+    graph->add(principled);
+
+    graph->connect(principled->output("Volume"), graph->output()->input("Volume"));
+
+    Shader *shader = new Shader();
+    shader->name = "default_volume";
+    shader->set_graph(graph);
+    scene->shaders.push_back(shader);
+    scene->default_volume = shader;
+    shader->tag_update(scene);
   }
 
   /* default light */
@@ -641,9 +667,10 @@ void ShaderManager::add_default(Scene *scene)
 
     Shader *shader = new Shader();
     shader->name = "default_light";
-    shader->graph = graph;
+    shader->set_graph(graph);
     scene->shaders.push_back(shader);
     scene->default_light = shader;
+    shader->tag_update(scene);
   }
 
   /* default background */
@@ -652,9 +679,10 @@ void ShaderManager::add_default(Scene *scene)
 
     Shader *shader = new Shader();
     shader->name = "default_background";
-    shader->graph = graph;
+    shader->set_graph(graph);
     scene->shaders.push_back(shader);
     scene->default_background = shader;
+    shader->tag_update(scene);
   }
 
   /* default empty */
@@ -663,9 +691,10 @@ void ShaderManager::add_default(Scene *scene)
 
     Shader *shader = new Shader();
     shader->name = "default_empty";
-    shader->graph = graph;
+    shader->set_graph(graph);
     scene->shaders.push_back(shader);
     scene->default_empty = shader;
+    shader->tag_update(scene);
   }
 }
 
@@ -704,6 +733,10 @@ void ShaderManager::get_requested_features(Scene *scene,
   requested_features->nodes_features = 0;
   for (int i = 0; i < scene->shaders.size(); i++) {
     Shader *shader = scene->shaders[i];
+    if (!shader->used) {
+      continue;
+    }
+
     /* Gather requested features from all the nodes from the graph nodes. */
     get_requested_graph_features(shader->graph, requested_features);
     ShaderNode *output_node = shader->graph->output();
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index f74204df355..7801fd29276 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -23,8 +23,8 @@
 #  include <OSL/oslexec.h>
 #endif
 
-#include "render/attribute.h"
 #include "kernel/kernel_types.h"
+#include "render/attribute.h"
 
 #include "graph/node.h"
 
@@ -92,10 +92,12 @@ class Shader : public Node {
   bool heterogeneous_volume;
   VolumeSampling volume_sampling_method;
   int volume_interpolation_method;
+  float volume_step_rate;
+  float prev_volume_step_rate;
 
   /* synchronization */
   bool need_update;
-  bool need_update_mesh;
+  bool need_update_geometry;
   bool need_sync_object;
 
   /* If the shader has only volume components, the surface is assumed to
@@ -118,8 +120,8 @@ class Shader : public Node {
   bool has_bssrdf_bump;
   bool has_surface_spatial_varying;
   bool has_volume_spatial_varying;
+  bool has_volume_attribute_dependency;
   bool has_object_dependency;
-  bool has_attribute_dependency;
   bool has_integrator_dependency;
 
   /* displacement */
@@ -163,7 +165,7 @@ class ShaderManager {
  public:
   bool need_update;
 
-  static ShaderManager *create(Scene *scene, int shadingsystem);
+  static ShaderManager *create(int shadingsystem);
   virtual ~ShaderManager();
 
   virtual void reset(Scene *scene) = 0;
@@ -180,7 +182,6 @@ class ShaderManager {
                              Progress &progress) = 0;
   virtual void device_free(Device *device, DeviceScene *dscene, Scene *scene) = 0;
 
-  void device_update_shaders_used(Scene *scene);
   void device_update_common(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
   void device_free_common(Device *device, DeviceScene *dscene, Scene *scene);
 
@@ -196,6 +197,7 @@ class ShaderManager {
   static void add_default(Scene *scene);
 
   /* Selective nodes compilation. */
+  void update_shaders_used(Scene *scene);
   void get_requested_features(Scene *scene, DeviceRequestedFeatures *requested_features);
 
   static void free_memory();
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 7c33f6c04ae..b4858f488c3 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -25,8 +25,8 @@
 #include "render/shader.h"
 #include "render/svm.h"
 
-#include "util/util_logging.h"
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
 #include "util/util_progress.h"
 #include "util/util_task.h"
 
@@ -85,9 +85,6 @@ void SVMShaderManager::device_update(Device *device,
   /* test if we need to update */
   device_free(device, dscene, scene);
 
-  /* determine which shaders are in use */
-  device_update_shaders_used(scene);
-
   /* Build all shaders. */
   TaskPool task_pool;
   vector<array<int4>> shader_svm_nodes(num_shaders);
@@ -447,16 +444,14 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet &done)
   else if (current_type == SHADER_TYPE_VOLUME) {
     if (node->has_spatial_varying())
       current_shader->has_volume_spatial_varying = true;
+    if (node->has_attribute_dependency())
+      current_shader->has_volume_attribute_dependency = true;
   }
 
   if (node->has_object_dependency()) {
     current_shader->has_object_dependency = true;
   }
 
-  if (node->has_attribute_dependency()) {
-    current_shader->has_attribute_dependency = true;
-  }
-
   if (node->has_integrator_dependency()) {
     current_shader->has_integrator_dependency = true;
   }
@@ -867,8 +862,8 @@ void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Sum
   shader->has_displacement = false;
   shader->has_surface_spatial_varying = false;
   shader->has_volume_spatial_varying = false;
+  shader->has_volume_attribute_dependency = false;
   shader->has_object_dependency = false;
-  shader->has_attribute_dependency = false;
   shader->has_integrator_dependency = false;
 
   /* generate bump shader */
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index d88925939e3..270e05abe29 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include "render/tables.h"
 #include "device/device.h"
 #include "render/scene.h"
-#include "render/tables.h"
 
 #include "util/util_logging.h"
 
diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h
index 12b59bb0aeb..3ed2959ae59 100644
--- a/intern/cycles/render/tables.h
+++ b/intern/cycles/render/tables.h
@@ -18,6 +18,7 @@
 #define __TABLES_H__
 
 #include "util/util_list.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 9ef0c695667..1480b6d1aab 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -101,6 +101,7 @@ TileManager::TileManager(bool progressive_,
   tile_order = tile_order_;
   start_resolution = start_resolution_;
   pixel_size = pixel_size_;
+  slice_overlap = 0;
   num_samples = num_samples_;
   num_devices = num_devices_;
   preserve_tile_device = preserve_tile_device_;
@@ -201,8 +202,7 @@ int TileManager::gen_tiles(bool sliced)
   int image_h = max(1, params.height / resolution);
   int2 center = make_int2(image_w / 2, image_h / 2);
 
-  int num_logical_devices = preserve_tile_device ? num_devices : 1;
-  int num = min(image_h, num_logical_devices);
+  int num = preserve_tile_device || sliced ? min(image_h, num_devices) : 1;
   int slice_num = sliced ? num : 1;
   int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
 
@@ -216,7 +216,7 @@ int TileManager::gen_tiles(bool sliced)
   tile_list = state.render_tiles.begin();
 
   if (tile_order == TILE_HILBERT_SPIRAL) {
-    assert(!sliced);
+    assert(!sliced && slice_overlap == 0);
 
     int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
     state.tiles.resize(tile_w * tile_h);
@@ -319,6 +319,12 @@ int TileManager::gen_tiles(bool sliced)
     int slice_h = (slice == slice_num - 1) ? image_h - slice * (image_h / slice_num) :
                                              image_h / slice_num;
 
+    if (slice_overlap != 0) {
+      int slice_y_offset = max(slice_y - slice_overlap, 0);
+      slice_h = min(slice_y + slice_h + slice_overlap, image_h) - slice_y_offset;
+      slice_y = slice_y_offset;
+    }
+
     int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
 
     int tiles_per_device = divide_up(tile_w * tile_h, num);
@@ -363,6 +369,7 @@ void TileManager::gen_render_tiles()
 {
   /* Regenerate just the render tiles for progressive render. */
   foreach (Tile &tile, state.tiles) {
+    tile.state = Tile::RENDER;
     state.render_tiles[tile.device].push_back(tile.index);
   }
 }
@@ -386,17 +393,29 @@ void TileManager::set_tiles()
 
 int TileManager::get_neighbor_index(int index, int neighbor)
 {
-  static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
+  /* Neighbor indices:
+   *   0 1 2
+   *   3 4 5
+   *   6 7 8
+   */
+  static const int dx[] = {-1, 0, 1, -1, 0, 1, -1, 0, 1};
+  static const int dy[] = {-1, -1, -1, 0, 0, 0, 1, 1, 1};
 
   int resolution = state.resolution_divider;
   int image_w = max(1, params.width / resolution);
   int image_h = max(1, params.height / resolution);
+
+  int num = min(image_h, num_devices);
+  int slice_num = !background ? num : 1;
+  int slice_h = image_h / slice_num;
+
   int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-  int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
+  int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
 
-  int nx = state.tiles[index].x / tile_size.x + dx[neighbor],
-      ny = state.tiles[index].y / tile_size.y + dy[neighbor];
-  if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h)
+  /* Tiles in the state tile list are always indexed from left to right, top to bottom. */
+  int nx = (index % tile_w) + dx[neighbor];
+  int ny = (index / tile_w) + dy[neighbor];
+  if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h * slice_num)
     return -1;
 
   return ny * state.tile_stride + nx;
@@ -426,15 +445,11 @@ bool TileManager::finish_tile(int index, bool &delete_tile)
 {
   delete_tile = false;
 
-  if (progressive) {
-    return true;
-  }
-
   switch (state.tiles[index].state) {
     case Tile::RENDER: {
       if (!schedule_denoising) {
         state.tiles[index].state = Tile::DONE;
-        delete_tile = true;
+        delete_tile = !progressive;
         return true;
       }
       state.tiles[index].state = Tile::RENDERED;
@@ -457,15 +472,18 @@ bool TileManager::finish_tile(int index, bool &delete_tile)
         int nindex = get_neighbor_index(index, neighbor);
         if (check_neighbor_state(nindex, Tile::DENOISED)) {
           state.tiles[nindex].state = Tile::DONE;
-          /* It can happen that the tile just finished denoising and already can be freed here.
-           * However, in that case it still has to be written before deleting, so we can't delete
-           * it yet. */
-          if (neighbor == 8) {
-            delete_tile = true;
-          }
-          else {
-            delete state.tiles[nindex].buffers;
-            state.tiles[nindex].buffers = NULL;
+          /* Do not delete finished tiles in progressive mode. */
+          if (!progressive) {
+            /* It can happen that the tile just finished denoising and already can be freed here.
+             * However, in that case it still has to be written before deleting, so we can't delete
+             * it yet. */
+            if (neighbor == 4) {
+              delete_tile = true;
+            }
+            else {
+              delete state.tiles[nindex].buffers;
+              state.tiles[nindex].buffers = NULL;
+            }
           }
         }
       }
@@ -477,27 +495,65 @@ bool TileManager::finish_tile(int index, bool &delete_tile)
   }
 }
 
-bool TileManager::next_tile(Tile *&tile, int device)
+bool TileManager::next_tile(Tile *&tile, int device, uint tile_types)
 {
-  int logical_device = preserve_tile_device ? device : 0;
+  /* Preserve device if requested, unless this is a separate denoising device that just wants to
+   * grab any available tile. */
+  const bool preserve_device = preserve_tile_device && device < num_devices;
 
-  if (logical_device >= state.render_tiles.size())
-    return false;
+  if (tile_types & RenderTile::DENOISE) {
+    int tile_index = -1;
+    int logical_device = preserve_device ? device : 0;
 
-  if (!state.denoising_tiles[logical_device].empty()) {
-    int idx = state.denoising_tiles[logical_device].front();
-    state.denoising_tiles[logical_device].pop_front();
-    tile = &state.tiles[idx];
-    return true;
+    while (logical_device < state.denoising_tiles.size()) {
+      if (state.denoising_tiles[logical_device].empty()) {
+        if (preserve_device) {
+          break;
+        }
+        else {
+          logical_device++;
+          continue;
+        }
+      }
+
+      tile_index = state.denoising_tiles[logical_device].front();
+      state.denoising_tiles[logical_device].pop_front();
+      break;
+    }
+
+    if (tile_index >= 0) {
+      tile = &state.tiles[tile_index];
+      return true;
+    }
   }
 
-  if (state.render_tiles[logical_device].empty())
-    return false;
+  if (tile_types & RenderTile::PATH_TRACE) {
+    int tile_index = -1;
+    int logical_device = preserve_device ? device : 0;
 
-  int idx = state.render_tiles[logical_device].front();
-  state.render_tiles[logical_device].pop_front();
-  tile = &state.tiles[idx];
-  return true;
+    while (logical_device < state.render_tiles.size()) {
+      if (state.render_tiles[logical_device].empty()) {
+        if (preserve_device) {
+          break;
+        }
+        else {
+          logical_device++;
+          continue;
+        }
+      }
+
+      tile_index = state.render_tiles[logical_device].front();
+      state.render_tiles[logical_device].pop_front();
+      break;
+    }
+
+    if (tile_index >= 0) {
+      tile = &state.tiles[tile_index];
+      return true;
+    }
+  }
+
+  return false;
 }
 
 bool TileManager::done()
@@ -508,6 +564,16 @@ bool TileManager::done()
          (state.sample + state.num_samples >= end_sample);
 }
 
+bool TileManager::has_tiles()
+{
+  foreach (Tile &tile, state.tiles) {
+    if (tile.state != Tile::DONE) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool TileManager::next()
 {
   if (done())
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 017c1af0ead..9fb9c1ca782 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -89,6 +89,7 @@ class TileManager {
   } state;
 
   int num_samples;
+  int slice_overlap;
 
   TileManager(bool progressive,
               int num_samples,
@@ -105,15 +106,19 @@ class TileManager {
   void reset(BufferParams &params, int num_samples);
   void set_samples(int num_samples);
   bool next();
-  bool next_tile(Tile *&tile, int device = 0);
+  bool next_tile(Tile *&tile, int device, uint tile_types);
   bool finish_tile(int index, bool &delete_tile);
   bool done();
+  bool has_tiles();
 
   void set_tile_order(TileOrder tile_order_)
   {
     tile_order = tile_order_;
   }
 
+  int get_neighbor_index(int index, int neighbor);
+  bool check_neighbor_state(int index, Tile::State state);
+
   /* ** Sample range rendering. ** */
 
   /* Start sample in the range. */
@@ -160,9 +165,6 @@ class TileManager {
   /* Generate tile list, return number of tiles. */
   int gen_tiles(bool sliced);
   void gen_render_tiles();
-
-  int get_neighbor_index(int index, int neighbor);
-  bool check_neighbor_state(int index, Tile::State state);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 98fcc8cd15e..6dcc7f7b3dd 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -82,25 +82,33 @@ list(APPEND ALL_CYCLES_LIBRARIES
   ${TIFF_LIBRARY}
   ${OPENIMAGEIO_LIBRARIES}
   ${OPENEXR_LIBRARIES}
+  ${OPENVDB_LIBRARIES}
 )
 
 include_directories(${INC})
 
-link_directories(${OPENIMAGEIO_LIBPATH}
-                 ${BOOST_LIBPATH}
-                 ${PNG_LIBPATH}
-                 ${JPEG_LIBPATH}
-                 ${ZLIB_LIBPATH}
-                 ${TIFF_LIBPATH}
-                 ${OPENEXR_LIBPATH}
-                 ${OPENCOLORIO_LIBPATH})
+link_directories(
+  ${OPENIMAGEIO_LIBPATH}
+  ${BOOST_LIBPATH}
+  ${PNG_LIBPATH}
+  ${JPEG_LIBPATH}
+  ${ZLIB_LIBPATH}
+  ${TIFF_LIBPATH}
+  ${OPENEXR_LIBPATH}
+  ${OPENCOLORIO_LIBPATH}
+  ${OPENVDB_LIBPATH}
+)
 
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PLATFORM_LINKFLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} ${PLATFORM_LINKFLAGS_DEBUG}")
 
 CYCLES_TEST(render_graph_finalize "${ALL_CYCLES_LIBRARIES};bf_intern_numaapi")
 CYCLES_TEST(util_aligned_malloc "cycles_util")
-CYCLES_TEST(util_path "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
-CYCLES_TEST(util_string "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
-CYCLES_TEST(util_task "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES};bf_intern_numaapi")
-CYCLES_TEST(util_time "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
+CYCLES_TEST(util_path "cycles_util;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
+CYCLES_TEST(util_string "cycles_util;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
+CYCLES_TEST(util_task "cycles_util;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES};bf_intern_numaapi")
+CYCLES_TEST(util_time "cycles_util;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
+set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+CYCLES_TEST(util_avxf_avx "cycles_util;bf_intern_numaapi;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
+set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+CYCLES_TEST(util_avxf_avx2 "cycles_util;bf_intern_numaapi;${OPENIMAGEIO_LIBRARIES};${BOOST_LIBRARIES}")
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index ca93f8b02d0..87389ebfb16 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "testing/testing.h"
 #include "testing/mock_log.h"
+#include "testing/testing.h"
 
 #include "render/graph.h"
-#include "render/scene.h"
 #include "render/nodes.h"
+#include "render/scene.h"
 #include "util/util_array.h"
 #include "util/util_logging.h"
 #include "util/util_string.h"
diff --git a/intern/cycles/test/util_avxf_avx2_test.cpp b/intern/cycles/test/util_avxf_avx2_test.cpp
new file mode 100644
index 00000000000..9b466ddd3a0
--- /dev/null
+++ b/intern/cycles/test/util_avxf_avx2_test.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#define __KERNEL_AVX2__
+#define __KERNEL_CPU__
+
+#if defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  include "util_avxf_test.h"
+#endif
diff --git a/intern/cycles/test/util_avxf_avx_test.cpp b/intern/cycles/test/util_avxf_avx_test.cpp
new file mode 100644
index 00000000000..cea67649b80
--- /dev/null
+++ b/intern/cycles/test/util_avxf_avx_test.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#define __KERNEL_AVX__
+#define __KERNEL_CPU__
+
+#if defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  include "util_avxf_test.h"
+#endif
diff --git a/intern/cycles/test/util_avxf_test.h b/intern/cycles/test/util_avxf_test.h
new file mode 100644
index 00000000000..d93563fdb3f
--- /dev/null
+++ b/intern/cycles/test/util_avxf_test.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+bool validate_cpu_capabilities()
+{
+
+#ifdef __KERNEL_AVX2__
+  return system_cpu_support_avx2();
+#else
+#  ifdef __KERNEL_AVX__
+  return system_cpu_support_avx();
+#  endif
+#endif
+}
+
+#define VALIDATECPU \
+  if (!validate_cpu_capabilities()) \
+    return;
+
+#define compare_vector_scalar(a, b) \
+  for (size_t index = 0; index < a.size; index++) \
+    EXPECT_FLOAT_EQ(a[index], b);
+
+#define compare_vector_vector(a, b) \
+  for (size_t index = 0; index < a.size; index++) \
+    EXPECT_FLOAT_EQ(a[index], b[index]);
+
+#define compare_vector_vector_near(a, b, abserror) \
+  for (size_t index = 0; index < a.size; index++) \
+    EXPECT_NEAR(a[index], b[index], abserror);
+
+#define basic_test_vv(a, b, op) \
+  VALIDATECPU \
+  avxf c = a op b; \
+  for (size_t i = 0; i < a.size; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
+
+/* vector op float tests */
+#define basic_test_vf(a, b, op) \
+  VALIDATECPU \
+  avxf c = a op b; \
+  for (size_t i = 0; i < a.size; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b);
+
+const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
+const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
+const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
+const float float_b = 1.5f;
+
+TEST(util_avx, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(util_avx, avxf_sub_vv){
+    basic_test_vv(avxf_a, avxf_b, -)} TEST(util_avx, avxf_mul_vv){
+    basic_test_vv(avxf_a, avxf_b, *)} TEST(util_avx, avxf_div_vv){
+    basic_test_vv(avxf_a, avxf_b, /)} TEST(util_avx, avxf_add_vf){
+    basic_test_vf(avxf_a, float_b, +)} TEST(util_avx, avxf_sub_vf){
+    basic_test_vf(avxf_a, float_b, -)} TEST(util_avx, avxf_mul_vf){
+    basic_test_vf(avxf_a, float_b, *)} TEST(util_avx,
+                                            avxf_div_vf){basic_test_vf(avxf_a, float_b, /)}
+
+TEST(util_avx, avxf_ctor)
+{
+  VALIDATECPU
+  compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f),
+                        static_cast<float>(index));
+  compare_vector_scalar(avxf(1.0f), 1.0f);
+  compare_vector_vector(avxf(1.0f, 2.0f), avxf(1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f));
+  compare_vector_vector(avxf(1.0f, 2.0f, 3.0f, 4.0f),
+                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f));
+  compare_vector_vector(avxf(make_float3(1.0f, 2.0f, 3.0f)),
+                        avxf(0.0f, 3.0f, 2.0f, 1.0f, 0.0f, 3.0f, 2.0f, 1.0f));
+}
+
+TEST(util_avx, avxf_sqrt)
+{
+  VALIDATECPU
+  compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
+                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
+}
+
+TEST(util_avx, avxf_min_max)
+{
+  VALIDATECPU
+  compare_vector_vector(min(avxf_a, avxf_b), avxf_a);
+  compare_vector_vector(max(avxf_a, avxf_b), avxf_b);
+}
+
+TEST(util_avx, avxf_set_sign)
+{
+  VALIDATECPU
+  avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a);
+  compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f));
+}
+
+TEST(util_avx, avxf_msub)
+{
+  VALIDATECPU
+  avxf res = msub(avxf_a, avxf_b, avxf_c);
+  avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7],
+                  (avxf_a[6] * avxf_b[6]) - avxf_c[6],
+                  (avxf_a[5] * avxf_b[5]) - avxf_c[5],
+                  (avxf_a[4] * avxf_b[4]) - avxf_c[4],
+                  (avxf_a[3] * avxf_b[3]) - avxf_c[3],
+                  (avxf_a[2] * avxf_b[2]) - avxf_c[2],
+                  (avxf_a[1] * avxf_b[1]) - avxf_c[1],
+                  (avxf_a[0] * avxf_b[0]) - avxf_c[0]);
+  compare_vector_vector(res, exp);
+}
+
+TEST(util_avx, avxf_madd)
+{
+  VALIDATECPU
+  avxf res = madd(avxf_a, avxf_b, avxf_c);
+  avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7],
+                  (avxf_a[6] * avxf_b[6]) + avxf_c[6],
+                  (avxf_a[5] * avxf_b[5]) + avxf_c[5],
+                  (avxf_a[4] * avxf_b[4]) + avxf_c[4],
+                  (avxf_a[3] * avxf_b[3]) + avxf_c[3],
+                  (avxf_a[2] * avxf_b[2]) + avxf_c[2],
+                  (avxf_a[1] * avxf_b[1]) + avxf_c[1],
+                  (avxf_a[0] * avxf_b[0]) + avxf_c[0]);
+  compare_vector_vector(res, exp);
+}
+
+TEST(util_avx, avxf_nmadd)
+{
+  VALIDATECPU
+  avxf res = nmadd(avxf_a, avxf_b, avxf_c);
+  avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]),
+                  avxf_c[6] - (avxf_a[6] * avxf_b[6]),
+                  avxf_c[5] - (avxf_a[5] * avxf_b[5]),
+                  avxf_c[4] - (avxf_a[4] * avxf_b[4]),
+                  avxf_c[3] - (avxf_a[3] * avxf_b[3]),
+                  avxf_c[2] - (avxf_a[2] * avxf_b[2]),
+                  avxf_c[1] - (avxf_a[1] * avxf_b[1]),
+                  avxf_c[0] - (avxf_a[0] * avxf_b[0]));
+  compare_vector_vector(res, exp);
+}
+
+TEST(util_avx, avxf_compare)
+{
+  VALIDATECPU
+  avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f);
+  avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+  avxb res = a <= b;
+  int exp[8] = {
+      a[0] <= b[0] ? -1 : 0,
+      a[1] <= b[1] ? -1 : 0,
+      a[2] <= b[2] ? -1 : 0,
+      a[3] <= b[3] ? -1 : 0,
+      a[4] <= b[4] ? -1 : 0,
+      a[5] <= b[5] ? -1 : 0,
+      a[6] <= b[6] ? -1 : 0,
+      a[7] <= b[7] ? -1 : 0,
+  };
+  compare_vector_vector(res, exp);
+}
+
+TEST(util_avx, avxf_permute)
+{
+  VALIDATECPU
+  avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b);
+  compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f));
+}
+
+TEST(util_avx, avxf_blend)
+{
+  VALIDATECPU
+  avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b);
+  compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f));
+}
+
+TEST(util_avx, avxf_shuffle)
+{
+  VALIDATECPU
+  avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a);
+  compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f));
+}
+
+TEST(util_avx, avxf_cross)
+{
+  VALIDATECPU
+  avxf res = cross(avxf_b, avxf_c);
+  compare_vector_vector_near(res,
+                             avxf(0.0f,
+                                  -9.5367432e-07f,
+                                  0.0f,
+                                  4.7683716e-07f,
+                                  0.0f,
+                                  -3.8146973e-06f,
+                                  3.8146973e-06f,
+                                  3.8146973e-06f),
+                             0.000002000f);
+}
+
+TEST(util_avx, avxf_dot3)
+{
+  VALIDATECPU
+  float den, den2;
+  dot3(avxf_a, avxf_b, den, den2);
+  EXPECT_FLOAT_EQ(den, 14.9f);
+  EXPECT_FLOAT_EQ(den2, 2.9f);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index ef100c12453..c1f71461dfd 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -102,6 +102,7 @@ set(SRC_HEADERS
   util_sky_model_data.h
   util_avxf.h
   util_avxb.h
+  util_semaphore.h
   util_sseb.h
   util_ssef.h
   util_ssei.h
diff --git a/intern/cycles/util/util_algorithm.h b/intern/cycles/util/util_algorithm.h
index 62093039625..63abd4e92a3 100644
--- a/intern/cycles/util/util_algorithm.h
+++ b/intern/cycles/util/util_algorithm.h
@@ -25,6 +25,7 @@ using std::max;
 using std::min;
 using std::remove;
 using std::sort;
+using std::stable_sort;
 using std::swap;
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index a8ea1dc925e..13d177d2b25 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -77,6 +77,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
 #    define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
 #    define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
+#    define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
 
 #    define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
 #    define ccl_barrier(flags) barrier(flags)
@@ -91,6 +92,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x))
 #    define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
 #    define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
+#    define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x))
 
 ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
                                                       const float old_val,
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index b5c3f1a8954..7fab7bd5a15 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -17,8 +17,8 @@
 #ifndef __UTIL_BOUNDBOX_H__
 #define __UTIL_BOUNDBOX_H__
 
-#include <math.h>
 #include <float.h>
+#include <math.h>
 
 #include "util/util_math.h"
 #include "util/util_string.h"
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index b29d4163133..24a20a969ab 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -15,6 +15,11 @@
  * limitations under the License.
  */
 
+/* clang-format off */
+
+/* #define __forceinline triggers a bug in some clang-format versions, disable
+ * format for entire file to keep results consistent. */
+
 #ifndef __UTIL_DEFINES_H__
 #define __UTIL_DEFINES_H__
 
diff --git a/intern/cycles/util/util_disjoint_set.h b/intern/cycles/util/util_disjoint_set.h
index 80f3c714a29..946632371d2 100644
--- a/intern/cycles/util/util_disjoint_set.h
+++ b/intern/cycles/util/util_disjoint_set.h
@@ -17,8 +17,8 @@
 #ifndef __UTIL_DISJOINT_SET_H__
 #define __UTIL_DISJOINT_SET_H__
 
-#include <utility>
 #include "util_array.h"
+#include <utility>
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 647e9cf2fd6..8de62893ba8 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -17,8 +17,8 @@
 #ifndef __UTIL_HALF_H__
 #define __UTIL_HALF_H__
 
-#include "util/util_types.h"
 #include "util/util_math.h"
+#include "util/util_types.h"
 
 #ifdef __KERNEL_SSE2__
 #  include "util/util_simd.h"
diff --git a/intern/cycles/util/util_md5.cpp b/intern/cycles/util/util_md5.cpp
index c11f495f785..0df521c2b58 100644
--- a/intern/cycles/util/util_md5.cpp
+++ b/intern/cycles/util/util_md5.cpp
@@ -26,8 +26,8 @@
 #include "util_md5.h"
 #include "util_path.h"
 
-#include <string.h>
 #include <stdio.h>
+#include <string.h>
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 77293c45f6b..8905c8bc7f0 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "util/util_md5.h"
 #include "util/util_path.h"
+#include "util/util_md5.h"
 #include "util/util_string.h"
 
 #include <OpenImageIO/filesystem.h>
@@ -36,8 +36,8 @@ OIIO_NAMESPACE_USING
 #  define DIR_SEP '/'
 #  include <dirent.h>
 #  include <pwd.h>
-#  include <unistd.h>
 #  include <sys/types.h>
+#  include <unistd.h>
 #endif
 
 #ifdef HAVE_SHLWAPI_H
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
index bbefbadd0fe..073b09f719f 100644
--- a/intern/cycles/util/util_profiling.cpp
+++ b/intern/cycles/util/util_profiling.cpp
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include "util/util_algorithm.h"
 #include "util/util_profiling.h"
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
 #include "util/util_set.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
index f5f500239f2..ceec08ed894 100644
--- a/intern/cycles/util/util_profiling.h
+++ b/intern/cycles/util/util_profiling.h
@@ -19,7 +19,6 @@
 
 #include <atomic>
 
-#include "util/util_foreach.h"
 #include "util/util_map.h"
 #include "util/util_thread.h"
 #include "util/util_vector.h"
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 379beaeeefa..26534a29dfe 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -25,8 +25,8 @@
 
 #include "util/util_function.h"
 #include "util/util_string.h"
-#include "util/util_time.h"
 #include "util/util_thread.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -204,6 +204,8 @@ class Progress {
 
   float get_progress()
   {
+    thread_scoped_lock lock(progress_mutex);
+
     if (total_pixel_samples > 0) {
       return ((float)pixel_samples) / total_pixel_samples;
     }
diff --git a/intern/cycles/util/util_semaphore.h b/intern/cycles/util/util_semaphore.h
new file mode 100644
index 00000000000..d995b0732b8
--- /dev/null
+++ b/intern/cycles/util/util_semaphore.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2020 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_SEMAPHORE_H__
+#define __UTIL_SEMAPHORE_H__
+
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Counting Semaphore
+ *
+ * To restrict concurrent access to a resource to a specified number
+ * of threads. Similar to std::counting_semaphore from C++20. */
+
+class thread_counting_semaphore {
+ public:
+  explicit thread_counting_semaphore(const int count) : count(count)
+  {
+  }
+
+  thread_counting_semaphore(const thread_counting_semaphore &) = delete;
+
+  void acquire()
+  {
+    thread_scoped_lock lock(mutex);
+    while (count == 0) {
+      condition.wait(lock);
+    }
+    count--;
+  }
+
+  void release()
+  {
+    thread_scoped_lock lock(mutex);
+    count++;
+    condition.notify_one();
+  }
+
+ protected:
+  thread_mutex mutex;
+  thread_condition_variable condition;
+  int count;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_SEMAPHORE_H__ */
diff --git a/intern/cycles/util/util_sky_model.cpp b/intern/cycles/util/util_sky_model.cpp
index 4a6a9f32607..8cdad8a90a4 100644
--- a/intern/cycles/util/util_sky_model.cpp
+++ b/intern/cycles/util/util_sky_model.cpp
@@ -101,9 +101,9 @@ All instructions on how to use this code are in the accompanying header file.
 #include "util/util_sky_model_data.h"
 
 #include <assert.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index fa525daf37c..e9f0efb4efb 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -18,6 +18,8 @@
 #ifndef __UTIL_SSEF_H__
 #define __UTIL_SSEF_H__
 
+#include "util_ssei.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index ceb52830319..d809f2e06d7 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -14,34 +14,20 @@
  * limitations under the License.
  */
 
+/* clang-format off */
+
+/* #define static_assert triggers a bug in some clang-format versions, disable
+ * format for entire file to keep results consistent. */
+
 #ifndef __UTIL_STATIC_ASSERT_H__
 #define __UTIL_STATIC_ASSERT_H__
 
 CCL_NAMESPACE_BEGIN
 
-/* TODO(sergey): In theory CUDA might work with own static assert
- * implementation since it's just pure C++.
- */
-#ifdef __KERNEL_GPU__
-#  ifndef static_assert
-#    define static_assert(statement, message)
-#  endif
-#endif /* __KERNEL_GPU__ */
-
-/* TODO(sergey): For until C++11 is a bare minimum for us,
- * we do a bit of a trickery to show meaningful message so
- * it's more or less clear what's wrong when building without
- * C++11.
- *
- * The thing here is: our non-C++11 implementation doesn't
- * have a way to print any message after preprocessor
- * substitution so we rely on the message which is passed to
- * static_assert() since that's the only message visible when
- * compilation fails.
- *
- * After C++11 bump it should be possible to glue structure
- * name to the error message,
- */
+#if defined(__KERNEL_OPENCL__) || defined(CYCLES_CUBIN_CC)
+#  define static_assert(statement, message)
+#endif /* __KERNEL_OPENCL__ */
+
 #define static_assert_align(st, align) \
   static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
 
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index f71145741c9..ce2d4acdde4 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -17,9 +17,9 @@
 #ifndef __UTIL_STRING_H__
 #define __UTIL_STRING_H__
 
+#include <sstream>
 #include <string.h>
 #include <string>
-#include <sstream>
 
 #include "util/util_vector.h"
 
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index f700f9bd277..6d32153209a 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -17,8 +17,8 @@
 #include "util/util_system.h"
 
 #include "util/util_logging.h"
-#include "util/util_types.h"
 #include "util/util_string.h"
+#include "util/util_types.h"
 
 #include <numaapi.h>
 
@@ -35,8 +35,8 @@ OIIO_NAMESPACE_USING
 #  include <sys/sysctl.h>
 #  include <sys/types.h>
 #else
-#  include <unistd.h>
 #  include <sys/ioctl.h>
+#  include <unistd.h>
 #endif
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 24286116dfb..61aa28c6815 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
+#include "util/util_task.h"
 #include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_system.h"
-#include "util/util_task.h"
 #include "util/util_time.h"
 
 //#define THREADING_DEBUG_ENABLED
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index d43852480d1..863c2ea3124 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -17,6 +17,8 @@
 #ifndef __UTIL_TEXTURE_H__
 #define __UTIL_TEXTURE_H__
 
+#include "util_transform.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Texture limits on devices. */
@@ -91,12 +93,17 @@ typedef enum ExtensionType {
 typedef struct TextureInfo {
   /* Pointer, offset or texture depending on device. */
   uint64_t data;
+  /* Data Type */
+  uint data_type;
   /* Buffer number for OpenCL. */
   uint cl_buffer;
   /* Interpolation and extension type. */
   uint interpolation, extension;
   /* Dimensions. */
   uint width, height, depth;
+  /* Transform for 3D textures. */
+  uint use_transform_3d;
+  Transform transform_3d;
 } TextureInfo;
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 18ec5b32144..f6dbc9186b8 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -17,11 +17,11 @@
 #ifndef __UTIL_THREAD_H__
 #define __UTIL_THREAD_H__
 
-#include <thread>
-#include <mutex>
 #include <condition_variable>
 #include <functional>
+#include <mutex>
 #include <queue>
+#include <thread>
 
 #ifdef _WIN32
 #  include "util_windows.h"
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 302a8a386ac..101122740d7 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -46,8 +46,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util/util_projection.h"
 #include "util/util_transform.h"
+#include "util/util_projection.h"
 
 #include "util/util_boundbox.h"
 #include "util/util_math.h"
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index 407654245cb..d0a6264d5cf 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -344,10 +344,10 @@ ccl_device_inline Transform transform_empty()
 
 ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 {
-  /* use simpe nlerp instead of slerp. it's faster and almost the same */
+  /* Optix is using lerp to interpolate motion transformations. */
+#ifdef __KERNEL_OPTIX__
   return normalize((1.0f - t) * q1 + t * q2);
-
-#if 0
+#else  /* __KERNEL_OPTIX__ */
   /* note: this does not ensure rotation around shortest angle, q1 and q2
    * are assumed to be matched already in transform_motion_decompose */
   float costheta = dot(q1, q2);
@@ -365,7 +365,7 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
     float thetap = theta * t;
     return q1 * cosf(thetap) + qperp * sinf(thetap);
   }
-#endif
+#endif /* __KERNEL_OPTIX__ */
 }
 
 ccl_device_inline Transform transform_quick_inverse(Transform M)
@@ -468,29 +468,6 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
 
 #ifndef __KERNEL_GPU__
 
-#  ifdef WITH_EMBREE
-ccl_device void transform_motion_array_interpolate_straight(
-    Transform *tfm, const ccl_global DecomposedTransform *motion, uint numsteps, float time)
-{
-  /* Figure out which steps we need to interpolate. */
-  int maxstep = numsteps - 1;
-  int step = min((int)(time * maxstep), maxstep - 1);
-  float t = time * maxstep - step;
-
-  const ccl_global DecomposedTransform *a = motion + step;
-  const ccl_global DecomposedTransform *b = motion + step + 1;
-  Transform step1, step2;
-
-  transform_compose(&step1, a);
-  transform_compose(&step2, b);
-
-  /* matrix lerp */
-  tfm->x = (1.0f - t) * step1.x + t * step2.x;
-  tfm->y = (1.0f - t) * step1.y + t * step2.y;
-  tfm->z = (1.0f - t) * step1.z + t * step2.z;
-}
-#  endif
-
 class BoundBox2D;
 
 ccl_device_inline bool operator==(const DecomposedTransform &A, const DecomposedTransform &B)
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 48e9983ac8f..f6535848480 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -101,6 +101,11 @@ ccl_device_inline size_t round_down(size_t x, size_t multiple)
   return (x / multiple) * multiple;
 }
 
+ccl_device_inline bool is_power_of_two(size_t x)
+{
+  return (x & (x - 1)) == 0;
+}
+
 CCL_NAMESPACE_END
 
 /* Vectorized types declaration. */
@@ -148,8 +153,8 @@ CCL_NAMESPACE_END
 /* SSE types. */
 #ifndef __KERNEL_GPU__
 #  include "util/util_sseb.h"
-#  include "util/util_ssei.h"
 #  include "util/util_ssef.h"
+#  include "util/util_ssei.h"
 #  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
 #    include "util/util_avxb.h"
 #    include "util/util_avxf.h"
diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h
index 38829d3a29c..bb2c99cc6d7 100644
--- a/intern/cycles/util/util_version.h
+++ b/intern/cycles/util/util_version.h
@@ -22,7 +22,7 @@
 CCL_NAMESPACE_BEGIN
 
 #define CYCLES_VERSION_MAJOR 1
-#define CYCLES_VERSION_MINOR 9
+#define CYCLES_VERSION_MINOR 12
 #define CYCLES_VERSION_PATCH 0
 
 #define CYCLES_MAKE_VERSION_STRING2(a, b, c) #a "." #b "." #c
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index f23174fd6dc..9d9ff451b3b 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -134,7 +134,7 @@ static void view_display()
 
   glMatrixMode(GL_PROJECTION);
   glLoadIdentity();
-  gluOrtho2D(0, V.width, 0, V.height);
+  glOrtho(0, V.width, 0, V.height, -1, 1);
 
   glMatrixMode(GL_MODELVIEW);
   glLoadIdentity();