136 files changed, 4980 insertions, 1390 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 17096d441f0..2018c1d9648 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -297,6 +297,7 @@ endif()
 
 if(WITH_CYCLES_STANDALONE)
   set(WITH_CYCLES_DEVICE_CUDA TRUE)
+  set(WITH_CYCLES_DEVICE_HIP TRUE)
 endif()
 # TODO(sergey): Consider removing it, only causes confusion in interface.
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index f9dc5f00802..3ed3f54ef9f 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -64,6 +64,8 @@ if(WITH_CYCLES_STANDALONE)
     cycles_standalone.cpp
     cycles_xml.cpp
     cycles_xml.h
+    oiio_output_driver.cpp
+    oiio_output_driver.h
   )
   add_executable(cycles ${SRC} ${INC} ${INC_SYS})
   unset(SRC)
@@ -73,7 +75,7 @@ if(WITH_CYCLES_STANDALONE)
 
   if(APPLE)
     if(WITH_OPENCOLORIO)
-      set_property(TARGET cycles APPEND_STRING PROPERTY LINK_FLAGS " -framework IOKit")
+      set_property(TARGET cycles APPEND_STRING PROPERTY LINK_FLAGS " -framework IOKit -framework Carbon")
     endif()
     if(WITH_OPENIMAGEDENOISE AND "${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64")
       # OpenImageDenoise uses BNNS from the Accelerate framework.
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 270096d70b0..00dc140648a 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -36,6 +36,9 @@
 #include "util/util_unique_ptr.h"
 #include "util/util_version.h"
 
+#include "app/cycles_xml.h"
+#include "app/oiio_output_driver.h"
+
 #ifdef WITH_CYCLES_STANDALONE_GUI
 #  include "util/util_view.h"
 #endif
@@ -53,7 +56,8 @@ struct Options {
   SessionParams session_params;
   bool quiet;
   bool show_help, interactive, pause;
-  string output_path;
+  string output_filepath;
+  string output_pass;
 } options;
 
 static void session_print(const string &str)
@@ -89,30 +93,6 @@ static void session_print_status()
   session_print(status);
 }
 
-static bool write_render(const uchar *pixels, int w, int h, int channels)
-{
-  string msg = string_printf("Writing image %s", options.output_path.c_str());
-  session_print(msg);
-
-  unique_ptr<ImageOutput> out = unique_ptr<ImageOutput>(ImageOutput::create(options.output_path));
-  if (!out) {
-    return false;
-  }
-
-  ImageSpec spec(w, h, channels, TypeDesc::UINT8);
-  if (!out->open(options.output_path, spec)) {
-    return false;
-  }
-
-  /* conversion for different top/bottom convention */
-  out->write_image(
-      TypeDesc::UINT8, pixels + (h - 1) * w * channels, AutoStride, -w * channels, AutoStride);
-
-  out->close();
-
-  return true;
-}
-
 static BufferParams &session_buffer_params()
 {
   static BufferParams buffer_params;
@@ -147,9 +127,14 @@ static void scene_init()
 
 static void session_init()
 {
-  options.session_params.write_render_cb = write_render;
+  options.output_pass = "combined";
   options.session = new Session(options.session_params, options.scene_params);
 
+  if (!options.output_filepath.empty()) {
+    options.session->set_output_driver(make_unique<OIIOOutputDriver>(
+        options.output_filepath, options.output_pass, session_print));
+  }
+
   if (options.session_params.background && !options.quiet)
     options.session->progress.set_update_callback(function_bind(&session_print_status));
 #ifdef WITH_CYCLES_STANDALONE_GUI
@@ -160,7 +145,12 @@ static void session_init()
   /* load scene */
   scene_init();
 
-  options.session->reset(session_buffer_params(), options.session_params.samples);
+  /* add pass for output. */
+  Pass *pass = options.scene->create_node<Pass>();
+  pass->set_name(ustring(options.output_pass.c_str()));
+  pass->set_type(PASS_COMBINED);
+
+  options.session->reset(options.session_params, session_buffer_params());
   options.session->start();
 }
 
@@ -222,9 +212,7 @@ static void display_info(Progress &progress)
 
 static void display()
 {
-  static DeviceDrawParams draw_params = DeviceDrawParams();
-
-  options.session->draw(session_buffer_params(), draw_params);
+  options.session->draw();
 
   display_info(options.session->progress);
 }
@@ -254,7 +242,7 @@ static void motion(int x, int y, int button)
     options.session->scene->camera->need_flags_update = true;
     options.session->scene->camera->need_device_update = true;
 
-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
   }
 }
 
@@ -271,7 +259,7 @@ static void resize(int width, int height)
     options.session->scene->camera->need_flags_update = true;
     options.session->scene->camera->need_device_update = true;
 
-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
   }
 }
 
@@ -283,7 +271,7 @@ static void keyboard(unsigned char key)
 
   /* Reset */
   else if (key == 'r')
-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
 
   /* Cancel */
   else if (key == 27)  // escape
@@ -320,7 +308,7 @@ static void keyboard(unsigned char key)
     options.session->scene->camera->need_flags_update = true;
     options.session->scene->camera->need_device_update = true;
 
-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
   }
 
   /* Set Max Bounces */
@@ -346,7 +334,7 @@ static void keyboard(unsigned char key)
 
     options.session->scene->integrator->set_max_bounce(bounce);
 
-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
   }
 }
 #endif
@@ -361,11 +349,13 @@ static int files_parse(int argc, const char *argv[])
 
 static void options_parse(int argc, const char **argv)
 {
-  options.width = 0;
-  options.height = 0;
+  options.width = 1024;
+  options.height = 512;
   options.filepath = "";
   options.session = NULL;
   options.quiet = false;
+  options.session_params.use_auto_tile = false;
+  options.session_params.tile_size = 0;
 
   /* device names */
   string device_names = "";
@@ -411,7 +401,7 @@ static void options_parse(int argc, const char **argv)
              &options.session_params.samples,
              "Number of samples to render",
              "--output %s",
-             &options.output_path,
+             &options.output_filepath,
              "File path to write output image",
              "--threads %d",
              &options.session_params.threads,
@@ -422,12 +412,9 @@ static void options_parse(int argc, const char **argv)
              "--height %d",
              &options.height,
              "Window height in pixel",
-             "--tile-width %d",
-             &options.session_params.tile_size.x,
-             "Tile width in pixels",
-             "--tile-height %d",
-             &options.session_params.tile_size.y,
-             "Tile height in pixels",
+             "--tile-size %d",
+             &options.session_params.tile_size,
+             "Tile size in pixels",
              "--list-devices",
              &list,
              "List information about all available devices",
@@ -489,8 +476,9 @@ static void options_parse(int argc, const char **argv)
   options.session_params.background = true;
 #endif
 
-  /* Use progressive rendering */
-  options.session_params.progressive = true;
+  if (options.session_params.tile_size > 0) {
+    options.session_params.use_auto_tile = true;
+  }
 
   /* find matching device */
   DeviceType device_type = Device::type_from_string(devicename.c_str());
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 54f97fddbd9..0b83c60f32d 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -333,6 +333,7 @@ static void xml_read_shader_graph(XMLReadState &state, Shader *shader, xml_node
       }
 
       snode = (ShaderNode *)node_type->create(node_type);
+      snode->set_owner(graph);
     }
 
     xml_read_node(graph_reader, snode, node);
diff --git a/intern/cycles/app/oiio_output_driver.cpp b/intern/cycles/app/oiio_output_driver.cpp
new file mode 100644
index 00000000000..d791c89772f
--- /dev/null
+++ b/intern/cycles/app/oiio_output_driver.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "app/oiio_output_driver.h"
+
+CCL_NAMESPACE_BEGIN
+
+OIIOOutputDriver::OIIOOutputDriver(const string_view filepath,
+                                   const string_view pass,
+                                   LogFunction log)
+    : filepath_(filepath), pass_(pass), log_(log)
+{
+}
+
+OIIOOutputDriver::~OIIOOutputDriver()
+{
+}
+
+void OIIOOutputDriver::write_render_tile(const Tile &tile)
+{
+  /* Only write the full buffer, no intermediate tiles. */
+  if (!(tile.size == tile.full_size)) {
+    return;
+  }
+
+  log_(string_printf("Writing image %s", filepath_.c_str()));
+
+  unique_ptr<ImageOutput> image_output(ImageOutput::create(filepath_));
+  if (image_output == nullptr) {
+    log_("Failed to create image file");
+    return;
+  }
+
+  const int width = tile.size.x;
+  const int height = tile.size.y;
+
+  ImageSpec spec(width, height, 4, TypeDesc::FLOAT);
+  if (!image_output->open(filepath_, spec)) {
+    log_("Failed to create image file");
+    return;
+  }
+
+  vector<float> pixels(width * height * 4);
+  if (!tile.get_pass_pixels(pass_, 4, pixels.data())) {
+    log_("Failed to read render pass pixels");
+    return;
+  }
+
+  /* Manipulate offset and stride to convert from bottom-up to top-down convention. */
+  image_output->write_image(TypeDesc::FLOAT,
+                            pixels.data() + (height - 1) * width * 4,
+                            AutoStride,
+                            -width * 4 * sizeof(float),
+                            AutoStride);
+  image_output->close();
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/app/oiio_output_driver.h b/intern/cycles/app/oiio_output_driver.h
new file mode 100644
index 00000000000..cdc4085d962
--- /dev/null
+++ b/intern/cycles/app/oiio_output_driver.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/output_driver.h"
+
+#include "util/util_function.h"
+#include "util/util_image.h"
+#include "util/util_string.h"
+#include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OIIOOutputDriver : public OutputDriver {
+ public:
+  typedef function<void(const string &)> LogFunction;
+
+  OIIOOutputDriver(const string_view filepath, const string_view pass, LogFunction log);
+  virtual ~OIIOOutputDriver();
+
+  void write_render_tile(const Tile &tile) override;
+
+ protected:
+  string filepath_;
+  string pass_;
+  LogFunction log_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 5bdcfd56a4d..a0442b3394b 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -31,13 +31,14 @@ set(INC_SYS
 set(SRC
   blender_camera.cpp
   blender_device.cpp
+  blender_display_driver.cpp
   blender_image.cpp
   blender_geometry.cpp
-  blender_gpu_display.cpp
   blender_light.cpp
   blender_mesh.cpp
   blender_object.cpp
   blender_object_cull.cpp
+  blender_output_driver.cpp
   blender_particles.cpp
   blender_curves.cpp
   blender_logging.cpp
@@ -51,10 +52,11 @@ set(SRC
 
   CCL_api.h
   blender_device.h
-  blender_gpu_display.h
+  blender_display_driver.h
   blender_id_map.h
   blender_image.h
   blender_object_cull.h
+  blender_output_driver.h
   blender_sync.h
   blender_session.h
   blender_texture.h
@@ -95,6 +97,9 @@ set(ADDON_FILES
 
 add_definitions(${GL_DEFINITIONS})
 
+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+endif()
 if(WITH_MOD_FLUID)
   add_definitions(-DWITH_FLUID)
 endif()
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index e0e8ca10bef..d729cb1ee69 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -28,7 +28,7 @@ def _configure_argument_parser():
                         action='store_true')
     parser.add_argument("--cycles-device",
                         help="Set the device to use for Cycles, overriding user preferences and the scene setting."
-                             "Valid options are 'CPU', 'CUDA' or 'OPTIX'."
+                             "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'"
                              "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
                         default=None)
     return parser
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5fb0eeed925..cea70033784 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -111,6 +111,7 @@ enum_device_type = (
     ('CPU', "CPU", "CPU", 0),
     ('CUDA', "CUDA", "CUDA", 1),
     ('OPTIX', "OptiX", "OptiX", 3),
+    ("HIP", "HIP", "HIP", 4)
 )
 
 enum_texture_limit = (
@@ -123,7 +124,7 @@ enum_texture_limit = (
     ('4096', "4096", "Limit texture size to 4096 pixels", 6),
     ('8192', "8192", "Limit texture size to 8192 pixels", 7),
 )
- 
+
 # NOTE: Identifiers are expected to be an upper case version of identifiers from  `Pass::get_type_enum()`
 enum_view3d_shading_render_pass = (
     ('', "General", ""),
@@ -739,7 +740,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
 
     use_auto_tile: BoolProperty(
         name="Auto Tiles",
-        description="Automatically split image into tiles",
+        description="Automatically render high resolution images in tiles to reduce memory usage, using the specified tile size. Tiles are cached to disk while rendering to save memory",
         default=True,
     )
     tile_size: IntProperty(
@@ -1266,12 +1267,16 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def get_device_types(self, context):
         import _cycles
-        has_cuda, has_optix = _cycles.get_device_types()
+        has_cuda, has_optix, has_hip = _cycles.get_device_types()
+
         list = [('NONE', "None", "Don't use compute device", 0)]
         if has_cuda:
             list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
         if has_optix:
             list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
+        if has_hip:
+            list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
+
         return list
 
     compute_device_type: EnumProperty(
@@ -1296,7 +1301,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}:
                 continue
             # Try to find existing Device entry
             entry = self.find_existing_device_entry(device)
@@ -1330,7 +1335,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
             elif entry.type == 'CPU':
                 cpu_devices.append(entry)
         # Extend all GPU devices with CPU.
-        if compute_device_type != 'CPU':
+        if compute_device_type != 'CPU' and compute_device_type != 'HIP':
             devices.extend(cpu_devices)
         return devices
 
@@ -1340,7 +1345,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         import _cycles
         # Ensure `self.devices` is not re-allocated when the second call to
         # get_devices_for_type is made, freeing items from the first list.
-        for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
+        for device_type in ('CUDA', 'OPTIX', 'HIP'):
             self.update_device_entries(_cycles.available_devices(device_type))
 
     # Deprecated: use refresh_devices instead.
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index d02627b9936..c4a1844480c 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -99,6 +99,11 @@ def use_cuda(context):
     return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')
 
 
+def use_hip(context):
+    cscene = context.scene.cycles
+
+    return (get_device_type(context) == 'HIP' and cscene.device == 'GPU')
+
 def use_optix(context):
     cscene = context.scene.cycles
 
@@ -613,8 +618,8 @@ class CYCLES_RENDER_PT_performance_threads(CyclesButtonsPanel, Panel):
         sub.prop(rd, "threads")
 
 
-class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
-    bl_label = "Tiles"
+class CYCLES_RENDER_PT_performance_memory(CyclesButtonsPanel, Panel):
+    bl_label = "Memory"
     bl_parent_id = "CYCLES_RENDER_PT_performance"
 
     def draw(self, context):
@@ -2107,7 +2112,7 @@ classes = (
     CYCLES_RENDER_PT_film_transparency,
     CYCLES_RENDER_PT_performance,
     CYCLES_RENDER_PT_performance_threads,
-    CYCLES_RENDER_PT_performance_tiles,
+    CYCLES_RENDER_PT_performance_memory,
     CYCLES_RENDER_PT_performance_acceleration_structure,
     CYCLES_RENDER_PT_performance_final_render,
     CYCLES_RENDER_PT_performance_viewport,
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 6fe5ea41fff..b6b4f206620 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -283,10 +283,13 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa
     return;
 
   Attribute *attr_intercept = NULL;
+  Attribute *attr_length = NULL;
   Attribute *attr_random = NULL;
 
   if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
     attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_LENGTH))
+    attr_length = hair->attributes.add(ATTR_STD_CURVE_LENGTH);
   if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM))
     attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
 
@@ -336,6 +339,10 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa
         num_curve_keys++;
       }
 
+      if (attr_length != NULL) {
+        attr_length->add(CData->curve_length[curve]);
+      }
+
       if (attr_random != NULL) {
         attr_random->add(hash_uint2_to_float(num_curves, 0));
       }
@@ -657,11 +664,15 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
 
   /* Add requested attributes. */
   Attribute *attr_intercept = NULL;
+  Attribute *attr_length = NULL;
   Attribute *attr_random = NULL;
 
   if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT)) {
     attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
   }
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_LENGTH)) {
+    attr_length = hair->attributes.add(ATTR_STD_CURVE_LENGTH);
+  }
   if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM)) {
     attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
   }
@@ -714,6 +725,10 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
       }
     }
 
+    if (attr_length) {
+      attr_length->add(length);
+    }
+
     /* Random number per curve. */
     if (attr_random != NULL) {
       attr_random->add(hash_uint2_to_float(b_curve.index(), 0));
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index ce1770f18a3..7bed33855c2 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -26,6 +26,7 @@ enum ComputeDevice {
   COMPUTE_DEVICE_CPU = 0,
   COMPUTE_DEVICE_CUDA = 1,
   COMPUTE_DEVICE_OPTIX = 3,
+  COMPUTE_DEVICE_HIP = 4,
 
   COMPUTE_DEVICE_NUM
 };
@@ -81,6 +82,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       else if (compute_device == COMPUTE_DEVICE_OPTIX) {
         mask |= DEVICE_MASK_OPTIX;
       }
+      else if (compute_device == COMPUTE_DEVICE_HIP) {
+        mask |= DEVICE_MASK_HIP;
+      }
       vector<DeviceInfo> devices = Device::available_devices(mask);
 
       /* Match device preferences and available devices. */
diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_display_driver.cpp
index c5c3a2bd155..f55a8ce8c4e 100644
--- a/intern/cycles/blender/blender_gpu_display.cpp
+++ b/intern/cycles/blender/blender_display_driver.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "blender/blender_gpu_display.h"
+#include "blender/blender_display_driver.h"
 
 #include "device/device.h"
 #include "util/util_logging.h"
@@ -273,17 +273,17 @@ uint BlenderDisplaySpaceShader::get_shader_program()
 }
 
 /* --------------------------------------------------------------------
- * BlenderGPUDisplay.
+ * BlenderDisplayDriver.
  */
 
-BlenderGPUDisplay::BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene)
+BlenderDisplayDriver::BlenderDisplayDriver(BL::RenderEngine &b_engine, BL::Scene &b_scene)
     : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene))
 {
   /* Create context while on the main thread. */
   gl_context_create();
 }
 
-BlenderGPUDisplay::~BlenderGPUDisplay()
+BlenderDisplayDriver::~BlenderDisplayDriver()
 {
   gl_resources_destroy();
 }
@@ -292,19 +292,18 @@ BlenderGPUDisplay::~BlenderGPUDisplay()
  * Update procedure.
  */
 
-bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams &params,
+bool BlenderDisplayDriver::update_begin(const Params &params,
                                         int texture_width,
                                         int texture_height)
 {
-  /* Note that it's the responsibility of BlenderGPUDisplay to ensure updating and drawing
+  /* Note that it's the responsibility of BlenderDisplayDriver to ensure updating and drawing
    * the texture does not happen at the same time. This is achieved indirectly.
    *
    * When enabling the OpenGL context, it uses an internal mutex lock DST.gl_context_lock.
    * This same lock is also held when do_draw() is called, which together ensure mutual
    * exclusion.
    *
-   * This locking is not performed at the GPU display level, because that would cause lock
-   * inversion. */
+   * This locking is not performed on the Cycles side, because that would cause lock inversion. */
   if (!gl_context_enable()) {
     return false;
   }
@@ -361,7 +360,7 @@ bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams &params,
   return true;
 }
 
-void BlenderGPUDisplay::do_update_end()
+void BlenderDisplayDriver::update_end()
 {
   gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
   glFlush();
@@ -370,53 +369,17 @@ void BlenderGPUDisplay::do_update_end()
 }
 
 /* --------------------------------------------------------------------
- * Texture update from CPU buffer.
- */
-
-void BlenderGPUDisplay::do_copy_pixels_to_texture(
-    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
-{
-  /* This call copies pixels to a Pixel Buffer Object (PBO) which is much cheaper from CPU time
-   * point of view than to copy data directly to the OpenGL texture.
-   *
-   * The possible downside of this approach is that it might require a higher peak memory when
-   * doing partial updates of the texture (although, in practice even partial updates might peak
-   * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */
-
-  half4 *mapped_rgba_pixels = map_texture_buffer();
-  if (!mapped_rgba_pixels) {
-    return;
-  }
-
-  if (texture_x == 0 && texture_y == 0 && pixels_width == texture_.width &&
-      pixels_height == texture_.height) {
-    const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height;
-    memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
-  }
-  else {
-    const half4 *rgba_row = rgba_pixels;
-    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width + texture_x;
-    for (int y = 0; y < pixels_height;
-         ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) {
-      memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
-    }
-  }
-
-  unmap_texture_buffer();
-}
-
-/* --------------------------------------------------------------------
  * Texture buffer mapping.
  */
 
-half4 *BlenderGPUDisplay::do_map_texture_buffer()
+half4 *BlenderDisplayDriver::map_texture_buffer()
 {
   glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
 
   half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>(
       glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY));
   if (!mapped_rgba_pixels) {
-    LOG(ERROR) << "Error mapping BlenderGPUDisplay pixel buffer object.";
+    LOG(ERROR) << "Error mapping BlenderDisplayDriver pixel buffer object.";
   }
 
   if (texture_.need_clear) {
@@ -431,7 +394,7 @@ half4 *BlenderGPUDisplay::do_map_texture_buffer()
   return mapped_rgba_pixels;
 }
 
-void BlenderGPUDisplay::do_unmap_texture_buffer()
+void BlenderDisplayDriver::unmap_texture_buffer()
 {
   glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
 
@@ -442,9 +405,9 @@ void BlenderGPUDisplay::do_unmap_texture_buffer()
  * Graphics interoperability.
  */
 
-DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get()
+BlenderDisplayDriver::GraphicsInterop BlenderDisplayDriver::graphics_interop_get()
 {
-  DeviceGraphicsInteropDestination interop_dst;
+  GraphicsInterop interop_dst;
 
   interop_dst.buffer_width = texture_.buffer_width;
   interop_dst.buffer_height = texture_.buffer_height;
@@ -456,12 +419,12 @@ DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get()
   return interop_dst;
 }
 
-void BlenderGPUDisplay::graphics_interop_activate()
+void BlenderDisplayDriver::graphics_interop_activate()
 {
   gl_context_enable();
 }
 
-void BlenderGPUDisplay::graphics_interop_deactivate()
+void BlenderDisplayDriver::graphics_interop_deactivate()
 {
   gl_context_disable();
 }
@@ -470,27 +433,21 @@ void BlenderGPUDisplay::graphics_interop_deactivate()
  * Drawing.
  */
 
-void BlenderGPUDisplay::clear()
+void BlenderDisplayDriver::clear()
 {
   texture_.need_clear = true;
 }
 
-void BlenderGPUDisplay::set_zoom(float zoom_x, float zoom_y)
+void BlenderDisplayDriver::set_zoom(float zoom_x, float zoom_y)
 {
   zoom_ = make_float2(zoom_x, zoom_y);
 }
 
-void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
+void BlenderDisplayDriver::draw(const Params &params)
 {
   /* See do_update_begin() for why no locking is required here. */
   const bool transparent = true;  // TODO(sergey): Derive this from Film.
 
-  if (texture_.need_clear) {
-    /* Texture is requested to be cleared and was not yet cleared.
-     * Do early return which should be equivalent of drawing all-zero texture. */
-    return;
-  }
-
   if (!gl_draw_resources_ensure()) {
     return;
   }
@@ -499,6 +456,16 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
     gl_context_mutex_.lock();
   }
 
+  if (texture_.need_clear) {
+    /* Texture is requested to be cleared and was not yet cleared.
+     *
+     * Do early return which should be equivalent of drawing all-zero texture.
+     * Watch out for the lock though so that the clear happening during update is properly
+     * synchronized here. */
+    gl_context_mutex_.unlock();
+    return;
+  }
+
   if (gl_upload_sync_) {
     glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
   }
@@ -524,7 +491,7 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
   const float zoomed_width = params.size.x * zoom_.x;
   const float zoomed_height = params.size.y * zoom_.y;
   if (texture_.width != params.size.x || texture_.height != params.size.y) {
-    /* Resolution divider is different from 1, force enarest interpolation. */
+    /* Resolution divider is different from 1, force nearest interpolation. */
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
   }
   else if (zoomed_width - params.size.x > 0.5f || zoomed_height - params.size.y > 0.5f) {
@@ -580,7 +547,7 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
   }
 }
 
-void BlenderGPUDisplay::gl_context_create()
+void BlenderDisplayDriver::gl_context_create()
 {
   /* When rendering in viewport there is no render context available via engine.
    * Check whether own context is to be created here.
@@ -609,7 +576,7 @@ void BlenderGPUDisplay::gl_context_create()
   }
 }
 
-bool BlenderGPUDisplay::gl_context_enable()
+bool BlenderDisplayDriver::gl_context_enable()
 {
   if (use_gl_context_) {
     if (!gl_context_) {
@@ -624,7 +591,7 @@ bool BlenderGPUDisplay::gl_context_enable()
   return true;
 }
 
-void BlenderGPUDisplay::gl_context_disable()
+void BlenderDisplayDriver::gl_context_disable()
 {
   if (use_gl_context_) {
     if (gl_context_) {
@@ -637,7 +604,7 @@ void BlenderGPUDisplay::gl_context_disable()
   RE_engine_render_context_disable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
 }
 
-void BlenderGPUDisplay::gl_context_dispose()
+void BlenderDisplayDriver::gl_context_dispose()
 {
   if (gl_context_) {
     const bool drw_state = DRW_opengl_context_release();
@@ -649,7 +616,7 @@ void BlenderGPUDisplay::gl_context_dispose()
   }
 }
 
-bool BlenderGPUDisplay::gl_draw_resources_ensure()
+bool BlenderDisplayDriver::gl_draw_resources_ensure()
 {
   if (!texture_.gl_id) {
     /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can
@@ -676,7 +643,7 @@ bool BlenderGPUDisplay::gl_draw_resources_ensure()
   return true;
 }
 
-void BlenderGPUDisplay::gl_resources_destroy()
+void BlenderDisplayDriver::gl_resources_destroy()
 {
   gl_context_enable();
 
@@ -699,7 +666,7 @@ void BlenderGPUDisplay::gl_resources_destroy()
   gl_context_dispose();
 }
 
-bool BlenderGPUDisplay::gl_texture_resources_ensure()
+bool BlenderDisplayDriver::gl_texture_resources_ensure()
 {
   if (texture_.creation_attempted) {
     return texture_.is_created;
@@ -736,7 +703,7 @@ bool BlenderGPUDisplay::gl_texture_resources_ensure()
   return true;
 }
 
-void BlenderGPUDisplay::texture_update_if_needed()
+void BlenderDisplayDriver::texture_update_if_needed()
 {
   if (!texture_.need_update) {
     return;
@@ -750,7 +717,7 @@ void BlenderGPUDisplay::texture_update_if_needed()
   texture_.need_update = false;
 }
 
-void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams &params)
+void BlenderDisplayDriver::vertex_buffer_update(const Params &params)
 {
   /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
    * rendered. */
@@ -763,23 +730,23 @@ void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams &params)
 
   vpointer[0] = 0.0f;
   vpointer[1] = 0.0f;
-  vpointer[2] = params.offset.x;
-  vpointer[3] = params.offset.y;
+  vpointer[2] = params.full_offset.x;
+  vpointer[3] = params.full_offset.y;
 
   vpointer[4] = 1.0f;
   vpointer[5] = 0.0f;
-  vpointer[6] = (float)params.size.x + params.offset.x;
-  vpointer[7] = params.offset.y;
+  vpointer[6] = (float)params.size.x + params.full_offset.x;
+  vpointer[7] = params.full_offset.y;
 
   vpointer[8] = 1.0f;
   vpointer[9] = 1.0f;
-  vpointer[10] = (float)params.size.x + params.offset.x;
-  vpointer[11] = (float)params.size.y + params.offset.y;
+  vpointer[10] = (float)params.size.x + params.full_offset.x;
+  vpointer[11] = (float)params.size.y + params.full_offset.y;
 
   vpointer[12] = 0.0f;
   vpointer[13] = 1.0f;
-  vpointer[14] = params.offset.x;
-  vpointer[15] = (float)params.size.y + params.offset.y;
+  vpointer[14] = params.full_offset.x;
+  vpointer[15] = (float)params.size.y + params.full_offset.y;
 
   glUnmapBuffer(GL_ARRAY_BUFFER);
 }
diff --git a/intern/cycles/blender/blender_gpu_display.h b/intern/cycles/blender/blender_display_driver.h
index 89420567037..558997c6b4f 100644
--- a/intern/cycles/blender/blender_gpu_display.h
+++ b/intern/cycles/blender/blender_display_driver.h
@@ -22,12 +22,14 @@
 
 #include "RNA_blender_cpp.h"
 
-#include "render/gpu_display.h"
+#include "render/display_driver.h"
+
+#include "util/util_thread.h"
 #include "util/util_unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Base class of shader used for GPU display rendering. */
+/* Base class of shader used for display driver rendering. */
 class BlenderDisplayShader {
  public:
   static constexpr const char *position_attribute_name = "pos";
@@ -96,11 +98,11 @@ class BlenderDisplaySpaceShader : public BlenderDisplayShader {
   uint shader_program_ = 0;
 };
 
-/* GPU display implementation which is specific for Blender viewport integration. */
-class BlenderGPUDisplay : public GPUDisplay {
+/* Display driver implementation which is specific for Blender viewport integration. */
+class BlenderDisplayDriver : public DisplayDriver {
  public:
-  BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene);
-  ~BlenderGPUDisplay();
+  BlenderDisplayDriver(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+  ~BlenderDisplayDriver();
 
   virtual void graphics_interop_activate() override;
   virtual void graphics_interop_deactivate() override;
@@ -110,22 +112,15 @@ class BlenderGPUDisplay : public GPUDisplay {
   void set_zoom(float zoom_x, float zoom_y);
 
  protected:
-  virtual bool do_update_begin(const GPUDisplayParams &params,
-                               int texture_width,
-                               int texture_height) override;
-  virtual void do_update_end() override;
+  virtual bool update_begin(const Params &params, int texture_width, int texture_height) override;
+  virtual void update_end() override;
 
-  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
-                                         int texture_x,
-                                         int texture_y,
-                                         int pixels_width,
-                                         int pixels_height) override;
-  virtual void do_draw(const GPUDisplayParams &params) override;
+  virtual half4 *map_texture_buffer() override;
+  virtual void unmap_texture_buffer() override;
 
-  virtual half4 *do_map_texture_buffer() override;
-  virtual void do_unmap_texture_buffer() override;
+  virtual GraphicsInterop graphics_interop_get() override;
 
-  virtual DeviceGraphicsInteropDestination do_graphics_interop_get() override;
+  virtual void draw(const Params &params) override;
 
   /* Helper function which allocates new GPU context. */
   void gl_context_create();
@@ -152,13 +147,13 @@ class BlenderGPUDisplay : public GPUDisplay {
    * This buffer is used to render texture in the viewport.
    *
    * NOTE: The buffer needs to be bound. */
-  void vertex_buffer_update(const GPUDisplayParams &params);
+  void vertex_buffer_update(const Params &params);
 
   BL::RenderEngine b_engine_;
 
   /* OpenGL context which is used the render engine doesn't have its own. */
   void *gl_context_ = nullptr;
-  /* The when Blender RenderEngine side context is not available and the GPUDisplay is to create
+  /* The when Blender RenderEngine side context is not available and the DisplayDriver is to create
    * its own context. */
   bool use_gl_context_ = false;
   /* Mutex used to guard the `gl_context_`. */
diff --git a/intern/cycles/blender/blender_geometry.cpp b/intern/cycles/blender/blender_geometry.cpp
index fca8cb9eda3..7b49bb7fbb7 100644
--- a/intern/cycles/blender/blender_geometry.cpp
+++ b/intern/cycles/blender/blender_geometry.cpp
@@ -80,8 +80,10 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
 {
   /* Test if we can instance or if the object is modified. */
   Geometry::Type geom_type = determine_geom_type(b_ob_info, use_particle_hair);
-  BL::ID b_key_id = (BKE_object_is_modified(b_ob_info.real_object)) ? b_ob_info.real_object :
-                                                                      b_ob_info.object_data;
+  BL::ID b_key_id = (b_ob_info.is_real_object_data() &&
+                     BKE_object_is_modified(b_ob_info.real_object)) ?
+                        b_ob_info.real_object :
+                        b_ob_info.object_data;
   GeometryKey key(b_key_id.ptr.data, geom_type);
 
   /* Find shader indices. */
diff --git a/intern/cycles/blender/blender_output_driver.cpp b/intern/cycles/blender/blender_output_driver.cpp
new file mode 100644
index 00000000000..f380b7b3bb1
--- /dev/null
+++ b/intern/cycles/blender/blender_output_driver.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "blender/blender_output_driver.h"
+
+CCL_NAMESPACE_BEGIN
+
+BlenderOutputDriver::BlenderOutputDriver(BL::RenderEngine &b_engine) : b_engine_(b_engine)
+{
+}
+
+BlenderOutputDriver::~BlenderOutputDriver()
+{
+}
+
+bool BlenderOutputDriver::read_render_tile(const Tile &tile)
+{
+  /* Get render result. */
+  BL::RenderResult b_rr = b_engine_.begin_result(tile.offset.x,
+                                                 tile.offset.y,
+                                                 tile.size.x,
+                                                 tile.size.y,
+                                                 tile.layer.c_str(),
+                                                 tile.view.c_str());
+
+  /* Can happen if the intersected rectangle gives 0 width or height. */
+  if (b_rr.ptr.data == NULL) {
+    return false;
+  }
+
+  BL::RenderResult::layers_iterator b_single_rlay;
+  b_rr.layers.begin(b_single_rlay);
+
+  /* layer will be missing if it was disabled in the UI */
+  if (b_single_rlay == b_rr.layers.end()) {
+    return false;
+  }
+
+  BL::RenderLayer b_rlay = *b_single_rlay;
+
+  vector<float> pixels(tile.size.x * tile.size.y * 4);
+
+  /* Copy each pass.
+   * TODO:copy only the required ones for better performance? */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    tile.set_pass_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect());
+  }
+
+  b_engine_.end_result(b_rr, false, false, false);
+
+  return true;
+}
+
+bool BlenderOutputDriver::update_render_tile(const Tile &tile)
+{
+  /* Use final write for preview renders, otherwise render result wouldn't be be updated
+   * quickly on Blender side. For all other cases we use the display driver. */
+  if (b_engine_.is_preview()) {
+    write_render_tile(tile);
+    return true;
+  }
+  else {
+    /* Don't highlight full-frame tile. */
+    if (!(tile.size == tile.full_size)) {
+      b_engine_.tile_highlight_clear_all();
+      b_engine_.tile_highlight_set(tile.offset.x, tile.offset.y, tile.size.x, tile.size.y, true);
+    }
+
+    return false;
+  }
+}
+
+void BlenderOutputDriver::write_render_tile(const Tile &tile)
+{
+  b_engine_.tile_highlight_clear_all();
+
+  /* Get render result. */
+  BL::RenderResult b_rr = b_engine_.begin_result(tile.offset.x,
+                                                 tile.offset.y,
+                                                 tile.size.x,
+                                                 tile.size.y,
+                                                 tile.layer.c_str(),
+                                                 tile.view.c_str());
+
+  /* Can happen if the intersected rectangle gives 0 width or height. */
+  if (b_rr.ptr.data == NULL) {
+    return;
+  }
+
+  BL::RenderResult::layers_iterator b_single_rlay;
+  b_rr.layers.begin(b_single_rlay);
+
+  /* Layer will be missing if it was disabled in the UI. */
+  if (b_single_rlay == b_rr.layers.end()) {
+    return;
+  }
+
+  BL::RenderLayer b_rlay = *b_single_rlay;
+
+  vector<float> pixels(tile.size.x * tile.size.y * 4);
+
+  /* Copy each pass. */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    if (!tile.get_pass_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) {
+      memset(&pixels[0], 0, pixels.size() * sizeof(float));
+    }
+
+    b_pass.rect(&pixels[0]);
+  }
+
+  b_engine_.end_result(b_rr, true, false, true);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_output_driver.h b/intern/cycles/blender/blender_output_driver.h
new file mode 100644
index 00000000000..8a1cf92d7c7
--- /dev/null
+++ b/intern/cycles/blender/blender_output_driver.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "MEM_guardedalloc.h"
+
+#include "RNA_blender_cpp.h"
+
+#include "render/output_driver.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BlenderOutputDriver : public OutputDriver {
+ public:
+  BlenderOutputDriver(BL::RenderEngine &b_engine);
+  ~BlenderOutputDriver();
+
+  virtual void write_render_tile(const Tile &tile) override;
+  virtual bool update_render_tile(const Tile &tile) override;
+  virtual bool read_render_tile(const Tile &tile) override;
+
+ protected:
+  BL::RenderEngine b_engine_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 694d8454422..d681517c9e1 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -911,14 +911,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false;
+  bool has_cuda = false, has_optix = false, has_hip = false;
   foreach (DeviceType device_type, device_types) {
     has_cuda |= (device_type == DEVICE_CUDA);
     has_optix |= (device_type == DEVICE_OPTIX);
+    has_hip |= (device_type == DEVICE_HIP);
   }
-  PyObject *list = PyTuple_New(2);
+  PyObject *list = PyTuple_New(3);
   PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
   PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
+  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
   return list;
 }
 
@@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
   else if (override == "OPTIX") {
     BlenderSession::device_override = DEVICE_MASK_OPTIX;
   }
+  else if (override == "HIP") {
+    BlenderSession::device_override = DEVICE_MASK_HIP;
+  }
   else {
     printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
     Py_RETURN_FALSE;
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index d65d89a7ddd..3be7ff32bd8 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -42,7 +42,8 @@
 #include "util/util_progress.h"
 #include "util/util_time.h"
 
-#include "blender/blender_gpu_display.h"
+#include "blender/blender_display_driver.h"
+#include "blender/blender_output_driver.h"
 #include "blender/blender_session.h"
 #include "blender/blender_sync.h"
 #include "blender/blender_util.h"
@@ -71,7 +72,8 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
       width(0),
       height(0),
       preview_osl(preview_osl),
-      python_thread_state(NULL)
+      python_thread_state(NULL),
+      use_developer_ui(false)
 {
   /* offline render */
   background = true;
@@ -156,11 +158,13 @@ void BlenderSession::create_session()
       b_v3d, b_rv3d, scene->camera, width, height);
   session->reset(session_params, buffer_params);
 
-  /* Create GPU display. */
+  /* Create GPU display.
+   * TODO(sergey): Investigate whether DisplayDriver can be used for the preview as well. */
   if (!b_engine.is_preview() && !headless) {
-    unique_ptr<BlenderGPUDisplay> gpu_display = make_unique<BlenderGPUDisplay>(b_engine, b_scene);
-    gpu_display_ = gpu_display.get();
-    session->set_gpu_display(move(gpu_display));
+    unique_ptr<BlenderDisplayDriver> display_driver = make_unique<BlenderDisplayDriver>(b_engine,
+                                                                                        b_scene);
+    display_driver_ = display_driver.get();
+    session->set_display_driver(move(display_driver));
   }
 
   /* Viewport and preview (as in, material preview) does not do tiled rendering, so can inform
@@ -277,94 +281,6 @@ void BlenderSession::free_session()
   session = nullptr;
 }
 
-void BlenderSession::read_render_tile()
-{
-  const int2 tile_offset = session->get_render_tile_offset();
-  const int2 tile_size = session->get_render_tile_size();
-
-  /* get render result */
-  BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
-                                                tile_offset.y,
-                                                tile_size.x,
-                                                tile_size.y,
-                                                b_rlay_name.c_str(),
-                                                b_rview_name.c_str());
-
-  /* can happen if the intersected rectangle gives 0 width or height */
-  if (b_rr.ptr.data == NULL) {
-    return;
-  }
-
-  BL::RenderResult::layers_iterator b_single_rlay;
-  b_rr.layers.begin(b_single_rlay);
-
-  /* layer will be missing if it was disabled in the UI */
-  if (b_single_rlay == b_rr.layers.end())
-    return;
-
-  BL::RenderLayer b_rlay = *b_single_rlay;
-
-  vector<float> pixels(tile_size.x * tile_size.y * 4);
-
-  /* Copy each pass.
-   * TODO:copy only the required ones for better performance? */
-  for (BL::RenderPass &b_pass : b_rlay.passes) {
-    session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect());
-  }
-}
-
-void BlenderSession::write_render_tile()
-{
-  const int2 tile_offset = session->get_render_tile_offset();
-  const int2 tile_size = session->get_render_tile_size();
-
-  const string_view render_layer_name = session->get_render_tile_layer();
-  const string_view render_view_name = session->get_render_tile_view();
-
-  b_engine.tile_highlight_clear_all();
-
-  /* get render result */
-  BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
-                                                tile_offset.y,
-                                                tile_size.x,
-                                                tile_size.y,
-                                                render_layer_name.c_str(),
-                                                render_view_name.c_str());
-
-  /* can happen if the intersected rectangle gives 0 width or height */
-  if (b_rr.ptr.data == NULL) {
-    return;
-  }
-
-  BL::RenderResult::layers_iterator b_single_rlay;
-  b_rr.layers.begin(b_single_rlay);
-
-  /* layer will be missing if it was disabled in the UI */
-  if (b_single_rlay == b_rr.layers.end()) {
-    return;
-  }
-
-  BL::RenderLayer b_rlay = *b_single_rlay;
-
-  write_render_result(b_rlay);
-
-  b_engine.end_result(b_rr, true, false, true);
-}
-
-void BlenderSession::update_render_tile()
-{
-  if (!session->has_multiple_render_tiles()) {
-    /* Don't highlight full-frame tile. */
-    return;
-  }
-
-  const int2 tile_offset = session->get_render_tile_offset();
-  const int2 tile_size = session->get_render_tile_size();
-
-  b_engine.tile_highlight_clear_all();
-  b_engine.tile_highlight_set(tile_offset.x, tile_offset.y, tile_size.x, tile_size.y, true);
-}
-
 void BlenderSession::full_buffer_written(string_view filename)
 {
   full_buffer_files_.emplace_back(filename);
@@ -438,18 +354,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
     return;
   }
 
-  /* set callback to write out render results */
-  session->write_render_tile_cb = [&]() { write_render_tile(); };
-
-  /* Use final write for preview renders, otherwise render result wouldn't be be updated on Blender
-   * side. */
-  /* TODO(sergey): Investigate whether GPUDisplay can be used for the preview as well. */
-  if (b_engine.is_preview()) {
-    session->update_render_tile_cb = [&]() { write_render_tile(); };
-  }
-  else {
-    session->update_render_tile_cb = [&]() { update_render_tile(); };
-  }
+  /* Create driver to write out render results. */
+  session->set_output_driver(make_unique<BlenderOutputDriver>(b_engine));
 
   session->full_buffer_written_cb = [&](string_view filename) { full_buffer_written(filename); };
 
@@ -557,6 +463,11 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   /* free result without merging */
   b_engine.end_result(b_rr, true, false, false);
 
+  /* When tiled rendering is used there will be no "write" done for the tile. Forcefully clear
+   * highlighted tiles now, so that the highlight will be removed while processing full frame from
+   * file. */
+  b_engine.tile_highlight_clear_all();
+
   double total_time, render_time;
   session->progress.get_time(total_time, render_time);
   VLOG(1) << "Total render time: " << total_time;
@@ -581,12 +492,17 @@ void BlenderSession::render_frame_finish()
 
   for (string_view filename : full_buffer_files_) {
     session->process_full_buffer_from_disk(filename);
+    if (check_and_report_session_error()) {
+      break;
+    }
+  }
+
+  for (string_view filename : full_buffer_files_) {
     path_remove(filename);
   }
 
-  /* clear callback */
-  session->write_render_tile_cb = function_null;
-  session->update_render_tile_cb = function_null;
+  /* Clear driver. */
+  session->set_output_driver(nullptr);
   session->full_buffer_written_cb = function_null;
 }
 
@@ -692,9 +608,8 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
   pass->set_type(bake_type_to_pass(bake_type, bake_filter));
   pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR));
 
-  session->read_render_tile_cb = [&]() { read_render_tile(); };
-  session->write_render_tile_cb = [&]() { write_render_tile(); };
-  session->set_gpu_display(nullptr);
+  session->set_display_driver(nullptr);
+  session->set_output_driver(make_unique<BlenderOutputDriver>(b_engine));
 
   if (!session->progress.get_cancel()) {
     /* Sync scene. */
@@ -737,43 +652,7 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
     session->wait();
   }
 
-  session->read_render_tile_cb = function_null;
-  session->write_render_tile_cb = function_null;
-}
-
-void BlenderSession::write_render_result(BL::RenderLayer &b_rlay)
-{
-  if (!session->copy_render_tile_from_device()) {
-    return;
-  }
-
-  const int2 tile_size = session->get_render_tile_size();
-  vector<float> pixels(tile_size.x * tile_size.y * 4);
-
-  /* Copy each pass. */
-  for (BL::RenderPass &b_pass : b_rlay.passes) {
-    if (!session->get_render_tile_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) {
-      memset(&pixels[0], 0, pixels.size() * sizeof(float));
-    }
-
-    b_pass.rect(&pixels[0]);
-  }
-}
-
-void BlenderSession::update_render_result(BL::RenderLayer &b_rlay)
-{
-  if (!session->copy_render_tile_from_device()) {
-    return;
-  }
-
-  const int2 tile_size = session->get_render_tile_size();
-  vector<float> pixels(tile_size.x * tile_size.y * 4);
-
-  /* Copy combined pass. */
-  BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
-  if (session->get_render_tile_pixels("Combined", b_combined_pass.channels(), &pixels[0])) {
-    b_combined_pass.rect(&pixels[0]);
-  }
+  session->set_output_driver(nullptr);
 }
 
 void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
@@ -881,7 +760,7 @@ void BlenderSession::draw(BL::SpaceImageEditor &space_image)
   }
 
   BL::Array<float, 2> zoom = space_image.zoom();
-  gpu_display_->set_zoom(zoom[0], zoom[1]);
+  display_driver_->set_zoom(zoom[0], zoom[1]);
 
   session->draw();
 }
@@ -988,8 +867,9 @@ void BlenderSession::update_status_progress()
   get_status(status, substatus);
   get_progress(progress, total_time, render_time);
 
-  if (progress > 0)
-    remaining_time = (1.0 - (double)progress) * (render_time / (double)progress);
+  if (progress > 0) {
+    remaining_time = session->get_estimated_remaining_time();
+  }
 
   if (background) {
     if (scene)
@@ -1027,20 +907,27 @@ void BlenderSession::update_status_progress()
     last_progress = progress;
   }
 
-  if (session->progress.get_error()) {
-    string error = session->progress.get_error_message();
-    if (error != last_error) {
-      /* TODO(sergey): Currently C++ RNA API doesn't let us to
-       * use mnemonic name for the variable. Would be nice to
-       * have this figured out.
-       *
-       * For until then, 1 << 5 means RPT_ERROR.
-       */
-      b_engine.report(1 << 5, error.c_str());
-      b_engine.error_set(error.c_str());
-      last_error = error;
-    }
+  check_and_report_session_error();
+}
+
+bool BlenderSession::check_and_report_session_error()
+{
+  if (!session->progress.get_error()) {
+    return false;
   }
+
+  const string error = session->progress.get_error_message();
+  if (error != last_error) {
+    /* TODO(sergey): Currently C++ RNA API doesn't let us to use mnemonic name for the variable.
+     * Would be nice to have this figured out.
+     *
+     * For until then, 1 << 5 means RPT_ERROR. */
+    b_engine.report(1 << 5, error.c_str());
+    b_engine.error_set(error.c_str());
+    last_error = error;
+  }
+
+  return true;
 }
 
 void BlenderSession::tag_update()
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 11e2657a325..fef6ad1adfc 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -29,7 +29,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-class BlenderGPUDisplay;
+class BlenderDisplayDriver;
 class BlenderSync;
 class ImageMetaData;
 class Scene;
@@ -70,20 +70,7 @@ class BlenderSession {
             const int bake_width,
             const int bake_height);
 
-  void write_render_result(BL::RenderLayer &b_rlay);
-  void write_render_tile();
-
-  void update_render_tile();
-
   void full_buffer_written(string_view filename);
-
-  /* update functions are used to update display buffer only after sample was rendered
-   * only needed for better visual feedback */
-  void update_render_result(BL::RenderLayer &b_rlay);
-
-  /* read functions for baking input */
-  void read_render_tile();
-
   /* interactive updates */
   void synchronize(BL::Depsgraph &b_depsgraph);
 
@@ -110,8 +97,7 @@ class BlenderSession {
   BL::RenderSettings b_render;
   BL::Depsgraph b_depsgraph;
   /* NOTE: Blender's scene might become invalid after call
-   * free_blender_memory_if_possible().
-   */
+   * #free_blender_memory_if_possible(). */
   BL::Scene b_scene;
   BL::SpaceView3D b_v3d;
   BL::RegionView3D b_rv3d;
@@ -147,6 +133,11 @@ class BlenderSession {
  protected:
   void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name);
 
+  /* Check whether session error happened.
+   * If so, it is reported to the render engine and true is returned.
+   * Otherwise false is returned. */
+  bool check_and_report_session_error();
+
   void builtin_images_load();
 
   /* Is used after each render layer synchronization is done with the goal
@@ -160,8 +151,8 @@ class BlenderSession {
     int last_pass_index = -1;
   } draw_state_;
 
-  /* NOTE: The BlenderSession references the GPU display. */
-  BlenderGPUDisplay *gpu_display_ = nullptr;
+  /* NOTE: The BlenderSession references the display driver. */
+  BlenderDisplayDriver *display_driver_ = nullptr;
 
   vector<string> full_buffer_files_;
 };
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 8c4f789ffd0..0b8aea15d6c 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -279,7 +279,7 @@ static ShaderNode *add_node(Scene *scene,
     array<float3> curve_mapping_curves;
     float min_x, max_x;
     curvemapping_color_to_array(mapping, curve_mapping_curves, RAMP_TABLE_SIZE, true);
-    curvemapping_minmax(mapping, true, &min_x, &max_x);
+    curvemapping_minmax(mapping, 4, &min_x, &max_x);
     curves->set_min_x(min_x);
     curves->set_max_x(max_x);
     curves->set_curves(curve_mapping_curves);
@@ -292,12 +292,25 @@ static ShaderNode *add_node(Scene *scene,
     array<float3> curve_mapping_curves;
     float min_x, max_x;
     curvemapping_color_to_array(mapping, curve_mapping_curves, RAMP_TABLE_SIZE, false);
-    curvemapping_minmax(mapping, false, &min_x, &max_x);
+    curvemapping_minmax(mapping, 3, &min_x, &max_x);
     curves->set_min_x(min_x);
     curves->set_max_x(max_x);
     curves->set_curves(curve_mapping_curves);
     node = curves;
   }
+  else if (b_node.is_a(&RNA_ShaderNodeFloatCurve)) {
+    BL::ShaderNodeFloatCurve b_curve_node(b_node);
+    BL::CurveMapping mapping(b_curve_node.mapping());
+    FloatCurveNode *curve = graph->create_node<FloatCurveNode>();
+    array<float> curve_mapping_curve;
+    float min_x, max_x;
+    curvemapping_float_to_array(mapping, curve_mapping_curve, RAMP_TABLE_SIZE);
+    curvemapping_minmax(mapping, 1, &min_x, &max_x);
+    curve->set_min_x(min_x);
+    curve->set_max_x(max_x);
+    curve->set_curve(curve_mapping_curve);
+    node = curve;
+  }
   else if (b_node.is_a(&RNA_ShaderNodeValToRGB)) {
     RGBRampNode *ramp = graph->create_node<RGBRampNode>();
     BL::ShaderNodeValToRGB b_ramp_node(b_node);
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index 04008d77d89..77b2bd5ac4f 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -90,26 +90,27 @@ static inline BL::Mesh object_to_mesh(BL::BlendData & /*data*/,
   }
 #endif
 
-  BL::Mesh mesh(PointerRNA_NULL);
-  if (b_ob_info.object_data.is_a(&RNA_Mesh)) {
-    /* TODO: calc_undeformed is not used. */
-    mesh = BL::Mesh(b_ob_info.object_data);
-
-    /* Make a copy to split faces if we use autosmooth, otherwise not needed.
-     * Also in edit mode do we need to make a copy, to ensure data layers like
-     * UV are not empty. */
-    if (mesh.is_editmode() ||
-        (mesh.use_auto_smooth() && subdivision_type == Mesh::SUBDIVISION_NONE)) {
+  BL::Mesh mesh = (b_ob_info.object_data.is_a(&RNA_Mesh)) ? BL::Mesh(b_ob_info.object_data) :
+                                                            BL::Mesh(PointerRNA_NULL);
+
+  if (b_ob_info.is_real_object_data()) {
+    if (mesh) {
+      /* Make a copy to split faces if we use autosmooth, otherwise not needed.
+       * Also in edit mode do we need to make a copy, to ensure data layers like
+       * UV are not empty. */
+      if (mesh.is_editmode() ||
+          (mesh.use_auto_smooth() && subdivision_type == Mesh::SUBDIVISION_NONE)) {
+        BL::Depsgraph depsgraph(PointerRNA_NULL);
+        mesh = b_ob_info.real_object.to_mesh(false, depsgraph);
+      }
+    }
+    else {
       BL::Depsgraph depsgraph(PointerRNA_NULL);
-      assert(b_ob_info.is_real_object_data());
       mesh = b_ob_info.real_object.to_mesh(false, depsgraph);
     }
   }
   else {
-    BL::Depsgraph depsgraph(PointerRNA_NULL);
-    if (b_ob_info.is_real_object_data()) {
-      mesh = b_ob_info.real_object.to_mesh(false, depsgraph);
-    }
+    /* TODO: what to do about non-mesh geometry instances? */
   }
 
 #if 0
@@ -170,12 +171,11 @@ static inline void curvemap_minmax_curve(/*const*/ BL::CurveMap &curve, float *m
 }
 
 static inline void curvemapping_minmax(/*const*/ BL::CurveMapping &cumap,
-                                       bool rgb_curve,
+                                       int num_curves,
                                        float *min_x,
                                        float *max_x)
 {
   // const int num_curves = cumap.curves.length(); /* Gives linking error so far. */
-  const int num_curves = rgb_curve ? 4 : 3;
   *min_x = FLT_MAX;
   *max_x = -FLT_MAX;
   for (int i = 0; i < num_curves; ++i) {
@@ -195,6 +195,28 @@ static inline void curvemapping_to_array(BL::CurveMapping &cumap, array<float> &
   }
 }
 
+static inline void curvemapping_float_to_array(BL::CurveMapping &cumap,
+                                               array<float> &data,
+                                               int size)
+{
+  float min = 0.0f, max = 1.0f;
+
+  curvemapping_minmax(cumap, 1, &min, &max);
+
+  const float range = max - min;
+
+  cumap.update();
+
+  BL::CurveMap map = cumap.curves[0];
+
+  data.resize(size);
+
+  for (int i = 0; i < size; i++) {
+    float t = min + (float)i / (float)(size - 1) * range;
+    data[i] = cumap.evaluate(map, t);
+  }
+}
+
 static inline void curvemapping_color_to_array(BL::CurveMapping &cumap,
                                                array<float3> &data,
                                                int size,
@@ -213,7 +235,8 @@ static inline void curvemapping_color_to_array(BL::CurveMapping &cumap,
    *
    * There might be some better estimations here tho.
    */
-  curvemapping_minmax(cumap, rgb_curve, &min_x, &max_x);
+  const int num_curves = rgb_curve ? 4 : 3;
+  curvemapping_minmax(cumap, num_curves, &min_x, &max_x);
 
   const float range_x = max_x - min_x;
 
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
index 96852510b63..20430cb164c 100644
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -213,7 +213,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
       if (ctx->num_hits < ctx->max_hits) {
         Intersection current_isect;
         kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-        for (size_t i = 0; i < ctx->max_hits; ++i) {
+        for (size_t i = 0; i < ctx->num_hits; ++i) {
           if (current_isect.object == ctx->isect_s[i].object &&
               current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) {
             /* This intersection was already recorded, skip it. */
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index da259171844..b966edd4298 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -532,4 +532,13 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD)
   endif()
 endif()
 
+
+###########################################################################
+# HIP
+###########################################################################
+
+if(NOT WITH_HIP_DYNLOAD)
+  set(WITH_HIP_DYNLOAD ON)
+endif()
+
 unset(_cycles_lib_dir)
diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake
index 47196dfd1ce..a470fb9c574 100644
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -156,10 +156,16 @@ macro(cycles_target_link_libraries target)
     ${PLATFORM_LINKLIBS}
   )
 
-  if(WITH_CUDA_DYNLOAD)
-    target_link_libraries(${target} extern_cuew)
-  else()
-    target_link_libraries(${target} ${CUDA_CUDA_LIBRARY})
+  if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
+    if(WITH_CUDA_DYNLOAD)
+      target_link_libraries(${target} extern_cuew)
+    else()
+      target_link_libraries(${target} ${CUDA_CUDA_LIBRARY})
+    endif()
+  endif()
+
+  if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
+    target_link_libraries(${target} extern_hipew)
   endif()
 
   if(CYCLES_STANDALONE_REPOSITORY)
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index d18f4360aef..6d33a6f107f 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -22,16 +22,25 @@ set(INC_SYS
   ../../../extern/clew/include
 )
 
-if(WITH_CUDA_DYNLOAD)
+if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
+  if(WITH_CUDA_DYNLOAD)
+    list(APPEND INC
+      ../../../extern/cuew/include
+    )
+    add_definitions(-DWITH_CUDA_DYNLOAD)
+  else()
+    list(APPEND INC_SYS
+      ${CUDA_TOOLKIT_INCLUDE}
+    )
+    add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
+  endif()
+endif()
+
+if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
   list(APPEND INC
-    ../../../extern/cuew/include
-  )
-  add_definitions(-DWITH_CUDA_DYNLOAD)
-else()
-  list(APPEND INC_SYS
-    ${CUDA_TOOLKIT_INCLUDE}
+    ../../../extern/hipew/include
   )
-  add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
+  add_definitions(-DWITH_HIP_DYNLOAD)
 endif()
 
 set(SRC
@@ -70,6 +79,21 @@ set(SRC_CUDA
   cuda/util.h
 )
 
+set(SRC_HIP
+  hip/device.cpp
+  hip/device.h
+  hip/device_impl.cpp
+  hip/device_impl.h
+  hip/graphics_interop.cpp
+  hip/graphics_interop.h
+  hip/kernel.cpp
+  hip/kernel.h
+  hip/queue.cpp
+  hip/queue.h
+  hip/util.cpp
+  hip/util.h
+)
+
 set(SRC_DUMMY
   dummy/device.cpp
   dummy/device.h
@@ -105,13 +129,21 @@ set(LIB
   ${CYCLES_GL_LIBRARIES}
 )
 
-if(WITH_CUDA_DYNLOAD)
-  list(APPEND LIB
-    extern_cuew
-  )
-else()
+if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
+  if(WITH_CUDA_DYNLOAD)
+    list(APPEND LIB
+      extern_cuew
+    )
+  else()
+    list(APPEND LIB
+      ${CUDA_CUDA_LIBRARY}
+    )
+  endif()
+endif()
+
+if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
   list(APPEND LIB
-    ${CUDA_CUDA_LIBRARY}
+    extern_hipew
   )
 endif()
 
@@ -120,6 +152,9 @@ add_definitions(${GL_DEFINITIONS})
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+endif()
 if(WITH_CYCLES_DEVICE_OPTIX)
   add_definitions(-DWITH_OPTIX)
 endif()
@@ -140,6 +175,7 @@ cycles_add_library(cycles_device "${LIB}"
   ${SRC}
   ${SRC_CPU}
   ${SRC_CUDA}
+  ${SRC_HIP}
   ${SRC_DUMMY}
   ${SRC_MULTI}
   ${SRC_OPTIX}
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index 3b0db6bdd0e..d02c18daee9 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -54,7 +54,6 @@
 #include "util/util_function.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
-#include "util/util_opengl.h"
 #include "util/util_openimagedenoise.h"
 #include "util/util_optimization.h"
 #include "util/util_progress.h"
@@ -170,7 +169,7 @@ void CPUDevice::mem_copy_to(device_memory &mem)
 }
 
 void CPUDevice::mem_copy_from(
-    device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+    device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/)
 {
   /* no-op */
 }
@@ -204,7 +203,7 @@ void CPUDevice::mem_free(device_memory &mem)
   }
 }
 
-device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
 {
   return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
 }
@@ -298,154 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
     Device::build_bvh(bvh, progress, refit);
 }
 
-#if 0
-void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
-{
-  const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
-  scoped_timer timer(&tile.buffers->render_time);
-
-  Coverage coverage(kg, tile);
-  if (use_coverage) {
-    coverage.init_path_trace();
-  }
-
-  float *render_buffer = (float *)tile.buffer;
-  int start_sample = tile.start_sample;
-  int end_sample = tile.start_sample + tile.num_samples;
-
-  /* Needed for Embree. */
-  SIMD_SET_FLUSH_TO_ZERO;
-
-  for (int sample = start_sample; sample < end_sample; sample++) {
-    if (task.get_cancel() || TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-
-    if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
-      tile.stealing_state = RenderTile::WAS_STOLEN;
-      break;
-    }
-
-    if (tile.task == RenderTile::PATH_TRACE) {
-      for (int y = tile.y; y < tile.y + tile.h; y++) {
-        for (int x = tile.x; x < tile.x + tile.w; x++) {
-          if (use_coverage) {
-            coverage.init_pixel(x, y);
-          }
-          kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-        }
-      }
-    }
-    else {
-      for (int y = tile.y; y < tile.y + tile.h; y++) {
-        for (int x = tile.x; x < tile.x + tile.w; x++) {
-          kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-        }
-      }
-    }
-    tile.sample = sample + 1;
-
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
-      const bool stop = adaptive_sampling_filter(kg, tile, sample);
-      if (stop) {
-        const int num_progress_samples = end_sample - sample;
-        tile.sample = end_sample;
-        task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
-        break;
-      }
-    }
-
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-  if (use_coverage) {
-    coverage.finalize();
-  }
-
-  if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
-    adaptive_sampling_post(tile, kg);
-  }
-}
-
-void CPUDevice::thread_render(DeviceTask &task)
-{
-  if (TaskPool::canceled()) {
-    if (task.need_finish_queue == false)
-      return;
-  }
-
-  /* allocate buffer for kernel globals */
-  CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
-
-  profiler.add_state(&kg.profiler);
-
-  /* NLM denoiser. */
-  DenoisingTask *denoising = NULL;
-
-  /* OpenImageDenoise: we can only denoise with one thread at a time, so to
-   * avoid waiting with mutex locks in the denoiser, we let only a single
-   * thread acquire denoising tiles. */
-  uint tile_types = task.tile_types;
-  bool hold_denoise_lock = false;
-  if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-    if (!oidn_task_lock.try_lock()) {
-      tile_types &= ~RenderTile::DENOISE;
-      hold_denoise_lock = true;
-    }
-  }
-
-  RenderTile tile;
-  while (task.acquire_tile(this, tile, tile_types)) {
-    if (tile.task == RenderTile::PATH_TRACE) {
-      render(task, tile, &kg);
-    }
-    else if (tile.task == RenderTile::BAKE) {
-      render(task, tile, &kg);
-    }
-    else if (tile.task == RenderTile::DENOISE) {
-      denoise_openimagedenoise(task, tile);
-      task.update_progress(&tile, tile.w * tile.h);
-    }
-
-    task.release_tile(tile);
-
-    if (TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  if (hold_denoise_lock) {
-    oidn_task_lock.unlock();
-  }
-
-  profiler.remove_state(&kg.profiler);
-
-  delete denoising;
-}
-
-void CPUDevice::thread_denoise(DeviceTask &task)
-{
-  RenderTile tile;
-  tile.x = task.x;
-  tile.y = task.y;
-  tile.w = task.w;
-  tile.h = task.h;
-  tile.buffer = task.buffer;
-  tile.sample = task.sample + task.num_samples;
-  tile.num_samples = task.num_samples;
-  tile.start_sample = task.sample;
-  tile.offset = task.offset;
-  tile.stride = task.stride;
-  tile.buffers = task.buffers;
-
-  denoise_openimagedenoise(task, tile);
-
-  task.update_progress(&tile, tile.w * tile.h);
-}
-#endif
-
 const CPUKernels *CPUDevice::get_cpu_kernels() const
 {
   return &kernels;
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
index 7d222808652..371d2258104 100644
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -72,10 +72,13 @@ class CPUDevice : public Device {
 
   virtual void mem_alloc(device_memory &mem) override;
   virtual void mem_copy_to(device_memory &mem) override;
-  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  virtual void mem_copy_from(
+      device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
   virtual void mem_zero(device_memory &mem) override;
   virtual void mem_free(device_memory &mem) override;
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem,
+                                       size_t offset,
+                                       size_t /*size*/) override;
 
   virtual void const_copy_to(const char *name, void *host, size_t size) override;
 
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 37fab8f8293..5e1a63c04df 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -31,7 +31,6 @@
 #  include "util/util_logging.h"
 #  include "util/util_map.h"
 #  include "util/util_md5.h"
-#  include "util/util_opengl.h"
 #  include "util/util_path.h"
 #  include "util/util_string.h"
 #  include "util/util_system.h"
@@ -837,7 +836,7 @@ void CUDADevice::mem_copy_to(device_memory &mem)
   }
 }
 
-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+void CUDADevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
 {
   if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
     assert(!"mem_copy_from not supported for textures.");
@@ -891,7 +890,7 @@ void CUDADevice::mem_free(device_memory &mem)
   }
 }
 
-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
 {
   return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
 }
@@ -1169,141 +1168,6 @@ void CUDADevice::tex_free(device_texture &mem)
   }
 }
 
-#  if 0
-void CUDADevice::render(DeviceTask &task,
-                        RenderTile &rtile,
-                        device_vector<KernelWorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-  CUfunction cuRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  KernelWorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  uint start_sample = rtile.start_sample;
-  uint end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(
-        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    cuda_assert(cuCtxSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    cuda_assert(cuCtxSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-}
-#  endif
-
 unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
 {
   return make_unique<CUDADeviceQueue>(this);
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
index 6b27db54ab4..c0316d18ba0 100644
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -26,7 +26,6 @@
 #  ifdef WITH_CUDA_DYNLOAD
 #    include "cuew.h"
 #  else
-#    include "util/util_opengl.h"
 #    include <cuda.h>
 #    include <cudaGL.h>
 #  endif
@@ -120,13 +119,13 @@ class CUDADevice : public Device {
 
   void mem_copy_to(device_memory &mem) override;
 
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
 
   void mem_zero(device_memory &mem) override;
 
   void mem_free(device_memory &mem) override;
 
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
 
   virtual void const_copy_to(const char *name, void *host, size_t size) override;
 
diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp
index e8ca8b90eae..30efefd9b6b 100644
--- a/intern/cycles/device/cuda/graphics_interop.cpp
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -37,14 +37,15 @@ CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop()
   }
 }
 
-void CUDADeviceGraphicsInterop::set_destination(
-    const DeviceGraphicsInteropDestination &destination)
+void CUDADeviceGraphicsInterop::set_display_interop(
+    const DisplayDriver::GraphicsInterop &display_interop)
 {
-  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+  const int64_t new_buffer_area = int64_t(display_interop.buffer_width) *
+                                  display_interop.buffer_height;
 
-  need_clear_ = destination.need_clear;
+  need_clear_ = display_interop.need_clear;
 
-  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+  if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) {
     return;
   }
 
@@ -55,12 +56,12 @@ void CUDADeviceGraphicsInterop::set_destination(
   }
 
   const CUresult result = cuGraphicsGLRegisterBuffer(
-      &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+      &cu_graphics_resource_, display_interop.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
   if (result != CUDA_SUCCESS) {
     LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result);
   }
 
-  opengl_pbo_id_ = destination.opengl_pbo_id;
+  opengl_pbo_id_ = display_interop.opengl_pbo_id;
   buffer_area_ = new_buffer_area;
 }
 
diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h
index 8a70c8aa71d..ec480f20c86 100644
--- a/intern/cycles/device/cuda/graphics_interop.h
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -41,7 +41,7 @@ class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
   CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete;
   CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete;
 
-  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+  virtual void set_display_interop(const DisplayDriver::GraphicsInterop &display_interop) override;
 
   virtual device_ptr map() override;
   virtual void unmap() override;
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index b7f86c10553..1149a835b14 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -116,18 +116,18 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
   }
 
   /* Launch kernel. */
-  cuda_device_assert(cuda_device_,
-                     cuLaunchKernel(cuda_kernel.function,
-                                    num_blocks,
-                                    1,
-                                    1,
-                                    num_threads_per_block,
-                                    1,
-                                    1,
-                                    shared_mem_bytes,
-                                    cuda_stream_,
-                                    args,
-                                    0));
+  assert_success(cuLaunchKernel(cuda_kernel.function,
+                                num_blocks,
+                                1,
+                                1,
+                                num_threads_per_block,
+                                1,
+                                1,
+                                shared_mem_bytes,
+                                cuda_stream_,
+                                args,
+                                0),
+                 "enqueue");
 
   return !(cuda_device_->have_error());
 }
@@ -139,7 +139,8 @@ bool CUDADeviceQueue::synchronize()
   }
 
   const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
+
   debug_synchronize();
 
   return !(cuda_device_->have_error());
@@ -162,9 +163,9 @@ void CUDADeviceQueue::zero_to_device(device_memory &mem)
   assert(mem.device_pointer != 0);
 
   const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
-      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+  assert_success(
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
+      "zero_to_device");
 }
 
 void CUDADeviceQueue::copy_to_device(device_memory &mem)
@@ -185,10 +186,10 @@ void CUDADeviceQueue::copy_to_device(device_memory &mem)
 
   /* Copy memory to device. */
   const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
+  assert_success(
       cuMemcpyHtoDAsync(
-          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
+      "copy_to_device");
 }
 
 void CUDADeviceQueue::copy_from_device(device_memory &mem)
@@ -204,10 +205,19 @@ void CUDADeviceQueue::copy_from_device(device_memory &mem)
 
   /* Copy memory from device. */
   const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
+  assert_success(
       cuMemcpyDtoHAsync(
-          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
+      "copy_from_device");
+}
+
+void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
+{
+  if (result != CUDA_SUCCESS) {
+    const char *name = cuewErrorString(result);
+    cuda_device_->set_error(string_printf(
+        "%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
+  }
 }
 
 unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index 62e3aa3d6c2..4d1995ed69e 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -60,6 +60,8 @@ class CUDADeviceQueue : public DeviceQueue {
  protected:
   CUDADevice *cuda_device_;
   CUstream cuda_stream_;
+
+  void assert_success(CUresult result, const char *operation);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 6ccedcf54ef..81574e8b184 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -25,6 +25,7 @@
 #include "device/cpu/device.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
+#include "device/hip/device.h"
 #include "device/multi/device.h"
 #include "device/optix/device.h"
 
@@ -32,7 +33,6 @@
 #include "util/util_half.h"
 #include "util/util_logging.h"
 #include "util/util_math.h"
-#include "util/util_opengl.h"
 #include "util/util_string.h"
 #include "util/util_system.h"
 #include "util/util_time.h"
@@ -47,6 +47,7 @@ thread_mutex Device::device_mutex;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
+vector<DeviceInfo> Device::hip_devices;
 uint Device::devices_initialized_mask = 0;
 
 /* Device */
@@ -97,6 +98,14 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
         device = device_optix_create(info, stats, profiler);
       break;
 #endif
+
+#ifdef WITH_HIP
+    case DEVICE_HIP:
+      if (device_hip_init())
+        device = device_hip_create(info, stats, profiler);
+      break;
+#endif
+
     default:
       break;
   }
@@ -118,6 +127,8 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_OPTIX;
   else if (strcmp(name, "MULTI") == 0)
     return DEVICE_MULTI;
+  else if (strcmp(name, "HIP") == 0)
+    return DEVICE_HIP;
 
   return DEVICE_NONE;
 }
@@ -132,6 +143,8 @@ string Device::string_from_type(DeviceType type)
     return "OPTIX";
   else if (type == DEVICE_MULTI)
     return "MULTI";
+  else if (type == DEVICE_HIP)
+    return "HIP";
 
   return "";
 }
@@ -146,6 +159,10 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
   types.push_back(DEVICE_OPTIX);
 #endif
+#ifdef WITH_HIP
+  types.push_back(DEVICE_HIP);
+#endif
+
   return types;
 }
 
@@ -187,6 +204,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   }
 #endif
 
+#ifdef WITH_HIP
+  if (mask & DEVICE_MASK_HIP) {
+    if (!(devices_initialized_mask & DEVICE_MASK_HIP)) {
+      if (device_hip_init()) {
+        device_hip_info(hip_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_HIP;
+    }
+    foreach (DeviceInfo &info, hip_devices) {
+      devices.push_back(info);
+    }
+  }
+#endif
+
   if (mask & DEVICE_MASK_CPU) {
     if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
       device_cpu_info(cpu_devices);
@@ -227,6 +258,15 @@ string Device::device_capabilities(uint mask)
   }
 #endif
 
+#ifdef WITH_HIP
+  if (mask & DEVICE_MASK_HIP) {
+    if (device_hip_init()) {
+      capabilities += "\nHIP device capabilities:\n";
+      capabilities += device_hip_capabilities();
+    }
+  }
+#endif
+
   return capabilities;
 }
 
@@ -315,6 +355,7 @@ void Device::free_memory()
   devices_initialized_mask = 0;
   cuda_devices.free_memory();
   optix_devices.free_memory();
+  hip_devices.free_memory();
   cpu_devices.free_memory();
 }
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 399d5eb91df..c73d74cdccc 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -51,6 +51,7 @@ enum DeviceType {
   DEVICE_CUDA,
   DEVICE_MULTI,
   DEVICE_OPTIX,
+  DEVICE_HIP,
   DEVICE_DUMMY,
 };
 
@@ -58,6 +59,7 @@ enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
+  DEVICE_MASK_HIP = (1 << DEVICE_HIP),
   DEVICE_MASK_ALL = ~0
 };
 
@@ -119,7 +121,7 @@ class Device {
 
   string error_msg;
 
-  virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
+  virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, size_t /*offset*/, size_t /*size*/)
   {
     /* Only required for devices that implement denoising. */
     assert(false);
@@ -273,7 +275,7 @@ class Device {
 
   virtual void mem_alloc(device_memory &mem) = 0;
   virtual void mem_copy_to(device_memory &mem) = 0;
-  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) = 0;
+  virtual void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) = 0;
   virtual void mem_zero(device_memory &mem) = 0;
   virtual void mem_free(device_memory &mem) = 0;
 
@@ -284,6 +286,7 @@ class Device {
   static vector<DeviceInfo> cuda_devices;
   static vector<DeviceInfo> optix_devices;
   static vector<DeviceInfo> cpu_devices;
+  static vector<DeviceInfo> hip_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h
index 671b1c189d7..eaf76077141 100644
--- a/intern/cycles/device/device_graphics_interop.h
+++ b/intern/cycles/device/device_graphics_interop.h
@@ -16,25 +16,12 @@
 
 #pragma once
 
+#include "render/display_driver.h"
+
 #include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Information about interoperability destination.
- * Is provided by the GPUDisplay. */
-class DeviceGraphicsInteropDestination {
- public:
-  /* Dimensions of the buffer, in pixels. */
-  int buffer_width = 0;
-  int buffer_height = 0;
-
-  /* OpenGL pixel buffer object. */
-  int opengl_pbo_id = 0;
-
-  /* Clear the entire destination before doing partial write to it. */
-  bool need_clear = false;
-};
-
 /* Device-side graphics interoperability support.
  *
  * Takes care of holding all the handlers needed by the device to implement interoperability with
@@ -46,7 +33,7 @@ class DeviceGraphicsInterop {
 
   /* Update this device-side graphics interoperability object with the given destination resource
    * information. */
-  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0;
+  virtual void set_display_interop(const DisplayDriver::GraphicsInterop &display_interop) = 0;
 
   virtual device_ptr map() = 0;
   virtual void unmap() = 0;
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index c4d45829b83..c0ab2e17cae 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -136,7 +136,7 @@ void device_memory::device_copy_to()
   }
 }
 
-void device_memory::device_copy_from(int y, int w, int h, int elem)
+void device_memory::device_copy_from(size_t y, size_t w, size_t h, size_t elem)
 {
   assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
   device->mem_copy_from(*this, y, w, h, elem);
@@ -181,7 +181,7 @@ bool device_memory::is_resident(Device *sub_device) const
 
 /* Device Sub Ptr */
 
-device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
+device_sub_ptr::device_sub_ptr(device_memory &mem, size_t offset, size_t size) : device(mem.device)
 {
   ptr = device->mem_alloc_sub_ptr(mem, offset, size);
 }
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index c51594b8580..be6123e09b2 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -81,154 +81,154 @@ static constexpr size_t datatype_size(DataType datatype)
 
 template<typename T> struct device_type_traits {
   static const DataType data_type = TYPE_UNKNOWN;
-  static const int num_elements_cpu = sizeof(T);
-  static const int num_elements_gpu = sizeof(T);
+  static const size_t num_elements_cpu = sizeof(T);
+  static const size_t num_elements_gpu = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
   static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar2> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements_cpu = 2;
-  static const int num_elements_gpu = 2;
+  static const size_t num_elements_cpu = 2;
+  static const size_t num_elements_gpu = 2;
   static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar3> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements_cpu = 3;
-  static const int num_elements_gpu = 3;
+  static const size_t num_elements_cpu = 3;
+  static const size_t num_elements_gpu = 3;
   static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar4> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
   static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
   static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint2> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements_cpu = 2;
-  static const int num_elements_gpu = 2;
+  static const size_t num_elements_cpu = 2;
+  static const size_t num_elements_gpu = 2;
   static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint3> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements_cpu = 3;
-  static const int num_elements_gpu = 3;
+  static const size_t num_elements_cpu = 3;
+  static const size_t num_elements_gpu = 3;
   static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint4> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
   static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
   static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int2> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements_cpu = 2;
-  static const int num_elements_gpu = 2;
+  static const size_t num_elements_cpu = 2;
+  static const size_t num_elements_gpu = 2;
   static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int3> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 3;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 3;
   static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int4> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
   static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
   static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float2> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements_cpu = 2;
-  static const int num_elements_gpu = 2;
+  static const size_t num_elements_cpu = 2;
+  static const size_t num_elements_gpu = 2;
   static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float3> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 3;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 3;
   static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float4> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
   static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
   static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<ushort4> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
   static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint16_t> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
   static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half4> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
   static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint64_t> {
   static const DataType data_type = TYPE_UINT64;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
   static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
 };
 
@@ -277,6 +277,7 @@ class device_memory {
  protected:
   friend class CUDADevice;
   friend class OptiXDevice;
+  friend class HIPDevice;
 
   /* Only create through subclasses. */
   device_memory(Device *device, const char *name, MemoryType type);
@@ -296,7 +297,7 @@ class device_memory {
   void device_alloc();
   void device_free();
   void device_copy_to();
-  void device_copy_from(int y, int w, int h, int elem);
+  void device_copy_from(size_t y, size_t w, size_t h, size_t elem);
   void device_zero();
 
   bool device_is_cpu();
@@ -565,7 +566,7 @@ template<typename T> class device_vector : public device_memory {
     device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
   }
 
-  void copy_from_device(int y, int w, int h)
+  void copy_from_device(size_t y, size_t w, size_t h)
   {
     device_copy_from(y, w, h, sizeof(T));
   }
@@ -601,7 +602,7 @@ template<typename T> class device_vector : public device_memory {
 
 class device_sub_ptr {
  public:
-  device_sub_ptr(device_memory &mem, int offset, int size);
+  device_sub_ptr(device_memory &mem, size_t offset, size_t size);
   ~device_sub_ptr();
 
   device_ptr operator*() const
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
index a89ba68d62c..f2b2f3496e0 100644
--- a/intern/cycles/device/device_queue.cpp
+++ b/intern/cycles/device/device_queue.cpp
@@ -57,8 +57,9 @@ void DeviceQueue::debug_init_execution()
 {
   if (VLOG_IS_ON(3)) {
     last_sync_time_ = time_dt();
-    last_kernels_enqueued_ = 0;
   }
+
+  last_kernels_enqueued_ = 0;
 }
 
 void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
@@ -66,8 +67,9 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
   if (VLOG_IS_ON(3)) {
     VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
             << work_size;
-    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
   }
+
+  last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
 }
 
 void DeviceQueue::debug_synchronize()
@@ -80,8 +82,14 @@ void DeviceQueue::debug_synchronize()
     stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
 
     last_sync_time_ = new_time;
-    last_kernels_enqueued_ = 0;
   }
+
+  last_kernels_enqueued_ = 0;
+}
+
+string DeviceQueue::debug_active_kernels()
+{
+  return device_kernel_mask_as_string(last_kernels_enqueued_);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
index edda3e61d51..e6835b787cf 100644
--- a/intern/cycles/device/device_queue.h
+++ b/intern/cycles/device/device_queue.h
@@ -21,6 +21,7 @@
 #include "device/device_graphics_interop.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
+#include "util/util_string.h"
 #include "util/util_unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
@@ -101,6 +102,7 @@ class DeviceQueue {
   void debug_init_execution();
   void debug_enqueue(DeviceKernel kernel, const int work_size);
   void debug_synchronize();
+  string debug_active_kernels();
 
   /* Combination of kernels enqueued together sync last synchronize. */
   DeviceKernelMask last_kernels_enqueued_;
diff --git a/intern/cycles/device/dummy/device.cpp b/intern/cycles/device/dummy/device.cpp
index 678276ed025..e3cea272300 100644
--- a/intern/cycles/device/dummy/device.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -48,7 +48,7 @@ class DummyDevice : public Device {
   {
   }
 
-  virtual void mem_copy_from(device_memory &, int, int, int, int) override
+  virtual void mem_copy_from(device_memory &, size_t, size_t, size_t, size_t) override
   {
   }
 
diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp
new file mode 100644
index 00000000000..90028ac7f10
--- /dev/null
+++ b/intern/cycles/device/hip/device.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/hip/device.h"
+
+#include "util/util_logging.h"
+
+#ifdef WITH_HIP
+#  include "device/device.h"
+#  include "device/hip/device_impl.h"
+
+#  include "util/util_string.h"
+#  include "util/util_windows.h"
+#endif /* WITH_HIP */
+
+CCL_NAMESPACE_BEGIN
+
+bool device_hip_init()
+{
+#if !defined(WITH_HIP)
+  return false;
+#elif defined(WITH_HIP_DYNLOAD)
+  static bool initialized = false;
+  static bool result = false;
+
+  if (initialized)
+    return result;
+
+  initialized = true;
+  int hipew_result = hipewInit(HIPEW_INIT_HIP);
+  if (hipew_result == HIPEW_SUCCESS) {
+    VLOG(1) << "HIPEW initialization succeeded";
+    if (HIPDevice::have_precompiled_kernels()) {
+      VLOG(1) << "Found precompiled kernels";
+      result = true;
+    }
+    else if (hipewCompilerPath() != NULL) {
+      VLOG(1) << "Found HIPCC " << hipewCompilerPath();
+      result = true;
+    }
+    else {
+      VLOG(1) << "Neither precompiled kernels nor HIPCC was found,"
+              << " unable to use HIP";
+    }
+  }
+  else {
+    VLOG(1) << "HIPEW initialization failed: "
+            << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+                                                              "Error opening the library");
+  }
+
+  return result;
+#else  /* WITH_HIP_DYNLOAD */
+  return true;
+#endif /* WITH_HIP_DYNLOAD */
+}
+
+Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_HIP
+  return new HIPDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+#ifdef WITH_HIP
+static hipError_t device_hip_safe_init()
+{
+#  ifdef _WIN32
+  __try {
+    return hipInit(0);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+    /* Ignore crashes inside the HIP driver and hope we can
+     * survive even with corrupted HIP installs. */
+    fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n");
+  }
+
+  return hipErrorNoDevice;
+#  else
+  return hipInit(0);
+#  endif
+}
+#endif /* WITH_HIP */
+
+void device_hip_info(vector<DeviceInfo> &devices)
+{
+#ifdef WITH_HIP
+  hipError_t result = device_hip_safe_init();
+  if (result != hipSuccess) {
+    if (result != hipErrorNoDevice)
+      fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result));
+    return;
+  }
+
+  int count = 0;
+  result = hipGetDeviceCount(&count);
+  if (result != hipSuccess) {
+    fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result));
+    return;
+  }
+
+  vector<DeviceInfo> display_devices;
+
+  for (int num = 0; num < count; num++) {
+    char name[256];
+
+    result = hipDeviceGetName(name, 256, num);
+    if (result != hipSuccess) {
+      fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result));
+      continue;
+    }
+
+    int major;
+    hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num);
+    // TODO : (Arya) What is the last major version we are supporting?
+
+    DeviceInfo info;
+
+    info.type = DEVICE_HIP;
+    info.description = string(name);
+    info.num = num;
+
+    info.has_half_images = (major >= 3);
+    info.has_nanovdb = true;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
+    /* Check if the device has P2P access to any other device in the system. */
+    for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
+      if (num != peer_num) {
+        int can_access = 0;
+        hipDeviceCanAccessPeer(&can_access, num, peer_num);
+        info.has_peer_memory = (can_access != 0);
+      }
+    }
+
+    int pci_location[3] = {0, 0, 0};
+    hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num);
+    hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num);
+    hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num);
+    info.id = string_printf("HIP_%s_%04x:%02x:%02x",
+                            name,
+                            (unsigned int)pci_location[0],
+                            (unsigned int)pci_location[1],
+                            (unsigned int)pci_location[2]);
+
+    /* If device has a kernel timeout and no compute preemption, we assume
+     * it is connected to a display and will freeze the display while doing
+     * computations. */
+    int timeout_attr = 0, preempt_attr = 0;
+    hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num);
+
+    if (timeout_attr && !preempt_attr) {
+      VLOG(1) << "Device is recognized as display.";
+      info.description += " (Display)";
+      info.display_device = true;
+      display_devices.push_back(info);
+    }
+    else {
+      VLOG(1) << "Device has compute preemption or is not used for display.";
+      devices.push_back(info);
+    }
+    VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+  }
+
+  if (!display_devices.empty())
+    devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_HIP */
+  (void)devices;
+#endif /* WITH_HIP */
+}
+
+string device_hip_capabilities()
+{
+#ifdef WITH_HIP
+  hipError_t result = device_hip_safe_init();
+  if (result != hipSuccess) {
+    if (result != hipErrorNoDevice) {
+      return string("Error initializing HIP: ") + hipewErrorString(result);
+    }
+    return "No HIP device found\n";
+  }
+
+  int count;
+  result = hipGetDeviceCount(&count);
+  if (result != hipSuccess) {
+    return string("Error getting devices: ") + hipewErrorString(result);
+  }
+
+  string capabilities = "";
+  for (int num = 0; num < count; num++) {
+    char name[256];
+    if (hipDeviceGetName(name, 256, num) != hipSuccess) {
+      continue;
+    }
+    capabilities += string("\t") + name + "\n";
+    int value;
+#  define GET_ATTR(attr) \
+    { \
+      if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \
+        capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \
+      } \
+    } \
+    (void)0
+    /* TODO(sergey): Strip all attributes which are not useful for us
+     * or does not depend on the driver.
+     */
+    GET_ATTR(MaxThreadsPerBlock);
+    GET_ATTR(MaxBlockDimX);
+    GET_ATTR(MaxBlockDimY);
+    GET_ATTR(MaxBlockDimZ);
+    GET_ATTR(MaxGridDimX);
+    GET_ATTR(MaxGridDimY);
+    GET_ATTR(MaxGridDimZ);
+    GET_ATTR(MaxSharedMemoryPerBlock);
+    GET_ATTR(TotalConstantMemory);
+    GET_ATTR(WarpSize);
+    GET_ATTR(MaxPitch);
+    GET_ATTR(MaxRegistersPerBlock);
+    GET_ATTR(ClockRate);
+    GET_ATTR(TextureAlignment);
+    GET_ATTR(MultiprocessorCount);
+    GET_ATTR(KernelExecTimeout);
+    GET_ATTR(Integrated);
+    GET_ATTR(CanMapHostMemory);
+    GET_ATTR(ComputeMode);
+    GET_ATTR(MaxTexture1DWidth);
+    GET_ATTR(MaxTexture2DWidth);
+    GET_ATTR(MaxTexture2DHeight);
+    GET_ATTR(MaxTexture3DWidth);
+    GET_ATTR(MaxTexture3DHeight);
+    GET_ATTR(MaxTexture3DDepth);
+    GET_ATTR(ConcurrentKernels);
+    GET_ATTR(EccEnabled);
+    GET_ATTR(MemoryClockRate);
+    GET_ATTR(MemoryBusWidth);
+    GET_ATTR(L2CacheSize);
+    GET_ATTR(MaxThreadsPerMultiProcessor);
+    GET_ATTR(ComputeCapabilityMajor);
+    GET_ATTR(ComputeCapabilityMinor);
+    GET_ATTR(MaxSharedMemoryPerMultiprocessor);
+    GET_ATTR(ManagedMemory);
+    GET_ATTR(IsMultiGpuBoard);
+#  undef GET_ATTR
+    capabilities += "\n";
+  }
+
+  return capabilities;
+
+#else  /* WITH_HIP */
+  return "";
+#endif /* WITH_HIP */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/hip/device.h b/intern/cycles/device/hip/device.h
new file mode 100644
index 00000000000..965fd9e484b
--- /dev/null
+++ b/intern/cycles/device/hip/device.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_hip_init();
+
+Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_hip_info(vector<DeviceInfo> &devices);
+
+string device_hip_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
new file mode 100644
index 00000000000..0e5ac6ce401
--- /dev/null
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -0,0 +1,1343 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/hip/device_impl.h"
+
+#  include "render/buffers.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+bool HIPDevice::have_precompiled_kernels()
+{
+  string fatbins_path = path_get("lib");
+  return path_exists(fatbins_path);
+}
+
+bool HIPDevice::show_samples() const
+{
+  /* The HIPDevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask HIPDevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void HIPDevice::set_error(const string &error)
+{
+  Device::set_error(error);
+
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+
+  hipDevId = info.num;
+  hipDevice = 0;
+  hipContext = 0;
+
+  hipModule = 0;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+  pitch_alignment = 0;
+
+  /* Initialize HIP. */
+  hipError_t result = hipInit(0);
+  if (result != hipSuccess) {
+    set_error(string_printf("Failed to initialize HIP runtime (%s)", hipewErrorString(result)));
+    return;
+  }
+
+  /* Setup device and context. */
+  result = hipGetDevice(&hipDevice, hipDevId);
+  if (result != hipSuccess) {
+    set_error(string_printf("Failed to get HIP device handle from ordinal (%s)",
+                            hipewErrorString(result)));
+    return;
+  }
+
+  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+
+  hip_assert(
+      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
+
+  unsigned int ctx_flags = hipDeviceLmemResizeToMax;
+  if (can_map_host) {
+    ctx_flags |= hipDeviceMapHost;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  result = hipCtxCreate(&hipContext, ctx_flags, hipDevice);
+
+  if (result != hipSuccess) {
+    set_error(string_printf("Failed to create HIP context (%s)", hipewErrorString(result)));
+    return;
+  }
+
+  int major, minor;
+  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
+  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
+  hipDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by hipCtxCreate. */
+  hipCtxPopCurrent(NULL);
+}
+
+HIPDevice::~HIPDevice()
+{
+  texture_info.free();
+
+  hip_assert(hipCtxDestroy(hipContext));
+}
+
+bool HIPDevice::support_device(const uint /*kernel_features*/)
+{
+  int major, minor;
+  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
+  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
+
+  // TODO : (Arya) What versions do we plan to support?
+  return true;
+}
+
+bool HIPDevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_HIP && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  HIPDevice *const peer_device_hip = static_cast<HIPDevice *>(peer_device);
+
+  int can_access = 0;
+  hip_assert(hipDeviceCanAccessPeer(&can_access, hipDevice, peer_device_hip->hipDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  hip_assert(hipDeviceGetP2PAttribute(
+      &can_access, hipDevP2PAttrHipArrayAccessSupported, hipDevice, peer_device_hip->hipDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const HIPContextScope scope(this);
+    hipError_t result = hipCtxEnablePeerAccess(peer_device_hip->hipContext, 0);
+    if (result != hipSuccess) {
+      set_error(string_printf("Failed to enable peer access on HIP context (%s)",
+                              hipewErrorString(result)));
+      return false;
+    }
+  }
+  {
+    const HIPContextScope scope(peer_device_hip);
+    hipError_t result = hipCtxEnablePeerAccess(hipContext, 0);
+    if (result != hipSuccess) {
+      set_error(string_printf("Failed to enable peer access on HIP context (%s)",
+                              hipewErrorString(result)));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool HIPDevice::use_adaptive_compilation()
+{
+  return DebugFlags().hip.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DHIPCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (use_adaptive_compilation()) {
+    cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+  }
+  return cflags;
+}
+
+string HIPDevice::compile_kernel(const uint kernel_features,
+                                 const char *name,
+                                 const char *base,
+                                 bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
+  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string fatbin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
+      if (path_exists(fatbin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return fatbin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing hip toolkit or changing other
+   * compiler command line arguments makes sure fatbin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = "genco";
+#  ifdef _WIN32
+  const char *const options =
+      "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp";
+#  else
+  const char *const options =
+      "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp -O3 -ggdb";
+#  endif
+  const string include_path = source_path;
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string fatbin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s", name, kernel_arch, major, minor, kernel_md5.c_str());
+  const string fatbin = path_cache_get(path_join("kernels", fatbin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << fatbin << ".";
+  if (path_exists(fatbin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return fatbin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      set_error(
+          string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      set_error(
+          string_printf("HIP binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const hipcc = hipewCompilerPath();
+  if (hipcc == NULL) {
+    set_error(
+        "HIP hipcc compiler not found. "
+        "Install HIP toolkit in default location.");
+    return string();
+  }
+
+  const int hipcc_hip_version = hipewCompilerVersion();
+  VLOG(1) << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << ".";
+  if (hipcc_hip_version < 40) {
+    printf(
+        "Unsupported HIP version %d.%d detected, "
+        "you need HIP 4.0 or newer.\n",
+        hipcc_hip_version / 10,
+        hipcc_hip_version % 10);
+    return string();
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(fatbin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("device", path_join(base, string_printf("%s.cpp", name))));
+
+  string command = string_printf("%s -%s -I %s --%s %s -o \"%s\"",
+                                 hipcc,
+                                 options,
+                                 include_path.c_str(),
+                                 kernel_ext,
+                                 source_path.c_str(),
+                                 fatbin.c_str());
+
+  printf("Compiling HIP kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    set_error(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(fatbin)) {
+    set_error(
+        "HIP kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return fatbin;
+}
+
+bool HIPDevice::load_kernels(const uint kernel_features)
+{
+  /* TODO(sergey): Support kernels re-load for HIP devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in hipCtxSynchronize.
+   */
+  if (hipModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if hip init succeeded */
+  if (hipContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(kernel_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = "kernel";
+  string fatbin = compile_kernel(kernel_features, kernel_name);
+  if (fatbin.empty())
+    return false;
+
+  /* open module */
+  HIPContextScope scope(this);
+
+  string fatbin_data;
+  hipError_t result;
+
+  if (path_read_text(fatbin, fatbin_data))
+    result = hipModuleLoadData(&hipModule, fatbin_data.c_str());
+  else
+    result = hipErrorFileNotFound;
+
+  if (result != hipSuccess)
+    set_error(string_printf(
+        "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(result)));
+
+  if (result == hipSuccess) {
+    kernels.load(this);
+    reserve_local_memory(kernel_features);
+  }
+
+  return (result == hipSuccess);
+}
+
+void HIPDevice::reserve_local_memory(const uint)
+{
+  /* Together with hipDeviceLmemResizeToMax, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  size_t total = 0, free_before = 0, free_after = 0;
+
+  {
+    HIPContextScope scope(this);
+    hipMemGetInfo(&free_before, &total);
+  }
+
+  {
+    /* Use the biggest kernel for estimation. */
+    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+     * multiprocessors. It would be good to do this in parallel for the multi GPU case
+     * still to make it faster. */
+    HIPDeviceQueue queue(this);
+
+    void *d_path_index = nullptr;
+    void *d_render_buffer = nullptr;
+    int d_work_size = 0;
+    void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+    queue.init_execution();
+    queue.enqueue(test_kernel, 1, args);
+    queue.synchronize();
+  }
+
+  {
+    HIPContextScope scope(this);
+    hipMemGetInfo(&free_after, &total);
+  }
+
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    hipDeviceptr_t tmp;
+    hip_assert(hipMalloc(&tmp, 10 * 1024 * 1024LL));
+    hipMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void HIPDevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void HIPDevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+  }
+}
+
+void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
+      device_memory &mem = *pair.first;
+      HIPMem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple HIP devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  HIPContextScope scope(this);
+
+  hipDeviceptr_t device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  hipError_t mem_alloc_result = hipErrorOutOfMemory;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  hipMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    hipMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = hipMalloc(&device_pointer, size);
+    if (mem_alloc_result == hipSuccess) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != hipSuccess && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = hipSuccess;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = hipHostMalloc(&shared_pointer, size);
+
+      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
+             (mem_alloc_result != hipSuccess && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == hipSuccess) {
+      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (mem_alloc_result != hipSuccess) {
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(hip_mem_map_mutex);
+  HIPMem *cmem = &hip_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * HIP memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void HIPDevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  thread_scoped_lock lock(hip_mem_map_mutex);
+  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const HIPContextScope scope(this);
+    hip_assert(
+        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void HIPDevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    HIPContextScope scope(this);
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    const HIPMem &cmem = hip_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          hipHostFree(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      hip_assert(hipFree(mem.device_pointer));
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    hip_mem_map.erase(hip_mem_map.find(&mem));
+  }
+}
+
+void HIPDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void HIPDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+    generic_copy_to(mem);
+  }
+}
+
+void HIPDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const HIPContextScope scope(this);
+      hip_assert(hipMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (hipDeviceptr_t)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void HIPDevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(hip_mem_map_mutex);
+  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const HIPContextScope scope(this);
+    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void HIPDevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr HIPDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void HIPDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  HIPContextScope scope(this);
+  hipDeviceptr_t mem;
+  size_t bytes;
+
+  hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name));
+  assert(bytes == size);
+  hip_assert(hipMemcpyHtoD(mem, host, size));
+}
+
+void HIPDevice::global_alloc(device_memory &mem)
+{
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void HIPDevice::global_free(device_memory &mem)
+{
+  if (mem.is_resident(this) && mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void HIPDevice::tex_alloc(device_texture &mem)
+{
+  HIPContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  hipTextureAddressMode address_mode = hipAddressModeWrap;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = hipAddressModeWrap;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = hipAddressModeClamp;
+      break;
+    case EXTENSION_CLIP:
+      // TODO : (Arya) setting this to Mode Clamp instead of Mode Border because it's unsupported
+      // in hip
+      address_mode = hipAddressModeClamp;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  hipTextureFilterMode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = hipFilterModePoint;
+  }
+  else {
+    filter_mode = hipFilterModeLinear;
+  }
+
+  /* Image Texture Storage */
+  hipArray_Format format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = HIP_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = HIP_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = HIP_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = HIP_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = HIP_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = HIP_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  HIPMem *cmem = NULL;
+  hArray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (!mem.is_resident(this)) {
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    cmem = &hip_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (hArray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    HIP_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    hip_assert(hipArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    HIP_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = hipMemoryTypeArray;
+    param.dstArray = &array_3d;
+    param.srcMemoryType = hipMemoryTypeHost;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    hip_assert(hipDrvMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    cmem = &hip_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    dst_pitch = align_up(src_pitch, pitch_alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    hip_Memcpy2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = hipMemoryTypeDevice;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = hipMemoryTypeHost;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    hip_assert(hipDrvMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  need_texture_info = true;
+
+  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
+    hipResourceDesc resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = hipResourceTypeArray;
+      resDesc.res.array.h_Array = &array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = hipResourceTypePitch2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = hipResourceTypeLinear;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    hipTextureDesc texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;
+
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    cmem = &hip_mem_map[&mem];
+
+    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    texture_info[slot].data = (uint64_t)cmem->texobject;
+  }
+  else {
+    texture_info[slot].data = (uint64_t)mem.device_pointer;
+  }
+}
+
+void HIPDevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    HIPContextScope scope(this);
+    thread_scoped_lock lock(hip_mem_map_mutex);
+    const HIPMem &cmem = hip_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      hipTexObjectDestroy(cmem.texobject);
+    }
+
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      hip_mem_map.erase(hip_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
+      /* Free array. */
+      hipArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      hip_mem_map.erase(hip_mem_map.find(&mem));
+    }
+    else {
+      lock.unlock();
+      generic_free(mem);
+    }
+  }
+}
+
+#  if 0
+void HIPDevice::render(DeviceTask &task,
+                        RenderTile &rtile,
+                        device_vector<KernelWorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  HIPContextScope scope(this);
+  hipFunction_t hipRender;
+
+  /* Get kernel function. */
+  if (rtile.task == RenderTile::BAKE) {
+    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake"));
+  }
+  else {
+    hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  KernelWorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  hip_assert(
+      hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+  /* Render all samples. */
+  uint start_sample = rtile.start_sample;
+  uint end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample;) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = step_samples;
+    if (task.adaptive_sampling.use) {
+      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+    }
+    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    hip_assert(
+        hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    hip_assert(hipDeviceSynchronize());
+
+    /* Update progress. */
+    sample += wtile->num_samples;
+    rtile.sample = sample;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    hip_assert(hipDeviceSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void HIPDevice::thread_run(DeviceTask &task)
+{
+  HIPContextScope scope(this);
+
+  if (task.type == DeviceTask::RENDER) {
+    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+
+    while (task.acquire_tile(this, tile, task.tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        render(task, tile, work_tiles);
+      }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, work_tiles);
+      }
+
+      task.release_tile(tile);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+}
+#  endif
+
+unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
+{
+  return make_unique<HIPDeviceQueue>(this);
+}
+
+bool HIPDevice::should_use_graphics_interop()
+{
+  /* Check whether this device is part of OpenGL context.
+   *
+   * Using HIP device for graphics interoperability which is not part of the OpenGL context is
+   * possible, but from the empiric measurements it can be considerably slower than using naive
+   * pixels copy. */
+
+  HIPContextScope scope(this);
+
+  int num_all_devices = 0;
+  hip_assert(hipGetDeviceCount(&num_all_devices));
+
+  if (num_all_devices == 0) {
+    return false;
+  }
+
+  vector<hipDevice_t> gl_devices(num_all_devices);
+  uint num_gl_devices = 0;
+  hipGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, hipGLDeviceListAll);
+
+  for (hipDevice_t gl_device : gl_devices) {
+    if (gl_device == hipDevice) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int HIPDevice::get_num_multiprocessors()
+{
+  return get_device_default_attribute(hipDeviceAttributeMultiprocessorCount, 0);
+}
+
+int HIPDevice::get_max_num_threads_per_multiprocessor()
+{
+  return get_device_default_attribute(hipDeviceAttributeMaxThreadsPerMultiProcessor, 0);
+}
+
+bool HIPDevice::get_device_attribute(hipDeviceAttribute_t attribute, int *value)
+{
+  HIPContextScope scope(this);
+
+  return hipDeviceGetAttribute(value, attribute, hipDevice) == hipSuccess;
+}
+
+int HIPDevice::get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value)
+{
+  int value = 0;
+  if (!get_device_attribute(attribute, &value)) {
+    return default_value;
+  }
+  return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h
new file mode 100644
index 00000000000..1d138ee9856
--- /dev/null
+++ b/intern/cycles/device/hip/device_impl.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/device.h"
+#  include "device/hip/kernel.h"
+#  include "device/hip/queue.h"
+#  include "device/hip/util.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  else
+#    include "util/util_opengl.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class HIPDevice : public Device {
+
+  friend class HIPContextScope;
+
+ public:
+  hipDevice_t hipDevice;
+  hipCtx_t hipContext;
+  hipModule_t hipModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int hipDevId;
+  int hipDevArchitecture;
+  bool first_error;
+
+  struct HIPMem {
+    HIPMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    hipTextureObject_t texobject;
+    hArray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, HIPMem> HIPMemMap;
+  HIPMemMap hip_mem_map;
+  thread_mutex hip_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  HIPDeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~HIPDevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "hip",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  /* Graphics resources interoperability. */
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(hipDeviceAttribute_t attribute, int *value);
+  int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/hip/graphics_interop.cpp b/intern/cycles/device/hip/graphics_interop.cpp
new file mode 100644
index 00000000000..0d5d71019b3
--- /dev/null
+++ b/intern/cycles/device/hip/graphics_interop.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/graphics_interop.h"
+
+#  include "device/hip/device_impl.h"
+#  include "device/hip/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue)
+    : queue_(queue), device_(static_cast<HIPDevice *>(queue->device))
+{
+}
+
+HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop()
+{
+  HIPContextScope scope(device_);
+
+  if (hip_graphics_resource_) {
+    hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
+  }
+}
+
+void HIPDeviceGraphicsInterop::set_display_interop(
+    const DisplayDriver::GraphicsInterop &display_interop)
+{
+  const int64_t new_buffer_area = int64_t(display_interop.buffer_width) *
+                                  display_interop.buffer_height;
+
+  need_clear_ = display_interop.need_clear;
+
+  if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  HIPContextScope scope(device_);
+
+  if (hip_graphics_resource_) {
+    hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
+  }
+
+  const hipError_t result = hipGraphicsGLRegisterBuffer(
+      &hip_graphics_resource_, display_interop.opengl_pbo_id, hipGraphicsRegisterFlagsNone);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result);
+  }
+
+  opengl_pbo_id_ = display_interop.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr HIPDeviceGraphicsInterop::map()
+{
+  if (!hip_graphics_resource_) {
+    return 0;
+  }
+
+  HIPContextScope scope(device_);
+
+  hipDeviceptr_t hip_buffer;
+  size_t bytes;
+
+  hip_device_assert(device_,
+                    hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream()));
+  hip_device_assert(
+      device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_));
+
+  if (need_clear_) {
+    hip_device_assert(
+        device_,
+        hipMemsetD8Async(static_cast<hipDeviceptr_t>(hip_buffer), 0, bytes, queue_->stream()));
+
+    need_clear_ = false;
+  }
+
+  return static_cast<device_ptr>(hip_buffer);
+}
+
+void HIPDeviceGraphicsInterop::unmap()
+{
+  HIPContextScope scope(device_);
+
+  hip_device_assert(device_,
+                    hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h
new file mode 100644
index 00000000000..2b2d287ff6c
--- /dev/null
+++ b/intern/cycles/device/hip/graphics_interop.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+class HIPDeviceQueue;
+
+class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue);
+
+  HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete;
+  HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~HIPDeviceGraphicsInterop();
+
+  HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete;
+  HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_display_interop(const DisplayDriver::GraphicsInterop &display_interop) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  HIPDeviceQueue *queue_ = nullptr;
+  HIPDevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  /* The destination was requested to be cleared. */
+  bool need_clear_ = false;
+
+  hipGraphicsResource hip_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/hip/kernel.cpp b/intern/cycles/device/hip/kernel.cpp
new file mode 100644
index 00000000000..9ede8507a0c
--- /dev/null
+++ b/intern/cycles/device/hip/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/kernel.h"
+#  include "device/hip/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void HIPDeviceKernels::load(HIPDevice *device)
+{
+  hipModule_t hipModule = device->hipModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    HIPDeviceKernel &kernel = kernels_[i];
+
+    /* No mega-kernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    hip_device_assert(device,
+                      hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str()));
+
+    if (kernel.function) {
+      hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1));
+
+      hip_device_assert(
+          device,
+          hipModuleOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool HIPDeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP*/
diff --git a/intern/cycles/device/hip/kernel.h b/intern/cycles/device/hip/kernel.h
new file mode 100644
index 00000000000..3301731f56e
--- /dev/null
+++ b/intern/cycles/device/hip/kernel.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+/* HIP kernel and associate occupancy information. */
+class HIPDeviceKernel {
+ public:
+  hipFunction_t function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of HIP kernels for each DeviceKernel. */
+class HIPDeviceKernels {
+ public:
+  void load(HIPDevice *device);
+  const HIPDeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
new file mode 100644
index 00000000000..78c77e5fdae
--- /dev/null
+++ b/intern/cycles/device/hip/queue.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/queue.h"
+
+#  include "device/hip/device_impl.h"
+#  include "device/hip/graphics_interop.h"
+#  include "device/hip/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* HIPDeviceQueue */
+
+HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device)
+    : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr)
+{
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking));
+}
+
+HIPDeviceQueue::~HIPDeviceQueue()
+{
+  const HIPContextScope scope(hip_device_);
+  hipStreamDestroy(hip_stream_);
+}
+
+int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+{
+  /* TODO: compute automatically. */
+  /* TODO: must have at least num_threads_per_block. */
+  return 14416128;
+}
+
+int HIPDeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = hip_device_->get_num_multiprocessors() *
+                              hip_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void HIPDeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  HIPContextScope scope(hip_device_);
+  hip_device_->load_texture_info();
+  hip_device_assert(hip_device_, hipDeviceSynchronize());
+
+  debug_init_execution();
+}
+
+bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return hip_device_->kernels.available(kernel);
+}
+
+bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (hip_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const HIPContextScope scope(hip_device_);
+  const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = hip_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  hip_device_assert(hip_device_,
+                    hipModuleLaunchKernel(hip_kernel.function,
+                                          num_blocks,
+                                          1,
+                                          1,
+                                          num_threads_per_block,
+                                          1,
+                                          1,
+                                          shared_mem_bytes,
+                                          hip_stream_,
+                                          args,
+                                          0));
+  return !(hip_device_->have_error());
+}
+
+bool HIPDeviceQueue::synchronize()
+{
+  if (hip_device_->have_error()) {
+    return false;
+  }
+
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
+  debug_synchronize();
+
+  return !(hip_device_->have_error());
+}
+
+void HIPDeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    hip_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
+}
+
+void HIPDeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    hip_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemcpyHtoDAsync(
+          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
+}
+
+void HIPDeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemcpyDtoHAsync(
+          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
+}
+
+// TODO : (Arya) Enable this after stabilizing dev branch
+unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
+{
+  return make_unique<HIPDeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
new file mode 100644
index 00000000000..04c8a5982ce
--- /dev/null
+++ b/intern/cycles/device/hip/queue.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/hip/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+class device_memory;
+
+/* Base class for HIP queues. */
+class HIPDeviceQueue : public DeviceQueue {
+ public:
+  HIPDeviceQueue(HIPDevice *device);
+  ~HIPDeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual hipStream_t stream()
+  {
+    return hip_stream_;
+  }
+
+  // TODO : (Arya) Enable this after stabilizing the dev branch
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  HIPDevice *hip_device_;
+  hipStream_t hip_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/hip/util.cpp b/intern/cycles/device/hip/util.cpp
new file mode 100644
index 00000000000..44f52c4e17b
--- /dev/null
+++ b/intern/cycles/device/hip/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/util.h"
+#  include "device/hip/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+HIPContextScope::HIPContextScope(HIPDevice *device) : device(device)
+{
+  hip_device_assert(device, hipCtxPushCurrent(device->hipContext));
+}
+
+HIPContextScope::~HIPContextScope()
+{
+  hip_device_assert(device, hipCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_HIP_DYNLOAD
+const char *hipewErrorString(hipError_t result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *hipewCompilerPath()
+{
+  return CYCLES_HIP_HIPCC_EXECUTABLE;
+}
+
+int hipewCompilerVersion()
+{
+  return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/hip/util.h b/intern/cycles/device/hip/util.h
new file mode 100644
index 00000000000..0db5174a3db
--- /dev/null
+++ b/intern/cycles/device/hip/util.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+/* Utility to push/pop HIP context. */
+class HIPContextScope {
+ public:
+  HIPContextScope(HIPDevice *device);
+  ~HIPContextScope();
+
+ private:
+  HIPDevice *device;
+};
+
+/* Utility for checking return values of HIP function calls. */
+#  define hip_device_assert(hip_device, stmt) \
+    { \
+      hipError_t result = stmt; \
+      if (result != hipSuccess) { \
+        const char *name = hipewErrorString(result); \
+        hip_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define hip_assert(stmt) hip_device_assert(this, stmt)
+
+#  ifndef WITH_HIP_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked HIP at all. */
+const char *hipewErrorString(hipError_t result);
+const char *hipewCompilerPath();
+int hipewCompilerVersion();
+#  endif /* WITH_HIP_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
index 6dbcce2d9a5..4f995abf2c4 100644
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -315,14 +315,14 @@ class MultiDevice : public Device {
     stats.mem_alloc(mem.device_size - existing_size);
   }
 
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override
   {
     device_ptr key = mem.device_pointer;
-    int i = 0, sub_h = h / devices.size();
+    size_t i = 0, sub_h = h / devices.size();
 
     foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+      size_t sy = y + i * sub_h;
+      size_t sh = (i == (size_t)devices.size() - 1) ? h - sub_h * i : sub_h;
 
       SubDevice *owner_sub = find_matching_mem_device(key, sub);
       mem.device = owner_sub->device;
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index b54d423a183..49d4e22143f 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -315,6 +315,11 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
   group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
   group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+  group_descs[PG_HITV].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITV].hitgroup.moduleCH = optix_module;
+  group_descs[PG_HITV].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+  group_descs[PG_HITV].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITV].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_volume_test";
 
   if (kernel_features & KERNEL_FEATURE_HAIR) {
     if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
@@ -397,6 +402,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
   trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
   trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITV].cssIS + stack_size[PG_HITV].cssAH);
   trace_css = std::max(trace_css,
                        stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
   trace_css = std::max(trace_css,
@@ -421,6 +427,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     pipeline_groups.push_back(groups[PG_HITD]);
     pipeline_groups.push_back(groups[PG_HITS]);
     pipeline_groups.push_back(groups[PG_HITL]);
+    pipeline_groups.push_back(groups[PG_HITV]);
     if (motion_blur) {
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
@@ -459,6 +466,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     pipeline_groups.push_back(groups[PG_HITD]);
     pipeline_groups.push_back(groups[PG_HITS]);
     pipeline_groups.push_back(groups[PG_HITL]);
+    pipeline_groups.push_back(groups[PG_HITV]);
     if (motion_blur) {
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
@@ -1390,25 +1398,33 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       /* Set user instance ID to object index (but leave low bit blank). */
       instance.instanceId = ob->get_device_index() << 1;
 
-      /* Have to have at least one bit in the mask, or else instance would always be culled. */
-      instance.visibilityMask = 1;
+      /* Add some of the object visibility bits to the mask.
+       * __prim_visibility contains the combined visibility bits of all instances, so is not
+       * reliable if they differ between instances. But the OptiX visibility mask can only contain
+       * 8 bits, so have to trade-off here and select just a few important ones.
+       */
+      instance.visibilityMask = ob->visibility_for_tracing() & 0xFF;
 
-      if (ob->get_geometry()->has_volume) {
-        /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
-         */
-        instance.visibilityMask |= 2;
+      /* Have to have at least one bit in the mask, or else instance would always be culled. */
+      if (0 == instance.visibilityMask) {
+        instance.visibilityMask = 0xFF;
       }
 
-      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-        /* Same applies to curves (so they can be skipped in local trace calls). */
-        instance.visibilityMask |= 4;
-
-        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+      if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
+          static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+        if (motion_blur && ob->get_geometry()->has_motion_blur()) {
           /* Select between motion blur and non-motion blur built-in intersection module. */
           instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
         }
       }
+      else {
+        /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
+         * since it needs to filter out end-caps there).
+         * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
+         * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
+         */
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT;
+      }
 
       /* Insert motion traversable if object has motion. */
       if (motion_blur && ob->use_motion()) {
@@ -1474,7 +1490,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
         delete[] reinterpret_cast<uint8_t *>(&motion_transform);
 
         /* Disable instance transform if object uses motion transform already. */
-        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+        instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
 
         /* Get traversable handle to motion transform. */
         optixConvertPointerToTraversableHandle(context,
@@ -1491,7 +1507,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
         }
         else {
           /* Disable instance transform if geometry already has it applied to vertex data. */
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
           /* Non-instanced objects read ID from 'prim_object', so distinguish
            * them from instanced objects with the low bit set. */
           instance.instanceId |= 1;
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index 91ef52e0a5a..3695ac6afc2 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -40,6 +40,7 @@ enum {
   PG_HITD, /* Default hit group. */
   PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
   PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+  PG_HITV, /* __VOLUME__ hit group. */
   PG_HITD_MOTION,
   PG_HITS_MOTION,
   PG_CALL_SVM_AO,
@@ -51,7 +52,7 @@ enum {
 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
 static const int NUM_MIS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
-static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int NUM_HIT_PROGRAM_GROUPS = 6;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
 
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
index bfabd35d7c3..949254606b8 100644
--- a/intern/cycles/integrator/CMakeLists.txt
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -27,6 +27,8 @@ set(SRC
   pass_accessor.cpp
   pass_accessor_cpu.cpp
   pass_accessor_gpu.cpp
+  path_trace_display.cpp
+  path_trace_tile.cpp
   path_trace_work.cpp
   path_trace_work_cpu.cpp
   path_trace_work_gpu.cpp
@@ -47,6 +49,8 @@ set(SRC_HEADERS
   pass_accessor.h
   pass_accessor_cpu.h
   pass_accessor_gpu.h
+  path_trace_display.h
+  path_trace_tile.h
   path_trace_work.h
   path_trace_work_cpu.h
   path_trace_work_gpu.h
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index b62a06aea43..7624b244175 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -19,8 +19,9 @@
 #include "device/cpu/device.h"
 #include "device/device.h"
 #include "integrator/pass_accessor.h"
+#include "integrator/path_trace_display.h"
+#include "integrator/path_trace_tile.h"
 #include "integrator/render_scheduler.h"
-#include "render/gpu_display.h"
 #include "render/pass.h"
 #include "render/scene.h"
 #include "render/tile.h"
@@ -67,11 +68,11 @@ PathTrace::PathTrace(Device *device,
 PathTrace::~PathTrace()
 {
   /* Destroy any GPU resource which was used for graphics interop.
-   * Need to have access to the GPUDisplay as it is the only source of drawing context which is
-   * used for interop. */
-  if (gpu_display_) {
+   * Need to have access to the PathTraceDisplay as it is the only source of drawing context which
+   * is used for interop. */
+  if (display_) {
     for (auto &&path_trace_work : path_trace_works_) {
-      path_trace_work->destroy_gpu_resources(gpu_display_.get());
+      path_trace_work->destroy_gpu_resources(display_.get());
     }
   }
 }
@@ -94,7 +95,7 @@ bool PathTrace::ready_to_reset()
 {
   /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
    * display. Of there is no such display, the logic here will break. */
-  DCHECK(gpu_display_);
+  DCHECK(display_);
 
   /* The logic here tries to provide behavior which feels the most interactive feel to artists.
    * General idea is to be able to reset as quickly as possible, while still providing interactive
@@ -126,8 +127,8 @@ void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_t
   /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
    * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
    * properly updated. */
-  if (gpu_display_) {
-    gpu_display_->reset(full_params);
+  if (display_) {
+    display_->reset(full_params);
   }
 
   render_state_.has_denoised_result = false;
@@ -244,7 +245,7 @@ static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>>
     const int slice_height = max(lround(height * weight), 1);
 
     /* Disallow negative values to deal with situations when there are more compute devices than
-     * scanlines. */
+     * scan-lines. */
     const int remaining_height = max(0, height - current_y);
 
     BufferParams slide_params = buffer_params;
@@ -535,25 +536,35 @@ void PathTrace::denoise(const RenderWork &render_work)
   render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
 }
 
-void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+void PathTrace::set_output_driver(unique_ptr<OutputDriver> driver)
 {
-  gpu_display_ = move(gpu_display);
+  output_driver_ = move(driver);
 }
 
-void PathTrace::clear_gpu_display()
+void PathTrace::set_display_driver(unique_ptr<DisplayDriver> driver)
 {
-  if (gpu_display_) {
-    gpu_display_->clear();
+  if (driver) {
+    display_ = make_unique<PathTraceDisplay>(move(driver));
+  }
+  else {
+    display_ = nullptr;
+  }
+}
+
+void PathTrace::clear_display()
+{
+  if (display_) {
+    display_->clear();
   }
 }
 
 void PathTrace::draw()
 {
-  if (!gpu_display_) {
+  if (!display_) {
     return;
   }
 
-  did_draw_after_reset_ |= gpu_display_->draw();
+  did_draw_after_reset_ |= display_->draw();
 }
 
 void PathTrace::update_display(const RenderWork &render_work)
@@ -562,31 +573,32 @@ void PathTrace::update_display(const RenderWork &render_work)
     return;
   }
 
-  if (!gpu_display_ && !tile_buffer_update_cb) {
+  if (!display_ && !output_driver_) {
     VLOG(3) << "Ignore display update.";
     return;
   }
 
   if (full_params_.width == 0 || full_params_.height == 0) {
-    VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
+    VLOG(3) << "Skipping PathTraceDisplay update due to 0 size of the render buffer.";
     return;
   }
 
   const double start_time = time_dt();
 
-  if (tile_buffer_update_cb) {
+  if (output_driver_) {
     VLOG(3) << "Invoke buffer update callback.";
 
-    tile_buffer_update_cb();
+    PathTraceTile tile(*this);
+    output_driver_->update_render_tile(tile);
   }
 
-  if (gpu_display_) {
+  if (display_) {
     VLOG(3) << "Perform copy to GPUDisplay work.";
 
     const int resolution_divider = render_work.resolution_divider;
     const int texture_width = max(1, full_params_.width / resolution_divider);
     const int texture_height = max(1, full_params_.height / resolution_divider);
-    if (!gpu_display_->update_begin(texture_width, texture_height)) {
+    if (!display_->update_begin(texture_width, texture_height)) {
       LOG(ERROR) << "Error beginning GPUDisplay update.";
       return;
     }
@@ -600,10 +612,10 @@ void PathTrace::update_display(const RenderWork &render_work)
      * all works in parallel. */
     const int num_samples = get_num_samples_in_buffer();
     for (auto &&path_trace_work : path_trace_works_) {
-      path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples);
+      path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
     }
 
-    gpu_display_->update_end();
+    display_->update_end();
   }
 
   render_scheduler_.report_display_update_time(render_work, time_dt() - start_time);
@@ -753,20 +765,26 @@ bool PathTrace::is_cancel_requested()
 
 void PathTrace::tile_buffer_write()
 {
-  if (!tile_buffer_write_cb) {
+  if (!output_driver_) {
     return;
   }
 
-  tile_buffer_write_cb();
+  PathTraceTile tile(*this);
+  output_driver_->write_render_tile(tile);
 }
 
 void PathTrace::tile_buffer_read()
 {
-  if (!tile_buffer_read_cb) {
+  if (!device_scene_->data.bake.use) {
     return;
   }
 
-  if (tile_buffer_read_cb()) {
+  if (!output_driver_) {
+    return;
+  }
+
+  PathTraceTile tile(*this);
+  if (output_driver_->read_render_tile(tile)) {
     tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
       path_trace_work->copy_render_buffers_to_device();
     });
@@ -801,7 +819,7 @@ void PathTrace::tile_buffer_write_to_disk()
   }
 
   if (!tile_manager_.write_tile(*buffers)) {
-    LOG(ERROR) << "Error writing tile to file.";
+    device_->set_error("Error writing tile to file");
   }
 }
 
@@ -894,7 +912,14 @@ void PathTrace::process_full_buffer_from_disk(string_view filename)
 
   DenoiseParams denoise_params;
   if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
-    LOG(ERROR) << "Error reading tiles from file.";
+    const string error_message = "Error reading tiles from file";
+    if (progress_) {
+      progress_->set_error(error_message);
+      progress_->set_cancel(error_message);
+    }
+    else {
+      LOG(ERROR) << error_message;
+    }
     return;
   }
 
@@ -998,6 +1023,11 @@ int2 PathTrace::get_render_tile_offset() const
   return make_int2(tile.x, tile.y);
 }
 
+int2 PathTrace::get_render_size() const
+{
+  return tile_manager_.get_size();
+}
+
 const BufferParams &PathTrace::get_render_tile_params() const
 {
   if (full_frame_state_.render_buffers) {
@@ -1028,6 +1058,8 @@ static const char *device_type_for_description(const DeviceType type)
       return "CUDA";
     case DEVICE_OPTIX:
       return "OptiX";
+    case DEVICE_HIP:
+      return "HIP";
     case DEVICE_DUMMY:
       return "Dummy";
     case DEVICE_MULTI:
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
index fc7713e6df9..dbb22c204d9 100644
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -31,12 +31,14 @@ CCL_NAMESPACE_BEGIN
 class AdaptiveSampling;
 class Device;
 class DeviceScene;
+class DisplayDriver;
 class Film;
 class RenderBuffers;
 class RenderScheduler;
 class RenderWork;
+class PathTraceDisplay;
+class OutputDriver;
 class Progress;
-class GPUDisplay;
 class TileManager;
 
 /* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of
@@ -98,13 +100,16 @@ class PathTrace {
    * Use this to configure the adaptive sampler before rendering any samples. */
   void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
 
-  /* Set GPU display which takes care of drawing the render result. */
-  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+  /* Sets output driver for render buffer output. */
+  void set_output_driver(unique_ptr<OutputDriver> driver);
 
-  /* Clear the GPU display by filling it in with all zeroes. */
-  void clear_gpu_display();
+  /* Set display driver for interactive render buffer display. */
+  void set_display_driver(unique_ptr<DisplayDriver> driver);
 
-  /* Perform drawing of the current state of the GPUDisplay. */
+  /* Clear the display buffer by filling it in with all zeroes. */
+  void clear_display();
+
+  /* Perform drawing of the current state of the DisplayDriver. */
   void draw();
 
   /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
@@ -157,6 +162,7 @@ class PathTrace {
    * instead. */
   int2 get_render_tile_size() const;
   int2 get_render_tile_offset() const;
+  int2 get_render_size() const;
 
   /* Get buffer parameters of the current tile.
    *
@@ -168,18 +174,6 @@ class PathTrace {
    * times, and so on. */
   string full_report() const;
 
-  /* Callback which communicates an updates state of the render buffer of the current big tile.
-   * Is called during path tracing to communicate work-in-progress state of the final buffer. */
-  function<void(void)> tile_buffer_update_cb;
-
-  /* Callback which communicates final rendered buffer. Is called after path-tracing is done. */
-  function<void(void)> tile_buffer_write_cb;
-
-  /* Callback which initializes rendered buffer. Is called before path-tracing starts.
-   *
-   * This is used for baking. */
-  function<bool(void)> tile_buffer_read_cb;
-
   /* Callback which is called to report current rendering progress.
    *
    * It is supposed to be cheaper than buffer update/write, hence can be called more often.
@@ -252,7 +246,11 @@ class PathTrace {
   RenderScheduler &render_scheduler_;
   TileManager &tile_manager_;
 
-  unique_ptr<GPUDisplay> gpu_display_;
+  /* Display driver for interactive render buffer display. */
+  unique_ptr<PathTraceDisplay> display_;
+
+  /* Output driver to write render buffer to. */
+  unique_ptr<OutputDriver> output_driver_;
 
   /* Per-compute device descriptors of work which is responsible for path tracing on its configured
    * device. */
@@ -286,7 +284,7 @@ class PathTrace {
     /* Parameters of the big tile with the current resolution divider applied. */
     BufferParams effective_big_tile_params;
 
-    /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+    /* Denoiser was run and there are denoised versions of the passes in the render buffers. */
     bool has_denoised_result = false;
 
     /* Current tile has been written (to either disk or callback.
diff --git a/intern/cycles/render/gpu_display.cpp b/intern/cycles/integrator/path_trace_display.cpp
index a8f0cc50583..28f0a7f7745 100644
--- a/intern/cycles/render/gpu_display.cpp
+++ b/intern/cycles/integrator/path_trace_display.cpp
@@ -14,20 +14,25 @@
  * limitations under the License.
  */
 
-#include "render/gpu_display.h"
+#include "integrator/path_trace_display.h"
 
 #include "render/buffers.h"
+
 #include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
-void GPUDisplay::reset(const BufferParams &buffer_params)
+PathTraceDisplay::PathTraceDisplay(unique_ptr<DisplayDriver> driver) : driver_(move(driver))
+{
+}
+
+void PathTraceDisplay::reset(const BufferParams &buffer_params)
 {
   thread_scoped_lock lock(mutex_);
 
-  const GPUDisplayParams old_params = params_;
+  const DisplayDriver::Params old_params = params_;
 
-  params_.offset = make_int2(buffer_params.full_x, buffer_params.full_y);
+  params_.full_offset = make_int2(buffer_params.full_x, buffer_params.full_y);
   params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height);
   params_.size = make_int2(buffer_params.width, buffer_params.height);
 
@@ -44,7 +49,7 @@ void GPUDisplay::reset(const BufferParams &buffer_params)
   texture_state_.is_outdated = true;
 }
 
-void GPUDisplay::mark_texture_updated()
+void PathTraceDisplay::mark_texture_updated()
 {
   texture_state_.is_outdated = false;
   texture_state_.is_usable = true;
@@ -54,7 +59,7 @@ void GPUDisplay::mark_texture_updated()
  * Update procedure.
  */
 
-bool GPUDisplay::update_begin(int texture_width, int texture_height)
+bool PathTraceDisplay::update_begin(int texture_width, int texture_height)
 {
   DCHECK(!update_state_.is_active);
 
@@ -66,15 +71,15 @@ bool GPUDisplay::update_begin(int texture_width, int texture_height)
   /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
    * The update itself is non-blocking however, for better performance and to avoid
    * potential deadlocks due to locks held by the subclass. */
-  GPUDisplayParams params;
+  DisplayDriver::Params params;
   {
     thread_scoped_lock lock(mutex_);
     params = params_;
     texture_state_.size = make_int2(texture_width, texture_height);
   }
 
-  if (!do_update_begin(params, texture_width, texture_height)) {
-    LOG(ERROR) << "GPUDisplay implementation could not begin update.";
+  if (!driver_->update_begin(params, texture_width, texture_height)) {
+    LOG(ERROR) << "PathTraceDisplay implementation could not begin update.";
     return false;
   }
 
@@ -83,7 +88,7 @@ bool GPUDisplay::update_begin(int texture_width, int texture_height)
   return true;
 }
 
-void GPUDisplay::update_end()
+void PathTraceDisplay::update_end()
 {
   DCHECK(update_state_.is_active);
 
@@ -92,12 +97,12 @@ void GPUDisplay::update_end()
     return;
   }
 
-  do_update_end();
+  driver_->update_end();
 
   update_state_.is_active = false;
 }
 
-int2 GPUDisplay::get_texture_size() const
+int2 PathTraceDisplay::get_texture_size() const
 {
   return texture_state_.size;
 }
@@ -106,25 +111,54 @@ int2 GPUDisplay::get_texture_size() const
  * Texture update from CPU buffer.
  */
 
-void GPUDisplay::copy_pixels_to_texture(
+void PathTraceDisplay::copy_pixels_to_texture(
     const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
 {
   DCHECK(update_state_.is_active);
 
   if (!update_state_.is_active) {
-    LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+    LOG(ERROR) << "Attempt to copy pixels data outside of PathTraceDisplay update.";
     return;
   }
 
   mark_texture_updated();
-  do_copy_pixels_to_texture(rgba_pixels, texture_x, texture_y, pixels_width, pixels_height);
+
+  /* This call copies pixels to a mapped texture buffer which is typically much cheaper from CPU
+   * time point of view than to copy data directly to a texture.
+   *
+   * The possible downside of this approach is that it might require a higher peak memory when
+   * doing partial updates of the texture (although, in practice even partial updates might peak
+   * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */
+  half4 *mapped_rgba_pixels = map_texture_buffer();
+  if (!mapped_rgba_pixels) {
+    return;
+  }
+
+  const int texture_width = texture_state_.size.x;
+  const int texture_height = texture_state_.size.y;
+
+  if (texture_x == 0 && texture_y == 0 && pixels_width == texture_width &&
+      pixels_height == texture_height) {
+    const size_t size_in_bytes = sizeof(half4) * texture_width * texture_height;
+    memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
+  }
+  else {
+    const half4 *rgba_row = rgba_pixels;
+    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_width + texture_x;
+    for (int y = 0; y < pixels_height;
+         ++y, rgba_row += pixels_width, mapped_rgba_row += texture_width) {
+      memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
+    }
+  }
+
+  unmap_texture_buffer();
 }
 
 /* --------------------------------------------------------------------
  * Texture buffer mapping.
  */
 
-half4 *GPUDisplay::map_texture_buffer()
+half4 *PathTraceDisplay::map_texture_buffer()
 {
   DCHECK(!texture_buffer_state_.is_mapped);
   DCHECK(update_state_.is_active);
@@ -135,11 +169,11 @@ half4 *GPUDisplay::map_texture_buffer()
   }
 
   if (!update_state_.is_active) {
-    LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+    LOG(ERROR) << "Attempt to copy pixels data outside of PathTraceDisplay update.";
     return nullptr;
   }
 
-  half4 *mapped_rgba_pixels = do_map_texture_buffer();
+  half4 *mapped_rgba_pixels = driver_->map_texture_buffer();
 
   if (mapped_rgba_pixels) {
     texture_buffer_state_.is_mapped = true;
@@ -148,7 +182,7 @@ half4 *GPUDisplay::map_texture_buffer()
   return mapped_rgba_pixels;
 }
 
-void GPUDisplay::unmap_texture_buffer()
+void PathTraceDisplay::unmap_texture_buffer()
 {
   DCHECK(texture_buffer_state_.is_mapped);
 
@@ -160,14 +194,14 @@ void GPUDisplay::unmap_texture_buffer()
   texture_buffer_state_.is_mapped = false;
 
   mark_texture_updated();
-  do_unmap_texture_buffer();
+  driver_->unmap_texture_buffer();
 }
 
 /* --------------------------------------------------------------------
  * Graphics interoperability.
  */
 
-DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get()
+DisplayDriver::GraphicsInterop PathTraceDisplay::graphics_interop_get()
 {
   DCHECK(!texture_buffer_state_.is_mapped);
   DCHECK(update_state_.is_active);
@@ -175,38 +209,45 @@ DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get()
   if (texture_buffer_state_.is_mapped) {
     LOG(ERROR)
         << "Attempt to use graphics interoperability mode while the texture buffer is mapped.";
-    return DeviceGraphicsInteropDestination();
+    return DisplayDriver::GraphicsInterop();
   }
 
   if (!update_state_.is_active) {
-    LOG(ERROR) << "Attempt to use graphics interoperability outside of GPUDisplay update.";
-    return DeviceGraphicsInteropDestination();
+    LOG(ERROR) << "Attempt to use graphics interoperability outside of PathTraceDisplay update.";
+    return DisplayDriver::GraphicsInterop();
   }
 
   /* Assume that interop will write new values to the texture. */
   mark_texture_updated();
 
-  return do_graphics_interop_get();
+  return driver_->graphics_interop_get();
 }
 
-void GPUDisplay::graphics_interop_activate()
+void PathTraceDisplay::graphics_interop_activate()
 {
+  driver_->graphics_interop_activate();
 }
 
-void GPUDisplay::graphics_interop_deactivate()
+void PathTraceDisplay::graphics_interop_deactivate()
 {
+  driver_->graphics_interop_deactivate();
 }
 
 /* --------------------------------------------------------------------
  * Drawing.
  */
 
-bool GPUDisplay::draw()
+void PathTraceDisplay::clear()
+{
+  driver_->clear();
+}
+
+bool PathTraceDisplay::draw()
 {
   /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
    * The drawing itself is non-blocking however, for better performance and to avoid
    * potential deadlocks due to locks held by the subclass. */
-  GPUDisplayParams params;
+  DisplayDriver::Params params;
   bool is_usable;
   bool is_outdated;
 
@@ -218,7 +259,7 @@ bool GPUDisplay::draw()
   }
 
   if (is_usable) {
-    do_draw(params);
+    driver_->draw(params);
   }
 
   return !is_outdated;
diff --git a/intern/cycles/render/gpu_display.h b/intern/cycles/integrator/path_trace_display.h
index a01348d28d5..24aaa0df6b1 100644
--- a/intern/cycles/render/gpu_display.h
+++ b/intern/cycles/integrator/path_trace_display.h
@@ -16,52 +16,30 @@
 
 #pragma once
 
-#include "device/device_graphics_interop.h"
+#include "render/display_driver.h"
+
 #include "util/util_half.h"
 #include "util/util_thread.h"
 #include "util/util_types.h"
+#include "util/util_unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
 class BufferParams;
 
-/* GPUDisplay class takes care of drawing render result in a viewport. The render result is stored
- * in a GPU-side texture, which is updated from a path tracer and drawn by an application.
+/* PathTraceDisplay is used for efficient render buffer display.
  *
- * The base GPUDisplay does some special texture state tracking, which allows render Session to
- * make decisions on whether reset for an updated state is possible or not. This state should only
- * be tracked in a base class and a particular implementation should not worry about it.
+ * The host applications implements a DisplayDriver, storing a render pass in a GPU-side
+ * textures. This texture is continuously updated by the path tracer and drawn by the host
+ * application.
  *
- * The subclasses should only implement the pure virtual methods, which allows them to not worry
- * about parent method calls, which helps them to be as small and reliable as possible. */
-
-class GPUDisplayParams {
- public:
-  /* Offset of the display within a viewport.
-   * For example, set to a lower-bottom corner of border render in Blender's viewport. */
-  int2 offset = make_int2(0, 0);
-
-  /* Full viewport size.
-   *
-   * NOTE: Is not affected by the resolution divider. */
-  int2 full_size = make_int2(0, 0);
-
-  /* Effective vieport size.
-   * In the case of border render, size of the border rectangle.
-   *
-   * NOTE: Is not affected by the resolution divider. */
-  int2 size = make_int2(0, 0);
-
-  bool modified(const GPUDisplayParams &other) const
-  {
-    return !(offset == other.offset && full_size == other.full_size && size == other.size);
-  }
-};
+ * PathTraceDisplay is a wrapper around the DisplayDriver, adding thread safety, state tracking
+ * and error checking. */
 
-class GPUDisplay {
+class PathTraceDisplay {
  public:
-  GPUDisplay() = default;
-  virtual ~GPUDisplay() = default;
+  PathTraceDisplay(unique_ptr<DisplayDriver> driver);
+  virtual ~PathTraceDisplay() = default;
 
   /* Reset the display for the new state of render session. Is called whenever session is reset,
    * which happens on changes like viewport navigation or viewport dimension change.
@@ -69,11 +47,6 @@ class GPUDisplay {
    * This call will configure parameters for a changed buffer and reset the texture state. */
   void reset(const BufferParams &buffer_params);
 
-  const GPUDisplayParams &get_params() const
-  {
-    return params_;
-  }
-
   /* --------------------------------------------------------------------
    * Update procedure.
    *
@@ -94,7 +67,8 @@ class GPUDisplay {
   /* --------------------------------------------------------------------
    * Texture update from CPU buffer.
    *
-   * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+   * NOTE: The PathTraceDisplay should be marked for an update being in process with
+   * `update_begin()`.
    *
    * Most portable implementation, which must be supported by all platforms. Might not be the most
    * efficient one.
@@ -115,7 +89,8 @@ class GPUDisplay {
    * This functionality is used to update GPU-side texture content without need to maintain CPU
    * side buffer on the caller.
    *
-   * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+   * NOTE: The PathTraceDisplay should be marked for an update being in process with
+   * `update_begin()`.
    *
    * NOTE: Texture buffer can not be mapped while graphics interoperability is active. This means
    * that `map_texture_buffer()` is not allowed between `graphics_interop_begin()` and
@@ -145,14 +120,14 @@ class GPUDisplay {
    * that `graphics_interop_get()` is not allowed between `map_texture_buffer()` and
    * `unmap_texture_buffer()` calls. */
 
-  /* Get GPUDisplay graphics interoperability information which acts as a destination for the
+  /* Get PathTraceDisplay graphics interoperability information which acts as a destination for the
    * device API. */
-  DeviceGraphicsInteropDestination graphics_interop_get();
+  DisplayDriver::GraphicsInterop graphics_interop_get();
 
   /* (De)activate GPU display for graphics interoperability outside of regular display update
    * routines. */
-  virtual void graphics_interop_activate();
-  virtual void graphics_interop_deactivate();
+  void graphics_interop_activate();
+  void graphics_interop_deactivate();
 
   /* --------------------------------------------------------------------
    * Drawing.
@@ -163,47 +138,26 @@ class GPUDisplay {
    * This call might happen in parallel with draw, but can never happen in parallel with the
    * update.
    *
-   * The actual zero-ing can be deferred to a later moment. What is important is that after clear
+   * The actual zeroing can be deferred to a later moment. What is important is that after clear
    * and before pixels update the drawing texture will be fully empty, and that partial update
    * after clear will write new pixel values for an updating area, leaving everything else zeroed.
    *
    * If the GPU display supports graphics interoperability then the zeroing the display is to be
-   * delegated to the device via the `DeviceGraphicsInteropDestination`. */
-  virtual void clear() = 0;
+   * delegated to the device via the `DisplayDriver::GraphicsInterop`. */
+  void clear();
 
   /* Draw the current state of the texture.
    *
    * Returns true if this call did draw an updated state of the texture. */
   bool draw();
 
- protected:
-  /* Implementation-specific calls which subclasses are to implement.
-   * These `do_foo()` method corresponds to their `foo()` calls, but they are purely virtual to
-   * simplify their particular implementation. */
-  virtual bool do_update_begin(const GPUDisplayParams &params,
-                               int texture_width,
-                               int texture_height) = 0;
-  virtual void do_update_end() = 0;
-
-  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
-                                         int texture_x,
-                                         int texture_y,
-                                         int pixels_width,
-                                         int pixels_height) = 0;
-
-  virtual half4 *do_map_texture_buffer() = 0;
-  virtual void do_unmap_texture_buffer() = 0;
-
-  /* Note that this might be called in parallel to do_update_begin() and do_update_end(),
-   * the subclass is responsible for appropriate mutex locks to avoid multiple threads
-   * editing and drawing the texture at the same time. */
-  virtual void do_draw(const GPUDisplayParams &params) = 0;
-
-  virtual DeviceGraphicsInteropDestination do_graphics_interop_get() = 0;
-
  private:
+  /* Display driver implemented by the host application. */
+  unique_ptr<DisplayDriver> driver_;
+
+  /* Current display parameters */
   thread_mutex mutex_;
-  GPUDisplayParams params_;
+  DisplayDriver::Params params_;
 
   /* Mark texture as its content has been updated.
    * Used from places which knows that the texture content has been brought up-to-date, so that the
diff --git a/intern/cycles/integrator/path_trace_tile.cpp b/intern/cycles/integrator/path_trace_tile.cpp
new file mode 100644
index 00000000000..540f4aa5f68
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_tile.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_tile.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "integrator/path_trace.h"
+
+#include "render/buffers.h"
+#include "render/film.h"
+#include "render/pass.h"
+#include "render/scene.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceTile::PathTraceTile(PathTrace &path_trace)
+    : OutputDriver::Tile(path_trace.get_render_tile_offset(),
+                         path_trace.get_render_tile_size(),
+                         path_trace.get_render_size(),
+                         path_trace.get_render_tile_params().layer,
+                         path_trace.get_render_tile_params().view),
+      path_trace_(path_trace),
+      copied_from_device_(false)
+{
+}
+
+bool PathTraceTile::get_pass_pixels(const string_view pass_name,
+                                    const int num_channels,
+                                    float *pixels) const
+{
+  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+   * is happening while this function runs. */
+
+  if (!copied_from_device_) {
+    /* Copy from device on demand. */
+    path_trace_.copy_render_tile_from_device();
+    const_cast<PathTraceTile *>(this)->copied_from_device_ = true;
+  }
+
+  const BufferParams &buffer_params = path_trace_.get_render_tile_params();
+
+  const BufferPass *pass = buffer_params.find_pass(pass_name);
+  if (pass == nullptr) {
+    return false;
+  }
+
+  const bool has_denoised_result = path_trace_.has_denoised_result();
+  if (pass->mode == PassMode::DENOISED && !has_denoised_result) {
+    pass = buffer_params.find_pass(pass->type);
+    if (pass == nullptr) {
+      /* Happens when denoised result pass is requested but is never written by the kernel. */
+      return false;
+    }
+  }
+
+  pass = buffer_params.get_actual_display_pass(pass);
+
+  const float exposure = buffer_params.exposure;
+  const int num_samples = path_trace_.get_num_render_tile_samples();
+
+  PassAccessor::PassAccessInfo pass_access_info(*pass);
+  pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background;
+
+  const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+  const PassAccessor::Destination destination(pixels, num_channels);
+
+  return path_trace_.get_render_tile_pixels(pass_accessor, destination);
+}
+
+bool PathTraceTile::set_pass_pixels(const string_view pass_name,
+                                    const int num_channels,
+                                    const float *pixels) const
+{
+  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+   * is happening while this function runs. */
+
+  const BufferParams &buffer_params = path_trace_.get_render_tile_params();
+  const BufferPass *pass = buffer_params.find_pass(pass_name);
+  if (!pass) {
+    return false;
+  }
+
+  const float exposure = buffer_params.exposure;
+  const int num_samples = 1;
+
+  const PassAccessor::PassAccessInfo pass_access_info(*pass);
+  PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+  PassAccessor::Source source(pixels, num_channels);
+
+  return path_trace_.set_render_tile_pixels(pass_accessor, source);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_tile.h b/intern/cycles/integrator/path_trace_tile.h
new file mode 100644
index 00000000000..fd3e2969f6c
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_tile.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "render/output_driver.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* PathTraceTile
+ *
+ * Implementation of OutputDriver::Tile interface for path tracer. */
+
+class PathTrace;
+
+class PathTraceTile : public OutputDriver::Tile {
+ public:
+  PathTraceTile(PathTrace &path_trace);
+
+  bool get_pass_pixels(const string_view pass_name, const int num_channels, float *pixels) const;
+  bool set_pass_pixels(const string_view pass_name,
+                       const int num_channels,
+                       const float *pixels) const;
+
+ private:
+  PathTrace &path_trace_;
+  bool copied_from_device_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
index d9634acac10..c29177907c9 100644
--- a/intern/cycles/integrator/path_trace_work.cpp
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -16,12 +16,12 @@
 
 #include "device/device.h"
 
+#include "integrator/path_trace_display.h"
 #include "integrator/path_trace_work.h"
 #include "integrator/path_trace_work_cpu.h"
 #include "integrator/path_trace_work_gpu.h"
 #include "render/buffers.h"
 #include "render/film.h"
-#include "render/gpu_display.h"
 #include "render/scene.h"
 
 #include "kernel/kernel_types.h"
@@ -185,12 +185,12 @@ PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMod
   return pass_access_info;
 }
 
-PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template(
-    const GPUDisplay *gpu_display) const
+PassAccessor::Destination PathTraceWork::get_display_destination_template(
+    const PathTraceDisplay *display) const
 {
   PassAccessor::Destination destination(film_->get_display_pass());
 
-  const int2 display_texture_size = gpu_display->get_texture_size();
+  const int2 display_texture_size = display->get_texture_size();
   const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x;
   const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y;
 
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
index 8c9c8811199..404165b7c55 100644
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -28,7 +28,7 @@ class BufferParams;
 class Device;
 class DeviceScene;
 class Film;
-class GPUDisplay;
+class PathTraceDisplay;
 class RenderBuffers;
 
 class PathTraceWork {
@@ -83,11 +83,9 @@ class PathTraceWork {
    * noisy pass mode will be passed here when it is known that the buffer does not have denoised
    * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is
    * not used then this function will fall-back to the noisy pass instead. */
-  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
-                                   PassMode pass_mode,
-                                   int num_samples) = 0;
+  virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, int num_samples) = 0;
 
-  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0;
+  virtual void destroy_gpu_resources(PathTraceDisplay *display) = 0;
 
   /* Copy data from/to given render buffers.
    * Will copy pixels from a corresponding place (from multi-device point of view) of the render
@@ -104,7 +102,7 @@ class PathTraceWork {
    * - Copies work's render buffer to its device. */
   void copy_from_render_buffers(const RenderBuffers *render_buffers);
 
-  /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+  /* Special version of the `copy_from_render_buffers()` which only copies denoised passes from the
    * given render buffers, leaving rest of the passes.
    *
    * Same notes about device copying applies to this call as well. */
@@ -162,8 +160,8 @@ class PathTraceWork {
 
   /* Get destination which offset and stride are configured so that writing to it will write to a
    * proper location of GPU display texture, taking current tile and device slice into account. */
-  PassAccessor::Destination get_gpu_display_destination_template(
-      const GPUDisplay *gpu_display) const;
+  PassAccessor::Destination get_display_destination_template(
+      const PathTraceDisplay *display) const;
 
   /* Device which will be used for path tracing.
    * Note that it is an actual render device (and never is a multi-device). */
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index b9a33b64051..18a5365453d 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -19,10 +19,12 @@
 #include "device/cpu/kernel.h"
 #include "device/device.h"
 
+#include "kernel/kernel_path_state.h"
+
 #include "integrator/pass_accessor_cpu.h"
+#include "integrator/path_trace_display.h"
 
 #include "render/buffers.h"
-#include "render/gpu_display.h"
 #include "render/scene.h"
 
 #include "util/util_atomic.h"
@@ -116,13 +118,17 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global
                                                     const KernelWorkTile &work_tile,
                                                     const int samples_num)
 {
-  const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
   const bool has_bake = device_scene_->data.bake.use;
 
-  IntegratorStateCPU integrator_states[2] = {};
+  IntegratorStateCPU integrator_states[2];
 
   IntegratorStateCPU *state = &integrator_states[0];
-  IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+  IntegratorStateCPU *shadow_catcher_state = nullptr;
+
+  if (device_scene_->data.integrator.has_shadow_catcher) {
+    shadow_catcher_state = &integrator_states[1];
+    path_state_init_queues(kernel_globals, shadow_catcher_state);
+  }
 
   KernelWorkTile sample_work_tile = work_tile;
   float *render_buffer = buffers_->buffer.data();
@@ -147,7 +153,7 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global
 
     kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
 
-    if (has_shadow_catcher) {
+    if (shadow_catcher_state) {
       kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
     }
 
@@ -155,14 +161,14 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global
   }
 }
 
-void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
-                                           PassMode pass_mode,
-                                           int num_samples)
+void PathTraceWorkCPU::copy_to_display(PathTraceDisplay *display,
+                                       PassMode pass_mode,
+                                       int num_samples)
 {
-  half4 *rgba_half = gpu_display->map_texture_buffer();
+  half4 *rgba_half = display->map_texture_buffer();
   if (!rgba_half) {
-    /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
-     * some implementations of GPUDisplay which can not map memory? */
+    /* TODO(sergey): Look into using copy_to_display() if mapping failed. Might be needed for
+     * some implementations of PathTraceDisplay which can not map memory? */
     return;
   }
 
@@ -172,7 +178,7 @@ void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
 
   const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
 
-  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  PassAccessor::Destination destination = get_display_destination_template(display);
   destination.pixels_half_rgba = rgba_half;
 
   tbb::task_arena local_arena = local_tbb_arena_create(device_);
@@ -180,10 +186,10 @@ void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
     pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
   });
 
-  gpu_display->unmap_texture_buffer();
+  display->unmap_texture_buffer();
 }
 
-void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+void PathTraceWorkCPU::destroy_gpu_resources(PathTraceDisplay * /*display*/)
 {
 }
 
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
index ab729bbf879..d011e8d05bd 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -50,10 +50,10 @@ class PathTraceWorkCPU : public PathTraceWork {
                               int start_sample,
                               int samples_num) override;
 
-  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
-                                   PassMode pass_mode,
-                                   int num_samples) override;
-  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+  virtual void copy_to_display(PathTraceDisplay *display,
+                               PassMode pass_mode,
+                               int num_samples) override;
+  virtual void destroy_gpu_resources(PathTraceDisplay *display) override;
 
   virtual bool copy_render_buffers_from_device() override;
   virtual bool copy_render_buffers_to_device() override;
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 135466becc6..17c49f244d2 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -15,12 +15,12 @@
  */
 
 #include "integrator/path_trace_work_gpu.h"
+#include "integrator/path_trace_display.h"
 
 #include "device/device.h"
 
 #include "integrator/pass_accessor_gpu.h"
 #include "render/buffers.h"
-#include "render/gpu_display.h"
 #include "render/scene.h"
 #include "util/util_logging.h"
 #include "util/util_tbb.h"
@@ -46,7 +46,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       queued_paths_(device, "queued_paths", MEM_READ_WRITE),
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
-      gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+      display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
       max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
       min_num_active_paths_(queue_->num_concurrent_busy_states()),
       max_active_path_index_(0)
@@ -95,8 +95,8 @@ void PathTraceWorkGPU::alloc_integrator_soa()
 #define KERNEL_STRUCT_END(name) \
   break; \
   }
-#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
-  if (array_index == array_size - 1) { \
+#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
+  if (array_index == gpu_array_size - 1) { \
     break; \
   } \
   }
@@ -652,7 +652,7 @@ int PathTraceWorkGPU::get_num_active_paths()
 bool PathTraceWorkGPU::should_use_graphics_interop()
 {
   /* There are few aspects with the graphics interop when using multiple devices caused by the fact
-   * that the GPUDisplay has a single texture:
+   * that the PathTraceDisplay has a single texture:
    *
    *   CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
    *   attempting to register OpenGL PBO which has been mapped. Which makes sense, because
@@ -678,9 +678,9 @@ bool PathTraceWorkGPU::should_use_graphics_interop()
   return interop_use_;
 }
 
-void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
-                                           PassMode pass_mode,
-                                           int num_samples)
+void PathTraceWorkGPU::copy_to_display(PathTraceDisplay *display,
+                                       PassMode pass_mode,
+                                       int num_samples)
 {
   if (device_->have_error()) {
     /* Don't attempt to update GPU display if the device has errors: the error state will make
@@ -694,7 +694,7 @@ void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
   }
 
   if (should_use_graphics_interop()) {
-    if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+    if (copy_to_display_interop(display, pass_mode, num_samples)) {
       return;
     }
 
@@ -703,12 +703,12 @@ void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
     interop_use_ = false;
   }
 
-  copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+  copy_to_display_naive(display, pass_mode, num_samples);
 }
 
-void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
-                                                 PassMode pass_mode,
-                                                 int num_samples)
+void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display,
+                                             PassMode pass_mode,
+                                             int num_samples)
 {
   const int full_x = effective_buffer_params_.full_x;
   const int full_y = effective_buffer_params_.full_y;
@@ -725,43 +725,42 @@ void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
    * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
    * change of the resolution divider. However, if the display becomes smaller, shrink the
    * allocated memory as well. */
-  if (gpu_display_rgba_half_.data_width != final_width ||
-      gpu_display_rgba_half_.data_height != final_height) {
-    gpu_display_rgba_half_.alloc(final_width, final_height);
+  if (display_rgba_half_.data_width != final_width ||
+      display_rgba_half_.data_height != final_height) {
+    display_rgba_half_.alloc(final_width, final_height);
     /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
      * transferring zeroes to the device. */
-    queue_->zero_to_device(gpu_display_rgba_half_);
+    queue_->zero_to_device(display_rgba_half_);
   }
 
   PassAccessor::Destination destination(film_->get_display_pass());
-  destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+  destination.d_pixels_half_rgba = display_rgba_half_.device_pointer;
 
   get_render_tile_film_pixels(destination, pass_mode, num_samples);
 
-  gpu_display_rgba_half_.copy_from_device();
+  queue_->copy_from_device(display_rgba_half_);
+  queue_->synchronize();
 
-  gpu_display->copy_pixels_to_texture(
-      gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+  display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height);
 }
 
-bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
-                                                   PassMode pass_mode,
-                                                   int num_samples)
+bool PathTraceWorkGPU::copy_to_display_interop(PathTraceDisplay *display,
+                                               PassMode pass_mode,
+                                               int num_samples)
 {
   if (!device_graphics_interop_) {
     device_graphics_interop_ = queue_->graphics_interop_create();
   }
 
-  const DeviceGraphicsInteropDestination graphics_interop_dst =
-      gpu_display->graphics_interop_get();
-  device_graphics_interop_->set_destination(graphics_interop_dst);
+  const DisplayDriver::GraphicsInterop graphics_interop_dst = display->graphics_interop_get();
+  device_graphics_interop_->set_display_interop(graphics_interop_dst);
 
   const device_ptr d_rgba_half = device_graphics_interop_->map();
   if (!d_rgba_half) {
     return false;
   }
 
-  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  PassAccessor::Destination destination = get_display_destination_template(display);
   destination.d_pixels_half_rgba = d_rgba_half;
 
   get_render_tile_film_pixels(destination, pass_mode, num_samples);
@@ -771,14 +770,14 @@ bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
   return true;
 }
 
-void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+void PathTraceWorkGPU::destroy_gpu_resources(PathTraceDisplay *display)
 {
   if (!device_graphics_interop_) {
     return;
   }
-  gpu_display->graphics_interop_activate();
+  display->graphics_interop_activate();
   device_graphics_interop_ = nullptr;
-  gpu_display->graphics_interop_deactivate();
+  display->graphics_interop_deactivate();
 }
 
 void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 38788122b0d..9212537d2fd 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -48,10 +48,10 @@ class PathTraceWorkGPU : public PathTraceWork {
                               int start_sample,
                               int samples_num) override;
 
-  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
-                                   PassMode pass_mode,
-                                   int num_samples) override;
-  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+  virtual void copy_to_display(PathTraceDisplay *display,
+                               PassMode pass_mode,
+                               int num_samples) override;
+  virtual void destroy_gpu_resources(PathTraceDisplay *display) override;
 
   virtual bool copy_render_buffers_from_device() override;
   virtual bool copy_render_buffers_to_device() override;
@@ -88,16 +88,16 @@ class PathTraceWorkGPU : public PathTraceWork {
 
   int get_num_active_paths();
 
-  /* Check whether graphics interop can be used for the GPUDisplay update. */
+  /* Check whether graphics interop can be used for the PathTraceDisplay update. */
   bool should_use_graphics_interop();
 
-  /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the
-   * device, then copies pixels to the host and pushes them to the `gpu_display`. */
-  void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+  /* Naive implementation of the `copy_to_display()` which performs film conversion on the
+   * device, then copies pixels to the host and pushes them to the `display`. */
+  void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, int num_samples);
 
-  /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability
+  /* Implementation of `copy_to_display()` which uses driver's OpenGL/GPU interoperability
    * functionality, avoiding copy of pixels to the host. */
-  bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+  bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, int num_samples);
 
   /* Synchronously run film conversion kernel and store display result in the given destination. */
   void get_render_tile_film_pixels(const PassAccessor::Destination &destination,
@@ -139,9 +139,9 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Temporary buffer for passing work tiles to kernel. */
   device_vector<KernelWorkTile> work_tiles_;
 
-  /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not
+  /* Temporary buffer used by the copy_to_display() whenever graphics interoperability is not
    * available. Is allocated on-demand. */
-  device_vector<half4> gpu_display_rgba_half_;
+  device_vector<half4> display_rgba_half_;
 
   unique_ptr<DeviceGraphicsInterop> device_graphics_interop_;
 
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
index 3e5b3417a6a..322d3d5f94c 100644
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -384,7 +384,7 @@ bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
   }
 
   if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
-    render_work->tile.denoise = true;
+    render_work->tile.denoise = !tile_manager_.has_multiple_tiles();
     any_scheduled = true;
   }
 
@@ -903,6 +903,12 @@ bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
     return false;
   }
 
+  /* When multiple tiles are used the full frame will be denoised.
+   * Avoid per-tile denoising to save up render time. */
+  if (tile_manager_.has_multiple_tiles()) {
+    return false;
+  }
+
   if (done()) {
     /* Always denoise at the last sample. */
     return true;
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
index b7b598fb10c..c4ab15e54ba 100644
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -31,7 +31,7 @@ class RenderWork {
   int resolution_divider = 1;
 
   /* Initialize render buffers.
-   * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+   * Includes steps like zeroing the buffer on the device, and optional reading of pixels from the
    * baking target. */
   bool init_render_buffers = false;
 
@@ -344,7 +344,7 @@ class RenderScheduler {
     /* Number of rendered samples on top of the start sample. */
     int num_rendered_samples = 0;
 
-    /* Point in time the latest GPUDisplay work has been scheduled. */
+    /* Point in time the latest PathTraceDisplay work has been scheduled. */
     double last_display_update_time = 0.0;
     /* Value of -1 means display was never updated. */
     int last_display_update_sample = -1;
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
index d35ff4cd03f..a14e41ec5be 100644
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -149,14 +149,14 @@ bool ShaderEval::eval_gpu(Device *device,
 
   /* Execute work on GPU in chunk, so we can cancel.
    * TODO : query appropriate size from device.*/
-  const int chunk_size = 65536;
+  const int64_t chunk_size = 65536;
 
-  const int work_size = output.size();
+  const int64_t work_size = output.size();
   void *d_input = (void *)input.device_pointer;
   void *d_output = (void *)output.device_pointer;
 
-  for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
-    int d_work_size = min(chunk_size, work_size - d_offset);
+  for (int64_t d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+    int64_t d_work_size = std::min(chunk_size, work_size - d_offset);
     void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
 
     queue->enqueue(kernel, d_work_size, args);
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 4196539a9b1..7b56216e887 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -35,6 +35,10 @@ set(SRC_DEVICE_CUDA
   device/cuda/kernel.cu
 )
 
+set(SRC_DEVICE_HIP
+  device/hip/kernel.cpp
+)
+
 set(SRC_DEVICE_OPTIX
   device/optix/kernel.cu
   device/optix/kernel_shader_raytrace.cu
@@ -106,6 +110,12 @@ set(SRC_DEVICE_CUDA_HEADERS
   device/cuda/globals.h
 )
 
+set(SRC_DEVICE_HIP_HEADERS
+  device/hip/compat.h
+  device/hip/config.h
+  device/hip/globals.h
+)
+
 set(SRC_DEVICE_OPTIX_HEADERS
   device/optix/compat.h
   device/optix/globals.h
@@ -458,6 +468,104 @@ if(WITH_CYCLES_CUDA_BINARIES)
   cycles_set_solution_folder(cycles_kernel_cuda)
 endif()
 
+####################################################### START
+
+# HIP module
+
+if(WITH_CYCLES_HIP_BINARIES)
+  # 64 bit only
+  set(HIP_BITS 64)
+
+  # HIP version
+  execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} "--version" OUTPUT_VARIABLE HIPCC_OUT)
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" HIP_VERSION_MAJOR "${HIPCC_OUT}")
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" HIP_VERSION_MINOR "${HIPCC_OUT}")
+  set(HIP_VERSION "${HIP_VERSION_MAJOR}${HIP_VERSION_MINOR}")
+
+
+  message(WARNING
+    "HIP version ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR} detected")
+
+  # build for each arch
+  set(hip_sources device/hip/kernel.cpp
+    ${SRC_HEADERS}
+    ${SRC_DEVICE_HIP_HEADERS}
+    ${SRC_BVH_HEADERS}
+    ${SRC_SVM_HEADERS}
+    ${SRC_GEOM_HEADERS}
+    ${SRC_INTEGRATOR_HEADERS}
+    ${SRC_CLOSURE_HEADERS}
+    ${SRC_UTIL_HEADERS}
+  )
+  set(hip_fatbins)
+
+  macro(CYCLES_HIP_KERNEL_ADD arch prev_arch name flags sources experimental)
+    if(${arch} MATCHES "compute_.*")
+      set(format "ptx")
+    else()
+      set(format "fatbin")
+    endif()
+    set(hip_file ${name}_${arch}.${format})
+
+    set(kernel_sources ${sources})
+    if(NOT ${prev_arch} STREQUAL "none")
+      if(${prev_arch} MATCHES "compute_.*")
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx)
+      else()
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.fatbin)
+      endif()
+    endif()
+
+    set(hip_kernel_src "/device/hip/${name}.cpp")
+
+    set(hip_flags ${flags}
+      -D CCL_NAMESPACE_BEGIN=
+      -D CCL_NAMESPACE_END=
+      -D HIPCC
+      -m ${HIP_BITS}
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/..
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hip
+      --use_fast_math
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})
+
+    if(${experimental})
+      set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
+      set(name ${name}_experimental)
+    endif()
+
+    if(WITH_CYCLES_DEBUG)
+      set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
+    endif()
+
+    if(WITH_NANOVDB)
+      set(hip_flags ${hip_flags}
+        -D WITH_NANOVDB
+        -I "${NANOVDB_INCLUDE_DIR}")
+    endif()
+  endmacro()
+
+  set(prev_arch "none")
+  foreach(arch ${CYCLES_HIP_BINARIES_ARCH})
+      set(hip_hipcc_executable ${HIP_HIPCC_EXECUTABLE})
+      set(hip_toolkit_root_dir ${HIP_TOOLKIT_ROOT_DIR})
+    if(DEFINED hip_hipcc_executable AND DEFINED hip_toolkit_root_dir)
+      # Compile regular kernel
+      CYCLES_HIP_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${hip_sources}" FALSE)
+
+      if(WITH_CYCLES_HIP_BUILD_SERIAL)
+        set(prev_arch ${arch})
+      endif()
+
+      unset(hip_hipcc_executable)
+      unset(hip_toolkit_root_dir)
+    endif()
+  endforeach()
+
+  add_custom_target(cycles_kernel_hip ALL DEPENDS ${hip_fatbins})
+  cycles_set_solution_folder(cycles_kernel_hip)
+endif()
+
+####################################################### END
 # OptiX PTX modules
 
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
@@ -602,11 +710,13 @@ endif()
 cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_DEVICE_CPU}
   ${SRC_DEVICE_CUDA}
+  ${SRC_DEVICE_HIP}
   ${SRC_DEVICE_OPTIX}
   ${SRC_HEADERS}
   ${SRC_DEVICE_CPU_HEADERS}
   ${SRC_DEVICE_GPU_HEADERS}
   ${SRC_DEVICE_CUDA_HEADERS}
+  ${SRC_DEVICE_HIP_HEADERS}
   ${SRC_DEVICE_OPTIX_HEADERS}
   ${SRC_BVH_HEADERS}
   ${SRC_CLOSURE_HEADERS}
@@ -621,6 +731,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
 source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\hip" FILES ${SRC_DEVICE_HIP} ${SRC_DEVICE_HIP_HEADERS})
 source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
 source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
 source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
@@ -632,14 +743,19 @@ endif()
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
   add_dependencies(cycles_kernel cycles_kernel_optix)
 endif()
+if(WITH_CYCLES_HIP)
+  add_dependencies(cycles_kernel cycles_kernel_hip)
+endif()
 
 # Install kernel source for runtime compilation
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 539e9fd05fb..0b44cc5db34 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -167,15 +167,25 @@ ccl_device_intersect bool scene_intersect(const KernelGlobals *kg,
   uint p4 = visibility;
   uint p5 = PRIMITIVE_NONE;
 
+  uint ray_mask = visibility & 0xFF;
+  uint ray_flags = OPTIX_RAY_FLAG_NONE;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+    ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
+  }
+  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    ray_flags = OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
+  }
+
   optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
              ray->P,
              ray->D,
              0.0f,
              ray->t,
              ray->time,
-             0xF,
-             OPTIX_RAY_FLAG_NONE,
-             0,  // SBT offset for PG_HITD
+             ray_mask,
+             ray_flags,
+             0, /* SBT offset for PG_HITD */
              0,
              0,
              p0,
@@ -251,11 +261,11 @@ ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
   uint p2 = ((uint64_t)local_isect) & 0xFFFFFFFF;
   uint p3 = (((uint64_t)local_isect) >> 32) & 0xFFFFFFFF;
   uint p4 = local_object;
-  // Is set to zero on miss or if ray is aborted, so can be used as return value
+  /* Is set to zero on miss or if ray is aborted, so can be used as return value. */
   uint p5 = max_hits;
 
   if (local_isect) {
-    local_isect->num_hits = 0;  // Initialize hit count to zero
+    local_isect->num_hits = 0; /* Initialize hit count to zero. */
   }
   optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
              ray->P,
@@ -263,11 +273,10 @@ ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
-             // Skip curves
-             0x3,
-             // Need to always call into __anyhit__kernel_optix_local_hit
+             0xFF,
+             /* Need to always call into __anyhit__kernel_optix_local_hit. */
              OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             2,  // SBT offset for PG_HITL
+             2, /* SBT offset for PG_HITL */
              0,
              0,
              p0,
@@ -365,17 +374,22 @@ ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg,
   uint p4 = visibility;
   uint p5 = false;
 
-  *num_hits = 0;  // Initialize hit count to zero
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  *num_hits = 0; /* Initialize hit count to zero. */
   optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
              ray->P,
              ray->D,
              0.0f,
              ray->t,
              ray->time,
-             0xF,
-             // Need to always call into __anyhit__kernel_optix_shadow_all_hit
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
              OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             1,  // SBT offset for PG_HITS
+             1, /* SBT offset for PG_HITS */
              0,
              0,
              p0,
@@ -444,16 +458,21 @@ ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg,
   uint p4 = visibility;
   uint p5 = PRIMITIVE_NONE;
 
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
   optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
              ray->P,
              ray->D,
              0.0f,
              ray->t,
              ray->time,
-             // Skip everything but volumes
-             0x2,
-             OPTIX_RAY_FLAG_NONE,
-             0,  // SBT offset for PG_HITD
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_volume_test. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             3, /* SBT offset for PG_HITV */
              0,
              0,
              p0,
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 85500bf4d07..db4a4bf71e0 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -21,11 +21,15 @@ CCL_NAMESPACE_BEGIN
 /* Given an array of states, build an array of indices for which the states
  * are active.
  *
- * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+ * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index f609520b8b4..a1349e82efb 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
 {
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
index 65b1990dbb8..b60dceb2ed0 100644
--- a/intern/cycles/kernel/device/gpu/parallel_reduce.h
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
  * the overall cost of the algorithm while keeping the work complexity O(n) and
  * the step complexity O(log n). (Brent's Theorem optimization) */
 
-#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
 __device__ void gpu_parallel_sum(
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 99b35468517..9bca1fad22f 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 
 template<uint blocksize, typename GetKeyOp>
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
new file mode 100644
index 00000000000..95338fe7d6e
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_HIP__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+#ifdef __HIPCC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+#  include <stdint.h>
+#endif
+
+#ifdef CYCLES_HIPBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#define ccl_device_inline __device__ __inline__
+#define ccl_device_forceinline __device__ __forceinline__
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+#define kernel_assert(cond)
+
+/* Types */
+#ifdef __HIP__
+#  include "hip/hip_fp16.h"
+#  include "hip/hip_runtime.h"
+#endif
+
+#ifdef _MSC_VER
+#  include <immintrin.h>
+#endif
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronization */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot(predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+typedef hipTextureObject_t ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
new file mode 100644
index 00000000000..2fde0d46015
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from HIP occupancy calculator.
+ *
+ * Terminology
+ * - HIP GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* Launch Bound Definitions */
+#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64
+#define GPU_BLOCK_MAX_THREADS 1024
+#define GPU_THREAD_MAX_REGISTERS 255
+
+#define GPU_KERNEL_BLOCK_NUM_THREADS 1024
+#define GPU_KERNEL_MAX_REGISTERS 64
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h
new file mode 100644
index 00000000000..39978ae7899
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/hip/kernel.cpp b/intern/cycles/kernel/device/hip/kernel.cpp
new file mode 100644
index 00000000000..c801320a2e1
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/kernel.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* HIP kernel entry points */
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+#  include "kernel/device/hip/compat.h"
+#  include "kernel/device/hip/config.h"
+#  include "kernel/device/hip/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index c1e36febfc0..7a79e0c4823 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -19,7 +19,7 @@
 #include "kernel/device/optix/compat.h"
 #include "kernel/device/optix/globals.h"
 
-#include "kernel/device/gpu/image.h"  // Texture lookup uses normal CUDA intrinsics
+#include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */
 
 #include "kernel/integrator/integrator_state.h"
 #include "kernel/integrator/integrator_state_flow.h"
@@ -44,18 +44,18 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
 template<bool always = false> ccl_device_forceinline uint get_object_id()
 {
 #ifdef __OBJECT_MOTION__
-  // Always get the the instance ID from the TLAS
-  // There might be a motion transform node between TLAS and BLAS which does not have one
+  /* Always get the the instance ID from the TLAS.
+   * There might be a motion transform node between TLAS and BLAS which does not have one. */
   uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
 #else
   uint object = optixGetInstanceId();
 #endif
-  // Choose between always returning object ID or only for instances
+  /* Choose between always returning object ID or only for instances. */
   if (always || (object & 1) == 0)
-    // Can just remove the low bit since instance always contains object ID
+    /* Can just remove the low bit since instance always contains object ID. */
     return object >> 1;
   else
-    // Set to OBJECT_NONE if this is not an instanced object
+    /* Set to OBJECT_NONE if this is not an instanced object. */
     return OBJECT_NONE;
 }
 
@@ -93,23 +93,30 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_st
 
 extern "C" __global__ void __miss__kernel_optix_miss()
 {
-  // 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss
+  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
   optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
   optixSetPayload_5(PRIMITIVE_NONE);
 }
 
 extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 {
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
 #ifdef __BVH_LOCAL__
   const uint object = get_object_id<true>();
   if (object != optixGetPayload_4() /* local_object */) {
-    // Only intersect with matching object
+    /* Only intersect with matching object. */
     return optixIgnoreIntersection();
   }
 
   const uint max_hits = optixGetPayload_5();
   if (max_hits == 0) {
-    // Special case for when no hit information is requested, just report that something was hit
+    /* Special case for when no hit information is requested, just report that something was hit */
     optixSetPayload_5(true);
     return optixTerminateRay();
   }
@@ -136,8 +143,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
   }
   else {
     if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      // Record closest intersection only
-      // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
+      /* Record closest intersection only.
+       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
+       */
       return optixIgnoreIntersection();
     }
 
@@ -154,14 +162,14 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
   isect->u = 1.0f - barycentrics.y - barycentrics.x;
   isect->v = barycentrics.x;
 
-  // Record geometric normal
+  /* Record geometric normal. */
   const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
   const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
   const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
   const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
   local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
 
-  // Continue tracing (without this the trace call would return after the first hit)
+  /* Continue tracing (without this the trace call would return after the first hit). */
   optixIgnoreIntersection();
 #endif
 }
@@ -190,7 +198,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     u = __uint_as_float(optixGetAttribute_0());
     v = __uint_as_float(optixGetAttribute_1());
 
-    // Filter out curve endcaps
+    /* Filter out curve endcaps. */
     if (u == 0.0f || u == 1.0f) {
       ignore_intersection = true;
     }
@@ -241,10 +249,10 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     isect->type = kernel_tex_fetch(__prim_type, prim);
 
 #  ifdef __TRANSPARENT_SHADOWS__
-    // Detect if this surface has a shader with transparent shadows
+    /* Detect if this surface has a shader with transparent shadows. */
     if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
 #  endif
-      // If no transparent shadows, all light is blocked and we can stop immediately
+      /* If no transparent shadows, all light is blocked and we can stop immediately. */
       optixSetPayload_5(true);
       return optixTerminateRay();
 #  ifdef __TRANSPARENT_SHADOWS__
@@ -252,24 +260,39 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 #  endif
   }
 
-  // Continue tracing
+  /* Continue tracing. */
   optixIgnoreIntersection();
 #endif
 }
 
-extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+extern "C" __global__ void __anyhit__kernel_optix_volume_test()
 {
-  uint visibility = optixGetPayload_4();
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
 #ifdef __VISIBILITY_FLAG__
   const uint prim = optixGetPrimitiveIndex();
+  const uint visibility = optixGetPayload_4();
   if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
     return optixIgnoreIntersection();
   }
 #endif
 
+  const uint object = get_object_id<true>();
+  if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
+    return optixIgnoreIntersection();
+  }
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
 #ifdef __HAIR__
   if (!optixIsTriangleHit()) {
-    // Filter out curve endcaps
+    /* Filter out curve endcaps. */
     const float u = __uint_as_float(optixGetAttribute_0());
     if (u == 0.0f || u == 1.0f) {
       return optixIgnoreIntersection();
@@ -277,18 +300,26 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
   }
 #endif
 
-  // Shadow ray early termination
+#ifdef __VISIBILITY_FLAG__
+  const uint prim = optixGetPrimitiveIndex();
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+
+  /* Shadow ray early termination. */
   if (visibility & PATH_RAY_SHADOW_OPAQUE) {
     return optixTerminateRay();
   }
+#endif
 }
 
 extern "C" __global__ void __closesthit__kernel_optix_hit()
 {
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));  // Intersection distance
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
   optixSetPayload_3(optixGetPrimitiveIndex());
   optixSetPayload_4(get_object_id());
-  // Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index
+  /* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */
   optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
 
   if (optixIsTriangleHit()) {
@@ -297,7 +328,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
     optixSetPayload_2(__float_as_uint(barycentrics.x));
   }
   else {
-    optixSetPayload_1(optixGetAttribute_0());  // Same as 'optixGetCurveParameter()'
+    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
     optixSetPayload_2(optixGetAttribute_1());
   }
 }
@@ -311,7 +342,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
   float3 P = optixGetObjectRayOrigin();
   float3 dir = optixGetObjectRayDirection();
 
-  // The direction is not normalized by default, but the curve intersection routine expects that
+  /* The direction is not normalized by default, but the curve intersection routine expects that */
   float len;
   dir = normalize_len(dir, &len);
 
@@ -323,15 +354,15 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
 
   Intersection isect;
   isect.t = optixGetRayTmax();
-  // Transform maximum distance into object space
+  /* Transform maximum distance into object space. */
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
   if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
-                            __float_as_int(isect.u),   // Attribute_0
-                            __float_as_int(isect.v));  // Attribute_1
+                            __float_as_int(isect.u),  /* Attribute_0 */
+                            __float_as_int(isect.v)); /* Attribute_1 */
   }
 }
 
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index 213f3e62ee0..a068e93790a 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -713,7 +713,7 @@ ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
-    D = normalize_len(D, &t);
+    D = safe_normalize_len(D, &t);
   }
 
   int prim = kernel_tex_fetch(__prim_index, isect_prim);
@@ -764,8 +764,10 @@ ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
     /* Thick curves, compute normal using direction from inside the curve.
      * This could be optimized by recording the normal in the intersection,
      * however for Optix this would go beyond the size of the payload. */
+    /* NOTE: It is possible that P will be the same as P_inside (precision issues, or very small
+     * radius). In this case use the view direction to approximate the normal. */
     const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
-    const float3 Ng = normalize(P - P_inside);
+    const float3 Ng = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I;
 
     sd->N = Ng;
     sd->Ng = Ng;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index eb4a39e062b..239bd0a37b2 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -41,7 +41,18 @@ ccl_device_inline int find_attribute_motion(const KernelGlobals *kg,
   uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 
   while (attr_map.x != id) {
-    attr_offset += ATTR_PRIM_TYPES;
+    if (UNLIKELY(attr_map.x == ATTR_STD_NONE)) {
+      if (UNLIKELY(attr_map.y == 0)) {
+        return (int)ATTR_STD_NOT_FOUND;
+      }
+      else {
+        /* Chain jump to a different part of the table. */
+        attr_offset = attr_map.z;
+      }
+    }
+    else {
+      attr_offset += ATTR_PRIM_TYPES;
+    }
     attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
   }
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
index 73b7cad32be..a24473addcc 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -365,19 +365,16 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
 #ifdef __VOLUME__
   if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
 #endif
+    const int path_flag = INTEGRATOR_STATE(path, flag);
 
-    {
-      const int path_flag = INTEGRATOR_STATE(path, flag);
 #ifdef __SUBSURFACE__
-      /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
-      if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
+    /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
+    if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
 #endif
-      {
-        /* Evaluate shader. */
-        PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
-        shader_eval_surface<node_feature_mask>(
-            INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
-      }
+    {
+      /* Evaluate shader. */
+      PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
+      shader_eval_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
     }
 
 #ifdef __SUBSURFACE__
@@ -417,17 +414,20 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
 
     /* Perform path termination. Most paths have already been terminated in
      * the intersect_closest kernel, this is just for emission and for dividing
-     * throughput by the probability at the right moment. */
-    const int path_flag = INTEGRATOR_STATE(path, flag);
-    const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
-                                  0.0f :
-                                  path_state_continuation_probability(INTEGRATOR_STATE_PASS,
-                                                                      path_flag);
-    if (probability == 0.0f) {
-      return false;
-    }
-    else if (probability != 1.0f) {
-      INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+     * throughput by the probability at the right moment.
+     *
+     * Also ensure we don't do it twice for SSS at both the entry and exit point. */
+    if (!(path_flag & PATH_RAY_SUBSURFACE)) {
+      const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
+                                    0.0f :
+                                    path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                        path_flag);
+      if (probability == 0.0f) {
+        return false;
+      }
+      else if (probability != 1.0f) {
+        INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+      }
     }
 
 #ifdef __DENOISING_FEATURES__
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
index 095a28ac505..dac3efb3996 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -74,7 +74,7 @@ ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS,
                                                    ShaderData *ccl_restrict sd,
                                                    float3 *ccl_restrict extinction)
 {
-  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
+  shader_eval_volume<true>(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
     return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
   });
 
@@ -93,7 +93,7 @@ ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS,
                                             VolumeShaderCoefficients *coeff)
 {
   const int path_flag = INTEGRATOR_STATE(path, flag);
-  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
+  shader_eval_volume<false>(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
     return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
   });
 
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index 094446be02c..f745ad3f4b9 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -60,7 +60,15 @@ CCL_NAMESPACE_BEGIN
  * TODO: these could be made dynamic depending on the features used in the scene. */
 
 #define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
-#define INTEGRATOR_SHADOW_ISECT_SIZE 4
+
+#define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024
+#define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4
+
+#ifdef __KERNEL_CPU__
+#  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_CPU
+#else
+#  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_GPU
+#endif
 
 /* Data structures */
 
@@ -74,9 +82,9 @@ typedef struct IntegratorStateCPU {
 #define KERNEL_STRUCT_END(name) \
   } \
   name;
-#define KERNEL_STRUCT_END_ARRAY(name, size) \
+#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
   } \
-  name[size];
+  name[cpu_size];
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
@@ -103,9 +111,9 @@ typedef struct IntegratorStateGPU {
 #define KERNEL_STRUCT_END(name) \
   } \
   name;
-#define KERNEL_STRUCT_END_ARRAY(name, size) \
+#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
   } \
-  name[size];
+  name[gpu_size];
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
index 41dd1bfcdbf..0d8126c64aa 100644
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -107,7 +107,7 @@ KERNEL_STRUCT_END(subsurface)
 KERNEL_STRUCT_BEGIN(volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE, INTEGRATOR_VOLUME_STACK_SIZE)
 
 /********************************* Shadow Path State **************************/
 
@@ -153,11 +153,15 @@ KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACIN
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
 /* TODO: exclude for GPU. */
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE)
+KERNEL_STRUCT_END_ARRAY(shadow_isect,
+                        INTEGRATOR_SHADOW_ISECT_SIZE_CPU,
+                        INTEGRATOR_SHADOW_ISECT_SIZE_GPU)
 
 /**************************** Shadow Volume Stack *****************************/
 
 KERNEL_STRUCT_BEGIN(shadow_volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
+                        INTEGRATOR_VOLUME_STACK_SIZE,
+                        INTEGRATOR_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
index cdf412fe22f..08d6cb00114 100644
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -217,10 +217,10 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
     while (false) \
       ;
 
-#  define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+#  define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
     ++index; \
     } \
-    while (index < array_size) \
+    while (index < gpu_array_size) \
       ;
 
 #  include "kernel/integrator/integrator_state_template.h"
@@ -264,7 +264,12 @@ ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_AR
 
   IntegratorStateCPU *ccl_restrict split_state = state + 1;
 
-  *split_state = *state;
+  /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
+  split_state->path = state->path;
+  split_state->ray = state->ray;
+  split_state->isect = state->isect;
+  memcpy(split_state->volume_stack, state->volume_stack, sizeof(state->volume_stack));
+  split_state->shadow_path = state->shadow_path;
 
   split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS;
 #endif
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 9e12d24dcf4..f4d00e4c20c 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -386,7 +386,7 @@ ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS,
 {
   /* The throughput for shadow paths already contains the light shader evaluation. */
   float3 contribution = INTEGRATOR_STATE(shadow_path, throughput);
-  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1);
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce));
 
   ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
                                                               render_buffer);
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index e025bcd6674..abb1ba455e6 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -42,6 +42,16 @@ ccl_device void kernel_displace_evaluate(const KernelGlobals *kg,
 
   object_inverse_dir_transform(kg, &sd, &D);
 
+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(D)) {
+    kernel_assert(!"Cycles displacement with non-finite value detected");
+  }
+#endif
+
+  /* Ensure finite displacement, preventing BVH from becoming degenerate and avoiding possible
+   * traversal issues caused by non-finite math. */
+  D = ensure_finite3(D);
+
   /* Write output. */
   output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
 }
@@ -66,7 +76,16 @@ ccl_device void kernel_background_evaluate(const KernelGlobals *kg,
   const int path_flag = PATH_RAY_EMISSION;
   shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
       INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag);
-  const float3 color = shader_background_eval(&sd);
+  float3 color = shader_background_eval(&sd);
+
+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(color)) {
+    kernel_assert(!"Cycles background with non-finite value detected");
+  }
+#endif
+
+  /* Ensure finite color, avoiding possible numerical instabilities in the path tracing kernels. */
+  color = ensure_finite3(color);
 
   /* Write output. */
   output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index 715d764fb31..e8f4a21878e 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -394,7 +394,7 @@ film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_conver
 
   /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
    * shadow catcher objects in the scene. In this case there will be no auxiliary passes required
-   * for the devision (to save up memory). So delay the asserts to this point so that the number of
+   * for the decision (to save up memory). So delay the asserts to this point so that the number of
    * samples check handles such configuration. */
   kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
   kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 354e8115538..1beaf3cc2b2 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -74,10 +74,6 @@ ccl_device_inline float cmj_randfloat_simple(uint i, uint p)
 
 ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension)
 {
-  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
-   *  the x part is used as the sample (TODO(@leesonw): Add using both x and y parts
-   * independently). */
-
   /* Perform Owen shuffle of the sample number to reorder the samples. */
 #ifdef _SIMPLE_HASH_
   const uint rv = cmj_hash_simple(dimension, rng_hash);
@@ -95,7 +91,10 @@ ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_ha
   const uint sample_set = s / NUM_PMJ_SAMPLES;
   const uint d = (dimension + sample_set);
   const uint dim = d % NUM_PMJ_PATTERNS;
-  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
+
+  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
+   *  the x part is used for even dims and the y for odd. */
+  int index = 2 * ((dim >> 1) * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)) + (dim & 1);
 
   float fx = kernel_tex_fetch(__sample_pattern_lut, index);
 
@@ -104,12 +103,11 @@ ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_ha
 #  ifdef _SIMPLE_HASH_
   float dx = cmj_randfloat_simple(d, rng_hash);
 #  else
-  /* Only jitter within the grid interval. */
   float dx = cmj_randfloat(d, rng_hash);
 #  endif
-  fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES);
+  /* Jitter sample locations and map back into [0 1]. */
+  fx = fx + dx;
   fx = fx - floorf(fx);
-
 #else
 #  warning "Not using Cranley-Patterson Rotation."
 #endif
@@ -136,7 +134,7 @@ ccl_device void pmj_sample_2D(
   /* Based on the sample number a sample pattern is selected and offset by the dimension. */
   const uint sample_set = s / NUM_PMJ_SAMPLES;
   const uint d = (dimension + sample_set);
-  const uint dim = d % NUM_PMJ_PATTERNS;
+  uint dim = d % NUM_PMJ_PATTERNS;
   int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
 
   float fx = kernel_tex_fetch(__sample_pattern_lut, index);
@@ -151,17 +149,17 @@ ccl_device void pmj_sample_2D(
   float dx = cmj_randfloat(d, rng_hash);
   float dy = cmj_randfloat(d + 1, rng_hash);
 #  endif
-  /* Only jitter within the grid cells. */
-  fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS);
-  fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS);
-  fx = fx - floorf(fx);
-  fy = fy - floorf(fy);
+  /* Jitter sample locations and map back to the unit square [0 1]x[0 1]. */
+  float sx = fx + dx;
+  float sy = fy + dy;
+  sx = sx - floorf(sx);
+  sy = sy - floorf(sy);
 #else
 #  warning "Not using Cranley Patterson Rotation."
 #endif
 
-  (*x) = fx;
-  (*y) = fy;
+  (*x) = sx;
+  (*y) = sy;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 3052bb53040..e7133724c85 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -186,8 +186,8 @@ ccl_device_inline float _shader_bsdf_multi_eval(const KernelGlobals *kg,
                                                 float sum_sample_weight,
                                                 const uint light_shader_flags)
 {
-  /* this is the veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting */
+  /* This is the veach one-sample model with balance heuristic,
+   * some PDF factors drop out when using balance heuristic weighting. */
   for (int i = 0; i < sd->num_closure; i++) {
     const ShaderClosure *sc = &sd->closure[i];
 
@@ -750,7 +750,7 @@ ccl_device int shader_phase_sample_closure(const KernelGlobals *kg,
 
 /* Volume Evaluation */
 
-template<typename StackReadOp>
+template<const bool shadow, typename StackReadOp>
 ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
                                           ShaderData *ccl_restrict sd,
                                           const int path_flag,
@@ -815,8 +815,11 @@ ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
 #  endif
 
     /* Merge closures to avoid exceeding number of closures limit. */
-    if (i > 0)
-      shader_merge_volume_closures(sd);
+    if (!shadow) {
+      if (i > 0) {
+        shader_merge_volume_closures(sd);
+      }
+    }
   }
 }
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 66b7310ab65..3cc42bf7a85 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -572,6 +572,7 @@ typedef enum AttributeStandard {
   ATTR_STD_MOTION_VERTEX_NORMAL,
   ATTR_STD_PARTICLE,
   ATTR_STD_CURVE_INTERCEPT,
+  ATTR_STD_CURVE_LENGTH,
   ATTR_STD_CURVE_RANDOM,
   ATTR_STD_PTEX_FACE_ID,
   ATTR_STD_PTEX_UV,
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 396f42080e4..4fc46a255a8 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -107,6 +107,7 @@ ustring OSLRenderServices::u_geom_undisplaced("geom:undisplaced");
 ustring OSLRenderServices::u_is_smooth("geom:is_smooth");
 ustring OSLRenderServices::u_is_curve("geom:is_curve");
 ustring OSLRenderServices::u_curve_thickness("geom:curve_thickness");
+ustring OSLRenderServices::u_curve_length("geom:curve_length");
 ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
 ustring OSLRenderServices::u_curve_random("geom:curve_random");
 ustring OSLRenderServices::u_path_ray_length("path:ray_length");
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 58accb46e7d..2a5400282b3 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -294,6 +294,7 @@ class OSLRenderServices : public OSL::RendererServices {
   static ustring u_is_smooth;
   static ustring u_is_curve;
   static ustring u_curve_thickness;
+  static ustring u_curve_length;
   static ustring u_curve_tangent_normal;
   static ustring u_curve_random;
   static ustring u_path_ray_length;
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index 02be7813369..6b62e7bb52f 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -41,6 +41,7 @@ set(SRC_OSL
   node_vector_displacement.osl
   node_emission.osl
   node_environment_texture.osl
+  node_float_curve.osl
   node_fresnel.osl
   node_gamma.osl
   node_geometry.osl
diff --git a/intern/cycles/kernel/shaders/node_float_curve.osl b/intern/cycles/kernel/shaders/node_float_curve.osl
new file mode 100644
index 00000000000..f1f05fd88a9
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_float_curve.osl
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "node_ramp_util.h"
+#include "stdcycles.h"
+
+shader node_float_curve(float ramp[] = {0.0},
+                        float min_x = 0.0,
+                        float max_x = 1.0,
+                        float ValueIn = 0.0,
+                        float Factor = 0.0,
+                        output float ValueOut = 0.0)
+{
+  float c = (ValueIn - min_x) / (max_x - min_x);
+
+  ValueOut = rgb_ramp_lookup(ramp, c, 1, 1);
+
+  ValueOut = mix(ValueIn, ValueOut, Factor);
+}
diff --git a/intern/cycles/kernel/shaders/node_hair_info.osl b/intern/cycles/kernel/shaders/node_hair_info.osl
index ee08ea57e68..ddc2e28b83a 100644
--- a/intern/cycles/kernel/shaders/node_hair_info.osl
+++ b/intern/cycles/kernel/shaders/node_hair_info.osl
@@ -18,12 +18,14 @@
 
 shader node_hair_info(output float IsStrand = 0.0,
                       output float Intercept = 0.0,
+                      output float Length = 0.0,
                       output float Thickness = 0.0,
                       output normal TangentNormal = N,
                       output float Random = 0)
 {
   getattribute("geom:is_curve", IsStrand);
   getattribute("geom:curve_intercept", Intercept);
+  getattribute("geom:curve_length", Length);
   getattribute("geom:curve_thickness", Thickness);
   getattribute("geom:curve_tangent_normal", TangentNormal);
   getattribute("geom:curve_random", Random);
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 4aee1ef11b3..ad609b15f86 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -493,11 +493,13 @@ ccl_device void svm_eval_nodes(INTEGRATOR_STATE_CONST_ARGS,
       case NODE_IES:
         svm_node_ies(kg, sd, stack, node);
         break;
-
       case NODE_RGB_CURVES:
       case NODE_VECTOR_CURVES:
         offset = svm_node_curves(kg, sd, stack, node, offset);
         break;
+      case NODE_FLOAT_CURVE:
+        offset = svm_node_curve(kg, sd, stack, node, offset);
+        break;
       case NODE_TANGENT:
         svm_node_tangent(kg, sd, stack, node);
         break;
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 10e9f291d0e..432529eb061 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -213,6 +213,8 @@ ccl_device_noinline void svm_node_hair_info(
     }
     case NODE_INFO_CURVE_INTERCEPT:
       break; /* handled as attribute */
+    case NODE_INFO_CURVE_LENGTH:
+      break; /* handled as attribute */
     case NODE_INFO_CURVE_RANDOM:
       break; /* handled as attribute */
     case NODE_INFO_CURVE_THICKNESS: {
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index e92df3c093c..563e5bcb5e4 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -21,6 +21,48 @@ CCL_NAMESPACE_BEGIN
 
 /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
 
+ccl_device_inline float fetch_float(const KernelGlobals *kg, int offset)
+{
+  uint4 node = kernel_tex_fetch(__svm_nodes, offset);
+  return __uint_as_float(node.x);
+}
+
+ccl_device_inline float float_ramp_lookup(const KernelGlobals *kg,
+                                          int offset,
+                                          float f,
+                                          bool interpolate,
+                                          bool extrapolate,
+                                          int table_size)
+{
+  if ((f < 0.0f || f > 1.0f) && extrapolate) {
+    float t0, dy;
+    if (f < 0.0f) {
+      t0 = fetch_float(kg, offset);
+      dy = t0 - fetch_float(kg, offset + 1);
+      f = -f;
+    }
+    else {
+      t0 = fetch_float(kg, offset + table_size - 1);
+      dy = t0 - fetch_float(kg, offset + table_size - 2);
+      f = f - 1.0f;
+    }
+    return t0 + dy * f * (table_size - 1);
+  }
+
+  f = saturate(f) * (table_size - 1);
+
+  /* clamp int as well in case of NaN */
+  int i = clamp(float_to_int(f), 0, table_size - 1);
+  float t = f - (float)i;
+
+  float a = fetch_float(kg, offset + i);
+
+  if (interpolate && t > 0.0f)
+    a = (1.0f - t) * a + t * fetch_float(kg, offset + i + 1);
+
+  return a;
+}
+
 ccl_device_inline float4 rgb_ramp_lookup(const KernelGlobals *kg,
                                          int offset,
                                          float f,
@@ -105,6 +147,30 @@ ccl_device_noinline int svm_node_curves(
   return offset;
 }
 
+ccl_device_noinline int svm_node_curve(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
+{
+  uint fac_offset, value_in_offset, out_offset;
+  svm_unpack_node_uchar3(node.y, &fac_offset, &value_in_offset, &out_offset);
+
+  uint table_size = read_node(kg, &offset).x;
+
+  float fac = stack_load_float(stack, fac_offset);
+  float in = stack_load_float(stack, value_in_offset);
+
+  const float min = __int_as_float(node.z), max = __int_as_float(node.w);
+  const float range = max - min;
+  const float relpos = (in - min) / range;
+
+  float v = float_ramp_lookup(kg, offset, relpos, true, true, table_size);
+
+  in = (1.0f - fac) * in + fac * v;
+  stack_store_float(stack, out_offset, in);
+
+  offset += table_size;
+  return offset;
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __SVM_RAMP_H__ */
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index c053be96c51..59a0e33acbc 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -122,6 +122,7 @@ typedef enum ShaderNodeType {
   NODE_AOV_START,
   NODE_AOV_COLOR,
   NODE_AOV_VALUE,
+  NODE_FLOAT_CURVE,
   /* NOTE: for best OpenCL performance, item definition in the enum must
    * match the switch case order in svm.h. */
 } ShaderNodeType;
@@ -173,6 +174,7 @@ typedef enum NodeParticleInfo {
 typedef enum NodeHairInfo {
   NODE_INFO_CURVE_IS_STRAND,
   NODE_INFO_CURVE_INTERCEPT,
+  NODE_INFO_CURVE_LENGTH,
   NODE_INFO_CURVE_THICKNESS,
   /* Fade for minimum hair width transiency. */
   // NODE_INFO_CURVE_FADE,
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 6edb5261b32..323222b8c85 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -35,7 +35,6 @@ set(SRC
   denoising.cpp
   film.cpp
   geometry.cpp
-  gpu_display.cpp
   graph.cpp
   hair.cpp
   image.cpp
@@ -78,9 +77,10 @@ set(SRC_HEADERS
   colorspace.h
   constant_fold.h
   denoising.h
+  display_driver.h
+  output_driver.h
   film.h
   geometry.h
-  gpu_display.h
   graph.h
   hair.h
   image.h
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index ea5a5f50f2d..aaf21ad9fd2 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -342,6 +342,8 @@ const char *Attribute::standard_name(AttributeStandard std)
       return "particle";
     case ATTR_STD_CURVE_INTERCEPT:
       return "curve_intercept";
+    case ATTR_STD_CURVE_LENGTH:
+      return "curve_length";
     case ATTR_STD_CURVE_RANDOM:
       return "curve_random";
     case ATTR_STD_PTEX_FACE_ID:
@@ -586,6 +588,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
       case ATTR_STD_CURVE_INTERCEPT:
         attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY);
         break;
+      case ATTR_STD_CURVE_LENGTH:
+        attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE);
+        break;
       case ATTR_STD_CURVE_RANDOM:
         attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE);
         break;
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 1882510cd70..3682b55049a 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -22,7 +22,6 @@
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
 #include "util/util_math.h"
-#include "util/util_opengl.h"
 #include "util/util_time.h"
 #include "util/util_types.h"
 
diff --git a/intern/cycles/render/display_driver.h b/intern/cycles/render/display_driver.h
new file mode 100644
index 00000000000..85f305034d7
--- /dev/null
+++ b/intern/cycles/render/display_driver.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_half.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Display driver for efficient interactive display of renders.
+ *
+ * Host applications implement this interface for viewport rendering. For best performance, we
+ * recommend:
+ * - Allocating a texture on the GPU to be interactively updated
+ * - Using the graphics interop mechanism to avoid CPU-GPU copying overhead
+ * - Using a dedicated or thread-safe graphics API context for updates, to avoid
+ *   blocking the host application.
+ */
+class DisplayDriver {
+ public:
+  DisplayDriver() = default;
+  virtual ~DisplayDriver() = default;
+
+  /* Render buffer parameters. */
+  struct Params {
+   public:
+    /* Render resolution, ignoring progressive resolution changes.
+     * The texture buffer should be allocated with this size. */
+    int2 size = make_int2(0, 0);
+
+    /* For border rendering, the full resolution of the render, and the offset within that larger
+     * render. */
+    int2 full_size = make_int2(0, 0);
+    int2 full_offset = make_int2(0, 0);
+
+    bool modified(const Params &other) const
+    {
+      return !(full_offset == other.full_offset && full_size == other.full_size &&
+               size == other.size);
+    }
+  };
+
+  /* Update the render from the rendering thread.
+   *
+   * Cycles periodically updates the render to be displayed. For multithreaded updates with
+   * potentially multiple rendering devices, it will call these methods as follows.
+   *
+   * if (driver.update_begin(params, width, height)) {
+   *     parallel_for_each(rendering_device) {
+   *         buffer = driver.map_texture_buffer();
+   *         if (buffer) {
+   *             fill(buffer);
+   *             driver.unmap_texture_buffer();
+   *         }
+   *     }
+   *     driver.update_end();
+   * }
+   *
+   * The parameters may dynamically change due to camera changes in the scene, and resources should
+   * be re-allocated accordingly.
+   *
+   * The width and height passed to update_begin() are the effective render resolution taking into
+   * account progressive resolution changes, which may be equal to or smaller than the params.size.
+   * For efficiency, changes in this resolution should be handled without re-allocating resources,
+   * but rather by using a subset of the full resolution buffer. */
+  virtual bool update_begin(const Params &params, int width, int height) = 0;
+  virtual void update_end() = 0;
+
+  virtual half4 *map_texture_buffer() = 0;
+  virtual void unmap_texture_buffer() = 0;
+
+  /* Optionally return a handle to a native graphics API texture buffer. If supported,
+   * the rendering device may write directly to this buffer instead of calling
+   * map_texture_buffer() and unmap_texture_buffer(). */
+  class GraphicsInterop {
+   public:
+    /* Dimensions of the buffer, in pixels. */
+    int buffer_width = 0;
+    int buffer_height = 0;
+
+    /* OpenGL pixel buffer object. */
+    int opengl_pbo_id = 0;
+
+    /* Clear the entire buffer before doing partial write to it. */
+    bool need_clear = false;
+  };
+
+  virtual GraphicsInterop graphics_interop_get()
+  {
+    return GraphicsInterop();
+  }
+
+  /* (De)activate graphics context required for editing or deleting the graphics interop
+   * object.
+   *
+   * For example, destruction of the CUDA object associated with an OpenGL requires the
+   * OpenGL context to be active. */
+  virtual void graphics_interop_activate(){};
+  virtual void graphics_interop_deactivate(){};
+
+  /* Clear the display buffer by filling it with zeros. */
+  virtual void clear() = 0;
+
+  /* Draw the render using the native graphics API.
+   *
+   * Note that this may be called in parallel to updates. The implementation is responsible for
+   * mutex locking or other mechanisms to avoid conflicts.
+   *
+   * The parameters may have changed since the last update. The implementation is responsible for
+   * deciding to skip or adjust render display for such changes.
+   *
+   * Host application drawing the render buffer should use Session.draw(), which will
+   * call this method. */
+  virtual void draw(const Params &params) = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 8e14b338bd3..ad3336ca089 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -434,7 +434,8 @@ void Film::update_passes(Scene *scene, bool add_sample_count_pass)
   const ObjectManager *object_manager = scene->object_manager;
   Integrator *integrator = scene->integrator;
 
-  if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) {
+  if (!is_modified() && !object_manager->need_update() && !integrator->is_modified() &&
+      !background->is_modified()) {
     return;
   }
 
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 32e108d62ca..5ad419e02ca 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -19,7 +19,7 @@
 
 #include "kernel/kernel_types.h"
 
-#include "device/device_denoise.h" /* For the paramaters and type enum. */
+#include "device/device_denoise.h" /* For the parameters and type enum. */
 #include "graph/node.h"
 #include "integrator/adaptive_sampling.h"
 
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 03b79d7de3e..1629895ff6e 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -4368,6 +4368,7 @@ NODE_DEFINE(HairInfoNode)
 
   SOCKET_OUT_FLOAT(is_strand, "Is Strand");
   SOCKET_OUT_FLOAT(intercept, "Intercept");
+  SOCKET_OUT_FLOAT(size, "Length");
   SOCKET_OUT_FLOAT(thickness, "Thickness");
   SOCKET_OUT_NORMAL(tangent_normal, "Tangent Normal");
 #if 0 /* Output for minimum hair width transparency - deactivated. */
@@ -4390,6 +4391,9 @@ void HairInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes)
     if (!intercept_out->links.empty())
       attributes->add(ATTR_STD_CURVE_INTERCEPT);
 
+    if (!output("Length")->links.empty())
+      attributes->add(ATTR_STD_CURVE_LENGTH);
+
     if (!output("Random")->links.empty())
       attributes->add(ATTR_STD_CURVE_RANDOM);
   }
@@ -4412,6 +4416,12 @@ void HairInfoNode::compile(SVMCompiler &compiler)
     compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT);
   }
 
+  out = output("Length");
+  if (!out->links.empty()) {
+    int attr = compiler.attribute(ATTR_STD_CURVE_LENGTH);
+    compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT);
+  }
+
   out = output("Thickness");
   if (!out->links.empty()) {
     compiler.add_node(NODE_HAIR_INFO, NODE_INFO_CURVE_THICKNESS, compiler.stack_assign(out));
@@ -6372,7 +6382,7 @@ void BumpNode::constant_fold(const ConstantFolder &folder)
   /* TODO(sergey): Ignore bump with zero strength. */
 }
 
-/* Curve node */
+/* Curves node */
 
 CurvesNode::CurvesNode(const NodeType *node_type) : ShaderNode(node_type)
 {
@@ -6521,6 +6531,83 @@ void VectorCurvesNode::compile(OSLCompiler &compiler)
   CurvesNode::compile(compiler, "node_vector_curves");
 }
 
+/* FloatCurveNode */
+
+NODE_DEFINE(FloatCurveNode)
+{
+  NodeType *type = NodeType::add("float_curve", create, NodeType::SHADER);
+
+  SOCKET_FLOAT_ARRAY(curve, "Curve", array<float>());
+  SOCKET_FLOAT(min_x, "Min X", 0.0f);
+  SOCKET_FLOAT(max_x, "Max X", 1.0f);
+
+  SOCKET_IN_FLOAT(fac, "Factor", 0.0f);
+  SOCKET_IN_FLOAT(value, "Value", 0.0f);
+
+  SOCKET_OUT_FLOAT(value, "Value");
+
+  return type;
+}
+
+FloatCurveNode::FloatCurveNode() : ShaderNode(get_node_type())
+{
+}
+
+void FloatCurveNode::constant_fold(const ConstantFolder &folder)
+{
+  ShaderInput *value_in = input("Value");
+  ShaderInput *fac_in = input("Factor");
+
+  /* evaluate fully constant node */
+  if (folder.all_inputs_constant()) {
+    if (curve.size() == 0) {
+      return;
+    }
+
+    float pos = (value - min_x) / (max_x - min_x);
+    float result = float_ramp_lookup(curve.data(), pos, true, true, curve.size());
+
+    folder.make_constant(value + fac * (result - value));
+  }
+  /* remove no-op node */
+  else if (!fac_in->link && fac == 0.0f) {
+    /* link is not null because otherwise all inputs are constant */
+    folder.bypass(value_in->link);
+  }
+}
+
+void FloatCurveNode::compile(SVMCompiler &compiler)
+{
+  if (curve.size() == 0)
+    return;
+
+  ShaderInput *value_in = input("Value");
+  ShaderInput *fac_in = input("Factor");
+  ShaderOutput *value_out = output("Value");
+
+  compiler.add_node(NODE_FLOAT_CURVE,
+                    compiler.encode_uchar4(compiler.stack_assign(fac_in),
+                                           compiler.stack_assign(value_in),
+                                           compiler.stack_assign(value_out)),
+                    __float_as_int(min_x),
+                    __float_as_int(max_x));
+
+  compiler.add_node(curve.size());
+  for (int i = 0; i < curve.size(); i++)
+    compiler.add_node(make_float4(curve[i]));
+}
+
+void FloatCurveNode::compile(OSLCompiler &compiler)
+{
+  if (curve.size() == 0)
+    return;
+
+  compiler.parameter_array("ramp", curve.data(), curve.size());
+  compiler.parameter(this, "min_x");
+  compiler.parameter(this, "max_x");
+  compiler.add(this, "node_float_curve");
+}
+
 /* RGBRampNode */
 
 NODE_DEFINE(RGBRampNode)
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 22bdb06b059..5ac72835ac5 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -1398,6 +1398,18 @@ class VectorCurvesNode : public CurvesNode {
   void constant_fold(const ConstantFolder &folder);
 };
 
+class FloatCurveNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(FloatCurveNode)
+  void constant_fold(const ConstantFolder &folder);
+
+  NODE_SOCKET_API_ARRAY(array<float>, curve)
+  NODE_SOCKET_API(float, min_x)
+  NODE_SOCKET_API(float, max_x)
+  NODE_SOCKET_API(float, fac)
+  NODE_SOCKET_API(float, value)
+};
+
 class RGBRampNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(RGBRampNode)
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index d28b222c10e..5a43b641872 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -727,8 +727,8 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
     }
   }
 
-  /* create shader of the appropriate type. OSL only distinguishes between "surface"
-   * and "displacement" atm */
+  /* Create shader of the appropriate type. OSL only distinguishes between "surface"
+   * and "displacement" at the moment. */
   if (current_type == SHADER_TYPE_SURFACE)
     ss->Shader("surface", name, id(node).c_str());
   else if (current_type == SHADER_TYPE_VOLUME)
diff --git a/intern/cycles/render/output_driver.h b/intern/cycles/render/output_driver.h
new file mode 100644
index 00000000000..b7e980d71d4
--- /dev/null
+++ b/intern/cycles/render/output_driver.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_math.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Output driver for reading render buffers.
+ *
+ * Host applications implement this interface for outputting render buffers for offline rendering.
+ * Drivers can be used to copy the buffers into the host application or write them directly to
+ * disk. This interface may also be used for interactive display, however the DisplayDriver is more
+ * efficient for that purpose.
+ */
+class OutputDriver {
+ public:
+  OutputDriver() = default;
+  virtual ~OutputDriver() = default;
+
+  class Tile {
+   public:
+    Tile(const int2 offset,
+         const int2 size,
+         const int2 full_size,
+         const string_view layer,
+         const string_view view)
+        : offset(offset), size(size), full_size(full_size), layer(layer), view(view)
+    {
+    }
+    virtual ~Tile() = default;
+
+    const int2 offset;
+    const int2 size;
+    const int2 full_size;
+    const string layer;
+    const string view;
+
+    virtual bool get_pass_pixels(const string_view pass_name,
+                                 const int num_channels,
+                                 float *pixels) const = 0;
+    virtual bool set_pass_pixels(const string_view pass_name,
+                                 const int num_channels,
+                                 const float *pixels) const = 0;
+  };
+
+  /* Write tile once it has finished rendering. */
+  virtual void write_render_tile(const Tile &tile) = 0;
+
+  /* Update tile while rendering is in progress. Return true if any update
+   * was performed. */
+  virtual bool update_render_tile(const Tile & /* tile */)
+  {
+    return false;
+  }
+
+  /* For baking, read render pass PASS_BAKE_PRIMITIVE and PASS_BAKE_DIFFERENTIAL
+   * to determine which shading points to use for baking at each pixel. Return
+   * true if any data was read. */
+  virtual bool read_render_tile(const Tile & /* tile */)
+  {
+    return false;
+  }
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 823c34ed519..550188b196a 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -25,12 +25,13 @@
 #include "render/bake.h"
 #include "render/buffers.h"
 #include "render/camera.h"
-#include "render/gpu_display.h"
+#include "render/display_driver.h"
 #include "render/graph.h"
 #include "render/integrator.h"
 #include "render/light.h"
 #include "render/mesh.h"
 #include "render/object.h"
+#include "render/output_driver.h"
 #include "render/scene.h"
 #include "render/session.h"
 
@@ -38,7 +39,6 @@
 #include "util/util_function.h"
 #include "util/util_logging.h"
 #include "util/util_math.h"
-#include "util/util_opengl.h"
 #include "util/util_task.h"
 #include "util/util_time.h"
 
@@ -65,25 +65,6 @@ Session::Session(const SessionParams &params_, const SceneParams &scene_params)
   path_trace_ = make_unique<PathTrace>(
       device, scene->film, &scene->dscene, render_scheduler_, tile_manager_);
   path_trace_->set_progress(&progress);
-  path_trace_->tile_buffer_update_cb = [&]() {
-    if (!update_render_tile_cb) {
-      return;
-    }
-    update_render_tile_cb();
-  };
-  path_trace_->tile_buffer_write_cb = [&]() {
-    if (!write_render_tile_cb) {
-      return;
-    }
-    write_render_tile_cb();
-  };
-  path_trace_->tile_buffer_read_cb = [&]() -> bool {
-    if (!read_render_tile_cb) {
-      return false;
-    }
-    read_render_tile_cb();
-    return true;
-  };
   path_trace_->progress_update_cb = [&]() { update_status_time(); };
 
   tile_manager_.full_buffer_written_cb = [&](string_view filename) {
@@ -98,24 +79,6 @@ Session::~Session()
 {
   cancel();
 
-  /* TODO(sergey): Bring the passes in viewport back.
-   * It is unclear why there is such an exception needed though. */
-#if 0
-  if (buffers && params.write_render_cb) {
-    /* Copy to display buffer and write out image if requested */
-    delete display;
-
-    display = new DisplayBuffer(device, false);
-    display->reset(buffers->params);
-    copy_to_display_buffer(params.samples);
-
-    int w = display->draw_width;
-    int h = display->draw_height;
-    uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h);
-    params.write_render_cb((uchar *)pixels, w, h, 4);
-  }
-#endif
-
   /* Make sure path tracer is destroyed before the device. This is needed because destruction might
    * need to access device for device memory free. */
   /* TODO(sergey): Convert device to be unique_ptr, and rely on C++ to destruct objects in the
@@ -163,7 +126,7 @@ bool Session::ready_to_reset()
 
 void Session::run_main_render_loop()
 {
-  path_trace_->clear_gpu_display();
+  path_trace_->clear_display();
 
   while (true) {
     RenderWork render_work = run_update_for_next_iteration();
@@ -397,8 +360,8 @@ int2 Session::get_effective_tile_size() const
 
   /* TODO(sergey): Take available memory into account, and if there is enough memory do not tile
    * and prefer optimal performance. */
-
-  return make_int2(params.tile_size, params.tile_size);
+  const int tile_size = tile_manager_.compute_render_tile_size(params.tile_size);
+  return make_int2(tile_size, tile_size);
 }
 
 void Session::do_delayed_reset()
@@ -515,9 +478,33 @@ void Session::set_pause(bool pause)
   }
 }
 
-void Session::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+void Session::set_output_driver(unique_ptr<OutputDriver> driver)
 {
-  path_trace_->set_gpu_display(move(gpu_display));
+  path_trace_->set_output_driver(move(driver));
+}
+
+void Session::set_display_driver(unique_ptr<DisplayDriver> driver)
+{
+  path_trace_->set_display_driver(move(driver));
+}
+
+double Session::get_estimated_remaining_time() const
+{
+  const float completed = progress.get_progress();
+  if (completed == 0.0f) {
+    return 0.0;
+  }
+
+  double total_time, render_time;
+  progress.get_time(total_time, render_time);
+  double remaining = (1.0 - (double)completed) * (render_time / (double)completed);
+
+  const double time_limit = render_scheduler_.get_time_limit();
+  if (time_limit != 0.0) {
+    remaining = min(remaining, max(time_limit - render_time, 0.0));
+  }
+
+  return remaining;
 }
 
 void Session::wait()
@@ -619,101 +606,6 @@ void Session::collect_statistics(RenderStats *render_stats)
 }
 
 /* --------------------------------------------------------------------
- * Tile and tile pixels access.
- */
-
-bool Session::has_multiple_render_tiles() const
-{
-  return tile_manager_.has_multiple_tiles();
-}
-
-int2 Session::get_render_tile_size() const
-{
-  return path_trace_->get_render_tile_size();
-}
-
-int2 Session::get_render_tile_offset() const
-{
-  return path_trace_->get_render_tile_offset();
-}
-
-string_view Session::get_render_tile_layer() const
-{
-  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
-  return buffer_params.layer;
-}
-
-string_view Session::get_render_tile_view() const
-{
-  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
-  return buffer_params.view;
-}
-
-bool Session::copy_render_tile_from_device()
-{
-  return path_trace_->copy_render_tile_from_device();
-}
-
-bool Session::get_render_tile_pixels(const string &pass_name, int num_components, float *pixels)
-{
-  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
-   * is happening while this function runs. */
-
-  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
-
-  const BufferPass *pass = buffer_params.find_pass(pass_name);
-  if (pass == nullptr) {
-    return false;
-  }
-
-  const bool has_denoised_result = path_trace_->has_denoised_result();
-  if (pass->mode == PassMode::DENOISED && !has_denoised_result) {
-    pass = buffer_params.find_pass(pass->type);
-    if (pass == nullptr) {
-      /* Happens when denoised result pass is requested but is never written by the kernel. */
-      return false;
-    }
-  }
-
-  pass = buffer_params.get_actual_display_pass(pass);
-
-  const float exposure = buffer_params.exposure;
-  const int num_samples = path_trace_->get_num_render_tile_samples();
-
-  PassAccessor::PassAccessInfo pass_access_info(*pass);
-  pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher;
-  pass_access_info.use_approximate_shadow_catcher_background =
-      pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background;
-
-  const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
-  const PassAccessor::Destination destination(pixels, num_components);
-
-  return path_trace_->get_render_tile_pixels(pass_accessor, destination);
-}
-
-bool Session::set_render_tile_pixels(const string &pass_name,
-                                     int num_components,
-                                     const float *pixels)
-{
-  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
-   * is happening while this function runs. */
-
-  const BufferPass *pass = buffer_params_.find_pass(pass_name);
-  if (!pass) {
-    return false;
-  }
-
-  const float exposure = scene->film->get_exposure();
-  const int num_samples = render_scheduler_.get_num_rendered_samples();
-
-  const PassAccessor::PassAccessInfo pass_access_info(*pass);
-  PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
-  PassAccessor::Source source(pixels, num_components);
-
-  return path_trace_->set_render_tile_pixels(pass_accessor, source);
-}
-
-/* --------------------------------------------------------------------
  * Full-frame on-disk storage.
  */
 
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 5623604bfe8..46c964bc98c 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -35,9 +35,10 @@ CCL_NAMESPACE_BEGIN
 class BufferParams;
 class Device;
 class DeviceScene;
+class DisplayDriver;
+class OutputDriver;
 class PathTrace;
 class Progress;
-class GPUDisplay;
 class RenderBuffers;
 class Scene;
 class SceneParams;
@@ -67,8 +68,6 @@ class SessionParams {
 
   ShadingSystem shadingsystem;
 
-  function<bool(const uchar *pixels, int width, int height, int channels)> write_render_cb;
-
   SessionParams()
   {
     headless = false;
@@ -114,10 +113,6 @@ class Session {
   Stats stats;
   Profiler profiler;
 
-  function<void(void)> write_render_tile_cb;
-  function<void(void)> update_render_tile_cb;
-  function<void(void)> read_render_tile_cb;
-
   /* Callback is invoked by tile manager whenever on-dist tiles storage file is closed after
    * writing. Allows an engine integration to keep track of those files without worry about
    * transferring the information when it needs to re-create session during rendering. */
@@ -143,7 +138,10 @@ class Session {
   void set_samples(int samples);
   void set_time_limit(double time_limit);
 
-  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+  void set_output_driver(unique_ptr<OutputDriver> driver);
+  void set_display_driver(unique_ptr<DisplayDriver> driver);
+
+  double get_estimated_remaining_time() const;
 
   void device_free();
 
@@ -154,24 +152,6 @@ class Session {
   void collect_statistics(RenderStats *stats);
 
   /* --------------------------------------------------------------------
-   * Tile and tile pixels access.
-   */
-
-  bool has_multiple_render_tiles() const;
-
-  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. */
-  int2 get_render_tile_size() const;
-  int2 get_render_tile_offset() const;
-
-  string_view get_render_tile_layer() const;
-  string_view get_render_tile_view() const;
-
-  bool copy_render_tile_from_device();
-
-  bool get_render_tile_pixels(const string &pass_name, int num_components, float *pixels);
-  bool set_render_tile_pixels(const string &pass_name, int num_components, const float *pixels);
-
-  /* --------------------------------------------------------------------
    * Full-frame on-disk storage.
    */
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 28910bffa7b..7e53a9d0911 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -307,8 +307,8 @@ static bool configure_image_spec_from_buffer(ImageSpec *image_spec,
     DCHECK_GT(tile_size.x, 0);
     DCHECK_GT(tile_size.y, 0);
 
-    image_spec->tile_width = tile_size.x;
-    image_spec->tile_height = tile_size.y;
+    image_spec->tile_width = min(TileManager::IMAGE_TILE_SIZE, tile_size.x);
+    image_spec->tile_height = min(TileManager::IMAGE_TILE_SIZE, tile_size.y);
   }
 
   return true;
@@ -335,6 +335,15 @@ TileManager::~TileManager()
 {
 }
 
+int TileManager::compute_render_tile_size(const int suggested_tile_size) const
+{
+  /* Must be a multiple of IMAGE_TILE_SIZE so that we can write render tiles into the image file
+   * aligned on image tile boundaries. We can't set IMAGE_TILE_SIZE equal to the render tile size
+   * because too big tile size leads to integer overflow inside OpenEXR. */
+  return (suggested_tile_size <= IMAGE_TILE_SIZE) ? suggested_tile_size :
+                                                    align_up(suggested_tile_size, IMAGE_TILE_SIZE);
+}
+
 void TileManager::reset_scheduling(const BufferParams &params, int2 tile_size)
 {
   VLOG(3) << "Using tile size of " << tile_size;
@@ -411,6 +420,11 @@ const Tile &TileManager::get_current_tile() const
   return tile_state_.current_tile;
 }
 
+const int2 TileManager::get_size() const
+{
+  return make_int2(buffer_params_.width, buffer_params_.height);
+}
+
 bool TileManager::open_tile_output()
 {
   write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
@@ -427,7 +441,12 @@ bool TileManager::open_tile_output()
     return false;
   }
 
-  write_state_.tile_out->open(write_state_.filename, write_state_.image_spec);
+  if (!write_state_.tile_out->open(write_state_.filename, write_state_.image_spec)) {
+    LOG(ERROR) << "Error opening tile file: " << write_state_.tile_out->geterror();
+    write_state_.tile_out = nullptr;
+    return false;
+  }
+
   write_state_.num_tiles_written = 0;
 
   VLOG(3) << "Opened tile file " << write_state_.filename;
@@ -466,33 +485,29 @@ bool TileManager::write_tile(const RenderBuffers &tile_buffers)
 
   const BufferParams &tile_params = tile_buffers.params;
 
-  vector<float> pixel_storage;
   const float *pixels = tile_buffers.buffer.data();
-
-  /* Tiled writing expects pixels to contain data for an entire tile. Pad the render buffers with
-   * empty pixels for tiles which are on the image boundary. */
-  if (tile_params.width != tile_size_.x || tile_params.height != tile_size_.y) {
-    const int64_t pass_stride = tile_params.pass_stride;
-    const int64_t src_row_stride = tile_params.width * pass_stride;
-
-    const int64_t dst_row_stride = tile_size_.x * pass_stride;
-    pixel_storage.resize(dst_row_stride * tile_size_.y);
-
-    const float *src = tile_buffers.buffer.data();
-    float *dst = pixel_storage.data();
-    pixels = dst;
-
-    for (int y = 0; y < tile_params.height; ++y, src += src_row_stride, dst += dst_row_stride) {
-      memcpy(dst, src, src_row_stride * sizeof(float));
-    }
-  }
-
   const int tile_x = tile_params.full_x - buffer_params_.full_x;
   const int tile_y = tile_params.full_y - buffer_params_.full_y;
 
   VLOG(3) << "Write tile at " << tile_x << ", " << tile_y;
-  if (!write_state_.tile_out->write_tile(tile_x, tile_y, 0, TypeDesc::FLOAT, pixels)) {
+
+  /* The image tile sizes in the OpenEXR file are different from the size of our big tiles. The
+   * write_tiles() method expects a contiguous image region that will be split into tiles
+   * internally. OpenEXR expects the size of this region to be a multiple of the tile size,
+   * however OpenImageIO automatically adds the required padding.
+   *
+   * The only thing we have to ensure is that the tile_x and tile_y are a multiple of the
+   * image tile size, which happens in compute_render_tile_size. */
+  if (!write_state_.tile_out->write_tiles(tile_x,
+                                          tile_x + tile_params.width,
+                                          tile_y,
+                                          tile_y + tile_params.height,
+                                          0,
+                                          1,
+                                          TypeDesc::FLOAT,
+                                          pixels)) {
     LOG(ERROR) << "Error writing tile " << write_state_.tile_out->geterror();
+    return false;
   }
 
   ++write_state_.num_tiles_written;
@@ -518,7 +533,14 @@ void TileManager::finish_write_tiles()
 
       VLOG(3) << "Write dummy tile at " << tile.x << ", " << tile.y;
 
-      write_state_.tile_out->write_tile(tile.x, tile.y, 0, TypeDesc::FLOAT, pixel_storage.data());
+      write_state_.tile_out->write_tiles(tile.x,
+                                         tile.x + tile.width,
+                                         tile.y,
+                                         tile.y + tile.height,
+                                         0,
+                                         1,
+                                         TypeDesc::FLOAT,
+                                         pixel_storage.data());
     }
   }
 
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 71b9e966278..08eaa4034f0 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -82,6 +82,7 @@ class TileManager {
   bool done();
 
   const Tile &get_current_tile() const;
+  const int2 get_size() const;
 
   /* Write render buffer of a tile to a file on disk.
    *
@@ -107,6 +108,12 @@ class TileManager {
                                   RenderBuffers *buffers,
                                   DenoiseParams *denoise_params);
 
+  /* Compute valid tile size compatible with image saving. */
+  int compute_render_tile_size(const int suggested_tile_size) const;
+
+  /* Tile size in the image file. */
+  static const int IMAGE_TILE_SIZE = 128;
+
  protected:
   /* Get tile configuration for its index.
    * The tile index must be within [0, state_.tile_state_). */
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index de17efafcf2..faba411c769 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -34,7 +34,7 @@
 
 #else /* __KERNEL_GPU__ */
 
-#  ifdef __KERNEL_CUDA__
+#  if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
 
 #    define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
 
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 1d598725c84..2245668d02f 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -59,12 +59,23 @@ DebugFlags::CUDA::CUDA() : adaptive_compile(false)
   reset();
 }
 
+DebugFlags::HIP::HIP() : adaptive_compile(false)
+{
+  reset();
+}
+
 void DebugFlags::CUDA::reset()
 {
   if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
     adaptive_compile = true;
 }
 
+void DebugFlags::HIP::reset()
+{
+  if (getenv("CYCLES_HIP_ADAPTIVE_COMPILE") != NULL)
+    adaptive_compile = true;
+}
+
 DebugFlags::OptiX::OptiX()
 {
   reset();
@@ -103,6 +114,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
 
   os << "OptiX flags:\n"
      << "  Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n";
+
+  os << "HIP flags:\n"
+     << "  HIP streams : " << string_from_bool(debug_flags.hip.adaptive_compile) << "\n";
+
   return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 99e2723180c..81677201790 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -89,7 +89,18 @@ class DebugFlags {
     void reset();
 
     /* Whether adaptive feature based runtime compile is enabled or not.
-     * Requires the CUDA Toolkit and only works on Linux atm. */
+     * Requires the CUDA Toolkit and only works on Linux at the moment. */
+    bool adaptive_compile;
+  };
+
+  /* Descriptor of HIP feature-set to be used. */
+  struct HIP {
+    HIP();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Whether adaptive feature based runtime compile is enabled or not.*/
     bool adaptive_compile;
   };
 
@@ -124,6 +135,9 @@ class DebugFlags {
   /* Requested OptiX flags. */
   OptiX optix;
 
+  /* Requested HIP flags. */
+  HIP hip;
+
  private:
   DebugFlags();
 
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index d9edfec5da3..f36a492a1b0 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 /* Half Floats */
 
 /* CUDA has its own half data type, no need to define then */
-#ifndef __KERNEL_CUDA__
+#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -59,7 +59,7 @@ struct half4 {
   half x, y, z, w;
 };
 
-#ifdef __KERNEL_CUDA__
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
 
 ccl_device_inline void float4_store_half(half *h, float4 f)
 {
@@ -73,6 +73,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f)
 
 ccl_device_inline void float4_store_half(half *h, float4 f)
 {
+
 #  ifndef __KERNEL_SSE2__
   for (int i = 0; i < 4; i++) {
     /* optimized float to half for pixels:
@@ -109,6 +110,8 @@ ccl_device_inline void float4_store_half(half *h, float4 f)
 #  endif
 }
 
+#  ifndef __KERNEL_HIP__
+
 ccl_device_inline float half_to_float(half h)
 {
   float f;
@@ -117,6 +120,23 @@ ccl_device_inline float half_to_float(half h)
 
   return f;
 }
+#  else
+
+ccl_device_inline float half_to_float(std::uint32_t a) noexcept
+{
+
+  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
+
+  std::uint32_t v = __float_as_uint(__uint_as_float(u) *
+                                    __uint_as_float(0x77800000U) /*0x1.0p+112f*/) +
+                    0x38000000U;
+
+  u = (a & 0x7fff) != 0 ? v : u;
+
+  return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/;
+}
+
+#  endif /* __KERNEL_HIP__ */
 
 ccl_device_inline float4 half4_to_float4(half4 h)
 {
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 6d728dde679..cb1e94c838c 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -26,6 +26,10 @@
 #  include <cmath>
 #endif
 
+#ifdef __HIP__
+#  include <hip/hip_vector_types.h>
+#endif
+
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
@@ -83,7 +87,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Scalar */
 
-#ifdef _WIN32
+#ifndef __HIP__
+#  ifdef _WIN32
 ccl_device_inline float fmaxf(float a, float b)
 {
   return (a > b) ? a : b;
@@ -93,7 +98,9 @@ ccl_device_inline float fminf(float a, float b)
 {
   return (a < b) ? a : b;
 }
-#endif /* _WIN32 */
+
+#  endif /* _WIN32 */
+#endif   /* __HIP__ */
 
 #ifndef __KERNEL_GPU__
 using std::isfinite;
@@ -199,6 +206,7 @@ ccl_device_inline uint as_uint(float f)
   return u.i;
 }
 
+#ifndef __HIP__
 ccl_device_inline int __float_as_int(float f)
 {
   union {
@@ -238,6 +246,7 @@ ccl_device_inline float __uint_as_float(uint i)
   u.i = i;
   return u.f;
 }
+#endif
 
 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
@@ -669,7 +678,7 @@ ccl_device float bits_to_01(uint bits)
 
 ccl_device_inline uint count_leading_zeros(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __clz(x);
 #else
   assert(x != 0);
@@ -685,7 +694,7 @@ ccl_device_inline uint count_leading_zeros(uint x)
 
 ccl_device_inline uint count_trailing_zeros(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return (__ffs(x) - 1);
 #else
   assert(x != 0);
@@ -701,7 +710,7 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 
 ccl_device_inline uint find_first_set(uint x)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
+#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __ffs(x);
 #else
 #  ifdef _MSC_VER
diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h
index fa3a541eea9..fd0c9124345 100644
--- a/intern/cycles/util/util_math_intersect.h
+++ b/intern/cycles/util/util_math_intersect.h
@@ -40,7 +40,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
       /* Ray  points away from sphere. */
       return false;
     }
-    const float dsq = tsq - tp * tp; /* pythagoras */
+    const float dsq = tsq - tp * tp; /* Pythagoras. */
     if (dsq > radiussq) {
       /* Closest point on ray outside sphere. */
       return false;
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index dca8d3d0ab5..176ee11e1e9 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -100,7 +100,7 @@ class Progress {
     cancel = true;
   }
 
-  bool get_cancel()
+  bool get_cancel() const
   {
     if (!cancel && cancel_cb)
       cancel_cb();
@@ -108,7 +108,7 @@ class Progress {
     return cancel;
   }
 
-  string get_cancel_message()
+  string get_cancel_message() const
   {
     thread_scoped_lock lock(progress_mutex);
     return cancel_message;
@@ -130,12 +130,12 @@ class Progress {
     cancel = true;
   }
 
-  bool get_error()
+  bool get_error() const
   {
     return error;
   }
 
-  string get_error_message()
+  string get_error_message() const
   {
     thread_scoped_lock lock(progress_mutex);
     return error_message;
@@ -168,7 +168,7 @@ class Progress {
     }
   }
 
-  void get_time(double &total_time_, double &render_time_)
+  void get_time(double &total_time_, double &render_time_) const
   {
     thread_scoped_lock lock(progress_mutex);
 
@@ -200,7 +200,7 @@ class Progress {
     total_pixel_samples = total_pixel_samples_;
   }
 
-  float get_progress()
+  float get_progress() const
   {
     thread_scoped_lock lock(progress_mutex);
 
@@ -236,7 +236,7 @@ class Progress {
     }
   }
 
-  int get_current_sample()
+  int get_current_sample() const
   {
     thread_scoped_lock lock(progress_mutex);
     /* Note that the value here always belongs to the last tile that updated,
@@ -244,13 +244,13 @@ class Progress {
     return current_tile_sample;
   }
 
-  int get_rendered_tiles()
+  int get_rendered_tiles() const
   {
     thread_scoped_lock lock(progress_mutex);
     return rendered_tiles;
   }
 
-  int get_denoised_tiles()
+  int get_denoised_tiles() const
   {
     thread_scoped_lock lock(progress_mutex);
     return denoised_tiles;
@@ -300,7 +300,7 @@ class Progress {
     set_update();
   }
 
-  void get_status(string &status_, string &substatus_)
+  void get_status(string &status_, string &substatus_) const
   {
     thread_scoped_lock lock(progress_mutex);
 
@@ -330,8 +330,8 @@ class Progress {
   }
 
  protected:
-  thread_mutex progress_mutex;
-  thread_mutex update_mutex;
+  mutable thread_mutex progress_mutex;
+  mutable thread_mutex update_mutex;
   function<void()> update_cb;
   function<void()> cancel_cb;
 
diff --git a/intern/ghost/GHOST_IWindow.h b/intern/ghost/GHOST_IWindow.h
index 5f9bd808c8c..91f576ca304 100644
--- a/intern/ghost/GHOST_IWindow.h
+++ b/intern/ghost/GHOST_IWindow.h
@@ -40,7 +40,7 @@
  * There are two coordinate systems:
  *
  * - The screen coordinate system. The origin of the screen is located in the
- *   upper left corner of the screen.</li>
+ *   upper left corner of the screen.
  * - The client rectangle coordinate system. The client rectangle of a window
  *   is the area that is drawable by the application (excluding title bars etc.).
  */
diff --git a/intern/ghost/intern/GHOST_DisplayManagerSDL.cpp b/intern/ghost/intern/GHOST_DisplayManagerSDL.cpp
index 5b026eb1632..09b2e4dfe2b 100644
--- a/intern/ghost/intern/GHOST_DisplayManagerSDL.cpp
+++ b/intern/ghost/intern/GHOST_DisplayManagerSDL.cpp
@@ -101,8 +101,7 @@ GHOST_TSuccess GHOST_DisplayManagerSDL::setCurrentDisplaySetting(
     uint8_t display, const GHOST_DisplaySetting &setting)
 {
   /*
-   * Mode switching code ported from Quake 2 version 3.21 and bzflag version
-   * 2.4.0:
+   * Mode switching code ported from Quake 2 version 3.21 and BZFLAG version 2.4.0:
    * ftp://ftp.idsoftware.com/idstuff/source/q2source-3.21.zip
    * See linux/gl_glx.c:GLimp_SetMode
    * http://wiki.bzflag.org/BZFlag_Source
diff --git a/intern/ghost/intern/GHOST_SystemSDL.cpp b/intern/ghost/intern/GHOST_SystemSDL.cpp
index 35c7a7ef463..5370d4df857 100644
--- a/intern/ghost/intern/GHOST_SystemSDL.cpp
+++ b/intern/ghost/intern/GHOST_SystemSDL.cpp
@@ -374,8 +374,8 @@ void GHOST_SystemSDL::processEvent(SDL_Event *sdl_event)
         if (window->getCursorGrabBounds(bounds) == GHOST_kFailure)
           window->getClientBounds(bounds);
 
-        /* Could also clamp to screen bounds wrap with a window outside the view will fail atm.
-         * Use offset of 8 in case the window is at screen bounds. */
+        /* Could also clamp to screen bounds wrap with a window outside the view will
+         * fail at the moment. Use offset of 8 in case the window is at screen bounds. */
         bounds.wrapPoint(x_new, y_new, 8, window->getCursorGrabAxis());
         window->getCursorGrabAccum(x_accum, y_accum);
 
diff --git a/intern/ghost/intern/GHOST_SystemWin32.cpp b/intern/ghost/intern/GHOST_SystemWin32.cpp
index f44107ee000..482f20f5cd1 100644
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -1100,8 +1100,8 @@ GHOST_EventCursor *GHOST_SystemWin32::processCursorEvent(GHOST_WindowWin32 *wind
       window->getClientBounds(bounds);
     }
 
-    /* Could also clamp to screen bounds wrap with a window outside the view will fail atm.
-     * Use inset in case the window is at screen bounds. */
+    /* Could also clamp to screen bounds wrap with a window outside the view will
+     * fail at the moment. Use inset in case the window is at screen bounds. */
     bounds.wrapPoint(x_new, y_new, 2, window->getCursorGrabAxis());
 
     window->getCursorGrabAccum(x_accum, y_accum);
diff --git a/intern/ghost/intern/GHOST_SystemX11.cpp b/intern/ghost/intern/GHOST_SystemX11.cpp
index 10ccb00cc15..86b4245ca67 100644
--- a/intern/ghost/intern/GHOST_SystemX11.cpp
+++ b/intern/ghost/intern/GHOST_SystemX11.cpp
@@ -973,8 +973,8 @@ void GHOST_SystemX11::processEvent(XEvent *xe)
         if (window->getCursorGrabBounds(bounds) == GHOST_kFailure)
           window->getClientBounds(bounds);
 
-        /* Could also clamp to screen bounds wrap with a window outside the view will fail atm.
-         * Use offset of 8 in case the window is at screen bounds. */
+        /* Could also clamp to screen bounds wrap with a window outside the view will
+         * fail at the moment. Use offset of 8 in case the window is at screen bounds. */
         bounds.wrapPoint(x_new, y_new, 8, window->getCursorGrabAxis());
 
         window->getCursorGrabAccum(x_accum, y_accum);
@@ -1528,13 +1528,13 @@ void GHOST_SystemX11::processEvent(XEvent *xe)
             window->GetTabletData().Pressure = axis_value / ((float)xtablet.PressureLevels);
           }
 
-          /* the (short) cast and the & 0xffff is bizarre and unexplained anywhere,
-           * but I got garbage data without it. Found it in the xidump.c source --matt
+          /* NOTE(@broken): the (short) cast and the & 0xffff is bizarre and unexplained anywhere,
+           * but I got garbage data without it. Found it in the `xidump.c` source.
            *
-           * The '& 0xffff' just truncates the value to its two lowest bytes, this probably means
-           * some drivers do not properly set the whole int value? Since we convert to float
-           * afterward, I don't think we need to cast to short here, but do not have a device to
-           * check this. --mont29
+           * NOTE(@mont29): The '& 0xffff' just truncates the value to its two lowest bytes,
+           * this probably means some drivers do not properly set the whole int value?
+           * Since we convert to float afterward,
+           * I don't think we need to cast to short here, but do not have a device to check this.
            */
           if (AXIS_VALUE_GET(3, axis_value)) {
             window->GetTabletData().Xtilt = (short)(axis_value & 0xffff) /
diff --git a/intern/ghost/intern/GHOST_WindowX11.cpp b/intern/ghost/intern/GHOST_WindowX11.cpp
index de389951613..8b44403c598 100644
--- a/intern/ghost/intern/GHOST_WindowX11.cpp
+++ b/intern/ghost/intern/GHOST_WindowX11.cpp
@@ -1092,9 +1092,9 @@ GHOST_TSuccess GHOST_WindowX11::setOrder(GHOST_TWindowOrder order)
     XWindowAttributes attr;
     Atom atom;
 
-    /* We use both XRaiseWindow and _NET_ACTIVE_WINDOW, since some
-     * window managers ignore the former (e.g. kwin from kde) and others
-     * don't implement the latter (e.g. fluxbox pre 0.9.9) */
+    /* We use both #XRaiseWindow and #_NET_ACTIVE_WINDOW, since some
+     * window managers ignore the former (e.g. KWIN from KDE) and others
+     * don't implement the latter (e.g. FLUXBOX before 0.9.9). */
 
     XRaiseWindow(m_display, m_window);
 
diff --git a/intern/guardedalloc/intern/mallocn_guarded_impl.c b/intern/guardedalloc/intern/mallocn_guarded_impl.c
index 98a8553a3eb..bba72c907eb 100644
--- a/intern/guardedalloc/intern/mallocn_guarded_impl.c
+++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c
@@ -89,7 +89,7 @@ typedef struct localListBase {
   void *first, *last;
 } localListBase;
 
-/* note: keep this struct aligned (e.g., irix/gcc) - Hos */
+/* NOTE(@hos): keep this struct aligned (e.g., IRIX/GCC). */
 typedef struct MemHead {
   int tag1;
   size_t len;
@@ -98,9 +98,8 @@ typedef struct MemHead {
   const char *nextname;
   int tag2;
   short pad1;
-  short alignment; /* if non-zero aligned alloc was used
-                    * and alignment is stored here.
-                    */
+  /* if non-zero aligned allocation was used and alignment is stored here. */
+  short alignment;
 #ifdef DEBUG_MEMCOUNTER
   int _count;
 #endif
diff --git a/intern/opensubdiv/internal/evaluator/evaluator_impl.cc b/intern/opensubdiv/internal/evaluator/evaluator_impl.cc
index b3fc021e1ee..4f4f332ff15 100644
--- a/intern/opensubdiv/internal/evaluator/evaluator_impl.cc
+++ b/intern/opensubdiv/internal/evaluator/evaluator_impl.cc
@@ -553,7 +553,7 @@ void convertPatchCoordsToArray(const OpenSubdiv_PatchCoord *patch_coords,
 
 }  // namespace
 
-// Note: Define as a class instead of typedcef to make it possible
+// Note: Define as a class instead of typedef to make it possible
 // to have anonymous class in opensubdiv_evaluator_internal.h
 class CpuEvalOutput : public VolatileEvalOutput<CpuVertexBuffer,
                                                 CpuVertexBuffer,
diff --git a/intern/opensubdiv/opensubdiv_capi_type.h b/intern/opensubdiv/opensubdiv_capi_type.h
index e759c5f43b0..e78842036be 100644
--- a/intern/opensubdiv/opensubdiv_capi_type.h
+++ b/intern/opensubdiv/opensubdiv_capi_type.h
@@ -23,7 +23,7 @@
 extern "C" {
 #endif
 
-// Keep this a bitmask os it's possible to pass available
+// Keep this a bitmask so it's possible to pass available
 // evaluators to Blender.
 typedef enum eOpenSubdivEvaluator {
   OPENSUBDIV_EVALUATOR_CPU = (1 << 0),