Cycles: merge of cycles-x branch, a major update to the renderer

This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
author: Brecht Van Lommel <brecht@blender.org> 2021-09-20 18:59:20 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-21 15:55:54 +0300
commit: 08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree: 6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/integrator
parent: fa6b1007bad065440950cd67deb16a04f368856f (diff)
35 files changed, 8108 insertions, 0 deletions
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
new file mode 100644
index 00000000000..bfabd35d7c3
--- /dev/null
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright 2011-2021 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(INC
+  ..
+)
+
+set(SRC
+  adaptive_sampling.cpp
+  denoiser.cpp
+  denoiser_device.cpp
+  denoiser_oidn.cpp
+  denoiser_optix.cpp
+  path_trace.cpp
+  tile.cpp
+  pass_accessor.cpp
+  pass_accessor_cpu.cpp
+  pass_accessor_gpu.cpp
+  path_trace_work.cpp
+  path_trace_work_cpu.cpp
+  path_trace_work_gpu.cpp
+  render_scheduler.cpp
+  shader_eval.cpp
+  work_balancer.cpp
+  work_tile_scheduler.cpp
+)
+
+set(SRC_HEADERS
+  adaptive_sampling.h
+  denoiser.h
+  denoiser_device.h
+  denoiser_oidn.h
+  denoiser_optix.h
+  path_trace.h
+  tile.h
+  pass_accessor.h
+  pass_accessor_cpu.h
+  pass_accessor_gpu.h
+  path_trace_work.h
+  path_trace_work_cpu.h
+  path_trace_work_gpu.h
+  render_scheduler.h
+  shader_eval.h
+  work_balancer.h
+  work_tile_scheduler.h
+)
+
+set(LIB
+  # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to
+  # avoid such cyclic dependency.
+  cycles_render
+
+  cycles_util
+)
+
+if(WITH_OPENIMAGEDENOISE)
+  list(APPEND LIB
+    ${OPENIMAGEDENOISE_LIBRARIES}
+  )
+endif()
+
+include_directories(${INC})
+include_directories(SYSTEM ${INC_SYS})
+
+cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp
new file mode 100644
index 00000000000..23fbcfea5c2
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/adaptive_sampling.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+AdaptiveSampling::AdaptiveSampling()
+{
+}
+
+int AdaptiveSampling::align_samples(int start_sample, int num_samples) const
+{
+  if (!use) {
+    return num_samples;
+  }
+
+  /*
+   * The naive implementation goes as following:
+   *
+   *   int count = 1;
+   *   while (!need_filter(start_sample + count - 1) && count < num_samples) {
+   *     ++count;
+   *   }
+   *   return count;
+   */
+
+  /* 0-based sample index at which first filtering will happen. */
+  const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1);
+
+  /* Allow as many samples as possible until the first filter sample. */
+  if (start_sample + num_samples <= first_filter_sample) {
+    return num_samples;
+  }
+
+  const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1));
+
+  const int num_samples_until_filter = next_filter_sample - start_sample + 1;
+
+  return min(num_samples_until_filter, num_samples);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (!use) {
+    return false;
+  }
+
+  if (sample <= min_samples) {
+    return false;
+  }
+
+  return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h
new file mode 100644
index 00000000000..d98edd9894c
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling {
+ public:
+  AdaptiveSampling();
+
+  /* Align number of samples so that they align with the adaptive filtering.
+   *
+   * Returns the new value for the `num_samples` so that after rendering so many samples on top
+   * of `start_sample` filtering is required.
+   *
+   * The alignment happens in a way that allows to render as many samples as possible without
+   * missing any filtering point. This means that the result is "clamped" by the nearest sample
+   * at which filtering is needed. This is part of mechanism which ensures that all devices will
+   * perform same exact filtering and adaptive sampling, regardless of their performance.
+   *
+   * `start_sample` is the 0-based index of sample.
+   *
+   * NOTE: The start sample is included into the number of samples to render. This means that
+   * if the number of samples is 1, then the path tracer will render samples [align_samples],
+   * if the number of samples is 2, then the path tracer will render samples [align_samples,
+   * align_samples + 1] and so on. */
+  int align_samples(int start_sample, int num_samples) const;
+
+  /* Check whether adaptive sampling filter should happen at this sample.
+   * Returns false if the adaptive sampling is not use.
+   *
+   * `sample` is the 0-based index of sample. */
+  bool need_filter(int sample) const;
+
+  bool use = false;
+  int adaptive_step = 0;
+  int min_samples = 0;
+  float threshold = 0.0f;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
new file mode 100644
index 00000000000..598bbd497a5
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser.h"
+
+#include "device/device.h"
+#include "integrator/denoiser_oidn.h"
+#include "integrator/denoiser_optix.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams &params)
+{
+  DCHECK(params.use);
+
+  switch (params.type) {
+    case DENOISER_OPTIX:
+      return make_unique<OptiXDenoiser>(path_trace_device, params);
+
+    case DENOISER_OPENIMAGEDENOISE:
+      return make_unique<OIDNDenoiser>(path_trace_device, params);
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      /* pass */
+      break;
+  }
+
+  LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen.";
+
+  return nullptr;
+}
+
+Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
+    : path_trace_device_(path_trace_device), params_(params)
+{
+  DCHECK(params.use);
+}
+
+void Denoiser::set_params(const DenoiseParams &params)
+{
+  DCHECK_EQ(params.type, params_.type);
+
+  if (params.type == params_.type) {
+    params_ = params;
+  }
+  else {
+    LOG(ERROR) << "Attempt to change denoiser type.";
+  }
+}
+
+const DenoiseParams &Denoiser::get_params() const
+{
+  return params_;
+}
+
+bool Denoiser::load_kernels(Progress *progress)
+{
+  const Device *denoiser_device = ensure_denoiser_device(progress);
+
+  if (!denoiser_device) {
+    path_trace_device_->set_error("No device available to denoise on");
+    return false;
+  }
+
+  VLOG(3) << "Will denoise on " << denoiser_device->info.description << " ("
+          << denoiser_device->info.id << ")";
+
+  return true;
+}
+
+Device *Denoiser::get_denoiser_device() const
+{
+  return denoiser_device_;
+}
+
+/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */
+static bool is_single_supported_device(Device *device, DenoiserType type)
+{
+  if (device->info.type == DEVICE_MULTI) {
+    /* Assume multi-device is never created with a single sub-device.
+     * If one requests such configuration it should be checked on the session level. */
+    return false;
+  }
+
+  if (!device->info.multi_devices.empty()) {
+    /* Some configurations will use multi_devices, but keep the type of an individual device.
+     * This does simplify checks for homogenous setups, but here we really need a single device. */
+    return false;
+  }
+
+  /* Check the denoiser type is supported. */
+  return (device->info.denoisers & type);
+}
+
+/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of
+ * multi-device.
+ *
+ * If there is no device available which supports given denoiser type nullptr is returned. */
+static Device *find_best_device(Device *device, DenoiserType type)
+{
+  Device *best_device = nullptr;
+
+  device->foreach_device([&](Device *sub_device) {
+    if ((sub_device->info.denoisers & type) == 0) {
+      return;
+    }
+    if (!best_device) {
+      best_device = sub_device;
+    }
+    else {
+      /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
+       * of the device and data transfer cost. */
+    }
+  });
+
+  return best_device;
+}
+
+static unique_ptr<Device> create_denoiser_device(Device *path_trace_device,
+                                                 const uint device_type_mask)
+{
+  const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask);
+  if (device_infos.empty()) {
+    return nullptr;
+  }
+
+  /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on
+   * a physical CUDA device which is already used for rendering. */
+
+  /* TODO(sergey): Choose fastest device for denoising. */
+
+  const DeviceInfo denoiser_device_info = device_infos.front();
+
+  unique_ptr<Device> denoiser_device(
+      Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler));
+
+  if (!denoiser_device) {
+    return nullptr;
+  }
+
+  if (denoiser_device->have_error()) {
+    return nullptr;
+  }
+
+  /* Only need denoising feature, everything else is unused. */
+  if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) {
+    return nullptr;
+  }
+
+  return denoiser_device;
+}
+
+Device *Denoiser::ensure_denoiser_device(Progress *progress)
+{
+  /* The best device has been found already, avoid sequential lookups.
+   * Additionally, avoid device re-creation if it has failed once. */
+  if (denoiser_device_ || device_creation_attempted_) {
+    return denoiser_device_;
+  }
+
+  /* Simple case: rendering happens on a single device which also supports denoiser. */
+  if (is_single_supported_device(path_trace_device_, params_.type)) {
+    denoiser_device_ = path_trace_device_;
+    return denoiser_device_;
+  }
+
+  /* Find best device from the ones which are already used for rendering. */
+  denoiser_device_ = find_best_device(path_trace_device_, params_.type);
+  if (denoiser_device_) {
+    return denoiser_device_;
+  }
+
+  if (progress) {
+    progress->set_status("Loading denoising kernels (may take a few minutes the first time)");
+  }
+
+  device_creation_attempted_ = true;
+
+  const uint device_type_mask = get_device_type_mask();
+  local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask);
+  denoiser_device_ = local_denoiser_device_.get();
+
+  return denoiser_device_;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h
new file mode 100644
index 00000000000..3101b45e31b
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the
+ * better place is figured out. */
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "util/util_function.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class RenderBuffers;
+class Progress;
+
+/* Implementation of a specific denoising algorithm.
+ *
+ * This class takes care of breaking down denosiing algorithm into a series of device calls or to
+ * calls of an external API to denoise given input.
+ *
+ * TODO(sergey): Are we better with device or a queue here? */
+class Denoiser {
+ public:
+  /* Create denoiser for the given path trace device.
+   *
+   * Notes:
+   * - The denoiser must be configured. This means that `params.use` must be true.
+   *   This is checked in debug builds.
+   * - The device might be MultiDevice. */
+  static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual ~Denoiser() = default;
+
+  void set_params(const DenoiseParams &params);
+  const DenoiseParams &get_params() const;
+
+  /* Create devices and load kernels needed for denoising.
+   * The progress is used to communicate state when kenrels actually needs to be loaded.
+   *
+   * NOTE: The `progress` is an optional argument, can be nullptr. */
+  virtual bool load_kernels(Progress *progress);
+
+  /* Denoise the entire buffer.
+   *
+   * Buffer parameters denotes an effective parameters used during rendering. It could be
+   * a lower resolution render into a bigger allocated buffer, which is used in viewport during
+   * navigation and non-unit pixel size. Use that instead of render_buffers->params.
+   *
+   * The buffer might be copming from a "foreign" device from what this denoise is created for.
+   * This means that in general case the denoiser will make sure the input data is available on
+   * the denoiser device, perform denoising, and put data back to the device where the buffer
+   * came from.
+   *
+   * The `num_samples` corresponds to the number of samples in the render buffers. It is used
+   * to scale buffers down to the "final" value in algorithms which don't do automatic exposure,
+   * or which needs "final" value for data passes.
+   *
+   * The `allow_inplace_modification` means that the denoiser is allowed to do in-place
+   * modification of the input passes (scaling them down i.e.). This will lower the memory
+   * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of
+   * view.
+   *
+   * Returns true when all passes are denoised. Will return false if there is a denoiser error (for
+   * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) = 0;
+
+  /* Get a device which is used to perform actual denoising.
+   *
+   * Notes:
+   *
+   * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then,
+   *
+   * - The device can be different from the path tracing device. This happens, for example, when
+   *   using OptiX denoiser and rendering on CPU.
+   *
+   * - No threading safety is ensured in this call. This means, that it is up to caller to ensure
+   *   that there is no threadingconflict between denoising task lazily initializing the device and
+   *   access to this device happen. */
+  Device *get_denoiser_device() const;
+
+  function<bool(void)> is_cancelled_cb;
+
+  bool is_cancelled() const
+  {
+    if (!is_cancelled_cb) {
+      return false;
+    }
+    return is_cancelled_cb();
+  }
+
+ protected:
+  Denoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  /* Make sure denoising device is initialized. */
+  virtual Device *ensure_denoiser_device(Progress *progress);
+
+  /* Get device type mask which is used to filter available devices when new device needs to be
+   * created. */
+  virtual uint get_device_type_mask() const = 0;
+
+  Device *path_trace_device_;
+  DenoiseParams params_;
+
+  /* Cached pointer to the device on which denoising will happen.
+   * Used to avoid lookup of a device for every denoising request. */
+  Device *denoiser_device_ = nullptr;
+
+  /* Denoiser device which was created to perform denoising in the case the none of the rendering
+   * devices are capable of denoising. */
+  unique_ptr<Device> local_denoiser_device_;
+  bool device_creation_attempted_ = false;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp
new file mode 100644
index 00000000000..8088cfd7800
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_device.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+}
+
+DeviceDenoiser::~DeviceDenoiser()
+{
+  /* Explicit implementation, to allow forward declaration of Device in the header. */
+}
+
+bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                    RenderBuffers *render_buffers,
+                                    const int num_samples,
+                                    bool allow_inplace_modification)
+{
+  Device *denoiser_device = get_denoiser_device();
+  if (!denoiser_device) {
+    return false;
+  }
+
+  DeviceDenoiseTask task;
+  task.params = params_;
+  task.num_samples = num_samples;
+  task.buffer_params = buffer_params;
+  task.allow_inplace_modification = allow_inplace_modification;
+
+  RenderBuffers local_render_buffers(denoiser_device);
+  bool local_buffer_used = false;
+
+  if (denoiser_device == render_buffers->buffer.device) {
+    /* The device can access an existing buffer pointer. */
+    local_buffer_used = false;
+    task.render_buffers = render_buffers;
+  }
+  else {
+    VLOG(3) << "Creating temporary buffer on denoiser device.";
+
+    DeviceQueue *queue = denoiser_device->get_denoise_queue();
+
+    /* Create buffer which is available by the device used by denoiser. */
+
+    /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
+     * ignoring other light ad data passes. */
+
+    local_buffer_used = true;
+
+    render_buffers->copy_from_device();
+
+    local_render_buffers.reset(buffer_params);
+
+    /* NOTE: The local buffer is allocated for an exact size of the effective render size, while
+     * the input render buffer is allcoated for the lowest resolution divider possible. So it is
+     * important to only copy actually needed part of the input buffer. */
+    memcpy(local_render_buffers.buffer.data(),
+           render_buffers->buffer.data(),
+           sizeof(float) * local_render_buffers.buffer.size());
+
+    queue->copy_to_device(local_render_buffers.buffer);
+
+    task.render_buffers = &local_render_buffers;
+    task.allow_inplace_modification = true;
+  }
+
+  const bool denoise_result = denoiser_device->denoise_buffer(task);
+
+  if (local_buffer_used) {
+    local_render_buffers.copy_from_device();
+
+    render_buffers_host_copy_denoised(
+        render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
+
+    render_buffers->copy_to_device();
+  }
+
+  return denoise_result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h
new file mode 100644
index 00000000000..0fd934dba79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
+ * implemented as a part of a driver of specific device.
+ *
+ * This implementation makes sure the to-be-denoised buffer is available on the denoising device
+ * and invoke denoising kernel via device API. */
+class DeviceDenoiser : public Denoiser {
+ public:
+  DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
+  ~DeviceDenoiser();
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
new file mode 100644
index 00000000000..1b5a012ec87
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -0,0 +1,628 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_oidn.h"
+
+#include <array>
+
+#include "device/device.h"
+#include "device/device_queue.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "render/buffers.h"
+#include "util/util_array.h"
+#include "util/util_logging.h"
+#include "util/util_openimagedenoise.h"
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+thread_mutex OIDNDenoiser::mutex_;
+
+OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+  DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE);
+
+  DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform.";
+}
+
+#ifdef WITH_OPENIMAGEDENOISE
+static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/)
+{
+  OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr);
+  return !oidn_denoiser->is_cancelled();
+}
+#endif
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+class OIDNPass {
+ public:
+  OIDNPass() = default;
+
+  OIDNPass(const BufferParams &buffer_params,
+           const char *name,
+           PassType type,
+           PassMode mode = PassMode::NOISY)
+      : name(name), type(type), mode(mode)
+  {
+    offset = buffer_params.get_pass_offset(type, mode);
+    need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  inline operator bool() const
+  {
+    return name[0] != '\0';
+  }
+
+  /* Name of an image which will be passed to the OIDN library.
+   * Should be one of the following: color, albedo, normal, output.
+   * The albedo and normal images are optional. */
+  const char *name = "";
+
+  PassType type = PASS_NONE;
+  PassMode mode = PassMode::NOISY;
+  int num_components = -1;
+  bool use_compositing = false;
+  bool use_denoising_albedo = true;
+
+  /* Offset of beginning of this pass in the render buffers. */
+  int offset = -1;
+
+  /* Denotes whether the data is to be scaled down with the number of passes.
+   * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so
+   * scaling is not needed for the color pass unless adaptive sampling is used.
+   *
+   * NOTE: Do not scale the outout pass, as that requires to be a pointer in the original buffer.
+   * All the scaling on the output needed for integration with adaptive sampling will happen
+   * outside of generic pass handling. */
+  bool need_scale = false;
+
+  /* The content of the pass has been pre-filtered. */
+  bool is_filtered = false;
+
+  /* For the scaled passes, the data which holds values of scaled pixels. */
+  array<float> scaled_buffer;
+};
+
+class OIDNDenoiseContext {
+ public:
+  OIDNDenoiseContext(OIDNDenoiser *denoiser,
+                     const DenoiseParams &denoise_params,
+                     const BufferParams &buffer_params,
+                     RenderBuffers *render_buffers,
+                     const int num_samples,
+                     const bool allow_inplace_modification)
+      : denoiser_(denoiser),
+        denoise_params_(denoise_params),
+        buffer_params_(buffer_params),
+        render_buffers_(render_buffers),
+        num_samples_(num_samples),
+        allow_inplace_modification_(allow_inplace_modification),
+        pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT))
+  {
+    if (denoise_params_.use_pass_albedo) {
+      oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO);
+    }
+
+    if (denoise_params_.use_pass_normal) {
+      oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL);
+    }
+  }
+
+  bool need_denoising() const
+  {
+    if (buffer_params_.width == 0 && buffer_params_.height == 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /* Make the guiding passes available by a sequential denoising of various passes. */
+  void read_guiding_passes()
+  {
+    read_guiding_pass(oidn_albedo_pass_);
+    read_guiding_pass(oidn_normal_pass_);
+  }
+
+  void denoise_pass(const PassType pass_type)
+  {
+    OIDNPass oidn_color_pass(buffer_params_, "color", pass_type);
+    if (oidn_color_pass.offset == PASS_UNUSED) {
+      return;
+    }
+
+    if (oidn_color_pass.use_denoising_albedo) {
+      if (albedo_replaced_with_fake_) {
+        LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+        return;
+      }
+    }
+
+    OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED);
+    if (oidn_output_pass.offset == PASS_UNUSED) {
+      LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+      return;
+    }
+
+    OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass);
+
+    oidn::DeviceRef oidn_device = oidn::newDevice();
+    oidn_device.commit();
+
+    /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too.
+     */
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_input_pass(oidn_filter, oidn_color_access_pass);
+    set_guiding_passes(oidn_filter, oidn_color_pass);
+    set_output_pass(oidn_filter, oidn_output_pass);
+    oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_);
+    oidn_filter.set("hdr", true);
+    oidn_filter.set("srgb", false);
+    if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE ||
+        denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) {
+      oidn_filter.set("cleanAux", true);
+    }
+    oidn_filter.commit();
+
+    filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_);
+    filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_);
+
+    /* Filter the beauty image. */
+    oidn_filter.execute();
+
+    /* Check for errors. */
+    const char *error_message;
+    const oidn::Error error = oidn_device.getError(error_message);
+    if (error != oidn::Error::None && error != oidn::Error::Cancelled) {
+      LOG(ERROR) << "OpenImageDenoise error: " << error_message;
+    }
+
+    postprocess_output(oidn_color_pass, oidn_output_pass);
+  }
+
+ protected:
+  void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass)
+  {
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass ||
+        oidn_pass.is_filtered) {
+      return;
+    }
+
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_pass(oidn_filter, oidn_pass);
+    set_output_pass(oidn_filter, oidn_pass);
+    oidn_filter.commit();
+    oidn_filter.execute();
+
+    oidn_pass.is_filtered = true;
+  }
+
+  /* Make pixels of a guiding pass available by the denoiser. */
+  void read_guiding_pass(OIDNPass &oidn_pass)
+  {
+    if (!oidn_pass) {
+      return;
+    }
+
+    DCHECK(!oidn_pass.use_compositing);
+
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE &&
+        !is_pass_scale_needed(oidn_pass)) {
+      /* Pass data is available as-is from the render buffers. */
+      return;
+    }
+
+    if (allow_inplace_modification_) {
+      scale_pass_in_render_buffers(oidn_pass);
+      return;
+    }
+
+    read_pass_pixels_into_buffer(oidn_pass);
+  }
+
+  /* Special reader of the input pass.
+   * To save memory it will read pixels into the output, and let the denoiser to perform an
+   * in-place operation. */
+  OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    const bool use_compositing = oidn_input_pass.use_compositing;
+
+    /* Simple case: no compositing is involved, no scaling is needed.
+     * The pass pixels will be referenced as-is, without extra processing. */
+    if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) {
+      return oidn_input_pass;
+    }
+
+    float *buffer_data = render_buffers_->buffer.data();
+    float *pass_data = buffer_data + oidn_output_pass.offset;
+
+    PassAccessor::Destination destination(pass_data, 3);
+    destination.pixel_stride = buffer_params_.pass_stride;
+
+    read_pass_pixels(oidn_input_pass, destination);
+
+    OIDNPass oidn_input_pass_at_output = oidn_input_pass;
+    oidn_input_pass_at_output.offset = oidn_output_pass.offset;
+
+    return oidn_input_pass_at_output;
+  }
+
+  /* Read pass pixels using PassAccessor into the given destination. */
+  void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination)
+  {
+    PassAccessor::PassAccessInfo pass_access_info;
+    pass_access_info.type = oidn_pass.type;
+    pass_access_info.mode = oidn_pass.mode;
+    pass_access_info.offset = oidn_pass.offset;
+
+    /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+     * on the approximation. The latter is not even possible because OIDN does not support
+     * denoising of semi-transparent pixels. */
+    pass_access_info.use_approximate_shadow_catcher = false;
+    pass_access_info.use_approximate_shadow_catcher_background = false;
+    pass_access_info.show_active_pixels = false;
+
+    /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured
+     * by users. What is important is to use same exposure for read and write access of the pass
+     * pixels. */
+    const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_);
+
+    pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination);
+  }
+
+  /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */
+  void read_pass_pixels_into_buffer(OIDNPass &oidn_pass)
+  {
+    VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " ("
+            << pass_type_as_string(oidn_pass.type) << ")";
+
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    array<float> &scaled_buffer = oidn_pass.scaled_buffer;
+    scaled_buffer.resize(width * height * 3);
+
+    const PassAccessor::Destination destination(scaled_buffer.data(), 3);
+
+    read_pass_pixels(oidn_pass, destination);
+  }
+
+  /* Set OIDN image to reference pixels from the given render buffer pass.
+   * No transform to the pixels is done, no additional memory is used. */
+  void set_pass_referenced(oidn::FilterRef &oidn_filter,
+                           const char *name,
+                           const OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+
+    const int64_t pixel_index = offset + x + y * stride;
+    const int64_t buffer_offset = pixel_index * pass_stride;
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    oidn_filter.setImage(name,
+                         buffer_data + buffer_offset + oidn_pass.offset,
+                         oidn::Format::Float3,
+                         width,
+                         height,
+                         0,
+                         pass_stride * sizeof(float),
+                         stride * pass_stride * sizeof(float));
+  }
+
+  void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    oidn_filter.setImage(
+        name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0);
+  }
+
+  void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+  void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    if (oidn_pass.scaled_buffer.empty()) {
+      set_pass_referenced(oidn_filter, name, oidn_pass);
+    }
+    else {
+      set_pass_from_buffer(oidn_filter, name, oidn_pass);
+    }
+  }
+
+  void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+
+  void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    if (oidn_albedo_pass_) {
+      if (oidn_pass.use_denoising_albedo) {
+        set_pass(oidn_filter, oidn_albedo_pass_);
+      }
+      else {
+        /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been
+         * provided. */
+        set_fake_albedo_pass(oidn_filter);
+      }
+    }
+
+    if (oidn_normal_pass_) {
+      set_pass(oidn_filter, oidn_normal_pass_);
+    }
+  }
+
+  void set_fake_albedo_pass(oidn::FilterRef &oidn_filter)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    if (!albedo_replaced_with_fake_) {
+      const int64_t num_pixel_components = width * height * 3;
+      oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components);
+
+      for (int i = 0; i < num_pixel_components; ++i) {
+        oidn_albedo_pass_.scaled_buffer[i] = 0.5f;
+      }
+
+      albedo_replaced_with_fake_ = true;
+    }
+
+    set_pass(oidn_filter, oidn_albedo_pass_);
+  }
+
+  void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, "output", oidn_pass);
+  }
+
+  /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel
+   * back. */
+  void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components);
+
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+    const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing;
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *denoised_pixel = buffer_pixel + oidn_output_pass.offset;
+
+        if (need_scale) {
+          const float pixel_scale = has_pass_sample_count ?
+                                        __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                        num_samples_;
+
+          denoised_pixel[0] = denoised_pixel[0] * pixel_scale;
+          denoised_pixel[1] = denoised_pixel[1] * pixel_scale;
+          denoised_pixel[2] = denoised_pixel[2] * pixel_scale;
+        }
+
+        if (oidn_output_pass.num_components == 3) {
+          /* Pass without alpha channel. */
+        }
+        else if (!oidn_input_pass.use_compositing) {
+          /* Currently compositing passes are either 3-component (derived by dividing light passes)
+           * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+           * simplifies logic and avoids extra memory allocation. */
+          const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset;
+          denoised_pixel[3] = noisy_pixel[3];
+        }
+        else {
+          /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+           * is an opaque pixel for 4 component passes. */
+          denoised_pixel[3] = 0;
+        }
+      }
+    }
+  }
+
+  bool is_pass_scale_needed(OIDNPass &oidn_pass) const
+  {
+    if (pass_sample_count_ != PASS_UNUSED) {
+      /* With adaptive sampling pixels will have different number of samples in them, so need to
+       * always scale the pass to make pixels uniformly sampled. */
+      return true;
+    }
+
+    if (!oidn_pass.need_scale) {
+      return false;
+    }
+
+    if (num_samples_ == 1) {
+      /* If the avoid scaling if there is only one sample, to save up time (so we dont divide
+       * buffer by 1). */
+      return false;
+    }
+
+    return true;
+  }
+
+  void scale_pass_in_render_buffers(OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *pass_pixel = buffer_pixel + oidn_pass.offset;
+
+        const float pixel_scale = 1.0f / (has_pass_sample_count ?
+                                              __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                              num_samples_);
+
+        pass_pixel[0] = pass_pixel[0] * pixel_scale;
+        pass_pixel[1] = pass_pixel[1] * pixel_scale;
+        pass_pixel[2] = pass_pixel[2] * pixel_scale;
+      }
+    }
+  }
+
+  OIDNDenoiser *denoiser_ = nullptr;
+
+  const DenoiseParams &denoise_params_;
+  const BufferParams &buffer_params_;
+  RenderBuffers *render_buffers_ = nullptr;
+  int num_samples_ = 0;
+  bool allow_inplace_modification_ = false;
+  int pass_sample_count_ = PASS_UNUSED;
+
+  /* Optional albedo and normal passes, reused by denoising of different pass types. */
+  OIDNPass oidn_albedo_pass_;
+  OIDNPass oidn_normal_pass_;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake_ = false;
+};
+#endif
+
+static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers)
+{
+  Device *device = render_buffers->buffer.device;
+  if (device->info.has_gpu_queue) {
+    return device->gpu_queue_create();
+  }
+  return nullptr;
+}
+
+static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue,
+                                            RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_from_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_from_device();
+  }
+}
+
+static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue,
+                                          RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_to_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_to_device();
+  }
+}
+
+bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                  RenderBuffers *render_buffers,
+                                  const int num_samples,
+                                  bool allow_inplace_modification)
+{
+  thread_scoped_lock lock(mutex_);
+
+  /* Make sure the host-side data is available for denoising. */
+  unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers);
+  copy_render_buffers_from_device(queue, render_buffers);
+
+#ifdef WITH_OPENIMAGEDENOISE
+  OIDNDenoiseContext context(
+      this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification);
+
+  if (context.need_denoising()) {
+    context.read_guiding_passes();
+
+    const std::array<PassType, 3> passes = {
+        {/* Passes which will use real albedo when it is available. */
+         PASS_COMBINED,
+         PASS_SHADOW_CATCHER_MATTE,
+
+         /* Passes which do not need albedo and hence if real is present it needs to become fake.
+          */
+         PASS_SHADOW_CATCHER}};
+
+    for (const PassType pass_type : passes) {
+      context.denoise_pass(pass_type);
+      if (is_cancelled()) {
+        return false;
+      }
+    }
+
+    /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code
+     * copies data from the device it doesn't overwrite the denoiser buffers. */
+    copy_render_buffers_to_device(queue, render_buffers);
+  }
+#endif
+
+  /* This code is not supposed to run when compiled without OIDN support, so can assume if we made
+   * it up here all passes are properly denoised. */
+  return true;
+}
+
+uint OIDNDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_CPU;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h
new file mode 100644
index 00000000000..566e761ae79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of denoising API which uses OpenImageDenoise library. */
+class OIDNDenoiser : public Denoiser {
+ public:
+  /* Forwardly declared state which might be using compile-flag specific fields, such as
+   * OpenImageDenoise device and filter handles. */
+  class State;
+
+  OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+
+  /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded.
+   * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */
+  static thread_mutex mutex_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_optix.cpp b/intern/cycles/integrator/denoiser_optix.cpp
new file mode 100644
index 00000000000..5f9de23bfe6
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_optix.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : DeviceDenoiser(path_trace_device, params)
+{
+}
+
+uint OptiXDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_OPTIX;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_optix.h b/intern/cycles/integrator/denoiser_optix.h
new file mode 100644
index 00000000000..a8df770ecf7
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser_device.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDenoiser : public DeviceDenoiser {
+ public:
+  OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
new file mode 100644
index 00000000000..87c048b1fa5
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/kernel_types.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Pass input information.
+ */
+
+PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass)
+    : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass destination.
+ */
+
+PassAccessor::Destination::Destination(float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
+    : Destination(pass_type)
+{
+  pixels_half_rgba = pixels;
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type)
+{
+  const PassInfo pass_info = Pass::get_info(pass_type);
+  num_components = pass_info.num_components;
+}
+
+/* --------------------------------------------------------------------
+ * Pass source.
+ */
+
+PassAccessor::Source::Source(const float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessor.
+ */
+
+PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples)
+    : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples)
+{
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  return get_render_tile_pixels(render_buffers, render_buffers->params, destination);
+}
+
+static void pad_pixels(const BufferParams &buffer_params,
+                       const PassAccessor::Destination &destination,
+                       const int src_num_components)
+{
+  /* When requesting a single channel pass as RGBA, or RGB pass as RGBA,
+   * fill in the additional components for convenience. */
+  const int dest_num_components = destination.num_components;
+
+  if (src_num_components >= dest_num_components) {
+    return;
+  }
+
+  const size_t size = buffer_params.width * buffer_params.height;
+  if (destination.pixels) {
+    float *pixel = destination.pixels;
+
+    for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[1] = pixel[0];
+        pixel[2] = pixel[0];
+      }
+      if (dest_num_components >= 4) {
+        pixel[3] = 1.0f;
+      }
+    }
+  }
+
+  if (destination.pixels_half_rgba) {
+    const half one = float_to_half(1.0f);
+    half4 *pixel = destination.pixels_half_rgba;
+
+    for (size_t i = 0; i < size; i++, pixel++) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[0].y = pixel[0].x;
+        pixel[0].z = pixel[0].x;
+      }
+      if (dest_num_components >= 4) {
+        pixel[0].w = one;
+      }
+    }
+  }
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const BufferParams &buffer_params,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  if (pass_access_info_.offset == PASS_UNUSED) {
+    return false;
+  }
+
+  const PassType type = pass_access_info_.type;
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
+
+  if (pass_info.num_components == 1) {
+    /* Single channel passes. */
+    if (mode == PassMode::DENOISED) {
+      /* Denoised passes store their final pixels, no need in special calculation. */
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_RENDER_TIME) {
+      /* TODO(sergey): Needs implementation. */
+    }
+    else if (type == PASS_DEPTH) {
+      get_pass_depth(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_MIST) {
+      get_pass_mist(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SAMPLE_COUNT) {
+      get_pass_sample_count(render_buffers, buffer_params, destination);
+    }
+    else {
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+  }
+  else if (type == PASS_MOTION) {
+    /* Motion pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components";
+    get_pass_motion(render_buffers, buffer_params, destination);
+  }
+  else if (type == PASS_CRYPTOMATTE) {
+    /* Cryptomatte pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components";
+    get_pass_cryptomatte(render_buffers, buffer_params, destination);
+  }
+  else {
+    /* RGB, RGBA and vector passes. */
+    DCHECK(destination.num_components == 3 || destination.num_components == 4)
+        << pass_type_as_string(type) << " pass must have 3 or 4 components";
+
+    if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) {
+      /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass
+       * to approximate shadow with). */
+      get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) {
+      /* Shadow catcher pass. */
+      get_pass_shadow_catcher(render_buffers, buffer_params, destination);
+    }
+    else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE ||
+              pass_info.indirect_type != PASS_NONE) &&
+             mode != PassMode::DENOISED) {
+      /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */
+      get_pass_light_path(render_buffers, buffer_params, destination);
+    }
+    else {
+      /* Passes that need no special computation, or denoised passes that already
+       * had the computation done. */
+      if (pass_info.num_components == 3) {
+        get_pass_float3(render_buffers, buffer_params, destination);
+      }
+      else if (pass_info.num_components == 4) {
+        if (destination.num_components == 3) {
+          /* Special case for denoiser access of RGBA passes ignoring alpha channel. */
+          get_pass_float3(render_buffers, buffer_params, destination);
+        }
+        else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER ||
+                 type == PASS_SHADOW_CATCHER_MATTE) {
+          /* Passes with transparency as 4th component. */
+          get_pass_combined(render_buffers, buffer_params, destination);
+        }
+        else {
+          /* Passes with alpha as 4th component. */
+          get_pass_float4(render_buffers, buffer_params, destination);
+        }
+      }
+    }
+  }
+
+  pad_pixels(buffer_params, destination, pass_info.num_components);
+
+  return true;
+}
+
+void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination) const
+{
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
+                                             pass_access_info_.include_albedo);
+
+  kfilm_convert->pass_offset = pass_access_info_.offset;
+  kfilm_convert->pass_stride = buffer_params.pass_stride;
+
+  kfilm_convert->pass_use_exposure = pass_info.use_exposure;
+  kfilm_convert->pass_use_filter = pass_info.use_filter;
+
+  /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */
+  if (pass_info.direct_type != PASS_NONE) {
+    kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type);
+  }
+  kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type);
+  kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type);
+
+  kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED);
+  kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset(
+      PASS_ADAPTIVE_AUX_BUFFER);
+  kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT);
+  kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode);
+  kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+  kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_MATTE, mode);
+
+  /* Background is not denoised, so always use noisy pass. */
+  kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);
+
+  if (pass_info.use_filter) {
+    kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+  }
+  else {
+    kfilm_convert->scale = 1.0f;
+  }
+
+  if (pass_info.use_exposure) {
+    kfilm_convert->exposure = exposure_;
+  }
+  else {
+    kfilm_convert->exposure = 1.0f;
+  }
+
+  kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure;
+
+  kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher;
+  kfilm_convert->use_approximate_shadow_catcher_background =
+      pass_access_info_.use_approximate_shadow_catcher_background;
+  kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels;
+
+  kfilm_convert->num_components = destination.num_components;
+  kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                           destination.num_components;
+
+  kfilm_convert->is_denoised = (mode == PassMode::DENOISED);
+}
+
+bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source)
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
+                                            pass_access_info_.include_albedo);
+
+  const BufferParams &buffer_params = render_buffers->params;
+
+  float *buffer_data = render_buffers->buffer.data();
+  const int size = buffer_params.width * buffer_params.height;
+
+  const int out_stride = buffer_params.pass_stride;
+  const int in_stride = source.num_components;
+  const int num_components_to_copy = min(source.num_components, pass_info.num_components);
+
+  float *out = buffer_data + pass_access_info_.offset;
+  const float *in = source.pixels + source.offset * in_stride;
+
+  for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
+    memcpy(out, in, sizeof(float) * num_components_to_copy);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
new file mode 100644
index 00000000000..624bf7d0b2c
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "render/pass.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class RenderBuffers;
+class BufferPass;
+class BufferParams;
+struct KernelFilmConvert;
+
+/* Helper class which allows to access pass data.
+ * Is designed in a way that it is created once when the pass data is known, and then pixels gets
+ * progressively update from various render buffers. */
+class PassAccessor {
+ public:
+  class PassAccessInfo {
+   public:
+    PassAccessInfo() = default;
+    explicit PassAccessInfo(const BufferPass &pass);
+
+    PassType type = PASS_NONE;
+    PassMode mode = PassMode::NOISY;
+    bool include_albedo = false;
+    int offset = -1;
+
+    /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its
+     * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop.
+     */
+    bool use_approximate_shadow_catcher = false;
+
+    /* When approximate shadow catcher matte is used alpha-over the result on top of background. */
+    bool use_approximate_shadow_catcher_background = false;
+
+    bool show_active_pixels = false;
+  };
+
+  class Destination {
+   public:
+    Destination() = default;
+    Destination(float *pixels, int num_components);
+    Destination(const PassType pass_type, half4 *pixels);
+
+    /* Destination will be initialized with the number of components which is native for the given
+     * pass type. */
+    explicit Destination(const PassType pass_type);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    float *pixels = nullptr;
+    half4 *pixels_half_rgba = nullptr;
+
+    /* Device-side pointers. */
+    device_ptr d_pixels = 0;
+    device_ptr d_pixels_half_rgba = 0;
+
+    /* Number of components per pixel in the floating-point destination.
+     * Is ignored for half4 destination (where number of components is implied to be 4). */
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+
+    /* Number of floats per pixel. When zero is the same as `num_components`.
+     *
+     * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component
+     * half-floats. */
+    int pixel_stride = 0;
+
+    /* Row stride in pixel elements:
+     *  - For the float destination stride is a number of floats per row.
+     *  - For the half4 destination stride is a number of half4 per row. */
+    int stride = 0;
+  };
+
+  class Source {
+   public:
+    Source() = default;
+    Source(const float *pixels, int num_components);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    const float *pixels = nullptr;
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+  };
+
+  PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples);
+
+  virtual ~PassAccessor() = default;
+
+  /* Get pass data from the given render buffers, perform needed filtering, and store result into
+   * the pixels.
+   * The result is stored sequentially starting from the very beginning of the pixels memory. */
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const Destination &destination) const;
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const BufferParams &buffer_params,
+                              const Destination &destination) const;
+  /* Set pass data for the given render buffers. Used for baking to read from passes. */
+  bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source);
+
+ protected:
+  virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                        const BufferParams &buffer_params,
+                                        const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const = 0;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+
+  PassAccessInfo pass_access_info_;
+
+  float exposure_ = 0.0f;
+  int num_samples_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
new file mode 100644
index 00000000000..3c6691f6d43
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_film.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Kernel processing.
+ */
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                                           const BufferParams &buffer_params,
+                                                           const Destination &destination,
+                                                           const Processor &processor) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  if (destination.pixels) {
+    /* NOTE: No overlays are applied since they are not used for final renders.
+     * Can be supported via some sort of specialization to avoid code duplication. */
+
+    run_get_pass_kernel_processor_float(
+        &kfilm_convert, render_buffers, buffer_params, destination, processor);
+  }
+
+  if (destination.pixels_half_rgba) {
+    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+    if (destination.num_components == 1) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                float pixel;
+                                                processor(kfilm_convert, buffer, &pixel);
+
+                                                pixel_rgba[0] = pixel;
+                                                pixel_rgba[1] = pixel;
+                                                pixel_rgba[2] = pixel;
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 3) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                processor(kfilm_convert, buffer, pixel_rgba);
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 4) {
+      run_get_pass_kernel_processor_half_rgba(
+          &kfilm_convert, render_buffers, buffer_params, destination, processor);
+    }
+  }
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+  const float *buffer_data = render_buffers->buffer.data();
+  const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                      destination.num_components;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+      float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
+
+      processor(kfilm_convert, buffer, pixel);
+    }
+  });
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  const float *buffer_data = render_buffers->buffer.data();
+
+  half4 *dst_start = destination.pixels_half_rgba + destination.offset;
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    half4 *dst_row_start = dst_start + y * destination_stride;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+
+      float pixel[4];
+      processor(kfilm_convert, buffer, pixel);
+
+      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+      half4 *pixel_half_rgba = dst_row_start + x;
+      float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+    }
+  });
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass) \
+  void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_get_pass_kernel_processor( \
+        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth)
+DEFINE_PASS_ACCESSOR(mist)
+DEFINE_PASS_ACCESSOR(sample_count)
+DEFINE_PASS_ACCESSOR(float)
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path)
+DEFINE_PASS_ACCESSOR(shadow_catcher)
+DEFINE_PASS_ACCESSOR(float3)
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion)
+DEFINE_PASS_ACCESSOR(cryptomatte)
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+DEFINE_PASS_ACCESSOR(combined)
+DEFINE_PASS_ACCESSOR(float4)
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
new file mode 100644
index 00000000000..0313dc5bb0d
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelFilmConvert;
+
+/* Pass accessor implementation for CPU side. */
+class PassAccessorCPU : public PassAccessor {
+ public:
+  using PassAccessor::PassAccessor;
+
+ protected:
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination,
+                                            const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+                                                  const RenderBuffers *render_buffers,
+                                                  const BufferParams &buffer_params,
+                                                  const Destination &destination,
+                                                  const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+                                                      const RenderBuffers *render_buffers,
+                                                      const BufferParams &buffer_params,
+                                                      const Destination &destination,
+                                                      const Processor &processor) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
new file mode 100644
index 00000000000..eb80ba99655
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_gpu.h"
+
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue,
+                                 const PassAccessInfo &pass_access_info,
+                                 float exposure,
+                                 int num_samples)
+    : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue)
+
+{
+}
+
+/* --------------------------------------------------------------------
+ * Kernel execution.
+ */
+
+void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
+                                               const RenderBuffers *render_buffers,
+                                               const BufferParams &buffer_params,
+                                               const Destination &destination) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  if (destination.d_pixels) {
+    DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+  if (destination.d_pixels_half_rgba) {
+    const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel_half_float, work_size, args);
+  }
+
+  queue_->synchronize();
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \
+  void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_film_convert_kernels( \
+        DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth, DEPTH);
+DEFINE_PASS_ACCESSOR(mist, MIST);
+DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT);
+DEFINE_PASS_ACCESSOR(float, FLOAT);
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
+DEFINE_PASS_ACCESSOR(float3, FLOAT3);
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion, MOTION);
+DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE);
+DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER);
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW);
+DEFINE_PASS_ACCESSOR(combined, COMBINED);
+DEFINE_PASS_ACCESSOR(float4, FLOAT4);
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h
new file mode 100644
index 00000000000..bc37e4387f3
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+/* Pass accessor implementation for GPU side. */
+class PassAccessorGPU : public PassAccessor {
+ public:
+  PassAccessorGPU(DeviceQueue *queue,
+                  const PassAccessInfo &pass_access_info,
+                  float exposure,
+                  int num_samples);
+
+ protected:
+  void run_film_convert_kernels(DeviceKernel kernel,
+                                const RenderBuffers *render_buffers,
+                                const BufferParams &buffer_params,
+                                const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth);
+  DECLARE_PASS_ACCESSOR(mist);
+  DECLARE_PASS_ACCESSOR(sample_count);
+  DECLARE_PASS_ACCESSOR(float);
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path);
+  DECLARE_PASS_ACCESSOR(float3);
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion);
+  DECLARE_PASS_ACCESSOR(cryptomatte);
+  DECLARE_PASS_ACCESSOR(shadow_catcher);
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow);
+  DECLARE_PASS_ACCESSOR(combined);
+  DECLARE_PASS_ACCESSOR(float4);
+
+#undef DECLARE_PASS_ACCESSOR
+
+  DeviceQueue *queue_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
new file mode 100644
index 00000000000..6c02316ac2b
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -0,0 +1,1147 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace.h"
+
+#include "device/cpu/device.h"
+#include "device/device.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/render_scheduler.h"
+#include "render/gpu_display.h"
+#include "render/pass.h"
+#include "render/scene.h"
+#include "render/tile.h"
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTrace::PathTrace(Device *device,
+                     Film *film,
+                     DeviceScene *device_scene,
+                     RenderScheduler &render_scheduler,
+                     TileManager &tile_manager)
+    : device_(device),
+      device_scene_(device_scene),
+      render_scheduler_(render_scheduler),
+      tile_manager_(tile_manager)
+{
+  DCHECK_NE(device_, nullptr);
+
+  {
+    vector<DeviceInfo> cpu_devices;
+    device_cpu_info(cpu_devices);
+
+    cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler));
+  }
+
+  /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
+   * as possible. */
+  device_->foreach_device([&](Device *path_trace_device) {
+    path_trace_works_.emplace_back(PathTraceWork::create(
+        path_trace_device, film, device_scene, &render_cancel_.is_requested));
+  });
+
+  work_balance_infos_.resize(path_trace_works_.size());
+  work_balance_do_initial(work_balance_infos_);
+
+  render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1);
+}
+
+PathTrace::~PathTrace()
+{
+  /* Destroy any GPU resource which was used for graphics interop.
+   * Need to have access to the GPUDisplay as it is the only source of drawing context which is
+   * used for interop. */
+  if (gpu_display_) {
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->destroy_gpu_resources(gpu_display_.get());
+    }
+  }
+}
+
+void PathTrace::load_kernels()
+{
+  if (denoiser_) {
+    denoiser_->load_kernels(progress_);
+  }
+}
+
+void PathTrace::alloc_work_memory()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->alloc_work_memory();
+  }
+}
+
+bool PathTrace::ready_to_reset()
+{
+  /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
+   * display. Of there is no such display, the logic here will break. */
+  DCHECK(gpu_display_);
+
+  /* The logic here tries to provide behavior which feels the most interactive feel to artists.
+   * General idea is to be able to reset as quickly as possible, while still providing interactive
+   * feel.
+   *
+   * If the render result was ever drawn after previous reset, consider that reset is now possible.
+   * This way camera navigation gives the quickest feedback of rendered pixels, regardless of
+   * whether CPU or GPU drawing pipeline is used.
+   *
+   * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit
+   * arbitrary, but seems to work very well with viewport navigation in Blender. */
+
+  if (did_draw_after_reset_) {
+    return true;
+  }
+
+  return false;
+}
+
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
+{
+  if (big_tile_params_.modified(big_tile_params)) {
+    big_tile_params_ = big_tile_params;
+    render_state_.need_reset_params = true;
+  }
+
+  full_params_ = full_params;
+
+  /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
+   * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
+   * properly updated. */
+  if (gpu_display_) {
+    gpu_display_->reset(full_params);
+  }
+
+  render_state_.has_denoised_result = false;
+  render_state_.tile_written = false;
+
+  did_draw_after_reset_ = false;
+}
+
+void PathTrace::device_free()
+{
+  /* Free render buffers used by the path trace work to reduce memory peak. */
+  BufferParams empty_params;
+  empty_params.pass_stride = 0;
+  empty_params.update_offset_stride();
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->get_render_buffers()->reset(empty_params);
+  }
+  render_state_.need_reset_params = true;
+}
+
+void PathTrace::set_progress(Progress *progress)
+{
+  progress_ = progress;
+}
+
+void PathTrace::render(const RenderWork &render_work)
+{
+  /* Indicate that rendering has started and that it can be requested to cancel. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    if (render_cancel_.is_requested) {
+      return;
+    }
+    render_cancel_.is_rendering = true;
+  }
+
+  render_pipeline(render_work);
+
+  /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry
+   * on. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    render_cancel_.is_rendering = false;
+    render_cancel_.condition.notify_one();
+  }
+}
+
+void PathTrace::render_pipeline(RenderWork render_work)
+{
+  /* NOTE: Only check for "instant" cancel here. Ther user-requested cancel via progress is
+   * checked in Session and the work in the event of cancel is to be finished here. */
+
+  render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes !=
+                                                  0);
+
+  render_init_kernel_execution();
+
+  render_scheduler_.report_work_begin(render_work);
+
+  init_render_buffers(render_work);
+
+  rebalance(render_work);
+
+  path_trace(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  adaptive_sample(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  cryptomatte_postprocess(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  denoise(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  write_tile_buffer(render_work);
+  update_display(render_work);
+
+  progress_update_if_needed(render_work);
+
+  finalize_full_buffer_on_disk(render_work);
+}
+
+void PathTrace::render_init_kernel_execution()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->init_execution();
+  }
+}
+
+/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a
+ * measurable performance impact at runtime, but will make compilation faster and binary somewhat
+ * smaller. */
+template<typename Callback>
+static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works,
+                                         const vector<WorkBalanceInfo> &work_balance_infos,
+                                         const BufferParams &buffer_params,
+                                         const Callback &callback)
+{
+  const int num_works = path_trace_works.size();
+  const int height = buffer_params.height;
+
+  int current_y = 0;
+  for (int i = 0; i < num_works; ++i) {
+    const double weight = work_balance_infos[i].weight;
+    const int slice_height = max(lround(height * weight), 1);
+
+    /* Disallow negative values to deal with situations when there are more compute devices than
+     * scanlines. */
+    const int remaining_height = max(0, height - current_y);
+
+    BufferParams slide_params = buffer_params;
+    slide_params.full_y = buffer_params.full_y + current_y;
+    if (i < num_works - 1) {
+      slide_params.height = min(slice_height, remaining_height);
+    }
+    else {
+      slide_params.height = remaining_height;
+    }
+
+    slide_params.update_offset_stride();
+
+    callback(path_trace_works[i].get(), slide_params);
+
+    current_y += slide_params.height;
+  }
+}
+
+void PathTrace::update_allocated_work_buffer_params()
+{
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               big_tile_params_,
+                               [](PathTraceWork *path_trace_work, const BufferParams &params) {
+                                 RenderBuffers *buffers = path_trace_work->get_render_buffers();
+                                 buffers->reset(params);
+                               });
+}
+
+static BufferParams scale_buffer_params(const BufferParams &params, int resolution_divider)
+{
+  BufferParams scaled_params = params;
+
+  scaled_params.width = max(1, params.width / resolution_divider);
+  scaled_params.height = max(1, params.height / resolution_divider);
+  scaled_params.full_x = params.full_x / resolution_divider;
+  scaled_params.full_y = params.full_y / resolution_divider;
+  scaled_params.full_width = params.full_width / resolution_divider;
+  scaled_params.full_height = params.full_height / resolution_divider;
+
+  scaled_params.update_offset_stride();
+
+  return scaled_params;
+}
+
+void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work)
+{
+  const int resolution_divider = render_work.resolution_divider;
+
+  const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider);
+  const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_,
+                                                                  resolution_divider);
+
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               scaled_big_tile_params,
+                               [&](PathTraceWork *path_trace_work, const BufferParams params) {
+                                 path_trace_work->set_effective_buffer_params(
+                                     scaled_full_params, scaled_big_tile_params, params);
+                               });
+
+  render_state_.effective_big_tile_params = scaled_big_tile_params;
+}
+
+void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work)
+{
+  if (render_state_.need_reset_params) {
+    update_allocated_work_buffer_params();
+  }
+
+  if (render_state_.need_reset_params ||
+      render_state_.resolution_divider != render_work.resolution_divider) {
+    update_effective_work_buffer_params(render_work);
+  }
+
+  render_state_.resolution_divider = render_work.resolution_divider;
+  render_state_.need_reset_params = false;
+}
+
+void PathTrace::init_render_buffers(const RenderWork &render_work)
+{
+  update_work_buffer_params_if_needed(render_work);
+
+  /* Handle initialization scheduled by the render scheduler. */
+  if (render_work.init_render_buffers) {
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->zero_render_buffers();
+    });
+
+    tile_buffer_read();
+  }
+}
+
+void PathTrace::path_trace(RenderWork &render_work)
+{
+  if (!render_work.path_trace.num_samples) {
+    return;
+  }
+
+  VLOG(3) << "Will path trace " << render_work.path_trace.num_samples
+          << " samples at the resolution divider " << render_work.resolution_divider;
+
+  const double start_time = time_dt();
+
+  const int num_works = path_trace_works_.size();
+
+  tbb::parallel_for(0, num_works, [&](int i) {
+    const double work_start_time = time_dt();
+    const int num_samples = render_work.path_trace.num_samples;
+
+    PathTraceWork *path_trace_work = path_trace_works_[i].get();
+
+    PathTraceWork::RenderStatistics statistics;
+    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+    const double work_time = time_dt() - work_start_time;
+    work_balance_infos_[i].time_spent += work_time;
+    work_balance_infos_[i].occupancy = statistics.occupancy;
+
+    VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+            << work_time / num_samples
+            << " seconds per sample), occupancy: " << statistics.occupancy;
+  });
+
+  float occupancy_accum = 0.0f;
+  for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+    occupancy_accum += balance_info.occupancy;
+  }
+  const float occupancy = occupancy_accum / num_works;
+  render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
+  render_scheduler_.report_path_trace_time(
+      render_work, time_dt() - start_time, is_cancel_requested());
+}
+
+void PathTrace::adaptive_sample(RenderWork &render_work)
+{
+  if (!render_work.adaptive_sampling.filter) {
+    return;
+  }
+
+  bool did_reschedule_on_idle = false;
+
+  while (true) {
+    VLOG(3) << "Will filter adaptive stopping buffer, threshold "
+            << render_work.adaptive_sampling.threshold;
+    if (render_work.adaptive_sampling.reset) {
+      VLOG(3) << "Will re-calculate convergency flag for currently converged pixels.";
+    }
+
+    const double start_time = time_dt();
+
+    uint num_active_pixels = 0;
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      const uint num_active_pixels_in_work =
+          path_trace_work->adaptive_sampling_converge_filter_count_active(
+              render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset);
+      if (num_active_pixels_in_work) {
+        atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work);
+      }
+    });
+
+    render_scheduler_.report_adaptive_filter_time(
+        render_work, time_dt() - start_time, is_cancel_requested());
+
+    if (num_active_pixels == 0) {
+      VLOG(3) << "All pixels converged.";
+      if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) {
+        break;
+      }
+      VLOG(3) << "Continuing with lower threshold.";
+    }
+    else if (did_reschedule_on_idle) {
+      break;
+    }
+    else if (num_active_pixels < 128 * 128) {
+      /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that
+       * there is no performance loss from the progressive noise floor feature.
+       *
+       * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of
+       * the final resolution. */
+      if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) {
+        VLOG(3) << "Rescheduling is not possible: final threshold is reached.";
+        break;
+      }
+      VLOG(3) << "Rescheduling lower threshold.";
+      did_reschedule_on_idle = true;
+    }
+    else {
+      break;
+    }
+  }
+}
+
+void PathTrace::set_denoiser_params(const DenoiseParams &params)
+{
+  render_scheduler_.set_denoiser_params(params);
+
+  if (!params.use) {
+    denoiser_.reset();
+    return;
+  }
+
+  if (denoiser_) {
+    const DenoiseParams old_denoiser_params = denoiser_->get_params();
+    if (old_denoiser_params.type == params.type) {
+      denoiser_->set_params(params);
+      return;
+    }
+  }
+
+  denoiser_ = Denoiser::create(device_, params);
+  denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); };
+}
+
+void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  render_scheduler_.set_adaptive_sampling(adaptive_sampling);
+}
+
+void PathTrace::cryptomatte_postprocess(const RenderWork &render_work)
+{
+  if (!render_work.cryptomatte.postprocess) {
+    return;
+  }
+  VLOG(3) << "Perform cryptomatte work.";
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    path_trace_work->cryptomatte_postproces();
+  });
+}
+
+void PathTrace::denoise(const RenderWork &render_work)
+{
+  if (!render_work.tile.denoise) {
+    return;
+  }
+
+  if (!denoiser_) {
+    /* Denoiser was not configured, so nothing to do here. */
+    return;
+  }
+
+  VLOG(3) << "Perform denoising work.";
+
+  const double start_time = time_dt();
+
+  RenderBuffers *buffer_to_denoise = nullptr;
+
+  unique_ptr<RenderBuffers> multi_device_buffers;
+  bool allow_inplace_modification = false;
+
+  if (path_trace_works_.size() == 1) {
+    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  }
+  else {
+    Device *denoiser_device = denoiser_->get_denoiser_device();
+    if (!denoiser_device) {
+      return;
+    }
+
+    multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
+    multi_device_buffers->reset(render_state_.effective_big_tile_params);
+
+    buffer_to_denoise = multi_device_buffers.get();
+
+    copy_to_render_buffers(multi_device_buffers.get());
+
+    allow_inplace_modification = true;
+  }
+
+  if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
+                                buffer_to_denoise,
+                                get_num_samples_in_buffer(),
+                                allow_inplace_modification)) {
+    render_state_.has_denoised_result = true;
+  }
+
+  if (multi_device_buffers) {
+    multi_device_buffers->copy_from_device();
+    tbb::parallel_for_each(
+        path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+          path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
+        });
+  }
+
+  render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+{
+  gpu_display_ = move(gpu_display);
+}
+
+void PathTrace::clear_gpu_display()
+{
+  if (gpu_display_) {
+    gpu_display_->clear();
+  }
+}
+
+void PathTrace::draw()
+{
+  if (!gpu_display_) {
+    return;
+  }
+
+  did_draw_after_reset_ |= gpu_display_->draw();
+}
+
+void PathTrace::update_display(const RenderWork &render_work)
+{
+  if (!render_work.display.update) {
+    return;
+  }
+
+  if (!gpu_display_ && !tile_buffer_update_cb) {
+    VLOG(3) << "Ignore display update.";
+    return;
+  }
+
+  if (full_params_.width == 0 || full_params_.height == 0) {
+    VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (tile_buffer_update_cb) {
+    VLOG(3) << "Invoke buffer update callback.";
+
+    tile_buffer_update_cb();
+  }
+
+  if (gpu_display_) {
+    VLOG(3) << "Perform copy to GPUDisplay work.";
+
+    const int resolution_divider = render_work.resolution_divider;
+    const int texture_width = max(1, full_params_.width / resolution_divider);
+    const int texture_height = max(1, full_params_.height / resolution_divider);
+    if (!gpu_display_->update_begin(texture_width, texture_height)) {
+      LOG(ERROR) << "Error beginning GPUDisplay update.";
+      return;
+    }
+
+    const PassMode pass_mode = render_work.display.use_denoised_result &&
+                                       render_state_.has_denoised_result ?
+                                   PassMode::DENOISED :
+                                   PassMode::NOISY;
+
+    /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
+     * all works in parallel. */
+    const int num_samples = get_num_samples_in_buffer();
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples);
+    }
+
+    gpu_display_->update_end();
+  }
+
+  render_scheduler_.report_display_update_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::rebalance(const RenderWork &render_work)
+{
+  static const int kLogLevel = 3;
+
+  if (!render_work.rebalance) {
+    return;
+  }
+
+  const int num_works = path_trace_works_.size();
+
+  if (num_works == 1) {
+    VLOG(kLogLevel) << "Ignoring rebalance work due to single device render.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Perform rebalance work.";
+    VLOG(kLogLevel) << "Per-device path tracing time (seconds):";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].time_spent;
+    }
+  }
+
+  const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_);
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Calculated per-device weights for works:";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].weight;
+    }
+  }
+
+  if (!did_rebalance) {
+    VLOG(kLogLevel) << "Balance in path trace works did not change.";
+    render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false);
+    return;
+  }
+
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+  big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+
+  copy_to_render_buffers(&big_tile_cpu_buffers);
+
+  render_state_.need_reset_params = true;
+  update_work_buffer_params_if_needed(render_work);
+
+  copy_from_render_buffers(&big_tile_cpu_buffers);
+
+  render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true);
+}
+
+void PathTrace::write_tile_buffer(const RenderWork &render_work)
+{
+  if (!render_work.tile.write) {
+    return;
+  }
+
+  VLOG(3) << "Write tile result.";
+
+  render_state_.tile_written = true;
+
+  const bool has_multiple_tiles = tile_manager_.has_multiple_tiles();
+
+  /* Write render tile result, but only if not using tiled rendering.
+   *
+   * Tiles are written to a file during rendering, and written to the software at the end
+   * of rendering (wither when all tiles are finished, or when rendering was requested to be
+   * cancelled).
+   *
+   * Important thing is: tile should be written to the software via callback only once. */
+  if (!has_multiple_tiles) {
+    VLOG(3) << "Write tile result via buffer write callback.";
+    tile_buffer_write();
+  }
+
+  /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
+   */
+  if (has_multiple_tiles) {
+    VLOG(3) << "Write tile result into .";
+    tile_buffer_write_to_disk();
+  }
+}
+
+void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work)
+{
+  if (!render_work.full.write) {
+    return;
+  }
+
+  VLOG(3) << "Handle full-frame render buffer work.";
+
+  if (!tile_manager_.has_written_tiles()) {
+    VLOG(3) << "No tiles on disk.";
+    return;
+  }
+
+  /* Make sure writing to the file is fully finished.
+   * This will include writing all possible missing tiles, ensuring validness of the file. */
+  tile_manager_.finish_write_tiles();
+
+  /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after
+   * all scenes and layers are rendered by the Session (which happens after freeing Session memory,
+   * so that we never hold scene and full-frame buffer in memory at the same time). */
+}
+
+void PathTrace::cancel()
+{
+  thread_scoped_lock lock(render_cancel_.mutex);
+
+  render_cancel_.is_requested = true;
+
+  while (render_cancel_.is_rendering) {
+    render_cancel_.condition.wait(lock);
+  }
+
+  render_cancel_.is_requested = false;
+}
+
+int PathTrace::get_num_samples_in_buffer()
+{
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::is_cancel_requested()
+{
+  if (render_cancel_.is_requested) {
+    return true;
+  }
+
+  if (progress_ != nullptr) {
+    if (progress_->get_cancel()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void PathTrace::tile_buffer_write()
+{
+  if (!tile_buffer_write_cb) {
+    return;
+  }
+
+  tile_buffer_write_cb();
+}
+
+void PathTrace::tile_buffer_read()
+{
+  if (!tile_buffer_read_cb) {
+    return;
+  }
+
+  if (tile_buffer_read_cb()) {
+    tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->copy_render_buffers_to_device();
+    });
+  }
+}
+
+void PathTrace::tile_buffer_write_to_disk()
+{
+  /* Sample count pass is required to support per-tile partial results stored in the file. */
+  DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED);
+
+  const int num_rendered_samples = render_scheduler_.get_num_rendered_samples();
+
+  if (num_rendered_samples == 0) {
+    /* The tile has zero samples, no need to write it. */
+    return;
+  }
+
+  /* Get access to the CPU-side render buffers of the current big tile. */
+  RenderBuffers *buffers;
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+
+  if (path_trace_works_.size() == 1) {
+    path_trace_works_[0]->copy_render_buffers_from_device();
+    buffers = path_trace_works_[0]->get_render_buffers();
+  }
+  else {
+    big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+    copy_to_render_buffers(&big_tile_cpu_buffers);
+
+    buffers = &big_tile_cpu_buffers;
+  }
+
+  if (!tile_manager_.write_tile(*buffers)) {
+    LOG(ERROR) << "Error writing tile to file.";
+  }
+}
+
+void PathTrace::progress_update_if_needed(const RenderWork &render_work)
+{
+  if (progress_ != nullptr) {
+    const int2 tile_size = get_render_tile_size();
+    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
+    const int current_sample = render_work.path_trace.start_sample +
+                               render_work.path_trace.num_samples;
+    progress_->add_samples(num_samples_added, current_sample);
+  }
+
+  if (progress_update_cb) {
+    progress_update_cb();
+  }
+}
+
+void PathTrace::progress_set_status(const string &status, const string &substatus)
+{
+  if (progress_ != nullptr) {
+    progress_->set_status(status, substatus);
+  }
+}
+
+void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_to_render_buffers(render_buffers);
+                         });
+  render_buffers->copy_to_device();
+}
+
+void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers)
+{
+  render_buffers->copy_from_device();
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_from_render_buffers(render_buffers);
+                         });
+}
+
+bool PathTrace::copy_render_tile_from_device()
+{
+  if (full_frame_state_.render_buffers) {
+    /* Full-frame buffer is always allocated on CPU. */
+    return true;
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->copy_render_buffers_from_device()) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+static string get_layer_view_name(const RenderBuffers &buffers)
+{
+  string result;
+
+  if (buffers.params.layer.size()) {
+    result += string(buffers.params.layer);
+  }
+
+  if (buffers.params.view.size()) {
+    if (!result.empty()) {
+      result += ", ";
+    }
+    result += string(buffers.params.view);
+  }
+
+  return result;
+}
+
+void PathTrace::process_full_buffer_from_disk(string_view filename)
+{
+  VLOG(3) << "Processing full frame buffer file " << filename;
+
+  progress_set_status("Reading full buffer from disk");
+
+  RenderBuffers full_frame_buffers(cpu_device_.get());
+
+  DenoiseParams denoise_params;
+  if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
+    LOG(ERROR) << "Error reading tiles from file.";
+    return;
+  }
+
+  const string layer_view_name = get_layer_view_name(full_frame_buffers);
+
+  render_state_.has_denoised_result = false;
+
+  if (denoise_params.use) {
+    progress_set_status(layer_view_name, "Denoising");
+
+    /* Re-use the denoiser as much as possible, avoiding possible device re-initialization.
+     *
+     * It will not conflict with the regular rendering as:
+     *  - Rendering is supposed to be finished here.
+     *  - The next rendering will go via Session's `run_update_for_next_iteration` which will
+     *    ensure proper denoiser is used. */
+    set_denoiser_params(denoise_params);
+
+    /* Number of samples doesn't matter too much, since the sampels count pass will be used. */
+    denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false);
+
+    render_state_.has_denoised_result = true;
+  }
+
+  full_frame_state_.render_buffers = &full_frame_buffers;
+
+  progress_set_status(layer_view_name, "Finishing");
+
+  /* Write the full result pretending that there is a single tile.
+   * Requires some state change, but allows to use same communication API with the software. */
+  tile_buffer_write();
+
+  full_frame_state_.render_buffers = nullptr;
+}
+
+int PathTrace::get_num_render_tile_samples() const
+{
+  if (full_frame_state_.render_buffers) {
+    /* If the full-frame buffer is read from disk the number of samples is not used as there is a
+     * sample count pass for that in the buffer. Just avoid access to badly defined state of the
+     * path state. */
+    return 0;
+  }
+
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                       const PassAccessor::Destination &destination)
+{
+  if (full_frame_state_.render_buffers) {
+    return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                       const PassAccessor::Source &source)
+{
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+int2 PathTrace::get_render_tile_size() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(full_frame_state_.render_buffers->params.width,
+                     full_frame_state_.render_buffers->params.height);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.width, tile.height);
+}
+
+int2 PathTrace::get_render_tile_offset() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(0, 0);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.x, tile.y);
+}
+
+const BufferParams &PathTrace::get_render_tile_params() const
+{
+  if (full_frame_state_.render_buffers) {
+    return full_frame_state_.render_buffers->params;
+  }
+
+  return big_tile_params_;
+}
+
+bool PathTrace::has_denoised_result() const
+{
+  return render_state_.has_denoised_result;
+}
+
+/* --------------------------------------------------------------------
+ * Report generation.
+ */
+
+static const char *device_type_for_description(const DeviceType type)
+{
+  switch (type) {
+    case DEVICE_NONE:
+      return "None";
+
+    case DEVICE_CPU:
+      return "CPU";
+    case DEVICE_CUDA:
+      return "CUDA";
+    case DEVICE_OPTIX:
+      return "OptiX";
+    case DEVICE_DUMMY:
+      return "Dummy";
+    case DEVICE_MULTI:
+      return "Multi";
+  }
+
+  return "UNKNOWN";
+}
+
+/* Construct description of the device which will appear in the full report. */
+/* TODO(sergey): Consider making it more reusable utility. */
+static string full_device_info_description(const DeviceInfo &device_info)
+{
+  string full_description = device_info.description;
+
+  full_description += " (" + string(device_type_for_description(device_info.type)) + ")";
+
+  if (device_info.display_device) {
+    full_description += " (display)";
+  }
+
+  if (device_info.type == DEVICE_CPU) {
+    full_description += " (" + to_string(device_info.cpu_threads) + " threads)";
+  }
+
+  full_description += " [" + device_info.id + "]";
+
+  return full_description;
+}
+
+/* Construct string which will contain information about devices, possibly multiple of the devices.
+ *
+ * In the simple case the result looks like:
+ *
+ *   Message: Full Device Description
+ *
+ * If there are multiple devices then the result looks like:
+ *
+ *   Message: Full First Device Description
+ *            Full Second Device Description
+ *
+ * Note that the newlines are placed in a way so that the result can be easily concatenated to the
+ * full report. */
+static string device_info_list_report(const string &message, const DeviceInfo &device_info)
+{
+  string result = "\n" + message + ": ";
+  const string pad(message.length() + 2, ' ');
+
+  if (device_info.multi_devices.empty()) {
+    result += full_device_info_description(device_info) + "\n";
+    return result;
+  }
+
+  bool is_first = true;
+  for (const DeviceInfo &sub_device_info : device_info.multi_devices) {
+    if (!is_first) {
+      result += pad;
+    }
+
+    result += full_device_info_description(sub_device_info) + "\n";
+
+    is_first = false;
+  }
+
+  return result;
+}
+
+static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works)
+{
+  DeviceInfo device_info;
+  device_info.type = DEVICE_MULTI;
+
+  for (auto &&path_trace_work : path_trace_works) {
+    device_info.multi_devices.push_back(path_trace_work->get_device()->info);
+  }
+
+  return device_info_list_report("Path tracing on", device_info);
+}
+
+static string denoiser_device_report(const Denoiser *denoiser)
+{
+  if (!denoiser) {
+    return "";
+  }
+
+  if (!denoiser->get_params().use) {
+    return "";
+  }
+
+  const Device *denoiser_device = denoiser->get_denoiser_device();
+  if (!denoiser_device) {
+    return "";
+  }
+
+  return device_info_list_report("Denoising on", denoiser_device->info);
+}
+
+string PathTrace::full_report() const
+{
+  string result = "\nFull path tracing report\n";
+
+  result += path_trace_devices_report(path_trace_works_);
+  result += denoiser_device_report(denoiser_.get());
+
+  /* Report from the render scheduler, which includes:
+   * - Render mode (interactive, offline, headless)
+   * - Adaptive sampling and denoiser parameters
+   * - Breakdown of timing. */
+  result += render_scheduler_.full_report();
+
+  return result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
new file mode 100644
index 00000000000..78ca68c1198
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/path_trace_work.h"
+#include "integrator/work_balancer.h"
+#include "render/buffers.h"
+#include "util/util_function.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling;
+class Device;
+class DeviceScene;
+class Film;
+class RenderBuffers;
+class RenderScheduler;
+class RenderWork;
+class Progress;
+class GPUDisplay;
+class TileManager;
+
+/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of
+ * all the common steps of path tracing which are not device-specific. The list of tasks includes
+ * but is not limited to:
+ *  - Kernel graph.
+ *  - Scheduling logic.
+ *  - Queues management.
+ *  - Adaptive stopping. */
+class PathTrace {
+ public:
+  /* Render scheduler is used to report timing information and access things like start/finish
+   * sample. */
+  PathTrace(Device *device,
+            Film *film,
+            DeviceScene *device_scene,
+            RenderScheduler &render_scheduler,
+            TileManager &tile_manager);
+  ~PathTrace();
+
+  /* Create devices and load kernels which are created on-demand (for example, denoising devices).
+   * The progress is reported to the currently configure progress object (via `set_progress`). */
+  void load_kernels();
+
+  /* Allocate working memory. This runs before allocating scene memory so that we can estimate
+   * more accurately which scene device memory may need to allocated on the host. */
+  void alloc_work_memory();
+
+  /* Check whether now it is a good time to reset rendering.
+   * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate
+   * render result. */
+  bool ready_to_reset();
+
+  void reset(const BufferParams &full_params, const BufferParams &big_tile_params);
+
+  void device_free();
+
+  /* Set progress tracker.
+   * Used to communicate details about the progress to the outer world, check whether rendering is
+   * to be canceled.
+   *
+   * The path tracer writes to this object, and then at a convenient moment runs
+   * progress_update_cb() callback. */
+  void set_progress(Progress *progress);
+
+  /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are
+   * rendered (or until rendering is requested to be cancelled). */
+  void render(const RenderWork &render_work);
+
+  /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is
+   * convenient to have it here because then its easy to access render buffer. But the downside is
+   * that this adds too much of entities which can live separately with some clear API. */
+
+  /* Set denoiser parameters.
+   * Use this to configure the denoiser before rendering any samples. */
+  void set_denoiser_params(const DenoiseParams &params);
+
+  /* Set parameters used for adaptive sampling.
+   * Use this to configure the adaptive sampler before rendering any samples. */
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  /* Set GPU display which takes care of drawing the render result. */
+  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+
+  /* Clear the GPU display by filling it in with all zeroes. */
+  void clear_gpu_display();
+
+  /* Perform drawing of the current state of the GPUDisplay. */
+  void draw();
+
+  /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
+   * Used in cases like reset of render session.
+   *
+   * This is a blockign call, which returns as soon as there is no running `render_samples()` call.
+   */
+  void cancel();
+
+  /* Copy an entire render buffer to/from the path trace.  */
+
+  /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and
+   * the data will be copied to the device of the given render buffers. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy happens via CPU side buffer: data will be copied from the device of the given rendetr
+   * buffers and will be copied to all devices of the path trace. */
+  void copy_from_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy render buffers of the big tile from the device to hsot.
+   * Return true if all copies are successful. */
+  bool copy_render_tile_from_device();
+
+  /* Read given full-frame file from disk, perform needed processing and write it to the software
+   * via the write callback. */
+  void process_full_buffer_from_disk(string_view filename);
+
+  /* Get number of samples in the current big tile render buffers. */
+  int get_num_render_tile_samples() const;
+
+  /* Get pass data of the entire big tile.
+   * This call puts pass render result from all devices into the final pixels storage.
+   *
+   * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`.
+   *
+   * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Check whether denoiser was run and denoised passes are available. */
+  bool has_denoised_result() const;
+
+  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile.
+   * In the case of tiled rendering this will return full-frame after all tiles has been rendered.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  int2 get_render_tile_size() const;
+  int2 get_render_tile_offset() const;
+
+  /* Get buffer parameters of the current tile.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  const BufferParams &get_render_tile_params() const;
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+  /* Callback which communicates an updates state of the render buffer of the current big tile.
+   * Is called during path tracing to communicate work-in-progress state of the final buffer. */
+  function<void(void)> tile_buffer_update_cb;
+
+  /* Callback which communicates final rendered buffer. Is called after pathtracing is done. */
+  function<void(void)> tile_buffer_write_cb;
+
+  /* Callback which initializes rendered buffer. Is called before pathtracing starts.
+   *
+   * This is used for baking. */
+  function<bool(void)> tile_buffer_read_cb;
+
+  /* Callback which is called to report current rendering progress.
+   *
+   * It is supposed to be cheaper than buffer update/write, hence can be called more often.
+   * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed
+   * that the buffer is "uniformly" sampled at the moment of this callback). */
+  function<void(void)> progress_update_cb;
+
+ protected:
+  /* Actual implementation of the rendering pipeline.
+   * Calls steps in order, checking for the cancel to be requested inbetween.
+   *
+   * Is separate from `render()` to simplify dealing with the early outputs and keeping
+   * `render_cancel_` in the consistent state. */
+  void render_pipeline(RenderWork render_work);
+
+  /* Initialize kernel execution on all integrator queues. */
+  void render_init_kernel_execution();
+
+  /* Make sure both allocated and effective buffer parameters of path tracer works are up to date
+   * with the current big tile parameters, performance-dependent slicing, and resolution divider.
+   */
+  void update_work_buffer_params_if_needed(const RenderWork &render_work);
+  void update_allocated_work_buffer_params();
+  void update_effective_work_buffer_params(const RenderWork &render_work);
+
+  /* Perform various steps of the render work.
+   *
+   * Note that some steps might modify the work, forcing some steps to happen within this iteration
+   * of rendering. */
+  void init_render_buffers(const RenderWork &render_work);
+  void path_trace(RenderWork &render_work);
+  void adaptive_sample(RenderWork &render_work);
+  void denoise(const RenderWork &render_work);
+  void cryptomatte_postprocess(const RenderWork &render_work);
+  void update_display(const RenderWork &render_work);
+  void rebalance(const RenderWork &render_work);
+  void write_tile_buffer(const RenderWork &render_work);
+  void finalize_full_buffer_on_disk(const RenderWork &render_work);
+
+  /* Get number of samples in the current state of the render buffers. */
+  int get_num_samples_in_buffer();
+
+  /* Check whether user requested to cancel rendering, so that path tracing is to be finished as
+   * soon as possible. */
+  bool is_cancel_requested();
+
+  /* Write the big tile render buffer via the write callback. */
+  void tile_buffer_write();
+
+  /* Read the big tile render buffer via the read callback. */
+  void tile_buffer_read();
+
+  /* Write current tile into the file on disk. */
+  void tile_buffer_write_to_disk();
+
+  /* Run the progress_update_cb callback if it is needed. */
+  void progress_update_if_needed(const RenderWork &render_work);
+
+  void progress_set_status(const string &status, const string &substatus = "");
+
+  /* Pointer to a device which is configured to be used for path tracing. If multiple devices
+   * are configured this is a `MultiDevice`. */
+  Device *device_ = nullptr;
+
+  /* CPU device for creating temporary render buffers on the CPU side. */
+  unique_ptr<Device> cpu_device_;
+
+  DeviceScene *device_scene_;
+
+  RenderScheduler &render_scheduler_;
+  TileManager &tile_manager_;
+
+  unique_ptr<GPUDisplay> gpu_display_;
+
+  /* Per-compute device descriptors of work which is responsible for path tracing on its configured
+   * device. */
+  vector<unique_ptr<PathTraceWork>> path_trace_works_;
+
+  /* Per-path trace work information needed for multi-device balancing. */
+  vector<WorkBalanceInfo> work_balance_infos_;
+
+  /* Render buffer parameters of the full frame and current big tile. */
+  BufferParams full_params_;
+  BufferParams big_tile_params_;
+
+  /* Denoiser which takes care of denoising the big tile. */
+  unique_ptr<Denoiser> denoiser_;
+
+  /* State which is common for all the steps of the render work.
+   * Is brought up to date in the `render()` call and is accessed from all the steps involved into
+   * rendering the work. */
+  struct {
+    /* Denotes whether render buffers parameters of path trace works are to be reset for the new
+     * value of the big tile parameters. */
+    bool need_reset_params = false;
+
+    /* Divider of the resolution for faster previews.
+     *
+     * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to
+     * think of render buffer in this case is as an over-allocated array: the resolution divider
+     * affects both resolution and stride as visible by the integrator kernels. */
+    int resolution_divider = 0;
+
+    /* Paramaters of the big tile with the current resolution divider applied. */
+    BufferParams effective_big_tile_params;
+
+    /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+    bool has_denoised_result = false;
+
+    /* Current tile has been written (to either disk or callback.
+     * Indicates that no more work will be done on this tile. */
+    bool tile_written = false;
+  } render_state_;
+
+  /* Progress object which is used to communicate sample progress. */
+  Progress *progress_;
+
+  /* Fields required for canceling render on demand, as quickly as possible. */
+  struct {
+    /* Indicates whether there is an on-going `render_samples()` call. */
+    bool is_rendering = false;
+
+    /* Indicates whether rendering is requested to be canceled by `cancel()`. */
+    bool is_requested = false;
+
+    /* Synchronization between thread which does `render_samples()` and thread which does
+     * `cancel()`. */
+    thread_mutex mutex;
+    thread_condition_variable condition;
+  } render_cancel_;
+
+  /* Indicates whether a render result was drawn after latest session reset.
+   * Used by `ready_to_reset()` to implement logic which feels the most interactive. */
+  bool did_draw_after_reset_ = true;
+
+  /* State of the full frame processing and writing to the software. */
+  struct {
+    RenderBuffers *render_buffers = nullptr;
+  } full_frame_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
new file mode 100644
index 00000000000..d9634acac10
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/path_trace_work_cpu.h"
+#include "integrator/path_trace_work_gpu.h"
+#include "render/buffers.h"
+#include "render/film.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
+                                                Film *film,
+                                                DeviceScene *device_scene,
+                                                bool *cancel_requested_flag)
+{
+  if (device->info.type == DEVICE_CPU) {
+    return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
+  }
+
+  return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
+}
+
+PathTraceWork::PathTraceWork(Device *device,
+                             Film *film,
+                             DeviceScene *device_scene,
+                             bool *cancel_requested_flag)
+    : device_(device),
+      film_(film),
+      device_scene_(device_scene),
+      buffers_(make_unique<RenderBuffers>(device)),
+      effective_buffer_params_(buffers_->params),
+      cancel_requested_flag_(cancel_requested_flag)
+{
+}
+
+PathTraceWork::~PathTraceWork()
+{
+}
+
+RenderBuffers *PathTraceWork::get_render_buffers()
+{
+  return buffers_.get();
+}
+
+void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params,
+                                                const BufferParams &effective_big_tile_params,
+                                                const BufferParams &effective_buffer_params)
+{
+  effective_full_params_ = effective_full_params;
+  effective_big_tile_params_ = effective_big_tile_params;
+  effective_buffer_params_ = effective_buffer_params;
+}
+
+bool PathTraceWork::has_multiple_works() const
+{
+  /* Assume if there are multiple works working on the same big tile none of the works gets the
+   * entire big tile to work on. */
+  return !(effective_big_tile_params_.width == effective_buffer_params_.width &&
+           effective_big_tile_params_.height == effective_buffer_params_.height &&
+           effective_big_tile_params_.full_x == effective_buffer_params_.full_x &&
+           effective_big_tile_params_.full_y == effective_buffer_params_.full_y);
+}
+
+void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  copy_render_buffers_from_device();
+
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = buffers_->buffer.data();
+  float *dst = render_buffers->buffer.data() + offset_in_floats;
+
+  memcpy(dst, src, data_size);
+}
+
+void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = render_buffers->buffer.data() + offset_in_floats;
+  float *dst = buffers_->buffer.data();
+
+  memcpy(dst, src, data_size);
+
+  copy_render_buffers_to_device();
+}
+
+void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset = offset_y * width;
+
+  render_buffers_host_copy_denoised(
+      buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
+
+  copy_render_buffers_to_device();
+}
+
+bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                           const PassAccessor::Destination &destination)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Destination slice_destination = destination;
+  slice_destination.offset += offset_y * width;
+
+  return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
+}
+
+bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                           const PassAccessor::Source &source)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Source slice_source = source;
+  slice_source.offset += offset_y * width;
+
+  return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source);
+}
+
+PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+  const KernelBackground &kbackground = device_scene_->data.background;
+
+  const BufferParams &params = buffers_->params;
+
+  const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass());
+
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = display_pass->type;
+  pass_access_info.offset = PASS_UNUSED;
+
+  if (pass_mode == PassMode::DENOISED) {
+    pass_access_info.mode = PassMode::DENOISED;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED);
+  }
+
+  if (pass_access_info.offset == PASS_UNUSED) {
+    pass_access_info.mode = PassMode::NOISY;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type);
+  }
+
+  pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      kfilm.use_approximate_shadow_catcher && !kbackground.transparent;
+
+  return pass_access_info;
+}
+
+PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template(
+    const GPUDisplay *gpu_display) const
+{
+  PassAccessor::Destination destination(film_->get_display_pass());
+
+  const int2 display_texture_size = gpu_display->get_texture_size();
+  const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x;
+  const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y;
+
+  destination.offset = texture_y * display_texture_size.x + texture_x;
+  destination.stride = display_texture_size.x;
+
+  return destination;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
new file mode 100644
index 00000000000..97b97f3d888
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "render/buffers.h"
+#include "render/pass.h"
+#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class DeviceScene;
+class Film;
+class GPUDisplay;
+class RenderBuffers;
+
+class PathTraceWork {
+ public:
+  struct RenderStatistics {
+    float occupancy = 1.0f;
+  };
+
+  /* Create path trace work which fits best the device.
+   *
+   * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as
+   * possible. This could be, for rexample, request to cancel rendering on camera navigation in
+   * viewport. */
+  static unique_ptr<PathTraceWork> create(Device *device,
+                                          Film *film,
+                                          DeviceScene *device_scene,
+                                          bool *cancel_requested_flag);
+
+  virtual ~PathTraceWork();
+
+  /* Access the render buffers.
+   *
+   * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to
+   * correspond to the big tile size and relative device performance. */
+  RenderBuffers *get_render_buffers();
+
+  /* Set effective parameters of the big tile and the work itself. */
+  void set_effective_buffer_params(const BufferParams &effective_full_params,
+                                   const BufferParams &effective_big_tile_params,
+                                   const BufferParams &effective_buffer_params);
+
+  /* Check whether the big tile is being worked on by multiple path trace works. */
+  bool has_multiple_works() const;
+
+  /* Allocate working memory for execution. Must be called before init_execution(). */
+  virtual void alloc_work_memory(){};
+
+  /* Initialize execution of kernels.
+   * Will ensure that all device queues are initialized for execution.
+   *
+   * This method is to be called after any change in the scene. It is not needed to call it prior
+   * to an every call of the `render_samples()`. */
+  virtual void init_execution() = 0;
+
+  /* Render given number of samples as a synchronous blocking call.
+   * The samples are added to the render buffer associated with this work. */
+  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+
+  /* Copy render result from this work to the corresponding place of the GPU display.
+   *
+   * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The
+   * noisy pass mode will be passed here when it is known that the buffer does not have denoised
+   * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is
+   * not used then this function will fall-back to the noisy pass instead. */
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) = 0;
+
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0;
+
+  /* Copy data from/to given render buffers.
+   * Will copy pixels from a corresponding place (from multi-device point of view) of the render
+   * buffers, and copy work's render buffers to the corresponding place of the destination. */
+
+  /* Notes:
+   * - Copies work's render buffer from the device.
+   * - Copies CPU-side buffer of the given buffer
+   * - Does not copy the buffer to its device. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Notes:
+   * - Does not copy given render buffers from the device.
+   * - Copies work's render buffer to its device. */
+  void copy_from_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+   * given render buffers, leaving rest of the passes.
+   *
+   * Same notes about device copying aplies to this call as well. */
+  void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Copy render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool copy_render_buffers_from_device() = 0;
+  virtual bool copy_render_buffers_to_device() = 0;
+
+  /* Zero render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool zero_render_buffers() = 0;
+
+  /* Access pixels rendered by this work and copy them to the coresponding location in the
+   * destination.
+   *
+   * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()`
+   * to update host-side data. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Perform convergence test on the render buffer, and filter the convergence mask.
+   * Returns number of active pixels (the ones which did not converge yet). */
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
+
+  /* Run cryptomatte pass post-processing kernels. */
+  virtual void cryptomatte_postproces() = 0;
+
+  /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
+   * possible, without waiting for any samples to be finished. */
+  inline bool is_cancel_requested() const
+  {
+    /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in
+     * threaded environment. */
+    return *cancel_requested_flag_;
+  }
+
+  /* Access to the device which is used to path trace this work on. */
+  Device *get_device() const
+  {
+    return device_;
+  }
+
+ protected:
+  PathTraceWork(Device *device,
+                Film *film,
+                DeviceScene *device_scene,
+                bool *cancel_requested_flag);
+
+  PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const;
+
+  /* Get destination which offset and stride are configured so that writing to it will write to a
+   * proper location of GPU display texture, taking current tile and device slice into account. */
+  PassAccessor::Destination get_gpu_display_destination_template(
+      const GPUDisplay *gpu_display) const;
+
+  /* Device which will be used for path tracing.
+   * Note that it is an actual render device (and never is a multi-device). */
+  Device *device_;
+
+  /* Film is used to access display pass configuration for GPU display update.
+   * Note that only fields which are not a part of kernel data can be accessed via the Film. */
+  Film *film_;
+
+  /* Device side scene storage, that may be used for integrator logic. */
+  DeviceScene *device_scene_;
+
+  /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big
+   * tile which is being rendered by this work.
+   * It also defines possible subset of a big tile in the case of multi-device rendering. */
+  unique_ptr<RenderBuffers> buffers_;
+
+  /* Effective parameters of the full, big tile, and current work render buffer.
+   * The latter might be different from buffers_->params when there is a resolution divider
+   * involved. */
+  BufferParams effective_full_params_;
+  BufferParams effective_big_tile_params_;
+  BufferParams effective_buffer_params_;
+
+  bool *cancel_requested_flag_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
new file mode 100644
index 00000000000..b9a33b64051
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_cpu.h"
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Create TBB arena for execution of path tracing and rendering tasks. */
+static inline tbb::task_arena local_tbb_arena_create(const Device *device)
+{
+  /* TODO: limit this to number of threads of CPU device, it may be smaller than
+   * the system number of threads when we reduce the number of CPU threads in
+   * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
+  return tbb::task_arena(device->info.cpu_threads);
+}
+
+/* Get CPUKernelThreadGlobals for the current thread. */
+static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  const int thread_index = tbb::this_task_arena::current_thread_index();
+  DCHECK_GE(thread_index, 0);
+  DCHECK_LE(thread_index, kernel_thread_globals.size());
+
+  return &kernel_thread_globals[thread_index];
+}
+
+PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      kernels_(*(device->get_cpu_kernels()))
+{
+  DCHECK_EQ(device->info.type, DEVICE_CPU);
+}
+
+void PathTraceWorkCPU::init_execution()
+{
+  /* Cache per-thread kernel globals. */
+  device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
+}
+
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  const int64_t image_width = effective_buffer_params_.width;
+  const int64_t image_height = effective_buffer_params_.height;
+  const int64_t total_pixels_num = image_width * image_height;
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.start_profiling();
+  }
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
+      if (is_cancel_requested()) {
+        return;
+      }
+
+      const int y = work_index / image_width;
+      const int x = work_index - y * image_width;
+
+      KernelWorkTile work_tile;
+      work_tile.x = effective_buffer_params_.full_x + x;
+      work_tile.y = effective_buffer_params_.full_y + y;
+      work_tile.w = 1;
+      work_tile.h = 1;
+      work_tile.start_sample = start_sample;
+      work_tile.num_samples = 1;
+      work_tile.offset = effective_buffer_params_.offset;
+      work_tile.stride = effective_buffer_params_.stride;
+
+      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+
+      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
+    });
+  });
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.stop_profiling();
+  }
+
+  statistics.occupancy = 1.0f;
+}
+
+void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                                    const KernelWorkTile &work_tile,
+                                                    const int samples_num)
+{
+  const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
+  const bool has_bake = device_scene_->data.bake.use;
+
+  IntegratorStateCPU integrator_states[2] = {};
+
+  IntegratorStateCPU *state = &integrator_states[0];
+  IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+
+  KernelWorkTile sample_work_tile = work_tile;
+  float *render_buffer = buffers_->buffer.data();
+
+  for (int sample = 0; sample < samples_num; ++sample) {
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    if (has_bake) {
+      if (!kernels_.integrator_init_from_bake(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+    else {
+      if (!kernels_.integrator_init_from_camera(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+
+    kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
+
+    if (has_shadow_catcher) {
+      kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
+    }
+
+    ++sample_work_tile.start_sample;
+  }
+}
+
+void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  half4 *rgba_half = gpu_display->map_texture_buffer();
+  if (!rgba_half) {
+    /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
+     * some implementations of GPUDisplay which can not map memory? */
+    return;
+  }
+
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+
+  const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.pixels_half_rgba = rgba_half;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+  });
+
+  gpu_display->unmap_texture_buffer();
+}
+
+void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+{
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_from_device()
+{
+  return buffers_->copy_from_device();
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_to_device()
+{
+  buffers_->buffer.copy_to_device();
+  return true;
+}
+
+bool PathTraceWorkCPU::zero_render_buffers()
+{
+  buffers_->zero();
+  return true;
+}
+
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int offset = effective_buffer_params_.offset;
+  const int stride = effective_buffer_params_.stride;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  uint num_active_pixels = 0;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(full_y, full_y + height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+
+      bool row_converged = true;
+      uint num_row_pixels_active = 0;
+      for (int x = 0; x < width; ++x) {
+        if (!kernels_.adaptive_sampling_convergence_check(
+                kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+          ++num_row_pixels_active;
+          row_converged = false;
+        }
+      }
+
+      atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
+      if (!row_converged) {
+        kernels_.adaptive_sampling_filter_x(
+            kernel_globals, render_buffer, y, full_x, width, offset, stride);
+      }
+    });
+  });
+
+  if (num_active_pixels) {
+    local_arena.execute([&]() {
+      tbb::parallel_for(full_x, full_x + width, [&](int x) {
+        CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+        kernels_.adaptive_sampling_filter_y(
+            kernel_globals, render_buffer, x, full_y, height, offset, stride);
+      });
+    });
+  }
+
+  return num_active_pixels;
+}
+
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(0, height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+      int pixel_index = y * width;
+
+      for (int x = 0; x < width; ++x, ++pixel_index) {
+        kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+      }
+    });
+  });
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
new file mode 100644
index 00000000000..ab729bbf879
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/cpu/kernel_thread_globals.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+struct KernelGlobals;
+
+class CPUKernels;
+
+/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel,
+ * for CPU devices.
+ *
+ * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent
+ * queues on the render device which makes this work be only usable on CPU. */
+class PathTraceWorkCPU : public PathTraceWork {
+ public:
+  PathTraceWorkCPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  /* Core path tracing routine. Renders given work time on the given queue. */
+  void render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                    const KernelWorkTile &work_tile,
+                                    const int samples_num);
+
+  /* CPU kernels. */
+  const CPUKernels &kernels_;
+
+  /* Copy of kernel globals which is suitable for concurrent access from multiple threads.
+   *
+   * More specifically, the `kernel_globals_` is local to each threads and nobody else is
+   * accessing it, but some "localization" is required to decouple from kernel globals stored
+   * on the device level. */
+  vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
new file mode 100644
index 00000000000..10baf869aa6
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_gpu.h"
+
+#include "device/device.h"
+
+#include "integrator/pass_accessor_gpu.h"
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      queue_(device->gpu_queue_create()),
+      integrator_state_soa_kernel_features_(0),
+      integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
+      integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
+      integrator_shader_raytrace_sort_counter_(
+          device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_next_shadow_catcher_path_index_(
+          device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+      queued_paths_(device, "queued_paths", MEM_READ_WRITE),
+      num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
+      work_tiles_(device, "work_tiles", MEM_READ_WRITE),
+      gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+      max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
+      min_num_active_paths_(queue_->num_concurrent_busy_states()),
+      max_active_path_index_(0)
+{
+  memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
+
+  /* Limit number of active paths to the half of the overall state. This is due to the logic in the
+   * path compaction which relies on the fact that regeneration does not happen sooner than half of
+   * the states are available again. */
+  min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+}
+
+void PathTraceWorkGPU::alloc_integrator_soa()
+{
+  /* IntegrateState allocated as structure of arrays. */
+
+  /* Check if we already allocated memory for the required features. */
+  const uint kernel_features = device_scene_->data.kernel_features;
+  if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
+    return;
+  }
+  integrator_state_soa_kernel_features_ = kernel_features;
+
+  /* Allocate a device only memory buffer before for each struct member, and then
+   * write the pointers into a struct that resides in constant memory.
+   *
+   * TODO: store float3 in separate XYZ arrays. */
+#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && \
+      (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_END(name) \
+  break; \
+  }
+#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+  if (array_index == array_size - 1) { \
+    break; \
+  } \
+  }
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+}
+
+void PathTraceWorkGPU::alloc_integrator_queue()
+{
+  if (integrator_queue_counter_.size() == 0) {
+    integrator_queue_counter_.alloc(1);
+    integrator_queue_counter_.zero_to_device();
+    integrator_queue_counter_.copy_from_device();
+    integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+                                              integrator_queue_counter_.device_pointer;
+  }
+
+  /* Allocate data for active path index arrays. */
+  if (num_queued_paths_.size() == 0) {
+    num_queued_paths_.alloc(1);
+    num_queued_paths_.zero_to_device();
+  }
+
+  if (queued_paths_.size() == 0) {
+    queued_paths_.alloc(max_num_paths_);
+    /* TODO: this could be skip if we had a function to just allocate on device. */
+    queued_paths_.zero_to_device();
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_sorting()
+{
+  /* Allocate arrays for shader sorting. */
+  const int max_shaders = device_scene_->data.max_shaders;
+  if (integrator_shader_sort_counter_.size() < max_shaders) {
+    integrator_shader_sort_counter_.alloc(max_shaders);
+    integrator_shader_sort_counter_.zero_to_device();
+
+    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+    integrator_shader_raytrace_sort_counter_.zero_to_device();
+
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_shader_sort_counter_.device_pointer;
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+    return;
+  }
+
+  integrator_next_shadow_catcher_path_index_.alloc(1);
+  /* TODO(sergey): Use queue? */
+  integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+  integrator_state_gpu_.next_shadow_catcher_path_index =
+      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
+void PathTraceWorkGPU::alloc_work_memory()
+{
+  alloc_integrator_soa();
+  alloc_integrator_queue();
+  alloc_integrator_sorting();
+  alloc_integrator_path_split();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+  queue_->init_execution();
+
+  /* Copy to device side struct in constant memory. */
+  device_->const_copy_to(
+      "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+}
+
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+   * add more work (because tiles are smaller, so there is higher chance that more paths will
+   * become busy after adding new tiles). This is especially important for the shadow catcher which
+   * schedules work in halves of available number of paths. */
+  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
+
+  work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
+
+  enqueue_reset();
+
+  int num_iterations = 0;
+  uint64_t num_busy_accum = 0;
+
+  /* TODO: set a hard limit in case of undetected kernel failures? */
+  while (true) {
+    /* Enqueue work from the scheduler, on start or when there are not enough
+     * paths to keep the device occupied. */
+    bool finished;
+    if (enqueue_work_tiles(finished)) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    /* Stop if no more work remaining. */
+    if (finished) {
+      break;
+    }
+
+    /* Enqueue on of the path iteration kernels. */
+    if (enqueue_path_iteration()) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    num_busy_accum += get_num_active_paths();
+    ++num_iterations;
+  }
+
+  statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
+}
+
+DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
+{
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int max_num_queued = 0;
+  DeviceKernel kernel = DEVICE_KERNEL_NUM;
+
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    if (queue_counter->num_queued[i] > max_num_queued) {
+      kernel = (DeviceKernel)i;
+      max_num_queued = queue_counter->num_queued[i];
+    }
+  }
+
+  return kernel;
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+  void *args[] = {&max_num_paths_};
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
+  queue_->zero_to_device(integrator_queue_counter_);
+  queue_->zero_to_device(integrator_shader_sort_counter_);
+  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+
+  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+   * counter on the host side because `zero_to_device()` is not doing it. */
+  if (integrator_queue_counter_.host_pointer) {
+    memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
+  }
+}
+
+bool PathTraceWorkGPU::enqueue_path_iteration()
+{
+  /* Find kernel to execute, with max number of queued paths. */
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_active_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    num_active_paths += queue_counter->num_queued[i];
+  }
+
+  if (num_active_paths == 0) {
+    return false;
+  }
+
+  /* Find kernel to execute, with max number of queued paths. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel == DEVICE_KERNEL_NUM) {
+    return false;
+  }
+
+  /* Finish shadows before potentially adding more shadow rays. We can only
+   * store one shadow ray in the integrator state. */
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
+    if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+      return true;
+    }
+    else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+      return true;
+    }
+  }
+
+  /* Schedule kernel with maximum number of queued items. */
+  enqueue_path_iteration(kernel);
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+{
+  void *d_path_index = (void *)NULL;
+
+  /* Create array of path indices for which this kernel is queued to be executed. */
+  int work_size = max_active_path_index_;
+
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+  int num_queued = queue_counter->num_queued[kernel];
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    /* Compute array of active paths, sorted by shader. */
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+  }
+  else if (num_queued < work_size) {
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+      /* Compute array of active shadow paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
+    }
+    else {
+      /* Compute array of active paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
+    }
+  }
+
+  DCHECK_LE(work_size, max_num_paths_);
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+      /* Ray intersection kernels with integrator state. */
+      void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+      /* Shading kernels with integrator state and render buffer. */
+      void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+      void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
+                 << " used for path iteration, should never happen.";
+      break;
+  }
+}
+
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+  void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+  assert(d_counter != nullptr);
+
+  /* Compute prefix sum of number of active paths with each shader. */
+  {
+    const int work_size = 1;
+    int max_shaders = device_scene_->data.max_shaders;
+    void *args[] = {&d_counter, &max_shaders};
+    queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  /* Launch kernel to fill the active paths arrays. */
+  {
+    /* TODO: this could be smaller for terminated paths based on amount of work we want
+     * to schedule. */
+    const int work_size = max_active_path_index_;
+
+    void *d_queued_paths = (void *)queued_paths_.device_pointer;
+    void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+    void *args[] = {const_cast<int *>(&work_size),
+                    &d_queued_paths,
+                    &d_num_queued_paths,
+                    &d_counter,
+                    &d_queued_kernel};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+
+  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+    queue_->zero_to_device(integrator_shader_sort_counter_);
+  }
+  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  else {
+    assert(0);
+  }
+}
+
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  const int work_size = max_active_path_index_;
+  void *d_queued_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {
+      const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
+
+  queue_->zero_to_device(num_queued_paths_);
+  queue_->enqueue(kernel, work_size, args);
+}
+
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+  if (num_active_paths == 0) {
+    max_active_path_index_ = 0;
+  }
+
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
+  if (max_active_path_index_ == num_active_paths) {
+    return;
+  }
+
+  void *d_compact_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+  /* Create array with terminated paths that we can write to. */
+  {
+    /* TODO: can the work size be reduced here? */
+    int offset = num_active_paths;
+    int work_size = num_active_paths;
+    void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+  }
+
+  /* Create array of paths that we need to compact, where the path index is bigger
+   * than the number of active paths. */
+  {
+    int work_size = max_active_path_index_;
+    void *args[] = {
+        &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+  }
+
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  int num_compact_paths = num_queued_paths_.data()[0];
+
+  /* Move paths into gaps. */
+  if (num_compact_paths > 0) {
+    int work_size = num_compact_paths;
+    int active_states_offset = 0;
+    int terminated_states_offset = num_active_paths;
+    void *args[] = {
+        &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+  }
+
+  queue_->synchronize();
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_path_index_ = num_active_paths;
+}
+
+bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
+{
+  /* If there are existing paths wait them to go to intersect closest kernel, which will align the
+   * wavefront of the existing and newely added paths. */
+  /* TODO: Check whether counting new intersection kernels here will have positive affect on the
+   * performance. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
+    return false;
+  }
+
+  int num_active_paths = get_num_active_paths();
+
+  /* Don't schedule more work if cancelling. */
+  if (is_cancel_requested()) {
+    if (num_active_paths == 0) {
+      finished = true;
+    }
+    return false;
+  }
+
+  finished = false;
+
+  vector<KernelWorkTile> work_tiles;
+
+  int max_num_camera_paths = max_num_paths_;
+  int num_predicted_splits = 0;
+
+  if (has_shadow_catcher()) {
+    /* When there are shadow catchers in the scene bounce from them will split the state. So we
+     * make sure there is enough space in the path states array to fit split states.
+     *
+     * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+     * that all the new paths can be split.
+     *
+     * Note that it is possible that some of the current states can still split, so need to make
+     * sure there is enough space for them as well. */
+
+    /* Number of currently in-flight states which can still split. */
+    const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+    const int num_available_paths = max_num_paths_ - num_active_paths;
+    const int num_new_paths = num_available_paths / 2;
+    max_num_camera_paths = max(num_active_paths,
+                               num_active_paths + num_new_paths - num_scheduled_possible_split);
+    num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+  }
+
+  /* Schedule when we're out of paths or there are too few paths to keep the
+   * device occupied. */
+  int num_paths = num_active_paths;
+  if (num_paths == 0 || num_paths < min_num_active_paths_) {
+    /* Get work tiles until the maximum number of path is reached. */
+    while (num_paths < max_num_camera_paths) {
+      KernelWorkTile work_tile;
+      if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
+        work_tiles.push_back(work_tile);
+        num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
+      }
+      else {
+        break;
+      }
+    }
+
+    /* If we couldn't get any more tiles, we're done. */
+    if (work_tiles.size() == 0 && num_paths == 0) {
+      finished = true;
+      return false;
+    }
+  }
+
+  /* Initialize paths from work tiles. */
+  if (work_tiles.size() == 0) {
+    return false;
+  }
+
+  /* Compact state array when number of paths becomes small relative to the
+   * known maximum path index, which makes computing active index arrays slow. */
+  compact_states(num_active_paths);
+
+  if (has_shadow_catcher()) {
+    integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+    queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+  }
+
+  enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
+                                                      DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
+                     work_tiles.data(),
+                     work_tiles.size(),
+                     num_active_paths,
+                     num_predicted_splits);
+
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
+                                          const KernelWorkTile work_tiles[],
+                                          const int num_work_tiles,
+                                          const int num_active_paths,
+                                          const int num_predicted_splits)
+{
+  /* Copy work tiles to device. */
+  if (work_tiles_.size() < num_work_tiles) {
+    work_tiles_.alloc(num_work_tiles);
+  }
+
+  int path_index_offset = num_active_paths;
+  int max_tile_work_size = 0;
+  for (int i = 0; i < num_work_tiles; i++) {
+    KernelWorkTile &work_tile = work_tiles_.data()[i];
+    work_tile = work_tiles[i];
+
+    const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+    work_tile.path_index_offset = path_index_offset;
+    work_tile.work_size = tile_work_size;
+
+    path_index_offset += tile_work_size;
+
+    max_tile_work_size = max(max_tile_work_size, tile_work_size);
+  }
+
+  queue_->copy_to_device(work_tiles_);
+
+  void *d_work_tiles = (void *)work_tiles_.device_pointer;
+  void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+
+  /* Launch kernel. */
+  void *args[] = {&d_work_tiles,
+                  const_cast<int *>(&num_work_tiles),
+                  &d_render_buffer,
+                  const_cast<int *>(&max_tile_work_size)};
+
+  queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
+
+  max_active_path_index_ = path_index_offset + num_predicted_splits;
+}
+
+int PathTraceWorkGPU::get_num_active_paths()
+{
+  /* TODO: this is wrong, does not account for duplicates with shadow! */
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    DCHECK_GE(queue_counter->num_queued[i], 0)
+        << "Invalid number of queued states for kernel "
+        << device_kernel_as_string(static_cast<DeviceKernel>(i));
+    num_paths += queue_counter->num_queued[i];
+  }
+
+  return num_paths;
+}
+
+bool PathTraceWorkGPU::should_use_graphics_interop()
+{
+  /* There are few aspects with the graphics interop when using multiple devices caused by the fact
+   * that the GPUDisplay has a single texture:
+   *
+   *   CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
+   *   attempting to register OpenGL PBO which has been mapped. Which makes sense, because
+   *   otherwise one would run into a conflict of where the source of truth is. */
+  if (has_multiple_works()) {
+    return false;
+  }
+
+  if (!interop_use_checked_) {
+    Device *device = queue_->device;
+    interop_use_ = device->should_use_graphics_interop();
+
+    if (interop_use_) {
+      VLOG(2) << "Will be using graphics interop GPU display update.";
+    }
+    else {
+      VLOG(2) << "Will be using naive GPU display update.";
+    }
+
+    interop_use_checked_ = true;
+  }
+
+  return interop_use_;
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  if (device_->have_error()) {
+    /* Don't attempt to update GPU display if the device has errors: the error state will make
+     * wrong decisions to happen about interop, causing more chained bugs. */
+    return;
+  }
+
+  if (!buffers_->buffer.device_pointer) {
+    LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
+    return;
+  }
+
+  if (should_use_graphics_interop()) {
+    if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+      return;
+    }
+
+    /* If error happens when trying to use graphics interop fallback to the native implementation
+     * and don't attempt to use interop for the further updates. */
+    interop_use_ = false;
+  }
+
+  copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
+                                                 PassMode pass_mode,
+                                                 int num_samples)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int final_width = buffers_->params.width;
+  const int final_height = buffers_->params.height;
+
+  const int texture_x = full_x - effective_full_params_.full_x;
+  const int texture_y = full_y - effective_full_params_.full_y;
+
+  /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
+   *
+   * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
+   * change of the resolution divider. However, if the display becomes smaller, shrink the
+   * allocated memory as well. */
+  if (gpu_display_rgba_half_.data_width != final_width ||
+      gpu_display_rgba_half_.data_height != final_height) {
+    gpu_display_rgba_half_.alloc(final_width, final_height);
+    /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
+     * transfering zeroes to the device. */
+    queue_->zero_to_device(gpu_display_rgba_half_);
+  }
+
+  PassAccessor::Destination destination(film_->get_display_pass());
+  destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  gpu_display_rgba_half_.copy_from_device();
+
+  gpu_display->copy_pixels_to_texture(
+      gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+}
+
+bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  if (!device_graphics_interop_) {
+    device_graphics_interop_ = queue_->graphics_interop_create();
+  }
+
+  const DeviceGraphicsInteropDestination graphics_interop_dst =
+      gpu_display->graphics_interop_get();
+  device_graphics_interop_->set_destination(graphics_interop_dst);
+
+  const device_ptr d_rgba_half = device_graphics_interop_->map();
+  if (!d_rgba_half) {
+    return false;
+  }
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.d_pixels_half_rgba = d_rgba_half;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  device_graphics_interop_->unmap();
+
+  return true;
+}
+
+void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+{
+  if (!device_graphics_interop_) {
+    return;
+  }
+  gpu_display->graphics_interop_activate();
+  device_graphics_interop_ = nullptr;
+  gpu_display->graphics_interop_deactivate();
+}
+
+void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+  const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
+
+  pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+}
+
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
+
+  if (num_active_pixels) {
+    enqueue_adaptive_sampling_filter_x();
+    enqueue_adaptive_sampling_filter_y();
+    queue_->synchronize();
+  }
+
+  return num_active_pixels;
+}
+
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
+{
+  device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+  num_active_pixels.alloc(1);
+
+  queue_->zero_to_device(num_active_pixels);
+
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&effective_buffer_params_.full_x),
+                  const_cast<int *>(&effective_buffer_params_.full_y),
+                  const_cast<int *>(&effective_buffer_params_.width),
+                  const_cast<int *>(&effective_buffer_params_.height),
+                  &threshold,
+                  &reset,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride,
+                  &num_active_pixels.device_pointer};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
+
+  queue_->copy_from_device(num_active_pixels);
+  queue_->synchronize();
+
+  return num_active_pixels.data()[0];
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
+{
+  const int work_size = effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
+{
+  const int work_size = effective_buffer_params_.width;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
+}
+
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&work_size),
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_from_device()
+{
+  queue_->copy_from_device(buffers_->buffer);
+
+  /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
+  return queue_->synchronize();
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_to_device()
+{
+  queue_->copy_to_device(buffers_->buffer);
+
+  /* NOTE: The direct device access to the buffers only happens within this path trace work. The
+   * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
+   * which will perform synchronization as needed. */
+
+  return true;
+}
+
+bool PathTraceWorkGPU::zero_render_buffers()
+{
+  queue_->zero_to_device(buffers_->buffer);
+
+  return true;
+}
+
+bool PathTraceWorkGPU::has_shadow_catcher() const
+{
+  return device_scene_->data.integrator.has_shadow_catcher;
+}
+
+int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
+{
+  if (max_active_path_index_ == 0) {
+    return 0;
+  }
+
+  if (!has_shadow_catcher()) {
+    return 0;
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  const int work_size = max_active_path_index_;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
+
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  return num_queued_paths_.data()[0];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
new file mode 100644
index 00000000000..38788122b0d
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/device_graphics_interop.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/work_tile_scheduler.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+
+/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized
+ * to match device queue's number of path states.
+ * This implementation suits best devices which have a lot of integrator states, such as GPU. */
+class PathTraceWorkGPU : public PathTraceWork {
+ public:
+  PathTraceWorkGPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void alloc_work_memory() override;
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  void alloc_integrator_soa();
+  void alloc_integrator_queue();
+  void alloc_integrator_sorting();
+  void alloc_integrator_path_split();
+
+  /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */
+  DeviceKernel get_most_queued_kernel() const;
+
+  void enqueue_reset();
+
+  bool enqueue_work_tiles(bool &finished);
+  void enqueue_work_tiles(DeviceKernel kernel,
+                          const KernelWorkTile work_tiles[],
+                          const int num_work_tiles,
+                          const int num_active_paths,
+                          const int num_predicted_splits);
+
+  bool enqueue_path_iteration();
+  void enqueue_path_iteration(DeviceKernel kernel);
+
+  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+
+  void compact_states(const int num_active_paths);
+
+  int get_num_active_paths();
+
+  /* Check whether graphics interop can be used for the GPUDisplay update. */
+  bool should_use_graphics_interop();
+
+  /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the
+   * device, then copies pixels to the host and pushes them to the `gpu_display`. */
+  void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability
+   * functionality, avoiding copy of pixels to the host. */
+  bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Synchronously run film conversion kernel and store display result in the given destination. */
+  void get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                   PassMode pass_mode,
+                                   int num_samples);
+
+  int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
+  void enqueue_adaptive_sampling_filter_x();
+  void enqueue_adaptive_sampling_filter_y();
+
+  bool has_shadow_catcher() const;
+
+  /* Count how many currently scheduled paths can still split. */
+  int shadow_catcher_count_possible_splits();
+
+  /* Integrator queue. */
+  unique_ptr<DeviceQueue> queue_;
+
+  /* Scheduler which gives work to path tracing threads. */
+  WorkTileScheduler work_tile_scheduler_;
+
+  /* Integrate state for paths. */
+  IntegratorStateGPU integrator_state_gpu_;
+  /* SoA arrays for integrator state. */
+  vector<unique_ptr<device_memory>> integrator_state_soa_;
+  uint integrator_state_soa_kernel_features_;
+  /* Keep track of number of queued kernels. */
+  device_vector<IntegratorQueueCounter> integrator_queue_counter_;
+  /* Shader sorting. */
+  device_vector<int> integrator_shader_sort_counter_;
+  device_vector<int> integrator_shader_raytrace_sort_counter_;
+  /* Path split. */
+  device_vector<int> integrator_next_shadow_catcher_path_index_;
+
+  /* Temporary buffer to get an array of queued path for a particular kernel. */
+  device_vector<int> queued_paths_;
+  device_vector<int> num_queued_paths_;
+
+  /* Temporary buffer for passing work tiles to kernel. */
+  device_vector<KernelWorkTile> work_tiles_;
+
+  /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not
+   * available. Is allocated on-demand. */
+  device_vector<half4> gpu_display_rgba_half_;
+
+  unique_ptr<DeviceGraphicsInterop> device_graphics_interop_;
+
+  /* Cached result of device->should_use_graphics_interop(). */
+  bool interop_use_checked_ = false;
+  bool interop_use_ = false;
+
+  /* Maximum number of concurrent integrator states. */
+  int max_num_paths_;
+
+  /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below
+   * this value more work will be scheduled. */
+  int min_num_active_paths_;
+
+  /* Maximum path index, effective number of paths used may be smaller than
+   * the size of the integrator_state_ buffer so can avoid iterating over the
+   * full buffer. */
+  int max_active_path_index_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
new file mode 100644
index 00000000000..4eb1dd941f9
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -0,0 +1,1187 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/render_scheduler.h"
+
+#include "render/session.h"
+#include "render/tile.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Render scheduler.
+ */
+
+RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams &params)
+    : headless_(params.headless),
+      background_(params.background),
+      pixel_size_(params.pixel_size),
+      tile_manager_(tile_manager),
+      default_start_resolution_divider_(pixel_size_ * 8)
+{
+  use_progressive_noise_floor_ = !background_;
+}
+
+void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte)
+{
+  need_schedule_cryptomatte_ = need_schedule_cryptomatte;
+}
+
+void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance)
+{
+  need_schedule_rebalance_works_ = need_schedule_rebalance;
+}
+
+bool RenderScheduler::is_background() const
+{
+  return background_;
+}
+
+void RenderScheduler::set_denoiser_params(const DenoiseParams &params)
+{
+  denoiser_params_ = params;
+}
+
+void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  adaptive_sampling_ = adaptive_sampling;
+}
+
+bool RenderScheduler::is_adaptive_sampling_used() const
+{
+  return adaptive_sampling_.use;
+}
+
+void RenderScheduler::set_start_sample(int start_sample)
+{
+  start_sample_ = start_sample;
+}
+
+int RenderScheduler::get_start_sample() const
+{
+  return start_sample_;
+}
+
+void RenderScheduler::set_num_samples(int num_samples)
+{
+  num_samples_ = num_samples;
+}
+
+int RenderScheduler::get_num_samples() const
+{
+  return num_samples_;
+}
+
+void RenderScheduler::set_time_limit(double time_limit)
+{
+  time_limit_ = time_limit;
+}
+
+double RenderScheduler::get_time_limit() const
+{
+  return time_limit_;
+}
+
+int RenderScheduler::get_rendered_sample() const
+{
+  DCHECK_GT(get_num_rendered_samples(), 0);
+
+  return start_sample_ + get_num_rendered_samples() - 1;
+}
+
+int RenderScheduler::get_num_rendered_samples() const
+{
+  return state_.num_rendered_samples;
+}
+
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+{
+  buffer_params_ = buffer_params;
+
+  update_start_resolution_divider();
+
+  set_num_samples(num_samples);
+
+  /* In background mode never do lower resolution render preview, as it is not really supported
+   * by the software. */
+  if (background_) {
+    state_.resolution_divider = 1;
+  }
+  else {
+    /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider
+     * first and then initialized render work. */
+    state_.resolution_divider = start_resolution_divider_ * 2;
+  }
+
+  state_.num_rendered_samples = 0;
+  state_.last_display_update_time = 0.0;
+  state_.last_display_update_sample = -1;
+
+  state_.last_rebalance_time = 0.0;
+  state_.num_rebalance_requested = 0;
+  state_.num_rebalance_changes = 0;
+  state_.last_rebalance_changed = false;
+  state_.need_rebalance_at_next_work = false;
+
+  /* TODO(sergey): Choose better initial value. */
+  /* NOTE: The adaptive sampling settings might not be available here yet. */
+  state_.adaptive_sampling_threshold = 0.4f;
+
+  state_.last_work_tile_was_denoised = false;
+  state_.tile_result_was_written = false;
+  state_.postprocess_work_scheduled = false;
+  state_.full_frame_work_scheduled = false;
+  state_.full_frame_was_written = false;
+
+  state_.path_trace_finished = false;
+
+  state_.start_render_time = 0.0;
+  state_.end_render_time = 0.0;
+  state_.time_limit_reached = false;
+
+  state_.occupancy_num_samples = 0;
+  state_.occupancy = 1.0f;
+
+  first_render_time_.path_trace_per_sample = 0.0;
+  first_render_time_.denoise_time = 0.0;
+  first_render_time_.display_update_time = 0.0;
+
+  path_trace_time_.reset();
+  denoise_time_.reset();
+  adaptive_filter_time_.reset();
+  display_update_time_.reset();
+  rebalance_time_.reset();
+}
+
+void RenderScheduler::reset_for_next_tile()
+{
+  reset(buffer_params_, num_samples_);
+}
+
+bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
+{
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (render_work_reschedule_on_idle(render_work)) {
+    return true;
+  }
+
+  state_.path_trace_finished = true;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  return false;
+}
+
+bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work)
+{
+  if (!use_progressive_noise_floor_) {
+    return false;
+  }
+
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (adaptive_sampling_.use) {
+    if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) {
+      state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2,
+                                               adaptive_sampling_.threshold);
+
+      render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold;
+      render_work.adaptive_sampling.reset = true;
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work)
+{
+  VLOG(3) << "Schedule work for cancel.";
+
+  /* Un-schedule samples: they will not be rendered and should not be counted. */
+  state_.num_rendered_samples -= render_work.path_trace.num_samples;
+
+  const bool has_rendered_samples = get_num_rendered_samples() != 0;
+
+  /* Reset all fields of the previous work, canelling things like adaptive sampling filtering and
+   * denoising.
+   * However, need to preserve write requests, since those will not be possible to recover and
+   * writes are only to happen once. */
+  const bool tile_write = render_work.tile.write;
+  const bool full_write = render_work.full.write;
+
+  render_work = RenderWork();
+
+  render_work.tile.write = tile_write;
+  render_work.full.write = full_write;
+
+  /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which
+   * got cancelled. */
+  if (!state_.tile_result_was_written && has_rendered_samples) {
+    render_work.tile.write = true;
+  }
+
+  if (!state_.full_frame_was_written) {
+    render_work.full.write = true;
+  }
+
+  /* Update current tile, but only if any sample was rendered.
+   * Allows to have latest state of tile visible while full buffer is being processed.
+   *
+   * Note that if there are no samples in the current tile its render buffer might have pixels
+   * remained from previous state.
+   *
+   * If the full result was written, then there is no way any updates were made to the render
+   * buffers. And the buffers might have been freed from the device, so display update is not
+   * possible. */
+  if (has_rendered_samples && !state_.full_frame_was_written) {
+    render_work.display.update = true;
+  }
+}
+
+bool RenderScheduler::done() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (state_.path_trace_finished || state_.time_limit_reached) {
+    return true;
+  }
+
+  return get_num_rendered_samples() >= num_samples_;
+}
+
+RenderWork RenderScheduler::get_render_work()
+{
+  check_time_limit_reached();
+
+  const double time_now = time_dt();
+
+  if (done()) {
+    RenderWork render_work;
+    render_work.resolution_divider = state_.resolution_divider;
+
+    if (!set_postprocess_render_work(&render_work)) {
+      set_full_frame_render_work(&render_work);
+    }
+
+    if (!render_work) {
+      state_.end_render_time = time_now;
+    }
+
+    update_state_for_render_work(render_work);
+
+    return render_work;
+  }
+
+  RenderWork render_work;
+
+  if (state_.resolution_divider != pixel_size_) {
+    state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_);
+    state_.num_rendered_samples = 0;
+    state_.last_display_update_sample = -1;
+  }
+
+  render_work.resolution_divider = state_.resolution_divider;
+
+  render_work.path_trace.start_sample = get_start_sample_to_path_trace();
+  render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+
+  render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());
+
+  /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */
+  render_work.rebalance = work_need_rebalance();
+
+  /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the
+   * samples are rendered. */
+  state_.num_rendered_samples += render_work.path_trace.num_samples;
+
+  render_work.adaptive_sampling.filter = work_need_adaptive_filter();
+  render_work.adaptive_sampling.threshold = work_adaptive_threshold();
+  render_work.adaptive_sampling.reset = false;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.tile.write = done();
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  if (done()) {
+    set_postprocess_render_work(&render_work);
+  }
+
+  update_state_for_render_work(render_work);
+
+  return render_work;
+}
+
+void RenderScheduler::update_state_for_render_work(const RenderWork &render_work)
+{
+  const double time_now = time_dt();
+
+  if (render_work.rebalance) {
+    state_.last_rebalance_time = time_now;
+    ++state_.num_rebalance_requested;
+  }
+
+  /* A fallback display update time, for the case there is an error of display update, or when
+   * there is no display at all. */
+  if (render_work.display.update) {
+    state_.last_display_update_time = time_now;
+    state_.last_display_update_sample = state_.num_rendered_samples;
+  }
+
+  state_.last_work_tile_was_denoised = render_work.tile.denoise;
+  state_.tile_result_was_written |= render_work.tile.write;
+  state_.full_frame_was_written |= render_work.full.write;
+}
+
+bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
+{
+  if (state_.postprocess_work_scheduled) {
+    return false;
+  }
+  state_.postprocess_work_scheduled = true;
+
+  bool any_scheduled = false;
+
+  if (need_schedule_cryptomatte_) {
+    render_work->cryptomatte.postprocess = true;
+    any_scheduled = true;
+  }
+
+  if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
+    render_work->tile.denoise = true;
+    any_scheduled = true;
+  }
+
+  if (!state_.tile_result_was_written) {
+    render_work->tile.write = true;
+    any_scheduled = true;
+  }
+
+  if (any_scheduled) {
+    render_work->display.update = true;
+  }
+
+  return any_scheduled;
+}
+
+void RenderScheduler::set_full_frame_render_work(RenderWork *render_work)
+{
+  if (state_.full_frame_work_scheduled) {
+    return;
+  }
+
+  if (!tile_manager_.has_multiple_tiles()) {
+    /* There is only single tile, so all work has been performed already. */
+    return;
+  }
+
+  if (!tile_manager_.done()) {
+    /* There are still tiles to be rendered. */
+    return;
+  }
+
+  if (state_.full_frame_was_written) {
+    return;
+  }
+
+  state_.full_frame_work_scheduled = true;
+
+  render_work->full.write = true;
+}
+
+/* Knowing time which it took to complete a task at the current resolution divider approximate how
+ * long it would have taken to complete it at a final resolution. */
+static double approximate_final_time(const RenderWork &render_work, double time)
+{
+  if (render_work.resolution_divider == 1) {
+    return time;
+  }
+
+  const double resolution_divider_sq = render_work.resolution_divider *
+                                       render_work.resolution_divider;
+  return time * resolution_divider_sq;
+}
+
+void RenderScheduler::report_work_begin(const RenderWork &render_work)
+{
+  /* Start counting render time when rendering samples at their final resolution.
+   *
+   * NOTE: The work might have the path trace part be all zero: this happens when a post-processing
+   * work is scheduled after the path tracing. Checking for just a start sample doesn't work here
+   * because it might be wrongly 0. Check for whether path tracing is actually happening as it is
+   * expected to happen in the first work. */
+  if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 &&
+      render_work.path_trace.start_sample == get_start_sample()) {
+    state_.start_render_time = time_dt();
+  }
+}
+
+void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
+                                             double time,
+                                             bool is_cancelled)
+{
+  path_trace_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.path_trace_per_sample = final_time_approx /
+                                               render_work.path_trace.num_samples;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    path_trace_time_.reset_average();
+  }
+
+  path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+  state_.occupancy_num_samples = render_work.path_trace.num_samples;
+  state_.occupancy = occupancy;
+  VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
+void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
+                                                  double time,
+                                                  bool is_cancelled)
+{
+  adaptive_filter_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_report_reset_average(render_work)) {
+    adaptive_filter_time_.reset_average();
+  }
+
+  adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average adaptive sampling filter  time: " << adaptive_filter_time_.get_average()
+          << " seconds.";
+}
+
+void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time)
+{
+  denoise_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.denoise_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    denoise_time_.reset_average();
+  }
+
+  denoise_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time)
+{
+  display_update_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.display_update_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    display_update_time_.reset_average();
+  }
+
+  display_update_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds.";
+
+  /* Move the display update moment further in time, so that logic which checks when last update
+   * did happen have more reliable point in time (without path tracing and denoising parts of the
+   * render work). */
+  state_.last_display_update_time = time_dt();
+}
+
+void RenderScheduler::report_rebalance_time(const RenderWork &render_work,
+                                            double time,
+                                            bool balance_changed)
+{
+  rebalance_time_.add_wall(time);
+
+  if (work_report_reset_average(render_work)) {
+    rebalance_time_.reset_average();
+  }
+
+  rebalance_time_.add_average(time);
+
+  if (balance_changed) {
+    ++state_.num_rebalance_changes;
+  }
+
+  state_.last_rebalance_changed = balance_changed;
+
+  VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds.";
+}
+
+string RenderScheduler::full_report() const
+{
+  const double render_wall_time = state_.end_render_time - state_.start_render_time;
+  const int num_rendered_samples = get_num_rendered_samples();
+
+  string result = "\nRender Scheduler Summary\n\n";
+
+  {
+    string mode;
+    if (headless_) {
+      mode = "Headless";
+    }
+    else if (background_) {
+      mode = "Background";
+    }
+    else {
+      mode = "Interactive";
+    }
+    result += "Mode: " + mode + "\n";
+  }
+
+  result += "Resolution: " + to_string(buffer_params_.width) + "x" +
+            to_string(buffer_params_.height) + "\n";
+
+  result += "\nAdaptive sampling:\n";
+  result += "  Use: " + string_from_bool(adaptive_sampling_.use) + "\n";
+  if (adaptive_sampling_.use) {
+    result += "  Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n";
+    result += "  Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n";
+    result += "  Threshold: " + to_string(adaptive_sampling_.threshold) + "\n";
+  }
+
+  result += "\nDenoiser:\n";
+  result += "  Use: " + string_from_bool(denoiser_params_.use) + "\n";
+  if (denoiser_params_.use) {
+    result += "  Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n";
+    result += "  Start Sample: " + to_string(denoiser_params_.start_sample) + "\n";
+
+    string passes = "Color";
+    if (denoiser_params_.use_pass_albedo) {
+      passes += ", Albedo";
+    }
+    if (denoiser_params_.use_pass_normal) {
+      passes += ", Normal";
+    }
+
+    result += "  Passes: " + passes + "\n";
+  }
+
+  if (state_.num_rebalance_requested) {
+    result += "\nRebalancer:\n";
+    result += "  Number of requested rebalances: " + to_string(state_.num_rebalance_requested) +
+              "\n";
+    result += "  Number of performed rebalances: " + to_string(state_.num_rebalance_changes) +
+              "\n";
+  }
+
+  result += "\nTime (in seconds):\n";
+  result += string_printf("  %20s %20s %20s\n", "", "Wall", "Average");
+  result += string_printf("  %20s %20f %20f\n",
+                          "Path Tracing",
+                          path_trace_time_.get_wall(),
+                          path_trace_time_.get_average());
+
+  if (adaptive_sampling_.use) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Adaptive Filter",
+                            adaptive_filter_time_.get_wall(),
+                            adaptive_filter_time_.get_average());
+  }
+
+  if (denoiser_params_.use) {
+    result += string_printf(
+        "  %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average());
+  }
+
+  result += string_printf("  %20s %20f %20f\n",
+                          "Display Update",
+                          display_update_time_.get_wall(),
+                          display_update_time_.get_average());
+
+  if (state_.num_rebalance_requested) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Rebalance",
+                            rebalance_time_.get_wall(),
+                            rebalance_time_.get_average());
+  }
+
+  const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() +
+                            denoise_time_.get_wall() + display_update_time_.get_wall();
+  result += "\n  Total: " + to_string(total_time) + "\n";
+
+  result += string_printf(
+      "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time);
+
+  /* When adaptive sampling is used the average time becomes meaningless, because different samples
+   * will likely render different number of pixels. */
+  if (!adaptive_sampling_.use) {
+    result += string_printf("Average time per sample: %f seconds\n",
+                            render_wall_time / num_rendered_samples);
+  }
+
+  return result;
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds() const
+{
+  return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples);
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples(
+    int num_rendered_samples) const
+{
+  double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      num_rendered_samples);
+
+  if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
+    const double remaining_render_time = max(0.0,
+                                             time_limit_ - (time_dt() - state_.start_render_time));
+
+    update_interval = min(update_interval, remaining_render_time);
+  }
+
+  return update_interval;
+}
+
+/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based
+ * on a more careful experiments with viewport rendering. */
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+    int num_rendered_samples) const
+{
+  /* TODO(sergey): Need a decision on whether this should be using number of samples rendered
+   * within the current render session, or use absolute number of samples with the start sample
+   * taken into account. It will depend on whether the start sample offset clears the render
+   * buffer. */
+
+  if (state_.need_rebalance_at_next_work) {
+    return 0.1;
+  }
+  if (state_.last_rebalance_changed) {
+    return 0.2;
+  }
+
+  if (headless_) {
+    /* In headless mode do rare updates, so that the device occupancy is high, but there are still
+     * progress messages printed to the logs. */
+    return 30.0;
+  }
+
+  if (background_) {
+    if (num_rendered_samples < 32) {
+      return 1.0;
+    }
+    return 2.0;
+  }
+
+  /* Render time and number of samples rendered are used to figure out the display update interval.
+   *  Render time is used to allow for fast display updates in the first few seconds of rendering
+   *  on fast devices. Number of samples rendered is used to allow for potentially quicker display
+   *  updates on slow devices during the first few samples. */
+  const double render_time = path_trace_time_.get_wall();
+  if (render_time < 1) {
+    return 0.1;
+  }
+  if (render_time < 2) {
+    return 0.25;
+  }
+  if (render_time < 4) {
+    return 0.5;
+  }
+  if (render_time < 8 || num_rendered_samples < 32) {
+    return 1.0;
+  }
+  return 2.0;
+}
+
+int RenderScheduler::calculate_num_samples_per_update() const
+{
+  const double time_per_sample_average = path_trace_time_.get_average();
+  const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average;
+
+  const double update_interval_in_seconds = guess_display_update_interval_in_seconds();
+
+  return max(int(num_samples_in_second * update_interval_in_seconds), 1);
+}
+
+int RenderScheduler::get_start_sample_to_path_trace() const
+{
+  return start_sample_ + state_.num_rendered_samples;
+}
+
+/* Round number of samples to the closest power of two.
+ * Rounding might happen to higher or lower value depending on which one is closer. Such behavior
+ * allows to have number of samples to be power of two without diverging from the planned number of
+ * samples too much. */
+static inline uint round_num_samples_to_power_of_2(const uint num_samples)
+{
+  if (num_samples == 1) {
+    return 1;
+  }
+
+  if (is_power_of_two(num_samples)) {
+    return num_samples;
+  }
+
+  const uint num_samples_up = next_power_of_two(num_samples);
+  const uint num_samples_down = num_samples_up - (num_samples_up >> 1);
+
+  const uint delta_up = num_samples_up - num_samples;
+  const uint delta_down = num_samples - num_samples_down;
+
+  if (delta_up <= delta_down) {
+    return num_samples_up;
+  }
+
+  return num_samples_down;
+}
+
+int RenderScheduler::get_num_samples_to_path_trace() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return get_num_samples_during_navigation(state_.resolution_divider);
+  }
+
+  /* Always start full resolution render  with a single sample. Gives more instant feedback to
+   * artists, and allows to gather information for a subsequent path tracing works. Do it in the
+   * headless mode as well, to give some estimate of how long samples are taking. */
+  if (state_.num_rendered_samples == 0) {
+    return 1;
+  }
+
+  const int num_samples_per_update = calculate_num_samples_per_update();
+  const int path_trace_start_sample = get_start_sample_to_path_trace();
+
+  /* Round number of samples to a power of two, so that division of path states into tiles goes in
+   * a more integer manner.
+   * This might make it so updates happens more rarely due to rounding up. In the test scenes this
+   * is not huge deal because it is not seen that more than 8 samples can be rendered between
+   * updates. If that becomes a problem we can add some extra rules like never allow to round up
+   * more than N samples. */
+  const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
+
+  const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+  int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+  /* When enough statistics is available and doing an offlien rendering prefer to keep device
+   * occupied. */
+  if (state_.occupancy_num_samples && (background_ || headless_)) {
+    /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+     * with good performance without forcing occupancy to be higher). */
+    int num_samples_to_occupy = state_.occupancy_num_samples;
+    if (state_.occupancy < 0.5f) {
+      num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+    }
+
+    num_samples_to_render = max(num_samples_to_render,
+                                min(num_samples_to_occupy, max_num_samples_to_render));
+  }
+
+  /* If adaptive sampling is not use, render as many samples per update as possible, keeping the
+   * device fully occupied, without much overhead of display updates. */
+  if (!adaptive_sampling_.use) {
+    return num_samples_to_render;
+  }
+
+  /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This
+   * is to ensure that the final render is pixel-matched regardless of how many samples per second
+   * compute device can do. */
+
+  return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render);
+}
+
+int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const
+{
+  /* Special trick for fast navigation: schedule multiple samples during fast navigation
+   * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
+   * usable visual feedback for artists. There are a couple of tricks though. */
+
+  if (is_denoise_active_during_update()) {
+    /* When denoising is used during navigation prefer using a higher resolution with less samples
+     * (scheduling less samples here will make it so the resolution_divider calculation will use a
+     * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser
+     * give visually better results on a higher resolution image with less samples. */
+    return 1;
+  }
+
+  if (resolution_divider <= pixel_size_) {
+    /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
+     * the sample count at this resolution division, but instead assists in the calculation of
+     * the resolution divider. */
+    return 1;
+  }
+
+  if (resolution_divider == pixel_size_ * 2) {
+    /* When resolution divider is the previous step to the final resolution, schedule two samples.
+     * This is so that rendering on lower resolution does not exceed time that it takes to render
+     * first sample at the full resolution. */
+    return 2;
+  }
+
+  /* Always render 4 samples, even if scene is configured for less.
+   * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
+   * to have 4 time extra samples, so verall worst case timing is the same as the final resolution
+   * at one sample. */
+  return 4;
+}
+
+bool RenderScheduler::work_need_adaptive_filter() const
+{
+  return adaptive_sampling_.need_filter(get_rendered_sample());
+}
+
+float RenderScheduler::work_adaptive_threshold() const
+{
+  if (!use_progressive_noise_floor_) {
+    return adaptive_sampling_.threshold;
+  }
+
+  return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold);
+}
+
+bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
+{
+  delayed = false;
+  ready_to_display = true;
+
+  if (!denoiser_params_.use) {
+    /* Denoising is disabled, no need to scheduler work for it. */
+    return false;
+  }
+
+  if (done()) {
+    /* Always denoise at the last sample. */
+    return true;
+  }
+
+  if (background_) {
+    /* Background render, only denoise when rendering the last sample. */
+    /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised
+     * image looks like even for the background rendering. */
+    return false;
+  }
+
+  /* Viewport render. */
+
+  /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as
+   * final samples. */
+  const int num_samples_finished = state_.resolution_divider == pixel_size_ ?
+                                       state_.num_rendered_samples :
+                                       1;
+
+  /* Immediately denoise when we reach the start sample or last sample. */
+  if (num_samples_finished == denoiser_params_.start_sample ||
+      num_samples_finished == num_samples_) {
+    return true;
+  }
+
+  /* Do not denoise until the sample at which denoising should start is reached. */
+  if (num_samples_finished < denoiser_params_.start_sample) {
+    ready_to_display = false;
+    return false;
+  }
+
+  /* Avoid excessive denoising in viewport after reaching a certain sample count and render time.
+   */
+  /* TODO(sergey): Consider making time interval and sample configurable. */
+  delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 &&
+             (time_dt() - state_.last_display_update_time) < 1.0);
+
+  return !delayed;
+}
+
+bool RenderScheduler::work_need_update_display(const bool denoiser_delayed)
+{
+  if (headless_) {
+    /* Force disable display update in headless mode. There will be nothing to display the
+     * in-progress result. */
+    return false;
+  }
+
+  if (denoiser_delayed) {
+    /* If denoiser has been delayed the display can not be updated as it will not contain
+     * up-to-date state of the render result. */
+    return false;
+  }
+
+  if (!adaptive_sampling_.use) {
+    /* When adaptive sampling is not used the work is scheduled in a way that they keep render
+     * device busy for long enough, so that the display update can happen right after the
+     * rendering. */
+    return true;
+  }
+
+  if (done() || state_.last_display_update_sample == -1) {
+    /* Make sure an initial and final results of adaptive sampling is communicated ot the display.
+     */
+    return true;
+  }
+
+  /* For the development purposes of adaptive sampling it might be very useful to see all updates
+   * of active pixels after convergence check. However, it would cause a slowdown for regular usage
+   * users. Possibly, make it a debug panel option to allow rapid update to ease development
+   * without need to re-compiled. */
+  // if (work_need_adaptive_filter()) {
+  //   return true;
+  // }
+
+  /* When adaptive sampling is used, its possible that only handful of samples of a very simple
+   * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points).
+   * We take care of skipping updates here based on when previous display update did happen. */
+  const double update_interval = guess_display_update_interval_in_seconds_for_num_samples(
+      state_.last_display_update_sample);
+  return (time_dt() - state_.last_display_update_time) > update_interval;
+}
+
+bool RenderScheduler::work_need_rebalance()
+{
+  /* This is the minimum time, as the rebalancing can not happen more often than the path trace
+   * work. */
+  static const double kRebalanceIntervalInSeconds = 1;
+
+  if (!need_schedule_rebalance_works_) {
+    return false;
+  }
+
+  if (state_.resolution_divider != pixel_size_) {
+    /* Don't rebalance at a non-final resolution divider. Some reasons for this:
+     *  - It will introduce unnecessary during navigation.
+     *  - Per-render device timing information is not very reliable yet. */
+    return false;
+  }
+
+  if (state_.num_rendered_samples == 0) {
+    state_.need_rebalance_at_next_work = true;
+    return false;
+  }
+
+  if (state_.need_rebalance_at_next_work) {
+    state_.need_rebalance_at_next_work = false;
+    return true;
+  }
+
+  if (state_.last_rebalance_changed) {
+    return true;
+  }
+
+  return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds;
+}
+
+void RenderScheduler::update_start_resolution_divider()
+{
+  if (start_resolution_divider_ == 0) {
+    /* Resolution divider has never been calculated before: use default resolution, so that we have
+     * somewhat good initial behavior, giving a chance to collect real numbers. */
+    start_resolution_divider_ = default_start_resolution_divider_;
+    VLOG(3) << "Initial resolution divider is " << start_resolution_divider_;
+    return;
+  }
+
+  if (first_render_time_.path_trace_per_sample == 0.0) {
+    /* Not enough information to calculate better resolution, keep the existing one. */
+    return;
+  }
+
+  const double desired_update_interval_in_seconds =
+      guess_viewport_navigation_update_interval_in_seconds();
+
+  const double actual_time_per_update = first_render_time_.path_trace_per_sample +
+                                        first_render_time_.denoise_time +
+                                        first_render_time_.display_update_time;
+
+  /* Allow some percent of tolerance, so that if the render time is close enough to the higher
+   * resolution we prefer to use it instead of going way lower resolution and time way below the
+   * desired one. */
+  const int resolution_divider_for_update = calculate_resolution_divider_for_time(
+      desired_update_interval_in_seconds * 1.4, actual_time_per_update);
+
+  /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
+   * render time is somewhere on a boundary between two resolutions. */
+
+  /* Never increase resolution to higher than the pixel size (which is possible if the scene is
+   * simple and compute device is fast). */
+  start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+
+  VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_;
+}
+
+double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const
+{
+  if (is_denoise_active_during_update()) {
+    /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the
+     * image from. With the faster updates and extra compute required the resolution becomes too
+     * low to give usable feedback. */
+    /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser
+     * on GPU the value might need to become lower for faster navigation. */
+    return 1.0 / 12.0;
+  }
+
+  /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will
+   * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high
+   * values of the resolution divider which does not give very pleasant updates during navigation.
+   * Choose less frequent updates to allow more noise-free and higher resolution updates. */
+
+  /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider
+   * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */
+
+  return 1.0 / 30.0;
+}
+
+bool RenderScheduler::is_denoise_active_during_update() const
+{
+  if (!denoiser_params_.use) {
+    return false;
+  }
+
+  if (denoiser_params_.start_sample > 1) {
+    return false;
+  }
+
+  return true;
+}
+
+bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work)
+{
+  return render_work.resolution_divider == pixel_size_ &&
+         render_work.path_trace.start_sample == start_sample_;
+}
+
+bool RenderScheduler::work_report_reset_average(const RenderWork &render_work)
+{
+  /* When rendering at a non-final resolution divider time average is not very useful because it
+   * will either bias average down (due to lower render times on the smaller images) or will give
+   * incorrect result when trying to estimate time which would have spent on the final resolution.
+   *
+   * So we only accumulate average for the latest resolution divider which was rendered. */
+  return render_work.resolution_divider != pixel_size_;
+}
+
+void RenderScheduler::check_time_limit_reached()
+{
+  if (time_limit_ == 0.0) {
+    /* No limit is enforced. */
+    return;
+  }
+
+  if (state_.start_render_time == 0.0) {
+    /* Rendering did not start yet. */
+    return;
+  }
+
+  const double current_time = time_dt();
+
+  if (current_time - state_.start_render_time < time_limit_) {
+    /* Time limit is not reached yet. */
+    return;
+  }
+
+  state_.time_limit_reached = true;
+  state_.end_render_time = current_time;
+}
+
+/* --------------------------------------------------------------------
+ * Utility functions.
+ */
+
+int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
+{
+  /* TODO(sergey): There should a non-iterative analytical formula here. */
+
+  int resolution_divider = 1;
+
+  /* This algorithm iterates through resolution dividers until a divider is found that achieves
+   * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
+   * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
+   * pre_resolution_division_samples and post_resolution_division_samples are used in this
+   * calculation to better predict the performance impact of changing resolution divisions as
+   * the sample count can also change between resolution divisions. */
+  while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
+    int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    resolution_divider = resolution_divider * 2;
+    int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
+{
+  if (resolution == INT_MAX) {
+    return 1;
+  }
+
+  int resolution_divider = 1;
+  while (width * height > resolution * resolution) {
+    width = max(1, width / 2);
+    height = max(1, height / 2);
+
+    resolution_divider <<= 1;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider)
+{
+  const int pixel_area = width * height;
+  const int resolution = lround(sqrt(pixel_area));
+
+  return resolution / resolution_divider;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
new file mode 100644
index 00000000000..9c2d107e46d
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/adaptive_sampling.h"
+#include "integrator/denoiser.h" /* For DenoiseParams. */
+#include "render/buffers.h"
+#include "util/util_string.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SessionParams;
+class TileManager;
+
+class RenderWork {
+ public:
+  int resolution_divider = 1;
+
+  /* Initialize render buffers.
+   * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+   * baking target. */
+  bool init_render_buffers = false;
+
+  /* Path tracing samples information. */
+  struct {
+    int start_sample = 0;
+    int num_samples = 0;
+  } path_trace;
+
+  struct {
+    /* Check for convergency and filter the mask. */
+    bool filter = false;
+
+    float threshold = 0.0f;
+
+    /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */
+    bool reset = false;
+  } adaptive_sampling;
+
+  struct {
+    bool postprocess = false;
+  } cryptomatte;
+
+  /* Work related on the current tile. */
+  struct {
+    /* Write render buffers of the current tile.
+     *
+     * It is up to the path trace to decide whether writing should happen via user-provided
+     * callback into the rendering software, or via tile manager into a partial file. */
+    bool write = false;
+
+    bool denoise = false;
+  } tile;
+
+  /* Work related on the full-frame render buffer. */
+  struct {
+    /* Write full render result.
+     * Implies reading the partial file from disk. */
+    bool write = false;
+  } full;
+
+  /* Display which is used to visualize render result. */
+  struct {
+    /* Display needs to be updated for the new render. */
+    bool update = false;
+
+    /* Display can use denoised result if available. */
+    bool use_denoised_result = true;
+  } display;
+
+  /* Re-balance multi-device scheduling after rendering this work.
+   * Note that the scheduler does not know anything abouce devices, so if there is only a single
+   * device used, then it is up for the PathTracer to ignore the balancing. */
+  bool rebalance = false;
+
+  /* Conversion to bool, to simplify checks about whether there is anything to be done for this
+   * work. */
+  inline operator bool() const
+  {
+    return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise ||
+           tile.write || full.write;
+  }
+};
+
+class RenderScheduler {
+ public:
+  RenderScheduler(TileManager &tile_manager, const SessionParams &params);
+
+  /* Specify whether cryptomatte-related works are to be scheduled. */
+  void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte);
+
+  /* Allows to disable work re-balancing works, allowing to schedule as much to a single device
+   * as possible. */
+  void set_need_schedule_rebalance(bool need_schedule_rebalance);
+
+  bool is_background() const;
+
+  void set_denoiser_params(const DenoiseParams &params);
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  bool is_adaptive_sampling_used() const;
+
+  /* Start sample for path tracing.
+   * The scheduler will schedule work using this sample as the first one. */
+  void set_start_sample(int start_sample);
+  int get_start_sample() const;
+
+  /* Number of samples to render, starting from start sample.
+   * The scheduler will schedule work in the range of
+   * [start_sample, start_sample + num_samples - 1], inclusively. */
+  void set_num_samples(int num_samples);
+  int get_num_samples() const;
+
+  /* Time limit for the path tracing tasks, in minutes.
+   * Zero disables the limit. */
+  void set_time_limit(double time_limit);
+  double get_time_limit() const;
+
+  /* Get sample up to which rendering has been done.
+   * This is an absolute 0-based value.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 14.
+   *
+   * If there were no samples rendered, then the behavior is undefined. */
+  int get_rendered_sample() const;
+
+  /* Get number of samples rendered within the current scheduling session.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 5.
+   *
+   * Note that this is based on the scheduling information. In practice this means that if someone
+   * requested for work to render the scheduler considers the work done. */
+  int get_num_rendered_samples() const;
+
+  /* Reset scheduler, indicating that rendering will happen from scratch.
+   * Resets current rendered state, as well as scheduling information. */
+  void reset(const BufferParams &buffer_params, int num_samples);
+
+  /* Reset scheduler upon switching to a next tile.
+   * Will keep the same number of samples and full-frame render parameters, but will reset progress
+   * and allow schedule renders works from the beginning of the new tile. */
+  void reset_for_next_tile();
+
+  /* Reschedule adaptive sampling work when all pixels did converge.
+   * If there is nothing else to be done for the adaptive sampling (pixels did converge to the
+   * final threshold) then false is returned and the render scheduler will stop scheduling path
+   * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with
+   * a lower threshold. */
+  bool render_work_reschedule_on_converge(RenderWork &render_work);
+
+  /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet
+   * converged.
+   * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and
+   * the path tracer is to finish the current pixels) then false is returned. */
+  bool render_work_reschedule_on_idle(RenderWork &render_work);
+
+  /* Reschedule work when rendering has been requested to cancel.
+   *
+   * Will skip all work which is not needed anymore because no more samples will be added (for
+   * example, adaptive sampling filtering and convergence check will be skipped).
+   * Will enable all work needed to make sure all passes are communicated to the software.
+   *
+   * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */
+  void render_work_reschedule_on_cancel(RenderWork &render_work);
+
+  RenderWork get_render_work();
+
+  /* Report that the path tracer started to work, after scene update and loading kernels. */
+  void report_work_begin(const RenderWork &render_work);
+
+  /* Report time (in seconds) which corresponding part of work took. */
+  void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
+  void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_denoise_time(const RenderWork &render_work, double time);
+  void report_display_update_time(const RenderWork &render_work, double time);
+  void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed);
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+ protected:
+  /* Check whether all work has been scheduled and time limit was not exceeded.
+   *
+   * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some
+   * extra work needs to be scheduled to denoise and write final result. */
+  bool done() const;
+
+  /* Update scheduling state for a newely scheduled work.
+   * Takes care of things like checking whether work was ever denoised, tile was written and states
+   * like that. */
+  void update_state_for_render_work(const RenderWork &render_work);
+
+  /* Returns true if any work was scheduled. */
+  bool set_postprocess_render_work(RenderWork *render_work);
+
+  /*  Set work which is to be performed after all tiles has been rendered. */
+  void set_full_frame_render_work(RenderWork *render_work);
+
+  /* Update start resolution divider based on the accumulated timing information, preserving nice
+   * feeling navigation feel. */
+  void update_start_resolution_divider();
+
+  /* Calculate desired update interval in seconds based on the current timings and settings.
+   * Will give an interval which provides good feeling updates during viewport navigation. */
+  double guess_viewport_navigation_update_interval_in_seconds() const;
+
+  /* Check whether denoising is active during interactive update while resolution divider is not
+   * unit. */
+  bool is_denoise_active_during_update() const;
+
+  /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at
+   * lower samples and near the beginning of rendering, updates happen more often, but with higher
+   * number of samples and later in the render, updates happen less often but device occupancy
+   * goes higher. */
+  double guess_display_update_interval_in_seconds() const;
+  double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const;
+  double guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      int num_rendered_samples) const;
+
+  /* Calculate number of samples which can be rendered within current desred update interval which
+   * is calculated by `guess_update_interval_in_seconds()`. */
+  int calculate_num_samples_per_update() const;
+
+  /* Get start sample and the number of samples which are to be path traces in the current work. */
+  int get_start_sample_to_path_trace() const;
+  int get_num_samples_to_path_trace() const;
+
+  /* Calculate how many samples there are to be rendered for the very first path trace after reset.
+   */
+  int get_num_samples_during_navigation(int resolution_divier) const;
+
+  /* Whether adaptive sampling convergence check and filter is to happen. */
+  bool work_need_adaptive_filter() const;
+
+  /* Calculate thretshold for adaptive sampling. */
+  float work_adaptive_threshold() const;
+
+  /* Check whether current work needs denoising.
+   * Denoising is not needed if the denoiser is not configured, or when denosiing is happening too
+   * often.
+   *
+   * The delayed will be true when the denoiser is configured for use, but it was delayed for a
+   * later sample, to reduce overhead.
+   *
+   * ready_to_display will be false if we may have a denoised result that is outdated due to
+   * increased samples. */
+  bool work_need_denoise(bool &delayed, bool &ready_to_display);
+
+  /* Check whether current work need to update display.
+   *
+   * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */
+  bool work_need_update_display(const bool denoiser_delayed);
+
+  /* Check whether it is time to perform rebalancing for the render work, */
+  bool work_need_rebalance();
+
+  /* Check whether timing of the given work are usable to store timings in the `first_render_time_`
+   * for the resolution divider calculation. */
+  bool work_is_usable_for_first_render_estimation(const RenderWork &render_work);
+
+  /* Check whether timing report about the given work need to reset accumulated average time. */
+  bool work_report_reset_average(const RenderWork &render_work);
+
+  /* CHeck whether render time limit has been reached (or exceeded), and if so store related
+   * information in the state so that rendering is considered finished, and is possible to report
+   * average render time information. */
+  void check_time_limit_reached();
+
+  /* Helper class to keep track of task timing.
+   *
+   * Contains two parts: wall time and average. The wall time is an actual wall time of how long it
+   * took to complete all tasks of a type. Is always advanced when PathTracer reports time update.
+   *
+   * The average time is used for scheduling purposes. It is estimated to be a time of how long it
+   * takes to perform task on the final resolution. */
+  class TimeWithAverage {
+   public:
+    inline void reset()
+    {
+      total_wall_time_ = 0.0;
+
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+    inline void add_wall(double time)
+    {
+      total_wall_time_ += time;
+    }
+
+    inline void add_average(double time, int num_measurements = 1)
+    {
+      average_time_accumulator_ += time;
+      num_average_times_ += num_measurements;
+    }
+
+    inline double get_wall() const
+    {
+      return total_wall_time_;
+    }
+
+    inline double get_average() const
+    {
+      if (num_average_times_ == 0) {
+        return 0;
+      }
+      return average_time_accumulator_ / num_average_times_;
+    }
+
+    inline void reset_average()
+    {
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+   protected:
+    double total_wall_time_ = 0.0;
+
+    double average_time_accumulator_ = 0.0;
+    int num_average_times_ = 0;
+  };
+
+  struct {
+    int resolution_divider = 1;
+
+    /* Number of rendered samples on top of the start sample. */
+    int num_rendered_samples = 0;
+
+    /* Point in time the latest GPUDisplay work has been scheduled. */
+    double last_display_update_time = 0.0;
+    /* Value of -1 means display was never updated. */
+    int last_display_update_sample = -1;
+
+    /* Point in time at which last rebalance has been performed. */
+    double last_rebalance_time = 0.0;
+
+    /* Number of rebalance works which has been requested to be performed.
+     * The path tracer might ignore the work if there is a single device rendering. */
+    int num_rebalance_requested = 0;
+
+    /* Number of rebalance works handled which did change balance across devices. */
+    int num_rebalance_changes = 0;
+
+    bool need_rebalance_at_next_work = false;
+
+    /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across
+     * devices. */
+    bool last_rebalance_changed = false;
+
+    /* Threshold for adaptive sampling which will be scheduled to work when not using progressive
+     * noise floor. */
+    float adaptive_sampling_threshold = 0.0f;
+
+    bool last_work_tile_was_denoised = false;
+    bool tile_result_was_written = false;
+    bool postprocess_work_scheduled = false;
+    bool full_frame_work_scheduled = false;
+    bool full_frame_was_written = false;
+
+    bool path_trace_finished = false;
+    bool time_limit_reached = false;
+
+    /* Time at which rendering started and finished. */
+    double start_render_time = 0.0;
+    double end_render_time = 0.0;
+
+    /* Measured occupancy of the render devices measured normalized to the number of samples.
+     *
+     * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+     * previous work was rendered. */
+    int occupancy_num_samples = 0;
+    float occupancy = 1.0f;
+  } state_;
+
+  /* Timing of tasks which were performed at the very first render work at 100% of the
+   * resolution. This timing information is used to estimate resolution divider for fats
+   * navigation. */
+  struct {
+    double path_trace_per_sample;
+    double denoise_time;
+    double display_update_time;
+  } first_render_time_;
+
+  TimeWithAverage path_trace_time_;
+  TimeWithAverage adaptive_filter_time_;
+  TimeWithAverage denoise_time_;
+  TimeWithAverage display_update_time_;
+  TimeWithAverage rebalance_time_;
+
+  /* Whether cryptomatte-related work will be scheduled. */
+  bool need_schedule_cryptomatte_ = false;
+
+  /* Whether to schedule device load rebalance works.
+   * Rebalancing requires some special treatment for update intervals and such, so if it's known
+   * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully
+   * ignore rebalancing logic. */
+  bool need_schedule_rebalance_works_ = false;
+
+  /* Path tracing work will be scheduled for samples from within
+   * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */
+  int start_sample_ = 0;
+  int num_samples_ = 0;
+
+  /* Limit in seconds for how long path tracing is allowed to happen.
+   * Zero means no limit is applied. */
+  double time_limit_ = 0.0;
+
+  /* Headless rendering without interface. */
+  bool headless_;
+
+  /* Background (offline) rendering. */
+  bool background_;
+
+  /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other
+   * types of hi-dpi displays. */
+  int pixel_size_ = 1;
+
+  TileManager &tile_manager_;
+
+  BufferParams buffer_params_;
+  DenoiseParams denoiser_params_;
+
+  AdaptiveSampling adaptive_sampling_;
+
+  /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise
+   * level. */
+  bool use_progressive_noise_floor_ = false;
+
+  /* Default value for the resolution divider which will be used when there is no render time
+   * information available yet.
+   * It is also what defines the upper limit of the automatically calculated resolution divider. */
+  int default_start_resolution_divider_ = 1;
+
+  /* Initial resolution divider which will be used on render scheduler reset. */
+  int start_resolution_divider_ = 0;
+
+  /* Calculate smallest resolution divider which will bring down actual rendering time below the
+   * desired one. This call assumes linear dependency of render time from number of pixels
+   * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time
+   * down by a factor of 4. */
+  int calculate_resolution_divider_for_time(double desired_time, double actual_time);
+};
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution);
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
new file mode 100644
index 00000000000..465b4a8d4da
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/shader_eval.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
+{
+  DCHECK_NE(device_, nullptr);
+}
+
+bool ShaderEval::eval(const ShaderEvalType type,
+                      const int max_num_points,
+                      const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+                      const function<void(device_vector<float4> &)> &read_output)
+{
+  bool first_device = true;
+  bool success = true;
+
+  device_->foreach_device([&](Device *device) {
+    if (!first_device) {
+      LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
+                    "single device.";
+      return;
+    }
+    first_device = false;
+
+    device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
+    device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+
+    /* Allocate and copy device buffers. */
+    DCHECK_EQ(input.device, device);
+    DCHECK_EQ(output.device, device);
+    DCHECK_LE(output.size(), input.size());
+
+    input.alloc(max_num_points);
+    int num_points = fill_input(input);
+    if (num_points == 0) {
+      return;
+    }
+
+    input.copy_to_device();
+    output.alloc(num_points);
+    output.zero_to_device();
+
+    /* Evaluate on CPU or GPU. */
+    success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
+                                                  eval_gpu(device, type, input, output);
+
+    /* Copy data back from device if not cancelled. */
+    if (success) {
+      output.copy_from_device(0, 1, output.size());
+      read_output(output);
+    }
+
+    input.free();
+    output.free();
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_cpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  vector<CPUKernelThreadGlobals> kernel_thread_globals;
+  device->get_cpu_kernel_thread_globals(kernel_thread_globals);
+
+  /* Find required kernel function. */
+  const CPUKernels &kernels = *(device->get_cpu_kernels());
+
+  /* Simple parallel_for over all work items. */
+  const int64_t work_size = output.size();
+  KernelShaderEvalInput *input_data = input.data();
+  float4 *output_data = output.data();
+  bool success = true;
+
+  tbb::task_arena local_arena(device->info.cpu_threads);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
+      /* TODO: is this fast enough? */
+      if (progress_.get_cancel()) {
+        success = false;
+        return;
+      }
+
+      const int thread_index = tbb::this_task_arena::current_thread_index();
+      KernelGlobals *kg = &kernel_thread_globals[thread_index];
+
+      switch (type) {
+        case SHADER_EVAL_DISPLACE:
+          kernels.shader_eval_displace(kg, input_data, output_data, work_index);
+          break;
+        case SHADER_EVAL_BACKGROUND:
+          kernels.shader_eval_background(kg, input_data, output_data, work_index);
+          break;
+      }
+    });
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_gpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  /* Find required kernel function. */
+  DeviceKernel kernel;
+  switch (type) {
+    case SHADER_EVAL_DISPLACE:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
+      break;
+    case SHADER_EVAL_BACKGROUND:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
+      break;
+  };
+
+  /* Create device queue. */
+  unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
+  queue->init_execution();
+
+  /* Execute work on GPU in chunk, so we can cancel.
+   * TODO : query appropriate size from device.*/
+  const int chunk_size = 65536;
+
+  const int work_size = output.size();
+  void *d_input = (void *)input.device_pointer;
+  void *d_output = (void *)output.device_pointer;
+
+  for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+    int d_work_size = min(chunk_size, work_size - d_offset);
+    void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
+
+    queue->enqueue(kernel, d_work_size, args);
+    queue->synchronize();
+
+    if (progress_.get_cancel()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h
new file mode 100644
index 00000000000..7dbf334b8d7
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_function.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class Progress;
+
+enum ShaderEvalType {
+  SHADER_EVAL_DISPLACE,
+  SHADER_EVAL_BACKGROUND,
+};
+
+/* ShaderEval class performs shader evaluation for background light and displacement. */
+class ShaderEval {
+ public:
+  ShaderEval(Device *device, Progress &progress);
+
+  /* Evaluate shader at points specified by KernelShaderEvalInput and write out
+   * RGBA colors to output. */
+  bool eval(const ShaderEvalType type,
+            const int max_num_points,
+            const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+            const function<void(device_vector<float4> &)> &read_output);
+
+ protected:
+  bool eval_cpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+  bool eval_gpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+
+  Device *device_;
+  Progress &progress_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
new file mode 100644
index 00000000000..3387b7bedf1
--- /dev/null
+++ b/intern/cycles/integrator/tile.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/tile.h"
+
+#include "util/util_logging.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size)
+{
+  os << "size: (" << tile_size.width << ", " << tile_size.height << ")";
+  os << ", num_samples: " << tile_size.num_samples;
+  return os;
+}
+
+ccl_device_inline uint round_down_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return prev_power_of_two(x);
+}
+
+ccl_device_inline uint round_up_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return next_power_of_two(x);
+}
+
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states)
+{
+  if (max_num_path_states == 1) {
+    /* Simple case: avoid any calculation, which could cause rounding issues. */
+    return TileSize(1, 1, 1);
+  }
+
+  const int64_t num_pixels = image_size.x * image_size.y;
+  const int64_t num_pixel_samples = num_pixels * num_samples;
+
+  if (max_num_path_states >= num_pixel_samples) {
+    /* Image fully fits into the state (could be border render, for example). */
+    return TileSize(image_size.x, image_size.y, num_samples);
+  }
+
+  /* The idea here is to keep number of samples per tile as much as possible to improve coherency
+   * across threads.
+   *
+   * Some general ideas:
+   *  - Prefer smaller tiles with more samples, which improves spatial coherency of paths.
+   *  - Keep values a power of two, for more integer fit into the maximum number of paths. */
+
+  TileSize tile_size;
+
+  /* Calculate tile size as if it is the most possible one to fit an entire range of samples.
+   * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling
+   * multiple tiles with the same coordinates rendering different samples. */
+  const int num_path_states_per_sample = max_num_path_states / num_samples;
+  if (num_path_states_per_sample != 0) {
+    tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample)));
+    tile_size.height = tile_size.width;
+  }
+  else {
+    tile_size.width = tile_size.height = 1;
+  }
+
+  if (num_samples == 1) {
+    tile_size.num_samples = 1;
+  }
+  else {
+    /* Heuristic here is to have more uniform division of the sample range: for example prefer
+     * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */
+    tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))),
+                                static_cast<uint>(num_samples));
+
+    const int tile_area = tile_size.width / tile_size.height;
+    tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area);
+  }
+
+  DCHECK_GE(tile_size.width, 1);
+  DCHECK_GE(tile_size.height, 1);
+  DCHECK_GE(tile_size.num_samples, 1);
+  DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states);
+
+  return tile_size;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
new file mode 100644
index 00000000000..d0824843ddb
--- /dev/null
+++ b/intern/cycles/integrator/tile.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct TileSize {
+  TileSize() = default;
+
+  inline TileSize(int width, int height, int num_samples)
+      : width(width), height(height), num_samples(num_samples)
+  {
+  }
+
+  inline bool operator==(const TileSize &other) const
+  {
+    return width == other.width && height == other.height && num_samples == other.num_samples;
+  }
+  inline bool operator!=(const TileSize &other) const
+  {
+    return !(*this == other);
+  }
+
+  int width = 0, height = 0;
+  int num_samples = 0;
+};
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
+
+/* Calculate tile size which is best suitable for rendering image of a given size with given number
+ * of active path states.
+ * Will attempt to provide best guess to keep path tracing threads of a device as localized as
+ * possible, and have as many threads active for every tile as possible. */
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
new file mode 100644
index 00000000000..9f96fe3632b
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_balancer.h"
+
+#include "util/util_math.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  if (num_infos == 1) {
+    work_balance_infos[0].weight = 1.0;
+    return;
+  }
+
+  /* There is no statistics available, so start with an equal distribution. */
+  const double weight = 1.0 / num_infos;
+  for (WorkBalanceInfo &balance_info : work_balance_infos) {
+    balance_info.weight = weight;
+  }
+}
+
+static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos)
+{
+  double total_time = 0;
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    total_time += info.time_spent;
+  }
+  return total_time;
+}
+
+/* The balance is based on equalizing time which devices spent performing a task. Assume that
+ * average of the observed times is usable for estimating whether more or less work is to be
+ * scheduled, and how difference in the work scheduling is needed. */
+
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  const double total_time = calculate_total_time(work_balance_infos);
+  const double time_average = total_time / num_infos;
+
+  double total_weight = 0;
+  vector<double> new_weights;
+  new_weights.reserve(num_infos);
+
+  /* Equalize the overall average time. This means that we don't make it so every work will perform
+   * amount of work based on the current average, but that after the weights changes the time will
+   * equalize.
+   * Can think of it that if one of the devices is 10% faster than another, then one device needs
+   * to do 5% less of the current work, and another needs to do 5% more. */
+  const double lerp_weight = 1.0 / num_infos;
+
+  bool has_big_difference = false;
+
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    const double time_target = lerp(info.time_spent, time_average, lerp_weight);
+    const double new_weight = info.weight * time_target / info.time_spent;
+    new_weights.push_back(new_weight);
+    total_weight += new_weight;
+
+    if (std::fabs(1.0 - time_target / time_average) > 0.02) {
+      has_big_difference = true;
+    }
+  }
+
+  if (!has_big_difference) {
+    return false;
+  }
+
+  const double total_weight_inv = 1.0 / total_weight;
+  for (int i = 0; i < num_infos; ++i) {
+    WorkBalanceInfo &info = work_balance_infos[i];
+    info.weight = new_weights[i] * total_weight_inv;
+    info.time_spent = 0;
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
new file mode 100644
index 00000000000..94e20ecf054
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct WorkBalanceInfo {
+  /* Time spent performing corresponding work. */
+  double time_spent = 0;
+
+  /* Average occupancy of the device while performing the work. */
+  float occupancy = 1.0f;
+
+  /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
+   * the big tile which is to be rendered on the device). */
+  double weight = 1.0;
+};
+
+/* Balance work for an initial render interation, before any statistics is known. */
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos);
+
+/* Rebalance work after statistics has been accumulated.
+ * Returns true if the balancing did change. */
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
new file mode 100644
index 00000000000..3fc99d5b74d
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_tile_scheduler.h"
+
+#include "device/device_queue.h"
+#include "integrator/tile.h"
+#include "render/buffers.h"
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+WorkTileScheduler::WorkTileScheduler()
+{
+}
+
+void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
+{
+  max_num_path_states_ = max_num_path_states;
+}
+
+void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num)
+{
+  /* Image buffer parameters. */
+  image_full_offset_px_.x = buffer_params.full_x;
+  image_full_offset_px_.y = buffer_params.full_y;
+
+  image_size_px_ = make_int2(buffer_params.width, buffer_params.height);
+
+  offset_ = buffer_params.offset;
+  stride_ = buffer_params.stride;
+
+  /* Samples parameters. */
+  sample_start_ = sample_start;
+  samples_num_ = samples_num;
+
+  /* Initialize new scheduling. */
+  reset_scheduler_state();
+}
+
+void WorkTileScheduler::reset_scheduler_state()
+{
+  tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_);
+
+  VLOG(3) << "Will schedule tiles of size " << tile_size_;
+
+  if (VLOG_IS_ON(3)) {
+    /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
+     * and purely focusing on the number of used path states. */
+    const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+                                        tile_size_.num_samples;
+    const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+    VLOG(3) << "Number of unused path states: "
+            << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+  }
+
+  num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+  num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+
+  total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
+  num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
+
+  next_work_index_ = 0;
+  total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
+}
+
+bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size)
+{
+  /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because
+   * the path trace work can decice to use smaller tile sizes and greedily schedule multiple tiles,
+   * improving overall device occupancy.
+   * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling
+   * limit". */
+
+  DCHECK_NE(max_num_path_states_, 0);
+
+  const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1);
+  if (work_index >= total_work_size_) {
+    return false;
+  }
+
+  const int sample_range_index = work_index % num_tiles_per_sample_range_;
+  const int start_sample = sample_range_index * tile_size_.num_samples;
+  const int tile_index = work_index / num_tiles_per_sample_range_;
+  const int tile_y = tile_index / num_tiles_x_;
+  const int tile_x = tile_index - tile_y * num_tiles_x_;
+
+  KernelWorkTile work_tile;
+  work_tile.x = tile_x * tile_size_.width;
+  work_tile.y = tile_y * tile_size_.height;
+  work_tile.w = tile_size_.width;
+  work_tile.h = tile_size_.height;
+  work_tile.start_sample = sample_start_ + start_sample;
+  work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+  work_tile.offset = offset_;
+  work_tile.stride = stride_;
+
+  work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
+  work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);
+
+  work_tile.x += image_full_offset_px_.x;
+  work_tile.y += image_full_offset_px_.y;
+
+  const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+  DCHECK_GT(tile_work_size, 0);
+
+  if (max_work_size && tile_work_size > max_work_size) {
+    /* The work did not fit into the requested limit of the work size. Unschedule the tile,
+     * allowing others (or ourselves later one) to pick it up.
+     *
+     * TODO: Such temporary decrement is not ideal, since it might lead to situation when another
+     * device sees there is nothing to be done, finishing its work and leaving all work to be
+     * done by us. */
+    atomic_fetch_and_add_int32(&next_work_index_, -1);
+    return false;
+  }
+
+  *work_tile_ = work_tile;
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
new file mode 100644
index 00000000000..e4c8f701259
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/tile.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+struct KernelWorkTile;
+
+/* Scheduler of device work tiles.
+ * Takes care of feeding multiple devices running in parallel a work which needs to be done. */
+class WorkTileScheduler {
+ public:
+  WorkTileScheduler();
+
+  /* MAximum path states which are allowed to be used by a single scheduled work tile.
+   *
+   * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
+   * this number of states. */
+  void set_max_num_path_states(int max_num_path_states);
+
+  /* Scheduling will happen for pixels within a big tile denotes by its parameters. */
+  void reset(const BufferParams &buffer_params, int sample_start, int samples_num);
+
+  /* Get work for a device.
+   * Returns true if there is still work to be done and initialize the work tile to all
+   * parameters of this work. If there is nothing remaining to be done, returns false and the
+   * work tile is kept unchanged.
+   *
+   * Optionally pass max_work_size to do nothing if there is no tile small enough. */
+  bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0);
+
+ protected:
+  void reset_scheduler_state();
+
+  /* Maximum allowed path states to be used.
+   *
+   * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
+   * number of path states is kind of a detail. Is there a more generic term from the scheduler
+   * point of view? */
+  int max_num_path_states_ = 0;
+
+  /* Offset in pixels within a global buffer. */
+  int2 image_full_offset_px_ = make_int2(0, 0);
+
+  /* dimensions of the currently rendering image in pixels. */
+  int2 image_size_px_ = make_int2(0, 0);
+
+  /* Offset and stride of the buffer within which scheduing is happenning.
+   * Will be passed over to the KernelWorkTile. */
+  int offset_, stride_;
+
+  /* Start sample of index and number of samples which are to be rendered.
+   * The scheduler will cover samples range of [start, start + num] over the entire image
+   * (splitting into a smaller work tiles). */
+  int sample_start_ = 0;
+  int samples_num_ = 0;
+
+  /* Tile size which be scheduled for rendering. */
+  TileSize tile_size_;
+
+  /* Number of tiles in X and Y axis of the image. */
+  int num_tiles_x_, num_tiles_y_;
+
+  /* Total number of tiles on the image.
+   * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`.
+   *
+   * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value
+   * in the `get_work()`? */
+  int total_tiles_num_ = 0;
+
+  /* In the case when the number of sam[les in the `tile_size_` is lower than samples_num_ denotes
+   * how many tiles are to be "stacked" to cover the entire requested range of samples. */
+  int num_tiles_per_sample_range_ = 0;
+
+  int next_work_index_ = 0;
+  int total_work_size_ = 0;
+};
+
+CCL_NAMESPACE_END
author	Brecht Van Lommel <brecht@blender.org>	2021-09-20 18:59:20 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-21 15:55:54 +0300
commit	08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree	6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/integrator
parent	fa6b1007bad065440950cd67deb16a04f368856f (diff)