35 files changed, 8108 insertions, 0 deletions
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
new file mode 100644
index 00000000000..bfabd35d7c3
--- /dev/null
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright 2011-2021 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(INC
+  ..
+)
+
+set(SRC
+  adaptive_sampling.cpp
+  denoiser.cpp
+  denoiser_device.cpp
+  denoiser_oidn.cpp
+  denoiser_optix.cpp
+  path_trace.cpp
+  tile.cpp
+  pass_accessor.cpp
+  pass_accessor_cpu.cpp
+  pass_accessor_gpu.cpp
+  path_trace_work.cpp
+  path_trace_work_cpu.cpp
+  path_trace_work_gpu.cpp
+  render_scheduler.cpp
+  shader_eval.cpp
+  work_balancer.cpp
+  work_tile_scheduler.cpp
+)
+
+set(SRC_HEADERS
+  adaptive_sampling.h
+  denoiser.h
+  denoiser_device.h
+  denoiser_oidn.h
+  denoiser_optix.h
+  path_trace.h
+  tile.h
+  pass_accessor.h
+  pass_accessor_cpu.h
+  pass_accessor_gpu.h
+  path_trace_work.h
+  path_trace_work_cpu.h
+  path_trace_work_gpu.h
+  render_scheduler.h
+  shader_eval.h
+  work_balancer.h
+  work_tile_scheduler.h
+)
+
+set(LIB
+  # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to
+  # avoid such cyclic dependency.
+  cycles_render
+
+  cycles_util
+)
+
+if(WITH_OPENIMAGEDENOISE)
+  list(APPEND LIB
+    ${OPENIMAGEDENOISE_LIBRARIES}
+  )
+endif()
+
+include_directories(${INC})
+include_directories(SYSTEM ${INC_SYS})
+
+cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp
new file mode 100644
index 00000000000..23fbcfea5c2
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/adaptive_sampling.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+AdaptiveSampling::AdaptiveSampling()
+{
+}
+
+int AdaptiveSampling::align_samples(int start_sample, int num_samples) const
+{
+  if (!use) {
+    return num_samples;
+  }
+
+  /*
+   * The naive implementation goes as following:
+   *
+   *   int count = 1;
+   *   while (!need_filter(start_sample + count - 1) && count < num_samples) {
+   *     ++count;
+   *   }
+   *   return count;
+   */
+
+  /* 0-based sample index at which first filtering will happen. */
+  const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1);
+
+  /* Allow as many samples as possible until the first filter sample. */
+  if (start_sample + num_samples <= first_filter_sample) {
+    return num_samples;
+  }
+
+  const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1));
+
+  const int num_samples_until_filter = next_filter_sample - start_sample + 1;
+
+  return min(num_samples_until_filter, num_samples);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (!use) {
+    return false;
+  }
+
+  if (sample <= min_samples) {
+    return false;
+  }
+
+  return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h
new file mode 100644
index 00000000000..d98edd9894c
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling {
+ public:
+  AdaptiveSampling();
+
+  /* Align number of samples so that they align with the adaptive filtering.
+   *
+   * Returns the new value for the `num_samples` so that after rendering so many samples on top
+   * of `start_sample` filtering is required.
+   *
+   * The alignment happens in a way that allows to render as many samples as possible without
+   * missing any filtering point. This means that the result is "clamped" by the nearest sample
+   * at which filtering is needed. This is part of mechanism which ensures that all devices will
+   * perform same exact filtering and adaptive sampling, regardless of their performance.
+   *
+   * `start_sample` is the 0-based index of sample.
+   *
+   * NOTE: The start sample is included into the number of samples to render. This means that
+   * if the number of samples is 1, then the path tracer will render samples [align_samples],
+   * if the number of samples is 2, then the path tracer will render samples [align_samples,
+   * align_samples + 1] and so on. */
+  int align_samples(int start_sample, int num_samples) const;
+
+  /* Check whether adaptive sampling filter should happen at this sample.
+   * Returns false if the adaptive sampling is not use.
+   *
+   * `sample` is the 0-based index of sample. */
+  bool need_filter(int sample) const;
+
+  bool use = false;
+  int adaptive_step = 0;
+  int min_samples = 0;
+  float threshold = 0.0f;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
new file mode 100644
index 00000000000..598bbd497a5
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser.h"
+
+#include "device/device.h"
+#include "integrator/denoiser_oidn.h"
+#include "integrator/denoiser_optix.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams &params)
+{
+  DCHECK(params.use);
+
+  switch (params.type) {
+    case DENOISER_OPTIX:
+      return make_unique<OptiXDenoiser>(path_trace_device, params);
+
+    case DENOISER_OPENIMAGEDENOISE:
+      return make_unique<OIDNDenoiser>(path_trace_device, params);
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      /* pass */
+      break;
+  }
+
+  LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen.";
+
+  return nullptr;
+}
+
+Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
+    : path_trace_device_(path_trace_device), params_(params)
+{
+  DCHECK(params.use);
+}
+
+void Denoiser::set_params(const DenoiseParams &params)
+{
+  DCHECK_EQ(params.type, params_.type);
+
+  if (params.type == params_.type) {
+    params_ = params;
+  }
+  else {
+    LOG(ERROR) << "Attempt to change denoiser type.";
+  }
+}
+
+const DenoiseParams &Denoiser::get_params() const
+{
+  return params_;
+}
+
+bool Denoiser::load_kernels(Progress *progress)
+{
+  const Device *denoiser_device = ensure_denoiser_device(progress);
+
+  if (!denoiser_device) {
+    path_trace_device_->set_error("No device available to denoise on");
+    return false;
+  }
+
+  VLOG(3) << "Will denoise on " << denoiser_device->info.description << " ("
+          << denoiser_device->info.id << ")";
+
+  return true;
+}
+
+Device *Denoiser::get_denoiser_device() const
+{
+  return denoiser_device_;
+}
+
+/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */
+static bool is_single_supported_device(Device *device, DenoiserType type)
+{
+  if (device->info.type == DEVICE_MULTI) {
+    /* Assume multi-device is never created with a single sub-device.
+     * If one requests such configuration it should be checked on the session level. */
+    return false;
+  }
+
+  if (!device->info.multi_devices.empty()) {
+    /* Some configurations will use multi_devices, but keep the type of an individual device.
+     * This does simplify checks for homogenous setups, but here we really need a single device. */
+    return false;
+  }
+
+  /* Check the denoiser type is supported. */
+  return (device->info.denoisers & type);
+}
+
+/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of
+ * multi-device.
+ *
+ * If there is no device available which supports given denoiser type nullptr is returned. */
+static Device *find_best_device(Device *device, DenoiserType type)
+{
+  Device *best_device = nullptr;
+
+  device->foreach_device([&](Device *sub_device) {
+    if ((sub_device->info.denoisers & type) == 0) {
+      return;
+    }
+    if (!best_device) {
+      best_device = sub_device;
+    }
+    else {
+      /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
+       * of the device and data transfer cost. */
+    }
+  });
+
+  return best_device;
+}
+
+static unique_ptr<Device> create_denoiser_device(Device *path_trace_device,
+                                                 const uint device_type_mask)
+{
+  const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask);
+  if (device_infos.empty()) {
+    return nullptr;
+  }
+
+  /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on
+   * a physical CUDA device which is already used for rendering. */
+
+  /* TODO(sergey): Choose fastest device for denoising. */
+
+  const DeviceInfo denoiser_device_info = device_infos.front();
+
+  unique_ptr<Device> denoiser_device(
+      Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler));
+
+  if (!denoiser_device) {
+    return nullptr;
+  }
+
+  if (denoiser_device->have_error()) {
+    return nullptr;
+  }
+
+  /* Only need denoising feature, everything else is unused. */
+  if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) {
+    return nullptr;
+  }
+
+  return denoiser_device;
+}
+
+Device *Denoiser::ensure_denoiser_device(Progress *progress)
+{
+  /* The best device has been found already, avoid sequential lookups.
+   * Additionally, avoid device re-creation if it has failed once. */
+  if (denoiser_device_ || device_creation_attempted_) {
+    return denoiser_device_;
+  }
+
+  /* Simple case: rendering happens on a single device which also supports denoiser. */
+  if (is_single_supported_device(path_trace_device_, params_.type)) {
+    denoiser_device_ = path_trace_device_;
+    return denoiser_device_;
+  }
+
+  /* Find best device from the ones which are already used for rendering. */
+  denoiser_device_ = find_best_device(path_trace_device_, params_.type);
+  if (denoiser_device_) {
+    return denoiser_device_;
+  }
+
+  if (progress) {
+    progress->set_status("Loading denoising kernels (may take a few minutes the first time)");
+  }
+
+  device_creation_attempted_ = true;
+
+  const uint device_type_mask = get_device_type_mask();
+  local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask);
+  denoiser_device_ = local_denoiser_device_.get();
+
+  return denoiser_device_;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h
new file mode 100644
index 00000000000..3101b45e31b
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the
+ * better place is figured out. */
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "util/util_function.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class RenderBuffers;
+class Progress;
+
+/* Implementation of a specific denoising algorithm.
+ *
+ * This class takes care of breaking down denosiing algorithm into a series of device calls or to
+ * calls of an external API to denoise given input.
+ *
+ * TODO(sergey): Are we better with device or a queue here? */
+class Denoiser {
+ public:
+  /* Create denoiser for the given path trace device.
+   *
+   * Notes:
+   * - The denoiser must be configured. This means that `params.use` must be true.
+   *   This is checked in debug builds.
+   * - The device might be MultiDevice. */
+  static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual ~Denoiser() = default;
+
+  void set_params(const DenoiseParams &params);
+  const DenoiseParams &get_params() const;
+
+  /* Create devices and load kernels needed for denoising.
+   * The progress is used to communicate state when kenrels actually needs to be loaded.
+   *
+   * NOTE: The `progress` is an optional argument, can be nullptr. */
+  virtual bool load_kernels(Progress *progress);
+
+  /* Denoise the entire buffer.
+   *
+   * Buffer parameters denotes an effective parameters used during rendering. It could be
+   * a lower resolution render into a bigger allocated buffer, which is used in viewport during
+   * navigation and non-unit pixel size. Use that instead of render_buffers->params.
+   *
+   * The buffer might be copming from a "foreign" device from what this denoise is created for.
+   * This means that in general case the denoiser will make sure the input data is available on
+   * the denoiser device, perform denoising, and put data back to the device where the buffer
+   * came from.
+   *
+   * The `num_samples` corresponds to the number of samples in the render buffers. It is used
+   * to scale buffers down to the "final" value in algorithms which don't do automatic exposure,
+   * or which needs "final" value for data passes.
+   *
+   * The `allow_inplace_modification` means that the denoiser is allowed to do in-place
+   * modification of the input passes (scaling them down i.e.). This will lower the memory
+   * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of
+   * view.
+   *
+   * Returns true when all passes are denoised. Will return false if there is a denoiser error (for
+   * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) = 0;
+
+  /* Get a device which is used to perform actual denoising.
+   *
+   * Notes:
+   *
+   * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then,
+   *
+   * - The device can be different from the path tracing device. This happens, for example, when
+   *   using OptiX denoiser and rendering on CPU.
+   *
+   * - No threading safety is ensured in this call. This means, that it is up to caller to ensure
+   *   that there is no threadingconflict between denoising task lazily initializing the device and
+   *   access to this device happen. */
+  Device *get_denoiser_device() const;
+
+  function<bool(void)> is_cancelled_cb;
+
+  bool is_cancelled() const
+  {
+    if (!is_cancelled_cb) {
+      return false;
+    }
+    return is_cancelled_cb();
+  }
+
+ protected:
+  Denoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  /* Make sure denoising device is initialized. */
+  virtual Device *ensure_denoiser_device(Progress *progress);
+
+  /* Get device type mask which is used to filter available devices when new device needs to be
+   * created. */
+  virtual uint get_device_type_mask() const = 0;
+
+  Device *path_trace_device_;
+  DenoiseParams params_;
+
+  /* Cached pointer to the device on which denoising will happen.
+   * Used to avoid lookup of a device for every denoising request. */
+  Device *denoiser_device_ = nullptr;
+
+  /* Denoiser device which was created to perform denoising in the case the none of the rendering
+   * devices are capable of denoising. */
+  unique_ptr<Device> local_denoiser_device_;
+  bool device_creation_attempted_ = false;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp
new file mode 100644
index 00000000000..8088cfd7800
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_device.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+}
+
+DeviceDenoiser::~DeviceDenoiser()
+{
+  /* Explicit implementation, to allow forward declaration of Device in the header. */
+}
+
+bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                    RenderBuffers *render_buffers,
+                                    const int num_samples,
+                                    bool allow_inplace_modification)
+{
+  Device *denoiser_device = get_denoiser_device();
+  if (!denoiser_device) {
+    return false;
+  }
+
+  DeviceDenoiseTask task;
+  task.params = params_;
+  task.num_samples = num_samples;
+  task.buffer_params = buffer_params;
+  task.allow_inplace_modification = allow_inplace_modification;
+
+  RenderBuffers local_render_buffers(denoiser_device);
+  bool local_buffer_used = false;
+
+  if (denoiser_device == render_buffers->buffer.device) {
+    /* The device can access an existing buffer pointer. */
+    local_buffer_used = false;
+    task.render_buffers = render_buffers;
+  }
+  else {
+    VLOG(3) << "Creating temporary buffer on denoiser device.";
+
+    DeviceQueue *queue = denoiser_device->get_denoise_queue();
+
+    /* Create buffer which is available by the device used by denoiser. */
+
+    /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
+     * ignoring other light ad data passes. */
+
+    local_buffer_used = true;
+
+    render_buffers->copy_from_device();
+
+    local_render_buffers.reset(buffer_params);
+
+    /* NOTE: The local buffer is allocated for an exact size of the effective render size, while
+     * the input render buffer is allcoated for the lowest resolution divider possible. So it is
+     * important to only copy actually needed part of the input buffer. */
+    memcpy(local_render_buffers.buffer.data(),
+           render_buffers->buffer.data(),
+           sizeof(float) * local_render_buffers.buffer.size());
+
+    queue->copy_to_device(local_render_buffers.buffer);
+
+    task.render_buffers = &local_render_buffers;
+    task.allow_inplace_modification = true;
+  }
+
+  const bool denoise_result = denoiser_device->denoise_buffer(task);
+
+  if (local_buffer_used) {
+    local_render_buffers.copy_from_device();
+
+    render_buffers_host_copy_denoised(
+        render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
+
+    render_buffers->copy_to_device();
+  }
+
+  return denoise_result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h
new file mode 100644
index 00000000000..0fd934dba79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
+ * implemented as a part of a driver of specific device.
+ *
+ * This implementation makes sure the to-be-denoised buffer is available on the denoising device
+ * and invoke denoising kernel via device API. */
+class DeviceDenoiser : public Denoiser {
+ public:
+  DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
+  ~DeviceDenoiser();
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
new file mode 100644
index 00000000000..1b5a012ec87
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -0,0 +1,628 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_oidn.h"
+
+#include <array>
+
+#include "device/device.h"
+#include "device/device_queue.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "render/buffers.h"
+#include "util/util_array.h"
+#include "util/util_logging.h"
+#include "util/util_openimagedenoise.h"
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+thread_mutex OIDNDenoiser::mutex_;
+
+OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+  DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE);
+
+  DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform.";
+}
+
+#ifdef WITH_OPENIMAGEDENOISE
+static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/)
+{
+  OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr);
+  return !oidn_denoiser->is_cancelled();
+}
+#endif
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+class OIDNPass {
+ public:
+  OIDNPass() = default;
+
+  OIDNPass(const BufferParams &buffer_params,
+           const char *name,
+           PassType type,
+           PassMode mode = PassMode::NOISY)
+      : name(name), type(type), mode(mode)
+  {
+    offset = buffer_params.get_pass_offset(type, mode);
+    need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  inline operator bool() const
+  {
+    return name[0] != '\0';
+  }
+
+  /* Name of an image which will be passed to the OIDN library.
+   * Should be one of the following: color, albedo, normal, output.
+   * The albedo and normal images are optional. */
+  const char *name = "";
+
+  PassType type = PASS_NONE;
+  PassMode mode = PassMode::NOISY;
+  int num_components = -1;
+  bool use_compositing = false;
+  bool use_denoising_albedo = true;
+
+  /* Offset of beginning of this pass in the render buffers. */
+  int offset = -1;
+
+  /* Denotes whether the data is to be scaled down with the number of passes.
+   * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so
+   * scaling is not needed for the color pass unless adaptive sampling is used.
+   *
+   * NOTE: Do not scale the outout pass, as that requires to be a pointer in the original buffer.
+   * All the scaling on the output needed for integration with adaptive sampling will happen
+   * outside of generic pass handling. */
+  bool need_scale = false;
+
+  /* The content of the pass has been pre-filtered. */
+  bool is_filtered = false;
+
+  /* For the scaled passes, the data which holds values of scaled pixels. */
+  array<float> scaled_buffer;
+};
+
+class OIDNDenoiseContext {
+ public:
+  OIDNDenoiseContext(OIDNDenoiser *denoiser,
+                     const DenoiseParams &denoise_params,
+                     const BufferParams &buffer_params,
+                     RenderBuffers *render_buffers,
+                     const int num_samples,
+                     const bool allow_inplace_modification)
+      : denoiser_(denoiser),
+        denoise_params_(denoise_params),
+        buffer_params_(buffer_params),
+        render_buffers_(render_buffers),
+        num_samples_(num_samples),
+        allow_inplace_modification_(allow_inplace_modification),
+        pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT))
+  {
+    if (denoise_params_.use_pass_albedo) {
+      oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO);
+    }
+
+    if (denoise_params_.use_pass_normal) {
+      oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL);
+    }
+  }
+
+  bool need_denoising() const
+  {
+    if (buffer_params_.width == 0 && buffer_params_.height == 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /* Make the guiding passes available by a sequential denoising of various passes. */
+  void read_guiding_passes()
+  {
+    read_guiding_pass(oidn_albedo_pass_);
+    read_guiding_pass(oidn_normal_pass_);
+  }
+
+  void denoise_pass(const PassType pass_type)
+  {
+    OIDNPass oidn_color_pass(buffer_params_, "color", pass_type);
+    if (oidn_color_pass.offset == PASS_UNUSED) {
+      return;
+    }
+
+    if (oidn_color_pass.use_denoising_albedo) {
+      if (albedo_replaced_with_fake_) {
+        LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+        return;
+      }
+    }
+
+    OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED);
+    if (oidn_output_pass.offset == PASS_UNUSED) {
+      LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+      return;
+    }
+
+    OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass);
+
+    oidn::DeviceRef oidn_device = oidn::newDevice();
+    oidn_device.commit();
+
+    /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too.
+     */
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_input_pass(oidn_filter, oidn_color_access_pass);
+    set_guiding_passes(oidn_filter, oidn_color_pass);
+    set_output_pass(oidn_filter, oidn_output_pass);
+    oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_);
+    oidn_filter.set("hdr", true);
+    oidn_filter.set("srgb", false);
+    if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE ||
+        denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) {
+      oidn_filter.set("cleanAux", true);
+    }
+    oidn_filter.commit();
+
+    filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_);
+    filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_);
+
+    /* Filter the beauty image. */
+    oidn_filter.execute();
+
+    /* Check for errors. */
+    const char *error_message;
+    const oidn::Error error = oidn_device.getError(error_message);
+    if (error != oidn::Error::None && error != oidn::Error::Cancelled) {
+      LOG(ERROR) << "OpenImageDenoise error: " << error_message;
+    }
+
+    postprocess_output(oidn_color_pass, oidn_output_pass);
+  }
+
+ protected:
+  void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass)
+  {
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass ||
+        oidn_pass.is_filtered) {
+      return;
+    }
+
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_pass(oidn_filter, oidn_pass);
+    set_output_pass(oidn_filter, oidn_pass);
+    oidn_filter.commit();
+    oidn_filter.execute();
+
+    oidn_pass.is_filtered = true;
+  }
+
+  /* Make pixels of a guiding pass available by the denoiser. */
+  void read_guiding_pass(OIDNPass &oidn_pass)
+  {
+    if (!oidn_pass) {
+      return;
+    }
+
+    DCHECK(!oidn_pass.use_compositing);
+
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE &&
+        !is_pass_scale_needed(oidn_pass)) {
+      /* Pass data is available as-is from the render buffers. */
+      return;
+    }
+
+    if (allow_inplace_modification_) {
+      scale_pass_in_render_buffers(oidn_pass);
+      return;
+    }
+
+    read_pass_pixels_into_buffer(oidn_pass);
+  }
+
+  /* Special reader of the input pass.
+   * To save memory it will read pixels into the output, and let the denoiser to perform an
+   * in-place operation. */
+  OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    const bool use_compositing = oidn_input_pass.use_compositing;
+
+    /* Simple case: no compositing is involved, no scaling is needed.
+     * The pass pixels will be referenced as-is, without extra processing. */
+    if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) {
+      return oidn_input_pass;
+    }
+
+    float *buffer_data = render_buffers_->buffer.data();
+    float *pass_data = buffer_data + oidn_output_pass.offset;
+
+    PassAccessor::Destination destination(pass_data, 3);
+    destination.pixel_stride = buffer_params_.pass_stride;
+
+    read_pass_pixels(oidn_input_pass, destination);
+
+    OIDNPass oidn_input_pass_at_output = oidn_input_pass;
+    oidn_input_pass_at_output.offset = oidn_output_pass.offset;
+
+    return oidn_input_pass_at_output;
+  }
+
+  /* Read pass pixels using PassAccessor into the given destination. */
+  void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination)
+  {
+    PassAccessor::PassAccessInfo pass_access_info;
+    pass_access_info.type = oidn_pass.type;
+    pass_access_info.mode = oidn_pass.mode;
+    pass_access_info.offset = oidn_pass.offset;
+
+    /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+     * on the approximation. The latter is not even possible because OIDN does not support
+     * denoising of semi-transparent pixels. */
+    pass_access_info.use_approximate_shadow_catcher = false;
+    pass_access_info.use_approximate_shadow_catcher_background = false;
+    pass_access_info.show_active_pixels = false;
+
+    /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured
+     * by users. What is important is to use same exposure for read and write access of the pass
+     * pixels. */
+    const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_);
+
+    pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination);
+  }
+
+  /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */
+  void read_pass_pixels_into_buffer(OIDNPass &oidn_pass)
+  {
+    VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " ("
+            << pass_type_as_string(oidn_pass.type) << ")";
+
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    array<float> &scaled_buffer = oidn_pass.scaled_buffer;
+    scaled_buffer.resize(width * height * 3);
+
+    const PassAccessor::Destination destination(scaled_buffer.data(), 3);
+
+    read_pass_pixels(oidn_pass, destination);
+  }
+
+  /* Set OIDN image to reference pixels from the given render buffer pass.
+   * No transform to the pixels is done, no additional memory is used. */
+  void set_pass_referenced(oidn::FilterRef &oidn_filter,
+                           const char *name,
+                           const OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+
+    const int64_t pixel_index = offset + x + y * stride;
+    const int64_t buffer_offset = pixel_index * pass_stride;
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    oidn_filter.setImage(name,
+                         buffer_data + buffer_offset + oidn_pass.offset,
+                         oidn::Format::Float3,
+                         width,
+                         height,
+                         0,
+                         pass_stride * sizeof(float),
+                         stride * pass_stride * sizeof(float));
+  }
+
+  void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    oidn_filter.setImage(
+        name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0);
+  }
+
+  void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+  void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    if (oidn_pass.scaled_buffer.empty()) {
+      set_pass_referenced(oidn_filter, name, oidn_pass);
+    }
+    else {
+      set_pass_from_buffer(oidn_filter, name, oidn_pass);
+    }
+  }
+
+  void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+
+  void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    if (oidn_albedo_pass_) {
+      if (oidn_pass.use_denoising_albedo) {
+        set_pass(oidn_filter, oidn_albedo_pass_);
+      }
+      else {
+        /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been
+         * provided. */
+        set_fake_albedo_pass(oidn_filter);
+      }
+    }
+
+    if (oidn_normal_pass_) {
+      set_pass(oidn_filter, oidn_normal_pass_);
+    }
+  }
+
+  void set_fake_albedo_pass(oidn::FilterRef &oidn_filter)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    if (!albedo_replaced_with_fake_) {
+      const int64_t num_pixel_components = width * height * 3;
+      oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components);
+
+      for (int i = 0; i < num_pixel_components; ++i) {
+        oidn_albedo_pass_.scaled_buffer[i] = 0.5f;
+      }
+
+      albedo_replaced_with_fake_ = true;
+    }
+
+    set_pass(oidn_filter, oidn_albedo_pass_);
+  }
+
+  void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, "output", oidn_pass);
+  }
+
+  /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel
+   * back. */
+  void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components);
+
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+    const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing;
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *denoised_pixel = buffer_pixel + oidn_output_pass.offset;
+
+        if (need_scale) {
+          const float pixel_scale = has_pass_sample_count ?
+                                        __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                        num_samples_;
+
+          denoised_pixel[0] = denoised_pixel[0] * pixel_scale;
+          denoised_pixel[1] = denoised_pixel[1] * pixel_scale;
+          denoised_pixel[2] = denoised_pixel[2] * pixel_scale;
+        }
+
+        if (oidn_output_pass.num_components == 3) {
+          /* Pass without alpha channel. */
+        }
+        else if (!oidn_input_pass.use_compositing) {
+          /* Currently compositing passes are either 3-component (derived by dividing light passes)
+           * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+           * simplifies logic and avoids extra memory allocation. */
+          const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset;
+          denoised_pixel[3] = noisy_pixel[3];
+        }
+        else {
+          /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+           * is an opaque pixel for 4 component passes. */
+          denoised_pixel[3] = 0;
+        }
+      }
+    }
+  }
+
+  bool is_pass_scale_needed(OIDNPass &oidn_pass) const
+  {
+    if (pass_sample_count_ != PASS_UNUSED) {
+      /* With adaptive sampling pixels will have different number of samples in them, so need to
+       * always scale the pass to make pixels uniformly sampled. */
+      return true;
+    }
+
+    if (!oidn_pass.need_scale) {
+      return false;
+    }
+
+    if (num_samples_ == 1) {
+      /* If the avoid scaling if there is only one sample, to save up time (so we dont divide
+       * buffer by 1). */
+      return false;
+    }
+
+    return true;
+  }
+
+  void scale_pass_in_render_buffers(OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *pass_pixel = buffer_pixel + oidn_pass.offset;
+
+        const float pixel_scale = 1.0f / (has_pass_sample_count ?
+                                              __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                              num_samples_);
+
+        pass_pixel[0] = pass_pixel[0] * pixel_scale;
+        pass_pixel[1] = pass_pixel[1] * pixel_scale;
+        pass_pixel[2] = pass_pixel[2] * pixel_scale;
+      }
+    }
+  }
+
+  OIDNDenoiser *denoiser_ = nullptr;
+
+  const DenoiseParams &denoise_params_;
+  const BufferParams &buffer_params_;
+  RenderBuffers *render_buffers_ = nullptr;
+  int num_samples_ = 0;
+  bool allow_inplace_modification_ = false;
+  int pass_sample_count_ = PASS_UNUSED;
+
+  /* Optional albedo and normal passes, reused by denoising of different pass types. */
+  OIDNPass oidn_albedo_pass_;
+  OIDNPass oidn_normal_pass_;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake_ = false;
+};
+#endif
+
+static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers)
+{
+  Device *device = render_buffers->buffer.device;
+  if (device->info.has_gpu_queue) {
+    return device->gpu_queue_create();
+  }
+  return nullptr;
+}
+
+static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue,
+                                            RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_from_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_from_device();
+  }
+}
+
+static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue,
+                                          RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_to_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_to_device();
+  }
+}
+
+bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                  RenderBuffers *render_buffers,
+                                  const int num_samples,
+                                  bool allow_inplace_modification)
+{
+  thread_scoped_lock lock(mutex_);
+
+  /* Make sure the host-side data is available for denoising. */
+  unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers);
+  copy_render_buffers_from_device(queue, render_buffers);
+
+#ifdef WITH_OPENIMAGEDENOISE
+  OIDNDenoiseContext context(
+      this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification);
+
+  if (context.need_denoising()) {
+    context.read_guiding_passes();
+
+    const std::array<PassType, 3> passes = {
+        {/* Passes which will use real albedo when it is available. */
+         PASS_COMBINED,
+         PASS_SHADOW_CATCHER_MATTE,
+
+         /* Passes which do not need albedo and hence if real is present it needs to become fake.
+          */
+         PASS_SHADOW_CATCHER}};
+
+    for (const PassType pass_type : passes) {
+      context.denoise_pass(pass_type);
+      if (is_cancelled()) {
+        return false;
+      }
+    }
+
+    /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code
+     * copies data from the device it doesn't overwrite the denoiser buffers. */
+    copy_render_buffers_to_device(queue, render_buffers);
+  }
+#endif
+
+  /* This code is not supposed to run when compiled without OIDN support, so can assume if we made
+   * it up here all passes are properly denoised. */
+  return true;
+}
+
+uint OIDNDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_CPU;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h
new file mode 100644
index 00000000000..566e761ae79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of denoising API which uses OpenImageDenoise library. */
+class OIDNDenoiser : public Denoiser {
+ public:
+  /* Forwardly declared state which might be using compile-flag specific fields, such as
+   * OpenImageDenoise device and filter handles. */
+  class State;
+
+  OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+
+  /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded.
+   * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */
+  static thread_mutex mutex_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_optix.cpp b/intern/cycles/integrator/denoiser_optix.cpp
new file mode 100644
index 00000000000..5f9de23bfe6
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_optix.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : DeviceDenoiser(path_trace_device, params)
+{
+}
+
+uint OptiXDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_OPTIX;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_optix.h b/intern/cycles/integrator/denoiser_optix.h
new file mode 100644
index 00000000000..a8df770ecf7
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser_device.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDenoiser : public DeviceDenoiser {
+ public:
+  OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
new file mode 100644
index 00000000000..87c048b1fa5
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/kernel_types.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Pass input information.
+ */
+
+PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass)
+    : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass destination.
+ */
+
+PassAccessor::Destination::Destination(float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
+    : Destination(pass_type)
+{
+  pixels_half_rgba = pixels;
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type)
+{
+  const PassInfo pass_info = Pass::get_info(pass_type);
+  num_components = pass_info.num_components;
+}
+
+/* --------------------------------------------------------------------
+ * Pass source.
+ */
+
+PassAccessor::Source::Source(const float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessor.
+ */
+
+PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples)
+    : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples)
+{
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  return get_render_tile_pixels(render_buffers, render_buffers->params, destination);
+}
+
+static void pad_pixels(const BufferParams &buffer_params,
+                       const PassAccessor::Destination &destination,
+                       const int src_num_components)
+{
+  /* When requesting a single channel pass as RGBA, or RGB pass as RGBA,
+   * fill in the additional components for convenience. */
+  const int dest_num_components = destination.num_components;
+
+  if (src_num_components >= dest_num_components) {
+    return;
+  }
+
+  const size_t size = buffer_params.width * buffer_params.height;
+  if (destination.pixels) {
+    float *pixel = destination.pixels;
+
+    for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[1] = pixel[0];
+        pixel[2] = pixel[0];
+      }
+      if (dest_num_components >= 4) {
+        pixel[3] = 1.0f;
+      }
+    }
+  }
+
+  if (destination.pixels_half_rgba) {
+    const half one = float_to_half(1.0f);
+    half4 *pixel = destination.pixels_half_rgba;
+
+    for (size_t i = 0; i < size; i++, pixel++) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[0].y = pixel[0].x;
+        pixel[0].z = pixel[0].x;
+      }
+      if (dest_num_components >= 4) {
+        pixel[0].w = one;
+      }
+    }
+  }
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const BufferParams &buffer_params,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  if (pass_access_info_.offset == PASS_UNUSED) {
+    return false;
+  }
+
+  const PassType type = pass_access_info_.type;
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
+
+  if (pass_info.num_components == 1) {
+    /* Single channel passes. */
+    if (mode == PassMode::DENOISED) {
+      /* Denoised passes store their final pixels, no need in special calculation. */
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_RENDER_TIME) {
+      /* TODO(sergey): Needs implementation. */
+    }
+    else if (type == PASS_DEPTH) {
+      get_pass_depth(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_MIST) {
+      get_pass_mist(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SAMPLE_COUNT) {
+      get_pass_sample_count(render_buffers, buffer_params, destination);
+    }
+    else {
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+  }
+  else if (type == PASS_MOTION) {
+    /* Motion pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components";
+    get_pass_motion(render_buffers, buffer_params, destination);
+  }
+  else if (type == PASS_CRYPTOMATTE) {
+    /* Cryptomatte pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components";
+    get_pass_cryptomatte(render_buffers, buffer_params, destination);
+  }
+  else {
+    /* RGB, RGBA and vector passes. */
+    DCHECK(destination.num_components == 3 || destination.num_components == 4)
+        << pass_type_as_string(type) << " pass must have 3 or 4 components";
+
+    if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) {
+      /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass
+       * to approximate shadow with). */
+      get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) {
+      /* Shadow catcher pass. */
+      get_pass_shadow_catcher(render_buffers, buffer_params, destination);
+    }
+    else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE ||
+              pass_info.indirect_type != PASS_NONE) &&
+             mode != PassMode::DENOISED) {
+      /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */
+      get_pass_light_path(render_buffers, buffer_params, destination);
+    }
+    else {
+      /* Passes that need no special computation, or denoised passes that already
+       * had the computation done. */
+      if (pass_info.num_components == 3) {
+        get_pass_float3(render_buffers, buffer_params, destination);
+      }
+      else if (pass_info.num_components == 4) {
+        if (destination.num_components == 3) {
+          /* Special case for denoiser access of RGBA passes ignoring alpha channel. */
+          get_pass_float3(render_buffers, buffer_params, destination);
+        }
+        else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER ||
+                 type == PASS_SHADOW_CATCHER_MATTE) {
+          /* Passes with transparency as 4th component. */
+          get_pass_combined(render_buffers, buffer_params, destination);
+        }
+        else {
+          /* Passes with alpha as 4th component. */
+          get_pass_float4(render_buffers, buffer_params, destination);
+        }
+      }
+    }
+  }
+
+  pad_pixels(buffer_params, destination, pass_info.num_components);
+
+  return true;
+}
+
+void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination) const
+{
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
+                                             pass_access_info_.include_albedo);
+
+  kfilm_convert->pass_offset = pass_access_info_.offset;
+  kfilm_convert->pass_stride = buffer_params.pass_stride;
+
+  kfilm_convert->pass_use_exposure = pass_info.use_exposure;
+  kfilm_convert->pass_use_filter = pass_info.use_filter;
+
+  /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */
+  if (pass_info.direct_type != PASS_NONE) {
+    kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type);
+  }
+  kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type);
+  kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type);
+
+  kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED);
+  kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset(
+      PASS_ADAPTIVE_AUX_BUFFER);
+  kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT);
+  kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode);
+  kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+  kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_MATTE, mode);
+
+  /* Background is not denoised, so always use noisy pass. */
+  kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);
+
+  if (pass_info.use_filter) {
+    kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+  }
+  else {
+    kfilm_convert->scale = 1.0f;
+  }
+
+  if (pass_info.use_exposure) {
+    kfilm_convert->exposure = exposure_;
+  }
+  else {
+    kfilm_convert->exposure = 1.0f;
+  }
+
+  kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure;
+
+  kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher;
+  kfilm_convert->use_approximate_shadow_catcher_background =
+      pass_access_info_.use_approximate_shadow_catcher_background;
+  kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels;
+
+  kfilm_convert->num_components = destination.num_components;
+  kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                           destination.num_components;
+
+  kfilm_convert->is_denoised = (mode == PassMode::DENOISED);
+}
+
+bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source)
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
+                                            pass_access_info_.include_albedo);
+
+  const BufferParams &buffer_params = render_buffers->params;
+
+  float *buffer_data = render_buffers->buffer.data();
+  const int size = buffer_params.width * buffer_params.height;
+
+  const int out_stride = buffer_params.pass_stride;
+  const int in_stride = source.num_components;
+  const int num_components_to_copy = min(source.num_components, pass_info.num_components);
+
+  float *out = buffer_data + pass_access_info_.offset;
+  const float *in = source.pixels + source.offset * in_stride;
+
+  for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
+    memcpy(out, in, sizeof(float) * num_components_to_copy);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
new file mode 100644
index 00000000000..624bf7d0b2c
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "render/pass.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class RenderBuffers;
+class BufferPass;
+class BufferParams;
+struct KernelFilmConvert;
+
+/* Helper class which allows to access pass data.
+ * Is designed in a way that it is created once when the pass data is known, and then pixels gets
+ * progressively update from various render buffers. */
+class PassAccessor {
+ public:
+  class PassAccessInfo {
+   public:
+    PassAccessInfo() = default;
+    explicit PassAccessInfo(const BufferPass &pass);
+
+    PassType type = PASS_NONE;
+    PassMode mode = PassMode::NOISY;
+    bool include_albedo = false;
+    int offset = -1;
+
+    /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its
+     * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop.
+     */
+    bool use_approximate_shadow_catcher = false;
+
+    /* When approximate shadow catcher matte is used alpha-over the result on top of background. */
+    bool use_approximate_shadow_catcher_background = false;
+
+    bool show_active_pixels = false;
+  };
+
+  class Destination {
+   public:
+    Destination() = default;
+    Destination(float *pixels, int num_components);
+    Destination(const PassType pass_type, half4 *pixels);
+
+    /* Destination will be initialized with the number of components which is native for the given
+     * pass type. */
+    explicit Destination(const PassType pass_type);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    float *pixels = nullptr;
+    half4 *pixels_half_rgba = nullptr;
+
+    /* Device-side pointers. */
+    device_ptr d_pixels = 0;
+    device_ptr d_pixels_half_rgba = 0;
+
+    /* Number of components per pixel in the floating-point destination.
+     * Is ignored for half4 destination (where number of components is implied to be 4). */
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+
+    /* Number of floats per pixel. When zero is the same as `num_components`.
+     *
+     * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component
+     * half-floats. */
+    int pixel_stride = 0;
+
+    /* Row stride in pixel elements:
+     *  - For the float destination stride is a number of floats per row.
+     *  - For the half4 destination stride is a number of half4 per row. */
+    int stride = 0;
+  };
+
+  class Source {
+   public:
+    Source() = default;
+    Source(const float *pixels, int num_components);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    const float *pixels = nullptr;
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+  };
+
+  PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples);
+
+  virtual ~PassAccessor() = default;
+
+  /* Get pass data from the given render buffers, perform needed filtering, and store result into
+   * the pixels.
+   * The result is stored sequentially starting from the very beginning of the pixels memory. */
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const Destination &destination) const;
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const BufferParams &buffer_params,
+                              const Destination &destination) const;
+  /* Set pass data for the given render buffers. Used for baking to read from passes. */
+  bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source);
+
+ protected:
+  virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                        const BufferParams &buffer_params,
+                                        const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const = 0;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+
+  PassAccessInfo pass_access_info_;
+
+  float exposure_ = 0.0f;
+  int num_samples_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
new file mode 100644
index 00000000000..3c6691f6d43
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_film.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Kernel processing.
+ */
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                                           const BufferParams &buffer_params,
+                                                           const Destination &destination,
+                                                           const Processor &processor) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  if (destination.pixels) {
+    /* NOTE: No overlays are applied since they are not used for final renders.
+     * Can be supported via some sort of specialization to avoid code duplication. */
+
+    run_get_pass_kernel_processor_float(
+        &kfilm_convert, render_buffers, buffer_params, destination, processor);
+  }
+
+  if (destination.pixels_half_rgba) {
+    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+    if (destination.num_components == 1) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                float pixel;
+                                                processor(kfilm_convert, buffer, &pixel);
+
+                                                pixel_rgba[0] = pixel;
+                                                pixel_rgba[1] = pixel;
+                                                pixel_rgba[2] = pixel;
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 3) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                processor(kfilm_convert, buffer, pixel_rgba);
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 4) {
+      run_get_pass_kernel_processor_half_rgba(
+          &kfilm_convert, render_buffers, buffer_params, destination, processor);
+    }
+  }
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+  const float *buffer_data = render_buffers->buffer.data();
+  const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                      destination.num_components;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+      float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
+
+      processor(kfilm_convert, buffer, pixel);
+    }
+  });
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  const float *buffer_data = render_buffers->buffer.data();
+
+  half4 *dst_start = destination.pixels_half_rgba + destination.offset;
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    half4 *dst_row_start = dst_start + y * destination_stride;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+
+      float pixel[4];
+      processor(kfilm_convert, buffer, pixel);
+
+      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+      half4 *pixel_half_rgba = dst_row_start + x;
+      float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+    }
+  });
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass) \
+  void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_get_pass_kernel_processor( \
+        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth)
+DEFINE_PASS_ACCESSOR(mist)
+DEFINE_PASS_ACCESSOR(sample_count)
+DEFINE_PASS_ACCESSOR(float)
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path)
+DEFINE_PASS_ACCESSOR(shadow_catcher)
+DEFINE_PASS_ACCESSOR(float3)
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion)
+DEFINE_PASS_ACCESSOR(cryptomatte)
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+DEFINE_PASS_ACCESSOR(combined)
+DEFINE_PASS_ACCESSOR(float4)
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
new file mode 100644
index 00000000000..0313dc5bb0d
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelFilmConvert;
+
+/* Pass accessor implementation for CPU side. */
+class PassAccessorCPU : public PassAccessor {
+ public:
+  using PassAccessor::PassAccessor;
+
+ protected:
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination,
+                                            const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+                                                  const RenderBuffers *render_buffers,
+                                                  const BufferParams &buffer_params,
+                                                  const Destination &destination,
+                                                  const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+                                                      const RenderBuffers *render_buffers,
+                                                      const BufferParams &buffer_params,
+                                                      const Destination &destination,
+                                                      const Processor &processor) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
new file mode 100644
index 00000000000..eb80ba99655
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_gpu.h"
+
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue,
+                                 const PassAccessInfo &pass_access_info,
+                                 float exposure,
+                                 int num_samples)
+    : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue)
+
+{
+}
+
+/* --------------------------------------------------------------------
+ * Kernel execution.
+ */
+
+void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
+                                               const RenderBuffers *render_buffers,
+                                               const BufferParams &buffer_params,
+                                               const Destination &destination) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  if (destination.d_pixels) {
+    DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+  if (destination.d_pixels_half_rgba) {
+    const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel_half_float, work_size, args);
+  }
+
+  queue_->synchronize();
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \
+  void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_film_convert_kernels( \
+        DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth, DEPTH);
+DEFINE_PASS_ACCESSOR(mist, MIST);
+DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT);
+DEFINE_PASS_ACCESSOR(float, FLOAT);
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
+DEFINE_PASS_ACCESSOR(float3, FLOAT3);
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion, MOTION);
+DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE);
+DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER);
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW);
+DEFINE_PASS_ACCESSOR(combined, COMBINED);
+DEFINE_PASS_ACCESSOR(float4, FLOAT4);
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h
new file mode 100644
index 00000000000..bc37e4387f3
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+/* Pass accessor implementation for GPU side. */
+class PassAccessorGPU : public PassAccessor {
+ public:
+  PassAccessorGPU(DeviceQueue *queue,
+                  const PassAccessInfo &pass_access_info,
+                  float exposure,
+                  int num_samples);
+
+ protected:
+  void run_film_convert_kernels(DeviceKernel kernel,
+                                const RenderBuffers *render_buffers,
+                                const BufferParams &buffer_params,
+                                const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth);
+  DECLARE_PASS_ACCESSOR(mist);
+  DECLARE_PASS_ACCESSOR(sample_count);
+  DECLARE_PASS_ACCESSOR(float);
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path);
+  DECLARE_PASS_ACCESSOR(float3);
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion);
+  DECLARE_PASS_ACCESSOR(cryptomatte);
+  DECLARE_PASS_ACCESSOR(shadow_catcher);
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow);
+  DECLARE_PASS_ACCESSOR(combined);
+  DECLARE_PASS_ACCESSOR(float4);
+
+#undef DECLARE_PASS_ACCESSOR
+
+  DeviceQueue *queue_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
new file mode 100644
index 00000000000..6c02316ac2b
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -0,0 +1,1147 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace.h"
+
+#include "device/cpu/device.h"
+#include "device/device.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/render_scheduler.h"
+#include "render/gpu_display.h"
+#include "render/pass.h"
+#include "render/scene.h"
+#include "render/tile.h"
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTrace::PathTrace(Device *device,
+                     Film *film,
+                     DeviceScene *device_scene,
+                     RenderScheduler &render_scheduler,
+                     TileManager &tile_manager)
+    : device_(device),
+      device_scene_(device_scene),
+      render_scheduler_(render_scheduler),
+      tile_manager_(tile_manager)
+{
+  DCHECK_NE(device_, nullptr);
+
+  {
+    vector<DeviceInfo> cpu_devices;
+    device_cpu_info(cpu_devices);
+
+    cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler));
+  }
+
+  /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
+   * as possible. */
+  device_->foreach_device([&](Device *path_trace_device) {
+    path_trace_works_.emplace_back(PathTraceWork::create(
+        path_trace_device, film, device_scene, &render_cancel_.is_requested));
+  });
+
+  work_balance_infos_.resize(path_trace_works_.size());
+  work_balance_do_initial(work_balance_infos_);
+
+  render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1);
+}
+
+PathTrace::~PathTrace()
+{
+  /* Destroy any GPU resource which was used for graphics interop.
+   * Need to have access to the GPUDisplay as it is the only source of drawing context which is
+   * used for interop. */
+  if (gpu_display_) {
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->destroy_gpu_resources(gpu_display_.get());
+    }
+  }
+}
+
+void PathTrace::load_kernels()
+{
+  if (denoiser_) {
+    denoiser_->load_kernels(progress_);
+  }
+}
+
+void PathTrace::alloc_work_memory()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->alloc_work_memory();
+  }
+}
+
+bool PathTrace::ready_to_reset()
+{
+  /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
+   * display. Of there is no such display, the logic here will break. */
+  DCHECK(gpu_display_);
+
+  /* The logic here tries to provide behavior which feels the most interactive feel to artists.
+   * General idea is to be able to reset as quickly as possible, while still providing interactive
+   * feel.
+   *
+   * If the render result was ever drawn after previous reset, consider that reset is now possible.
+   * This way camera navigation gives the quickest feedback of rendered pixels, regardless of
+   * whether CPU or GPU drawing pipeline is used.
+   *
+   * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit
+   * arbitrary, but seems to work very well with viewport navigation in Blender. */
+
+  if (did_draw_after_reset_) {
+    return true;
+  }
+
+  return false;
+}
+
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
+{
+  if (big_tile_params_.modified(big_tile_params)) {
+    big_tile_params_ = big_tile_params;
+    render_state_.need_reset_params = true;
+  }
+
+  full_params_ = full_params;
+
+  /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
+   * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
+   * properly updated. */
+  if (gpu_display_) {
+    gpu_display_->reset(full_params);
+  }
+
+  render_state_.has_denoised_result = false;
+  render_state_.tile_written = false;
+
+  did_draw_after_reset_ = false;
+}
+
+void PathTrace::device_free()
+{
+  /* Free render buffers used by the path trace work to reduce memory peak. */
+  BufferParams empty_params;
+  empty_params.pass_stride = 0;
+  empty_params.update_offset_stride();
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->get_render_buffers()->reset(empty_params);
+  }
+  render_state_.need_reset_params = true;
+}
+
+void PathTrace::set_progress(Progress *progress)
+{
+  progress_ = progress;
+}
+
+void PathTrace::render(const RenderWork &render_work)
+{
+  /* Indicate that rendering has started and that it can be requested to cancel. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    if (render_cancel_.is_requested) {
+      return;
+    }
+    render_cancel_.is_rendering = true;
+  }
+
+  render_pipeline(render_work);
+
+  /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry
+   * on. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    render_cancel_.is_rendering = false;
+    render_cancel_.condition.notify_one();
+  }
+}
+
+void PathTrace::render_pipeline(RenderWork render_work)
+{
+  /* NOTE: Only check for "instant" cancel here. Ther user-requested cancel via progress is
+   * checked in Session and the work in the event of cancel is to be finished here. */
+
+  render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes !=
+                                                  0);
+
+  render_init_kernel_execution();
+
+  render_scheduler_.report_work_begin(render_work);
+
+  init_render_buffers(render_work);
+
+  rebalance(render_work);
+
+  path_trace(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  adaptive_sample(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  cryptomatte_postprocess(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  denoise(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  write_tile_buffer(render_work);
+  update_display(render_work);
+
+  progress_update_if_needed(render_work);
+
+  finalize_full_buffer_on_disk(render_work);
+}
+
+void PathTrace::render_init_kernel_execution()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->init_execution();
+  }
+}
+
+/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a
+ * measurable performance impact at runtime, but will make compilation faster and binary somewhat
+ * smaller. */
+template<typename Callback>
+static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works,
+                                         const vector<WorkBalanceInfo> &work_balance_infos,
+                                         const BufferParams &buffer_params,
+                                         const Callback &callback)
+{
+  const int num_works = path_trace_works.size();
+  const int height = buffer_params.height;
+
+  int current_y = 0;
+  for (int i = 0; i < num_works; ++i) {
+    const double weight = work_balance_infos[i].weight;
+    const int slice_height = max(lround(height * weight), 1);
+
+    /* Disallow negative values to deal with situations when there are more compute devices than
+     * scanlines. */
+    const int remaining_height = max(0, height - current_y);
+
+    BufferParams slide_params = buffer_params;
+    slide_params.full_y = buffer_params.full_y + current_y;
+    if (i < num_works - 1) {
+      slide_params.height = min(slice_height, remaining_height);
+    }
+    else {
+      slide_params.height = remaining_height;
+    }
+
+    slide_params.update_offset_stride();
+
+    callback(path_trace_works[i].get(), slide_params);
+
+    current_y += slide_params.height;
+  }
+}
+
+void PathTrace::update_allocated_work_buffer_params()
+{
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               big_tile_params_,
+                               [](PathTraceWork *path_trace_work, const BufferParams &params) {
+                                 RenderBuffers *buffers = path_trace_work->get_render_buffers();
+                                 buffers->reset(params);
+                               });
+}
+
+static BufferParams scale_buffer_params(const BufferParams &params, int resolution_divider)
+{
+  BufferParams scaled_params = params;
+
+  scaled_params.width = max(1, params.width / resolution_divider);
+  scaled_params.height = max(1, params.height / resolution_divider);
+  scaled_params.full_x = params.full_x / resolution_divider;
+  scaled_params.full_y = params.full_y / resolution_divider;
+  scaled_params.full_width = params.full_width / resolution_divider;
+  scaled_params.full_height = params.full_height / resolution_divider;
+
+  scaled_params.update_offset_stride();
+
+  return scaled_params;
+}
+
+void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work)
+{
+  const int resolution_divider = render_work.resolution_divider;
+
+  const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider);
+  const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_,
+                                                                  resolution_divider);
+
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               scaled_big_tile_params,
+                               [&](PathTraceWork *path_trace_work, const BufferParams params) {
+                                 path_trace_work->set_effective_buffer_params(
+                                     scaled_full_params, scaled_big_tile_params, params);
+                               });
+
+  render_state_.effective_big_tile_params = scaled_big_tile_params;
+}
+
+void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work)
+{
+  if (render_state_.need_reset_params) {
+    update_allocated_work_buffer_params();
+  }
+
+  if (render_state_.need_reset_params ||
+      render_state_.resolution_divider != render_work.resolution_divider) {
+    update_effective_work_buffer_params(render_work);
+  }
+
+  render_state_.resolution_divider = render_work.resolution_divider;
+  render_state_.need_reset_params = false;
+}
+
+void PathTrace::init_render_buffers(const RenderWork &render_work)
+{
+  update_work_buffer_params_if_needed(render_work);
+
+  /* Handle initialization scheduled by the render scheduler. */
+  if (render_work.init_render_buffers) {
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->zero_render_buffers();
+    });
+
+    tile_buffer_read();
+  }
+}
+
+void PathTrace::path_trace(RenderWork &render_work)
+{
+  if (!render_work.path_trace.num_samples) {
+    return;
+  }
+
+  VLOG(3) << "Will path trace " << render_work.path_trace.num_samples
+          << " samples at the resolution divider " << render_work.resolution_divider;
+
+  const double start_time = time_dt();
+
+  const int num_works = path_trace_works_.size();
+
+  tbb::parallel_for(0, num_works, [&](int i) {
+    const double work_start_time = time_dt();
+    const int num_samples = render_work.path_trace.num_samples;
+
+    PathTraceWork *path_trace_work = path_trace_works_[i].get();
+
+    PathTraceWork::RenderStatistics statistics;
+    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+    const double work_time = time_dt() - work_start_time;
+    work_balance_infos_[i].time_spent += work_time;
+    work_balance_infos_[i].occupancy = statistics.occupancy;
+
+    VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+            << work_time / num_samples
+            << " seconds per sample), occupancy: " << statistics.occupancy;
+  });
+
+  float occupancy_accum = 0.0f;
+  for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+    occupancy_accum += balance_info.occupancy;
+  }
+  const float occupancy = occupancy_accum / num_works;
+  render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
+  render_scheduler_.report_path_trace_time(
+      render_work, time_dt() - start_time, is_cancel_requested());
+}
+
+void PathTrace::adaptive_sample(RenderWork &render_work)
+{
+  if (!render_work.adaptive_sampling.filter) {
+    return;
+  }
+
+  bool did_reschedule_on_idle = false;
+
+  while (true) {
+    VLOG(3) << "Will filter adaptive stopping buffer, threshold "
+            << render_work.adaptive_sampling.threshold;
+    if (render_work.adaptive_sampling.reset) {
+      VLOG(3) << "Will re-calculate convergency flag for currently converged pixels.";
+    }
+
+    const double start_time = time_dt();
+
+    uint num_active_pixels = 0;
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      const uint num_active_pixels_in_work =
+          path_trace_work->adaptive_sampling_converge_filter_count_active(
+              render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset);
+      if (num_active_pixels_in_work) {
+        atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work);
+      }
+    });
+
+    render_scheduler_.report_adaptive_filter_time(
+        render_work, time_dt() - start_time, is_cancel_requested());
+
+    if (num_active_pixels == 0) {
+      VLOG(3) << "All pixels converged.";
+      if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) {
+        break;
+      }
+      VLOG(3) << "Continuing with lower threshold.";
+    }
+    else if (did_reschedule_on_idle) {
+      break;
+    }
+    else if (num_active_pixels < 128 * 128) {
+      /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that
+       * there is no performance loss from the progressive noise floor feature.
+       *
+       * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of
+       * the final resolution. */
+      if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) {
+        VLOG(3) << "Rescheduling is not possible: final threshold is reached.";
+        break;
+      }
+      VLOG(3) << "Rescheduling lower threshold.";
+      did_reschedule_on_idle = true;
+    }
+    else {
+      break;
+    }
+  }
+}
+
+void PathTrace::set_denoiser_params(const DenoiseParams &params)
+{
+  render_scheduler_.set_denoiser_params(params);
+
+  if (!params.use) {
+    denoiser_.reset();
+    return;
+  }
+
+  if (denoiser_) {
+    const DenoiseParams old_denoiser_params = denoiser_->get_params();
+    if (old_denoiser_params.type == params.type) {
+      denoiser_->set_params(params);
+      return;
+    }
+  }
+
+  denoiser_ = Denoiser::create(device_, params);
+  denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); };
+}
+
+void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  render_scheduler_.set_adaptive_sampling(adaptive_sampling);
+}
+
+void PathTrace::cryptomatte_postprocess(const RenderWork &render_work)
+{
+  if (!render_work.cryptomatte.postprocess) {
+    return;
+  }
+  VLOG(3) << "Perform cryptomatte work.";
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    path_trace_work->cryptomatte_postproces();
+  });
+}
+
+void PathTrace::denoise(const RenderWork &render_work)
+{
+  if (!render_work.tile.denoise) {
+    return;
+  }
+
+  if (!denoiser_) {
+    /* Denoiser was not configured, so nothing to do here. */
+    return;
+  }
+
+  VLOG(3) << "Perform denoising work.";
+
+  const double start_time = time_dt();
+
+  RenderBuffers *buffer_to_denoise = nullptr;
+
+  unique_ptr<RenderBuffers> multi_device_buffers;
+  bool allow_inplace_modification = false;
+
+  if (path_trace_works_.size() == 1) {
+    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  }
+  else {
+    Device *denoiser_device = denoiser_->get_denoiser_device();
+    if (!denoiser_device) {
+      return;
+    }
+
+    multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
+    multi_device_buffers->reset(render_state_.effective_big_tile_params);
+
+    buffer_to_denoise = multi_device_buffers.get();
+
+    copy_to_render_buffers(multi_device_buffers.get());
+
+    allow_inplace_modification = true;
+  }
+
+  if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
+                                buffer_to_denoise,
+                                get_num_samples_in_buffer(),
+                                allow_inplace_modification)) {
+    render_state_.has_denoised_result = true;
+  }
+
+  if (multi_device_buffers) {
+    multi_device_buffers->copy_from_device();
+    tbb::parallel_for_each(
+        path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+          path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
+        });
+  }
+
+  render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+{
+  gpu_display_ = move(gpu_display);
+}
+
+void PathTrace::clear_gpu_display()
+{
+  if (gpu_display_) {
+    gpu_display_->clear();
+  }
+}
+
+void PathTrace::draw()
+{
+  if (!gpu_display_) {
+    return;
+  }
+
+  did_draw_after_reset_ |= gpu_display_->draw();
+}
+
+void PathTrace::update_display(const RenderWork &render_work)
+{
+  if (!render_work.display.update) {
+    return;
+  }
+
+  if (!gpu_display_ && !tile_buffer_update_cb) {
+    VLOG(3) << "Ignore display update.";
+    return;
+  }
+
+  if (full_params_.width == 0 || full_params_.height == 0) {
+    VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (tile_buffer_update_cb) {
+    VLOG(3) << "Invoke buffer update callback.";
+
+    tile_buffer_update_cb();
+  }
+
+  if (gpu_display_) {
+    VLOG(3) << "Perform copy to GPUDisplay work.";
+
+    const int resolution_divider = render_work.resolution_divider;
+    const int texture_width = max(1, full_params_.width / resolution_divider);
+    const int texture_height = max(1, full_params_.height / resolution_divider);
+    if (!gpu_display_->update_begin(texture_width, texture_height)) {
+      LOG(ERROR) << "Error beginning GPUDisplay update.";
+      return;
+    }
+
+    const PassMode pass_mode = render_work.display.use_denoised_result &&
+                                       render_state_.has_denoised_result ?
+                                   PassMode::DENOISED :
+                                   PassMode::NOISY;
+
+    /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
+     * all works in parallel. */
+    const int num_samples = get_num_samples_in_buffer();
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples);
+    }
+
+    gpu_display_->update_end();
+  }
+
+  render_scheduler_.report_display_update_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::rebalance(const RenderWork &render_work)
+{
+  static const int kLogLevel = 3;
+
+  if (!render_work.rebalance) {
+    return;
+  }
+
+  const int num_works = path_trace_works_.size();
+
+  if (num_works == 1) {
+    VLOG(kLogLevel) << "Ignoring rebalance work due to single device render.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Perform rebalance work.";
+    VLOG(kLogLevel) << "Per-device path tracing time (seconds):";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].time_spent;
+    }
+  }
+
+  const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_);
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Calculated per-device weights for works:";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].weight;
+    }
+  }
+
+  if (!did_rebalance) {
+    VLOG(kLogLevel) << "Balance in path trace works did not change.";
+    render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false);
+    return;
+  }
+
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+  big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+
+  copy_to_render_buffers(&big_tile_cpu_buffers);
+
+  render_state_.need_reset_params = true;
+  update_work_buffer_params_if_needed(render_work);
+
+  copy_from_render_buffers(&big_tile_cpu_buffers);
+
+  render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true);
+}
+
+void PathTrace::write_tile_buffer(const RenderWork &render_work)
+{
+  if (!render_work.tile.write) {
+    return;
+  }
+
+  VLOG(3) << "Write tile result.";
+
+  render_state_.tile_written = true;
+
+  const bool has_multiple_tiles = tile_manager_.has_multiple_tiles();
+
+  /* Write render tile result, but only if not using tiled rendering.
+   *
+   * Tiles are written to a file during rendering, and written to the software at the end
+   * of rendering (wither when all tiles are finished, or when rendering was requested to be
+   * cancelled).
+   *
+   * Important thing is: tile should be written to the software via callback only once. */
+  if (!has_multiple_tiles) {
+    VLOG(3) << "Write tile result via buffer write callback.";
+    tile_buffer_write();
+  }
+
+  /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
+   */
+  if (has_multiple_tiles) {
+    VLOG(3) << "Write tile result into .";
+    tile_buffer_write_to_disk();
+  }
+}
+
+void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work)
+{
+  if (!render_work.full.write) {
+    return;
+  }
+
+  VLOG(3) << "Handle full-frame render buffer work.";
+
+  if (!tile_manager_.has_written_tiles()) {
+    VLOG(3) << "No tiles on disk.";
+    return;
+  }
+
+  /* Make sure writing to the file is fully finished.
+   * This will include writing all possible missing tiles, ensuring validness of the file. */
+  tile_manager_.finish_write_tiles();
+
+  /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after
+   * all scenes and layers are rendered by the Session (which happens after freeing Session memory,
+   * so that we never hold scene and full-frame buffer in memory at the same time). */
+}
+
+void PathTrace::cancel()
+{
+  thread_scoped_lock lock(render_cancel_.mutex);
+
+  render_cancel_.is_requested = true;
+
+  while (render_cancel_.is_rendering) {
+    render_cancel_.condition.wait(lock);
+  }
+
+  render_cancel_.is_requested = false;
+}
+
+int PathTrace::get_num_samples_in_buffer()
+{
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::is_cancel_requested()
+{
+  if (render_cancel_.is_requested) {
+    return true;
+  }
+
+  if (progress_ != nullptr) {
+    if (progress_->get_cancel()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void PathTrace::tile_buffer_write()
+{
+  if (!tile_buffer_write_cb) {
+    return;
+  }
+
+  tile_buffer_write_cb();
+}
+
+void PathTrace::tile_buffer_read()
+{
+  if (!tile_buffer_read_cb) {
+    return;
+  }
+
+  if (tile_buffer_read_cb()) {
+    tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->copy_render_buffers_to_device();
+    });
+  }
+}
+
+void PathTrace::tile_buffer_write_to_disk()
+{
+  /* Sample count pass is required to support per-tile partial results stored in the file. */
+  DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED);
+
+  const int num_rendered_samples = render_scheduler_.get_num_rendered_samples();
+
+  if (num_rendered_samples == 0) {
+    /* The tile has zero samples, no need to write it. */
+    return;
+  }
+
+  /* Get access to the CPU-side render buffers of the current big tile. */
+  RenderBuffers *buffers;
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+
+  if (path_trace_works_.size() == 1) {
+    path_trace_works_[0]->copy_render_buffers_from_device();
+    buffers = path_trace_works_[0]->get_render_buffers();
+  }
+  else {
+    big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+    copy_to_render_buffers(&big_tile_cpu_buffers);
+
+    buffers = &big_tile_cpu_buffers;
+  }
+
+  if (!tile_manager_.write_tile(*buffers)) {
+    LOG(ERROR) << "Error writing tile to file.";
+  }
+}
+
+void PathTrace::progress_update_if_needed(const RenderWork &render_work)
+{
+  if (progress_ != nullptr) {
+    const int2 tile_size = get_render_tile_size();
+    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
+    const int current_sample = render_work.path_trace.start_sample +
+                               render_work.path_trace.num_samples;
+    progress_->add_samples(num_samples_added, current_sample);
+  }
+
+  if (progress_update_cb) {
+    progress_update_cb();
+  }
+}
+
+void PathTrace::progress_set_status(const string &status, const string &substatus)
+{
+  if (progress_ != nullptr) {
+    progress_->set_status(status, substatus);
+  }
+}
+
+void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_to_render_buffers(render_buffers);
+                         });
+  render_buffers->copy_to_device();
+}
+
+void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers)
+{
+  render_buffers->copy_from_device();
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_from_render_buffers(render_buffers);
+                         });
+}
+
+bool PathTrace::copy_render_tile_from_device()
+{
+  if (full_frame_state_.render_buffers) {
+    /* Full-frame buffer is always allocated on CPU. */
+    return true;
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->copy_render_buffers_from_device()) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+static string get_layer_view_name(const RenderBuffers &buffers)
+{
+  string result;
+
+  if (buffers.params.layer.size()) {
+    result += string(buffers.params.layer);
+  }
+
+  if (buffers.params.view.size()) {
+    if (!result.empty()) {
+      result += ", ";
+    }
+    result += string(buffers.params.view);
+  }
+
+  return result;
+}
+
+void PathTrace::process_full_buffer_from_disk(string_view filename)
+{
+  VLOG(3) << "Processing full frame buffer file " << filename;
+
+  progress_set_status("Reading full buffer from disk");
+
+  RenderBuffers full_frame_buffers(cpu_device_.get());
+
+  DenoiseParams denoise_params;
+  if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
+    LOG(ERROR) << "Error reading tiles from file.";
+    return;
+  }
+
+  const string layer_view_name = get_layer_view_name(full_frame_buffers);
+
+  render_state_.has_denoised_result = false;
+
+  if (denoise_params.use) {
+    progress_set_status(layer_view_name, "Denoising");
+
+    /* Re-use the denoiser as much as possible, avoiding possible device re-initialization.
+     *
+     * It will not conflict with the regular rendering as:
+     *  - Rendering is supposed to be finished here.
+     *  - The next rendering will go via Session's `run_update_for_next_iteration` which will
+     *    ensure proper denoiser is used. */
+    set_denoiser_params(denoise_params);
+
+    /* Number of samples doesn't matter too much, since the sampels count pass will be used. */
+    denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false);
+
+    render_state_.has_denoised_result = true;
+  }
+
+  full_frame_state_.render_buffers = &full_frame_buffers;
+
+  progress_set_status(layer_view_name, "Finishing");
+
+  /* Write the full result pretending that there is a single tile.
+   * Requires some state change, but allows to use same communication API with the software. */
+  tile_buffer_write();
+
+  full_frame_state_.render_buffers = nullptr;
+}
+
+int PathTrace::get_num_render_tile_samples() const
+{
+  if (full_frame_state_.render_buffers) {
+    /* If the full-frame buffer is read from disk the number of samples is not used as there is a
+     * sample count pass for that in the buffer. Just avoid access to badly defined state of the
+     * path state. */
+    return 0;
+  }
+
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                       const PassAccessor::Destination &destination)
+{
+  if (full_frame_state_.render_buffers) {
+    return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                       const PassAccessor::Source &source)
+{
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+int2 PathTrace::get_render_tile_size() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(full_frame_state_.render_buffers->params.width,
+                     full_frame_state_.render_buffers->params.height);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.width, tile.height);
+}
+
+int2 PathTrace::get_render_tile_offset() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(0, 0);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.x, tile.y);
+}
+
+const BufferParams &PathTrace::get_render_tile_params() const
+{
+  if (full_frame_state_.render_buffers) {
+    return full_frame_state_.render_buffers->params;
+  }
+
+  return big_tile_params_;
+}
+
+bool PathTrace::has_denoised_result() const
+{
+  return render_state_.has_denoised_result;
+}
+
+/* --------------------------------------------------------------------
+ * Report generation.
+ */
+
+static const char *device_type_for_description(const DeviceType type)
+{
+  switch (type) {
+    case DEVICE_NONE:
+      return "None";
+
+    case DEVICE_CPU:
+      return "CPU";
+    case DEVICE_CUDA:
+      return "CUDA";
+    case DEVICE_OPTIX:
+      return "OptiX";
+    case DEVICE_DUMMY:
+      return "Dummy";
+    case DEVICE_MULTI:
+      return "Multi";
+  }
+
+  return "UNKNOWN";
+}
+
+/* Construct description of the device which will appear in the full report. */
+/* TODO(sergey): Consider making it more reusable utility. */
+static string full_device_info_description(const DeviceInfo &device_info)
+{
+  string full_description = device_info.description;
+
+  full_description += " (" + string(device_type_for_description(device_info.type)) + ")";
+
+  if (device_info.display_device) {
+    full_description += " (display)";
+  }
+
+  if (device_info.type == DEVICE_CPU) {
+    full_description += " (" + to_string(device_info.cpu_threads) + " threads)";
+  }
+
+  full_description += " [" + device_info.id + "]";
+
+  return full_description;
+}
+
+/* Construct string which will contain information about devices, possibly multiple of the devices.
+ *
+ * In the simple case the result looks like:
+ *
+ *   Message: Full Device Description
+ *
+ * If there are multiple devices then the result looks like:
+ *
+ *   Message: Full First Device Description
+ *            Full Second Device Description
+ *
+ * Note that the newlines are placed in a way so that the result can be easily concatenated to the
+ * full report. */
+static string device_info_list_report(const string &message, const DeviceInfo &device_info)
+{
+  string result = "\n" + message + ": ";
+  const string pad(message.length() + 2, ' ');
+
+  if (device_info.multi_devices.empty()) {
+    result += full_device_info_description(device_info) + "\n";
+    return result;
+  }
+
+  bool is_first = true;
+  for (const DeviceInfo &sub_device_info : device_info.multi_devices) {
+    if (!is_first) {
+      result += pad;
+    }
+
+    result += full_device_info_description(sub_device_info) + "\n";
+
+    is_first = false;
+  }
+
+  return result;
+}
+
+static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works)
+{
+  DeviceInfo device_info;
+  device_info.type = DEVICE_MULTI;
+
+  for (auto &&path_trace_work : path_trace_works) {
+    device_info.multi_devices.push_back(path_trace_work->get_device()->info);
+  }
+
+  return device_info_list_report("Path tracing on", device_info);
+}
+
+static string denoiser_device_report(const Denoiser *denoiser)
+{
+  if (!denoiser) {
+    return "";
+  }
+
+  if (!denoiser->get_params().use) {
+    return "";
+  }
+
+  const Device *denoiser_device = denoiser->get_denoiser_device();
+  if (!denoiser_device) {
+    return "";
+  }
+
+  return device_info_list_report("Denoising on", denoiser_device->info);
+}
+
+string PathTrace::full_report() const
+{
+  string result = "\nFull path tracing report\n";
+
+  result += path_trace_devices_report(path_trace_works_);
+  result += denoiser_device_report(denoiser_.get());
+
+  /* Report from the render scheduler, which includes:
+   * - Render mode (interactive, offline, headless)
+   * - Adaptive sampling and denoiser parameters
+   * - Breakdown of timing. */
+  result += render_scheduler_.full_report();
+
+  return result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
new file mode 100644
index 00000000000..78ca68c1198
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/path_trace_work.h"
+#include "integrator/work_balancer.h"
+#include "render/buffers.h"
+#include "util/util_function.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling;
+class Device;
+class DeviceScene;
+class Film;
+class RenderBuffers;
+class RenderScheduler;
+class RenderWork;
+class Progress;
+class GPUDisplay;
+class TileManager;
+
+/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of
+ * all the common steps of path tracing which are not device-specific. The list of tasks includes
+ * but is not limited to:
+ *  - Kernel graph.
+ *  - Scheduling logic.
+ *  - Queues management.
+ *  - Adaptive stopping. */
+class PathTrace {
+ public:
+  /* Render scheduler is used to report timing information and access things like start/finish
+   * sample. */
+  PathTrace(Device *device,
+            Film *film,
+            DeviceScene *device_scene,
+            RenderScheduler &render_scheduler,
+            TileManager &tile_manager);
+  ~PathTrace();
+
+  /* Create devices and load kernels which are created on-demand (for example, denoising devices).
+   * The progress is reported to the currently configure progress object (via `set_progress`). */
+  void load_kernels();
+
+  /* Allocate working memory. This runs before allocating scene memory so that we can estimate
+   * more accurately which scene device memory may need to allocated on the host. */
+  void alloc_work_memory();
+
+  /* Check whether now it is a good time to reset rendering.
+   * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate
+   * render result. */
+  bool ready_to_reset();
+
+  void reset(const BufferParams &full_params, const BufferParams &big_tile_params);
+
+  void device_free();
+
+  /* Set progress tracker.
+   * Used to communicate details about the progress to the outer world, check whether rendering is
+   * to be canceled.
+   *
+   * The path tracer writes to this object, and then at a convenient moment runs
+   * progress_update_cb() callback. */
+  void set_progress(Progress *progress);
+
+  /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are
+   * rendered (or until rendering is requested to be cancelled). */
+  void render(const RenderWork &render_work);
+
+  /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is
+   * convenient to have it here because then its easy to access render buffer. But the downside is
+   * that this adds too much of entities which can live separately with some clear API. */
+
+  /* Set denoiser parameters.
+   * Use this to configure the denoiser before rendering any samples. */
+  void set_denoiser_params(const DenoiseParams &params);
+
+  /* Set parameters used for adaptive sampling.
+   * Use this to configure the adaptive sampler before rendering any samples. */
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  /* Set GPU display which takes care of drawing the render result. */
+  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+
+  /* Clear the GPU display by filling it in with all zeroes. */
+  void clear_gpu_display();
+
+  /* Perform drawing of the current state of the GPUDisplay. */
+  void draw();
+
+  /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
+   * Used in cases like reset of render session.
+   *
+   * This is a blockign call, which returns as soon as there is no running `render_samples()` call.
+   */
+  void cancel();
+
+  /* Copy an entire render buffer to/from the path trace.  */
+
+  /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and
+   * the data will be copied to the device of the given render buffers. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy happens via CPU side buffer: data will be copied from the device of the given rendetr
+   * buffers and will be copied to all devices of the path trace. */
+  void copy_from_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy render buffers of the big tile from the device to hsot.
+   * Return true if all copies are successful. */
+  bool copy_render_tile_from_device();
+
+  /* Read given full-frame file from disk, perform needed processing and write it to the software
+   * via the write callback. */
+  void process_full_buffer_from_disk(string_view filename);
+
+  /* Get number of samples in the current big tile render buffers. */
+  int get_num_render_tile_samples() const;
+
+  /* Get pass data of the entire big tile.
+   * This call puts pass render result from all devices into the final pixels storage.
+   *
+   * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`.
+   *
+   * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Check whether denoiser was run and denoised passes are available. */
+  bool has_denoised_result() const;
+
+  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile.
+   * In the case of tiled rendering this will return full-frame after all tiles has been rendered.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  int2 get_render_tile_size() const;
+  int2 get_render_tile_offset() const;
+
+  /* Get buffer parameters of the current tile.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  const BufferParams &get_render_tile_params() const;
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+  /* Callback which communicates an updates state of the render buffer of the current big tile.
+   * Is called during path tracing to communicate work-in-progress state of the final buffer. */
+  function<void(void)> tile_buffer_update_cb;
+
+  /* Callback which communicates final rendered buffer. Is called after pathtracing is done. */
+  function<void(void)> tile_buffer_write_cb;
+
+  /* Callback which initializes rendered buffer. Is called before pathtracing starts.
+   *
+   * This is used for baking. */
+  function<bool(void)> tile_buffer_read_cb;
+
+  /* Callback which is called to report current rendering progress.
+   *
+   * It is supposed to be cheaper than buffer update/write, hence can be called more often.
+   * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed
+   * that the buffer is "uniformly" sampled at the moment of this callback). */
+  function<void(void)> progress_update_cb;
+
+ protected:
+  /* Actual implementation of the rendering pipeline.
+   * Calls steps in order, checking for the cancel to be requested inbetween.
+   *
+   * Is separate from `render()` to simplify dealing with the early outputs and keeping
+   * `render_cancel_` in the consistent state. */
+  void render_pipeline(RenderWork render_work);
+
+  /* Initialize kernel execution on all integrator queues. */
+  void render_init_kernel_execution();
+
+  /* Make sure both allocated and effective buffer parameters of path tracer works are up to date
+   * with the current big tile parameters, performance-dependent slicing, and resolution divider.
+   */
+  void update_work_buffer_params_if_needed(const RenderWork &render_work);
+  void update_allocated_work_buffer_params();
+  void update_effective_work_buffer_params(const RenderWork &render_work);
+
+  /* Perform various steps of the render work.
+   *
+   * Note that some steps might modify the work, forcing some steps to happen within this iteration
+   * of rendering. */
+  void init_render_buffers(const RenderWork &render_work);
+  void path_trace(RenderWork &render_work);
+  void adaptive_sample(RenderWork &render_work);
+  void denoise(const RenderWork &render_work);
+  void cryptomatte_postprocess(const RenderWork &render_work);
+  void update_display(const RenderWork &render_work);
+  void rebalance(const RenderWork &render_work);
+  void write_tile_buffer(const RenderWork &render_work);
+  void finalize_full_buffer_on_disk(const RenderWork &render_work);
+
+  /* Get number of samples in the current state of the render buffers. */
+  int get_num_samples_in_buffer();
+
+  /* Check whether user requested to cancel rendering, so that path tracing is to be finished as
+   * soon as possible. */
+  bool is_cancel_requested();
+
+  /* Write the big tile render buffer via the write callback. */
+  void tile_buffer_write();
+
+  /* Read the big tile render buffer via the read callback. */
+  void tile_buffer_read();
+
+  /* Write current tile into the file on disk. */
+  void tile_buffer_write_to_disk();
+
+  /* Run the progress_update_cb callback if it is needed. */
+  void progress_update_if_needed(const RenderWork &render_work);
+
+  void progress_set_status(const string &status, const string &substatus = "");
+
+  /* Pointer to a device which is configured to be used for path tracing. If multiple devices
+   * are configured this is a `MultiDevice`. */
+  Device *device_ = nullptr;
+
+  /* CPU device for creating temporary render buffers on the CPU side. */
+  unique_ptr<Device> cpu_device_;
+
+  DeviceScene *device_scene_;
+
+  RenderScheduler &render_scheduler_;
+  TileManager &tile_manager_;
+
+  unique_ptr<GPUDisplay> gpu_display_;
+
+  /* Per-compute device descriptors of work which is responsible for path tracing on its configured
+   * device. */
+  vector<unique_ptr<PathTraceWork>> path_trace_works_;
+
+  /* Per-path trace work information needed for multi-device balancing. */
+  vector<WorkBalanceInfo> work_balance_infos_;
+
+  /* Render buffer parameters of the full frame and current big tile. */
+  BufferParams full_params_;
+  BufferParams big_tile_params_;
+
+  /* Denoiser which takes care of denoising the big tile. */
+  unique_ptr<Denoiser> denoiser_;
+
+  /* State which is common for all the steps of the render work.
+   * Is brought up to date in the `render()` call and is accessed from all the steps involved into
+   * rendering the work. */
+  struct {
+    /* Denotes whether render buffers parameters of path trace works are to be reset for the new
+     * value of the big tile parameters. */
+    bool need_reset_params = false;
+
+    /* Divider of the resolution for faster previews.
+     *
+     * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to
+     * think of render buffer in this case is as an over-allocated array: the resolution divider
+     * affects both resolution and stride as visible by the integrator kernels. */
+    int resolution_divider = 0;
+
+    /* Paramaters of the big tile with the current resolution divider applied. */
+    BufferParams effective_big_tile_params;
+
+    /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+    bool has_denoised_result = false;
+
+    /* Current tile has been written (to either disk or callback.
+     * Indicates that no more work will be done on this tile. */
+    bool tile_written = false;
+  } render_state_;
+
+  /* Progress object which is used to communicate sample progress. */
+  Progress *progress_;
+
+  /* Fields required for canceling render on demand, as quickly as possible. */
+  struct {
+    /* Indicates whether there is an on-going `render_samples()` call. */
+    bool is_rendering = false;
+
+    /* Indicates whether rendering is requested to be canceled by `cancel()`. */
+    bool is_requested = false;
+
+    /* Synchronization between thread which does `render_samples()` and thread which does
+     * `cancel()`. */
+    thread_mutex mutex;
+    thread_condition_variable condition;
+  } render_cancel_;
+
+  /* Indicates whether a render result was drawn after latest session reset.
+   * Used by `ready_to_reset()` to implement logic which feels the most interactive. */
+  bool did_draw_after_reset_ = true;
+
+  /* State of the full frame processing and writing to the software. */
+  struct {
+    RenderBuffers *render_buffers = nullptr;
+  } full_frame_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
new file mode 100644
index 00000000000..d9634acac10
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/path_trace_work_cpu.h"
+#include "integrator/path_trace_work_gpu.h"
+#include "render/buffers.h"
+#include "render/film.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
+                                                Film *film,
+                                                DeviceScene *device_scene,
+                                                bool *cancel_requested_flag)
+{
+  if (device->info.type == DEVICE_CPU) {
+    return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
+  }
+
+  return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
+}
+
+PathTraceWork::PathTraceWork(Device *device,
+                             Film *film,
+                             DeviceScene *device_scene,
+                             bool *cancel_requested_flag)
+    : device_(device),
+      film_(film),
+      device_scene_(device_scene),
+      buffers_(make_unique<RenderBuffers>(device)),
+      effective_buffer_params_(buffers_->params),
+      cancel_requested_flag_(cancel_requested_flag)
+{
+}
+
+PathTraceWork::~PathTraceWork()
+{
+}
+
+RenderBuffers *PathTraceWork::get_render_buffers()
+{
+  return buffers_.get();
+}
+
+void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params,
+                                                const BufferParams &effective_big_tile_params,
+                                                const BufferParams &effective_buffer_params)
+{
+  effective_full_params_ = effective_full_params;
+  effective_big_tile_params_ = effective_big_tile_params;
+  effective_buffer_params_ = effective_buffer_params;
+}
+
+bool PathTraceWork::has_multiple_works() const
+{
+  /* Assume if there are multiple works working on the same big tile none of the works gets the
+   * entire big tile to work on. */
+  return !(effective_big_tile_params_.width == effective_buffer_params_.width &&
+           effective_big_tile_params_.height == effective_buffer_params_.height &&
+           effective_big_tile_params_.full_x == effective_buffer_params_.full_x &&
+           effective_big_tile_params_.full_y == effective_buffer_params_.full_y);
+}
+
+void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  copy_render_buffers_from_device();
+
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = buffers_->buffer.data();
+  float *dst = render_buffers->buffer.data() + offset_in_floats;
+
+  memcpy(dst, src, data_size);
+}
+
+void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = render_buffers->buffer.data() + offset_in_floats;
+  float *dst = buffers_->buffer.data();
+
+  memcpy(dst, src, data_size);
+
+  copy_render_buffers_to_device();
+}
+
+void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset = offset_y * width;
+
+  render_buffers_host_copy_denoised(
+      buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
+
+  copy_render_buffers_to_device();
+}
+
+bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                           const PassAccessor::Destination &destination)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Destination slice_destination = destination;
+  slice_destination.offset += offset_y * width;
+
+  return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
+}
+
+bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                           const PassAccessor::Source &source)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Source slice_source = source;
+  slice_source.offset += offset_y * width;
+
+  return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source);
+}
+
+PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+  const KernelBackground &kbackground = device_scene_->data.background;
+
+  const BufferParams &params = buffers_->params;
+
+  const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass());
+
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = display_pass->type;
+  pass_access_info.offset = PASS_UNUSED;
+
+  if (pass_mode == PassMode::DENOISED) {
+    pass_access_info.mode = PassMode::DENOISED;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED);
+  }
+
+  if (pass_access_info.offset == PASS_UNUSED) {
+    pass_access_info.mode = PassMode::NOISY;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type);
+  }
+
+  pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      kfilm.use_approximate_shadow_catcher && !kbackground.transparent;
+
+  return pass_access_info;
+}
+
+PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template(
+    const GPUDisplay *gpu_display) const
+{
+  PassAccessor::Destination destination(film_->get_display_pass());
+
+  const int2 display_texture_size = gpu_display->get_texture_size();
+  const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x;
+  const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y;
+
+  destination.offset = texture_y * display_texture_size.x + texture_x;
+  destination.stride = display_texture_size.x;
+
+  return destination;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
new file mode 100644
index 00000000000..97b97f3d888
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "render/buffers.h"
+#include "render/pass.h"
+#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class DeviceScene;
+class Film;
+class GPUDisplay;
+class RenderBuffers;
+
+class PathTraceWork {
+ public:
+  struct RenderStatistics {
+    float occupancy = 1.0f;
+  };
+
+  /* Create path trace work which fits best the device.
+   *
+   * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as
+   * possible. This could be, for rexample, request to cancel rendering on camera navigation in
+   * viewport. */
+  static unique_ptr<PathTraceWork> create(Device *device,
+                                          Film *film,
+                                          DeviceScene *device_scene,
+                                          bool *cancel_requested_flag);
+
+  virtual ~PathTraceWork();
+
+  /* Access the render buffers.
+   *
+   * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to
+   * correspond to the big tile size and relative device performance. */
+  RenderBuffers *get_render_buffers();
+
+  /* Set effective parameters of the big tile and the work itself. */
+  void set_effective_buffer_params(const BufferParams &effective_full_params,
+                                   const BufferParams &effective_big_tile_params,
+                                   const BufferParams &effective_buffer_params);
+
+  /* Check whether the big tile is being worked on by multiple path trace works. */
+  bool has_multiple_works() const;
+
+  /* Allocate working memory for execution. Must be called before init_execution(). */
+  virtual void alloc_work_memory(){};
+
+  /* Initialize execution of kernels.
+   * Will ensure that all device queues are initialized for execution.
+   *
+   * This method is to be called after any change in the scene. It is not needed to call it prior
+   * to an every call of the `render_samples()`. */
+  virtual void init_execution() = 0;
+
+  /* Render given number of samples as a synchronous blocking call.
+   * The samples are added to the render buffer associated with this work. */
+  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+
+  /* Copy render result from this work to the corresponding place of the GPU display.
+   *
+   * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The
+   * noisy pass mode will be passed here when it is known that the buffer does not have denoised
+   * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is
+   * not used then this function will fall-back to the noisy pass instead. */
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) = 0;
+
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0;
+
+  /* Copy data from/to given render buffers.
+   * Will copy pixels from a corresponding place (from multi-device point of view) of the render
+   * buffers, and copy work's render buffers to the corresponding place of the destination. */
+
+  /* Notes:
+   * - Copies work's render buffer from the device.
+   * - Copies CPU-side buffer of the given buffer
+   * - Does not copy the buffer to its device. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Notes:
+   * - Does not copy given render buffers from the device.
+   * - Copies work's render buffer to its device. */
+  void copy_from_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+   * given render buffers, leaving rest of the passes.
+   *
+   * Same notes about device copying aplies to this call as well. */
+  void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Copy render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool copy_render_buffers_from_device() = 0;
+  virtual bool copy_render_buffers_to_device() = 0;
+
+  /* Zero render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool zero_render_buffers() = 0;
+
+  /* Access pixels rendered by this work and copy them to the coresponding location in the
+   * destination.
+   *
+   * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()`
+   * to update host-side data. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Perform convergence test on the render buffer, and filter the convergence mask.
+   * Returns number of active pixels (the ones which did not converge yet). */
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
+
+  /* Run cryptomatte pass post-processing kernels. */
+  virtual void cryptomatte_postproces() = 0;
+
+  /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
+   * possible, without waiting for any samples to be finished. */
+  inline bool is_cancel_requested() const
+  {
+    /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in
+     * threaded environment. */
+    return *cancel_requested_flag_;
+  }
+
+  /* Access to the device which is used to path trace this work on. */
+  Device *get_device() const
+  {
+    return device_;
+  }
+
+ protected:
+  PathTraceWork(Device *device,
+                Film *film,
+                DeviceScene *device_scene,
+                bool *cancel_requested_flag);
+
+  PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const;
+
+  /* Get destination which offset and stride are configured so that writing to it will write to a
+   * proper location of GPU display texture, taking current tile and device slice into account. */
+  PassAccessor::Destination get_gpu_display_destination_template(
+      const GPUDisplay *gpu_display) const;
+
+  /* Device which will be used for path tracing.
+   * Note that it is an actual render device (and never is a multi-device). */
+  Device *device_;
+
+  /* Film is used to access display pass configuration for GPU display update.
+   * Note that only fields which are not a part of kernel data can be accessed via the Film. */
+  Film *film_;
+
+  /* Device side scene storage, that may be used for integrator logic. */
+  DeviceScene *device_scene_;
+
+  /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big
+   * tile which is being rendered by this work.
+   * It also defines possible subset of a big tile in the case of multi-device rendering. */
+  unique_ptr<RenderBuffers> buffers_;
+
+  /* Effective parameters of the full, big tile, and current work render buffer.
+   * The latter might be different from buffers_->params when there is a resolution divider
+   * involved. */
+  BufferParams effective_full_params_;
+  BufferParams effective_big_tile_params_;
+  BufferParams effective_buffer_params_;
+
+  bool *cancel_requested_flag_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
new file mode 100644
index 00000000000..b9a33b64051
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_cpu.h"
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Create TBB arena for execution of path tracing and rendering tasks. */
+static inline tbb::task_arena local_tbb_arena_create(const Device *device)
+{
+  /* TODO: limit this to number of threads of CPU device, it may be smaller than
+   * the system number of threads when we reduce the number of CPU threads in
+   * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
+  return tbb::task_arena(device->info.cpu_threads);
+}
+
+/* Get CPUKernelThreadGlobals for the current thread. */
+static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  const int thread_index = tbb::this_task_arena::current_thread_index();
+  DCHECK_GE(thread_index, 0);
+  DCHECK_LE(thread_index, kernel_thread_globals.size());
+
+  return &kernel_thread_globals[thread_index];
+}
+
+PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      kernels_(*(device->get_cpu_kernels()))
+{
+  DCHECK_EQ(device->info.type, DEVICE_CPU);
+}
+
+void PathTraceWorkCPU::init_execution()
+{
+  /* Cache per-thread kernel globals. */
+  device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
+}
+
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  const int64_t image_width = effective_buffer_params_.width;
+  const int64_t image_height = effective_buffer_params_.height;
+  const int64_t total_pixels_num = image_width * image_height;
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.start_profiling();
+  }
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
+      if (is_cancel_requested()) {
+        return;
+      }
+
+      const int y = work_index / image_width;
+      const int x = work_index - y * image_width;
+
+      KernelWorkTile work_tile;
+      work_tile.x = effective_buffer_params_.full_x + x;
+      work_tile.y = effective_buffer_params_.full_y + y;
+      work_tile.w = 1;
+      work_tile.h = 1;
+      work_tile.start_sample = start_sample;
+      work_tile.num_samples = 1;
+      work_tile.offset = effective_buffer_params_.offset;
+      work_tile.stride = effective_buffer_params_.stride;
+
+      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+
+      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
+    });
+  });
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.stop_profiling();
+  }
+
+  statistics.occupancy = 1.0f;
+}
+
+void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                                    const KernelWorkTile &work_tile,
+                                                    const int samples_num)
+{
+  const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
+  const bool has_bake = device_scene_->data.bake.use;
+
+  IntegratorStateCPU integrator_states[2] = {};
+
+  IntegratorStateCPU *state = &integrator_states[0];
+  IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+
+  KernelWorkTile sample_work_tile = work_tile;
+  float *render_buffer = buffers_->buffer.data();
+
+  for (int sample = 0; sample < samples_num; ++sample) {
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    if (has_bake) {
+      if (!kernels_.integrator_init_from_bake(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+    else {
+      if (!kernels_.integrator_init_from_camera(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+
+    kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
+
+    if (has_shadow_catcher) {
+      kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
+    }
+
+    ++sample_work_tile.start_sample;
+  }
+}
+
+void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  half4 *rgba_half = gpu_display->map_texture_buffer();
+  if (!rgba_half) {
+    /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
+     * some implementations of GPUDisplay which can not map memory? */
+    return;
+  }
+
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+
+  const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.pixels_half_rgba = rgba_half;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+  });
+
+  gpu_display->unmap_texture_buffer();
+}
+
+void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+{
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_from_device()
+{
+  return buffers_->copy_from_device();
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_to_device()
+{
+  buffers_->buffer.copy_to_device();
+  return true;
+}
+
+bool PathTraceWorkCPU::zero_render_buffers()
+{
+  buffers_->zero();
+  return true;
+}
+
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int offset = effective_buffer_params_.offset;
+  const int stride = effective_buffer_params_.stride;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  uint num_active_pixels = 0;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(full_y, full_y + height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+
+      bool row_converged = true;
+      uint num_row_pixels_active = 0;
+      for (int x = 0; x < width; ++x) {
+        if (!kernels_.adaptive_sampling_convergence_check(
+                kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+          ++num_row_pixels_active;
+          row_converged = false;
+        }
+      }
+
+      atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
+      if (!row_converged) {
+        kernels_.adaptive_sampling_filter_x(
+            kernel_globals, render_buffer, y, full_x, width, offset, stride);
+      }
+    });
+  });
+
+  if (num_active_pixels) {
+    local_arena.execute([&]() {
+      tbb::parallel_for(full_x, full_x + width, [&](int x) {
+        CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+        kernels_.adaptive_sampling_filter_y(
+            kernel_globals, render_buffer, x, full_y, height, offset, stride);
+      });
+    });
+  }
+
+  return num_active_pixels;
+}
+
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(0, height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+      int pixel_index = y * width;
+
+      for (int x = 0; x < width; ++x, ++pixel_index) {
+        kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+      }
+    });
+  });
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
new file mode 100644
index 00000000000..ab729bbf879
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/cpu/kernel_thread_globals.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+struct KernelGlobals;
+
+class CPUKernels;
+
+/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel,
+ * for CPU devices.
+ *
+ * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent
+ * queues on the render device which makes this work be only usable on CPU. */
+class PathTraceWorkCPU : public PathTraceWork {
+ public:
+  PathTraceWorkCPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  /* Core path tracing routine. Renders given work time on the given queue. */
+  void render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                    const KernelWorkTile &work_tile,
+                                    const int samples_num);
+
+  /* CPU kernels. */
+  const CPUKernels &kernels_;
+
+  /* Copy of kernel globals which is suitable for concurrent access from multiple threads.
+   *
+   * More specifically, the `kernel_globals_` is local to each threads and nobody else is
+   * accessing it, but some "localization" is required to decouple from kernel globals stored
+   * on the device level. */
+  vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
new file mode 100644
index 00000000000..10baf869aa6
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_gpu.h"
+
+#include "device/device.h"
+
+#include "integrator/pass_accessor_gpu.h"
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      queue_(device->gpu_queue_create()),
+      integrator_state_soa_kernel_features_(0),
+      integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
+      integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
+      integrator_shader_raytrace_sort_counter_(
+          device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_next_shadow_catcher_path_index_(
+          device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+      queued_paths_(device, "queued_paths", MEM_READ_WRITE),
+      num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
+      work_tiles_(device, "work_tiles", MEM_READ_WRITE),
+      gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+      max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
+      min_num_active_paths_(queue_->num_concurrent_busy_states()),
+      max_active_path_index_(0)
+{
+  memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
+
+  /* Limit number of active paths to the half of the overall state. This is due to the logic in the
+   * path compaction which relies on the fact that regeneration does not happen sooner than half of
+   * the states are available again. */
+  min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+}
+
+void PathTraceWorkGPU::alloc_integrator_soa()
+{
+  /* IntegrateState allocated as structure of arrays. */
+
+  /* Check if we already allocated memory for the required features. */
+  const uint kernel_features = device_scene_->data.kernel_features;
+  if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
+    return;
+  }
+  integrator_state_soa_kernel_features_ = kernel_features;
+
+  /* Allocate a device only memory buffer before for each struct member, and then
+   * write the pointers into a struct that resides in constant memory.
+   *
+   * TODO: store float3 in separate XYZ arrays. */
+#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && \
+      (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_END(name) \
+  break; \
+  }
+#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+  if (array_index == array_size - 1) { \
+    break; \
+  } \
+  }
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+}
+
+void PathTraceWorkGPU::alloc_integrator_queue()
+{
+  if (integrator_queue_counter_.size() == 0) {
+    integrator_queue_counter_.alloc(1);
+    integrator_queue_counter_.zero_to_device();
+    integrator_queue_counter_.copy_from_device();
+    integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+                                              integrator_queue_counter_.device_pointer;
+  }
+
+  /* Allocate data for active path index arrays. */
+  if (num_queued_paths_.size() == 0) {
+    num_queued_paths_.alloc(1);
+    num_queued_paths_.zero_to_device();
+  }
+
+  if (queued_paths_.size() == 0) {
+    queued_paths_.alloc(max_num_paths_);
+    /* TODO: this could be skip if we had a function to just allocate on device. */
+    queued_paths_.zero_to_device();
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_sorting()
+{
+  /* Allocate arrays for shader sorting. */
+  const int max_shaders = device_scene_->data.max_shaders;
+  if (integrator_shader_sort_counter_.size() < max_shaders) {
+    integrator_shader_sort_counter_.alloc(max_shaders);
+    integrator_shader_sort_counter_.zero_to_device();
+
+    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+    integrator_shader_raytrace_sort_counter_.zero_to_device();
+
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_shader_sort_counter_.device_pointer;
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+    return;
+  }
+
+  integrator_next_shadow_catcher_path_index_.alloc(1);
+  /* TODO(sergey): Use queue? */
+  integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+  integrator_state_gpu_.next_shadow_catcher_path_index =
+      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
+void PathTraceWorkGPU::alloc_work_memory()
+{
+  alloc_integrator_soa();
+  alloc_integrator_queue();
+  alloc_integrator_sorting();
+  alloc_integrator_path_split();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+  queue_->init_execution();
+
+  /* Copy to device side struct in constant memory. */
+  device_->const_copy_to(
+      "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+}
+
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+   * add more work (because tiles are smaller, so there is higher chance that more paths will
+   * become busy after adding new tiles). This is especially important for the shadow catcher which
+   * schedules work in halves of available number of paths. */
+  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
+
+  work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
+
+  enqueue_reset();
+
+  int num_iterations = 0;
+  uint64_t num_busy_accum = 0;
+
+  /* TODO: set a hard limit in case of undetected kernel failures? */
+  while (true) {
+    /* Enqueue work from the scheduler, on start or when there are not enough
+     * paths to keep the device occupied. */
+    bool finished;
+    if (enqueue_work_tiles(finished)) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    /* Stop if no more work remaining. */
+    if (finished) {
+      break;
+    }
+
+    /* Enqueue on of the path iteration kernels. */
+    if (enqueue_path_iteration()) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    num_busy_accum += get_num_active_paths();
+    ++num_iterations;
+  }
+
+  statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
+}
+
+DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
+{
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int max_num_queued = 0;
+  DeviceKernel kernel = DEVICE_KERNEL_NUM;
+
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    if (queue_counter->num_queued[i] > max_num_queued) {
+      kernel = (DeviceKernel)i;
+      max_num_queued = queue_counter->num_queued[i];
+    }
+  }
+
+  return kernel;
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+  void *args[] = {&max_num_paths_};
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
+  queue_->zero_to_device(integrator_queue_counter_);
+  queue_->zero_to_device(integrator_shader_sort_counter_);
+  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+
+  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+   * counter on the host side because `zero_to_device()` is not doing it. */
+  if (integrator_queue_counter_.host_pointer) {
+    memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
+  }
+}
+
+bool PathTraceWorkGPU::enqueue_path_iteration()
+{
+  /* Find kernel to execute, with max number of queued paths. */
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_active_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    num_active_paths += queue_counter->num_queued[i];
+  }
+
+  if (num_active_paths == 0) {
+    return false;
+  }
+
+  /* Find kernel to execute, with max number of queued paths. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel == DEVICE_KERNEL_NUM) {
+    return false;
+  }
+
+  /* Finish shadows before potentially adding more shadow rays. We can only
+   * store one shadow ray in the integrator state. */
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
+    if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+      return true;
+    }
+    else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+      return true;
+    }
+  }
+
+  /* Schedule kernel with maximum number of queued items. */
+  enqueue_path_iteration(kernel);
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+{
+  void *d_path_index = (void *)NULL;
+
+  /* Create array of path indices for which this kernel is queued to be executed. */
+  int work_size = max_active_path_index_;
+
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+  int num_queued = queue_counter->num_queued[kernel];
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    /* Compute array of active paths, sorted by shader. */
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+  }
+  else if (num_queued < work_size) {
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+      /* Compute array of active shadow paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
+    }
+    else {
+      /* Compute array of active paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
+    }
+  }
+
+  DCHECK_LE(work_size, max_num_paths_);
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+      /* Ray intersection kernels with integrator state. */
+      void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+      /* Shading kernels with integrator state and render buffer. */
+      void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+      void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
+                 << " used for path iteration, should never happen.";
+      break;
+  }
+}
+
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+  void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+  assert(d_counter != nullptr);
+
+  /* Compute prefix sum of number of active paths with each shader. */
+  {
+    const int work_size = 1;
+    int max_shaders = device_scene_->data.max_shaders;
+    void *args[] = {&d_counter, &max_shaders};
+    queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  /* Launch kernel to fill the active paths arrays. */
+  {
+    /* TODO: this could be smaller for terminated paths based on amount of work we want
+     * to schedule. */
+    const int work_size = max_active_path_index_;
+
+    void *d_queued_paths = (void *)queued_paths_.device_pointer;
+    void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+    void *args[] = {const_cast<int *>(&work_size),
+                    &d_queued_paths,
+                    &d_num_queued_paths,
+                    &d_counter,
+                    &d_queued_kernel};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+
+  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+    queue_->zero_to_device(integrator_shader_sort_counter_);
+  }
+  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  else {
+    assert(0);
+  }
+}
+
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  const int work_size = max_active_path_index_;
+  void *d_queued_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {
+      const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
+
+  queue_->zero_to_device(num_queued_paths_);
+  queue_->enqueue(kernel, work_size, args);
+}
+
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+  if (num_active_paths == 0) {
+    max_active_path_index_ = 0;
+  }
+
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
+  if (max_active_path_index_ == num_active_paths) {
+    return;
+  }
+
+  void *d_compact_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+  /* Create array with terminated paths that we can write to. */
+  {
+    /* TODO: can the work size be reduced here? */
+    int offset = num_active_paths;
+    int work_size = num_active_paths;
+    void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+  }
+
+  /* Create array of paths that we need to compact, where the path index is bigger
+   * than the number of active paths. */
+  {
+    int work_size = max_active_path_index_;
+    void *args[] = {
+        &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+  }
+
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  int num_compact_paths = num_queued_paths_.data()[0];
+
+  /* Move paths into gaps. */
+  if (num_compact_paths > 0) {
+    int work_size = num_compact_paths;
+    int active_states_offset = 0;
+    int terminated_states_offset = num_active_paths;
+    void *args[] = {
+        &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+  }
+
+  queue_->synchronize();
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_path_index_ = num_active_paths;
+}
+
+bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
+{
+  /* If there are existing paths wait them to go to intersect closest kernel, which will align the
+   * wavefront of the existing and newely added paths. */
+  /* TODO: Check whether counting new intersection kernels here will have positive affect on the
+   * performance. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
+    return false;
+  }
+
+  int num_active_paths = get_num_active_paths();
+
+  /* Don't schedule more work if cancelling. */
+  if (is_cancel_requested()) {
+    if (num_active_paths == 0) {
+      finished = true;
+    }
+    return false;
+  }
+
+  finished = false;
+
+  vector<KernelWorkTile> work_tiles;
+
+  int max_num_camera_paths = max_num_paths_;
+  int num_predicted_splits = 0;
+
+  if (has_shadow_catcher()) {
+    /* When there are shadow catchers in the scene bounce from them will split the state. So we
+     * make sure there is enough space in the path states array to fit split states.
+     *
+     * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+     * that all the new paths can be split.
+     *
+     * Note that it is possible that some of the current states can still split, so need to make
+     * sure there is enough space for them as well. */
+
+    /* Number of currently in-flight states which can still split. */
+    const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+    const int num_available_paths = max_num_paths_ - num_active_paths;
+    const int num_new_paths = num_available_paths / 2;
+    max_num_camera_paths = max(num_active_paths,
+                               num_active_paths + num_new_paths - num_scheduled_possible_split);
+    num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+  }
+
+  /* Schedule when we're out of paths or there are too few paths to keep the
+   * device occupied. */
+  int num_paths = num_active_paths;
+  if (num_paths == 0 || num_paths < min_num_active_paths_) {
+    /* Get work tiles until the maximum number of path is reached. */
+    while (num_paths < max_num_camera_paths) {
+      KernelWorkTile work_tile;
+      if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
+        work_tiles.push_back(work_tile);
+        num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
+      }
+      else {
+        break;
+      }
+    }
+
+    /* If we couldn't get any more tiles, we're done. */
+    if (work_tiles.size() == 0 && num_paths == 0) {
+      finished = true;
+      return false;
+    }
+  }
+
+  /* Initialize paths from work tiles. */
+  if (work_tiles.size() == 0) {
+    return false;
+  }
+
+  /* Compact state array when number of paths becomes small relative to the
+   * known maximum path index, which makes computing active index arrays slow. */
+  compact_states(num_active_paths);
+
+  if (has_shadow_catcher()) {
+    integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+    queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+  }
+
+  enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
+                                                      DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
+                     work_tiles.data(),
+                     work_tiles.size(),
+                     num_active_paths,
+                     num_predicted_splits);
+
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
+                                          const KernelWorkTile work_tiles[],
+                                          const int num_work_tiles,
+                                          const int num_active_paths,
+                                          const int num_predicted_splits)
+{
+  /* Copy work tiles to device. */
+  if (work_tiles_.size() < num_work_tiles) {
+    work_tiles_.alloc(num_work_tiles);
+  }
+
+  int path_index_offset = num_active_paths;
+  int max_tile_work_size = 0;
+  for (int i = 0; i < num_work_tiles; i++) {
+    KernelWorkTile &work_tile = work_tiles_.data()[i];
+    work_tile = work_tiles[i];
+
+    const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+    work_tile.path_index_offset = path_index_offset;
+    work_tile.work_size = tile_work_size;
+
+    path_index_offset += tile_work_size;
+
+    max_tile_work_size = max(max_tile_work_size, tile_work_size);
+  }
+
+  queue_->copy_to_device(work_tiles_);
+
+  void *d_work_tiles = (void *)work_tiles_.device_pointer;
+  void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+
+  /* Launch kernel. */
+  void *args[] = {&d_work_tiles,
+                  const_cast<int *>(&num_work_tiles),
+                  &d_render_buffer,
+                  const_cast<int *>(&max_tile_work_size)};
+
+  queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
+
+  max_active_path_index_ = path_index_offset + num_predicted_splits;
+}
+
+int PathTraceWorkGPU::get_num_active_paths()
+{
+  /* TODO: this is wrong, does not account for duplicates with shadow! */
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    DCHECK_GE(queue_counter->num_queued[i], 0)
+        << "Invalid number of queued states for kernel "
+        << device_kernel_as_string(static_cast<DeviceKernel>(i));
+    num_paths += queue_counter->num_queued[i];
+  }
+
+  return num_paths;
+}
+
+bool PathTraceWorkGPU::should_use_graphics_interop()
+{
+  /* There are few aspects with the graphics interop when using multiple devices caused by the fact
+   * that the GPUDisplay has a single texture:
+   *
+   *   CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
+   *   attempting to register OpenGL PBO which has been mapped. Which makes sense, because
+   *   otherwise one would run into a conflict of where the source of truth is. */
+  if (has_multiple_works()) {
+    return false;
+  }
+
+  if (!interop_use_checked_) {
+    Device *device = queue_->device;
+    interop_use_ = device->should_use_graphics_interop();
+
+    if (interop_use_) {
+      VLOG(2) << "Will be using graphics interop GPU display update.";
+    }
+    else {
+      VLOG(2) << "Will be using naive GPU display update.";
+    }
+
+    interop_use_checked_ = true;
+  }
+
+  return interop_use_;
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  if (device_->have_error()) {
+    /* Don't attempt to update GPU display if the device has errors: the error state will make
+     * wrong decisions to happen about interop, causing more chained bugs. */
+    return;
+  }
+
+  if (!buffers_->buffer.device_pointer) {
+    LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
+    return;
+  }
+
+  if (should_use_graphics_interop()) {
+    if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+      return;
+    }
+
+    /* If error happens when trying to use graphics interop fallback to the native implementation
+     * and don't attempt to use interop for the further updates. */
+    interop_use_ = false;
+  }
+
+  copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
+                                                 PassMode pass_mode,
+                                                 int num_samples)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int final_width = buffers_->params.width;
+  const int final_height = buffers_->params.height;
+
+  const int texture_x = full_x - effective_full_params_.full_x;
+  const int texture_y = full_y - effective_full_params_.full_y;
+
+  /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
+   *
+   * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
+   * change of the resolution divider. However, if the display becomes smaller, shrink the
+   * allocated memory as well. */
+  if (gpu_display_rgba_half_.data_width != final_width ||
+      gpu_display_rgba_half_.data_height != final_height) {
+    gpu_display_rgba_half_.alloc(final_width, final_height);
+    /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
+     * transfering zeroes to the device. */
+    queue_->zero_to_device(gpu_display_rgba_half_);
+  }
+
+  PassAccessor::Destination destination(film_->get_display_pass());
+  destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  gpu_display_rgba_half_.copy_from_device();
+
+  gpu_display->copy_pixels_to_texture(
+      gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+}
+
+bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  if (!device_graphics_interop_) {
+    device_graphics_interop_ = queue_->graphics_interop_create();
+  }
+
+  const DeviceGraphicsInteropDestination graphics_interop_dst =
+      gpu_display->graphics_interop_get();
+  device_graphics_interop_->set_destination(graphics_interop_dst);
+
+  const device_ptr d_rgba_half = device_graphics_interop_->map();
+  if (!d_rgba_half) {
+    return false;
+  }
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.d_pixels_half_rgba = d_rgba_half;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  device_graphics_interop_->unmap();
+
+  return true;
+}
+
+void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+{
+  if (!device_graphics_interop_) {
+    return;
+  }
+  gpu_display->graphics_interop_activate();
+  device_graphics_interop_ = nullptr;
+  gpu_display->graphics_interop_deactivate();
+}
+
+void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+  const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
+
+  pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+}
+
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
+
+  if (num_active_pixels) {
+    enqueue_adaptive_sampling_filter_x();
+    enqueue_adaptive_sampling_filter_y();
+    queue_->synchronize();
+  }
+
+  return num_active_pixels;
+}
+
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
+{
+  device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+  num_active_pixels.alloc(1);
+
+  queue_->zero_to_device(num_active_pixels);
+
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&effective_buffer_params_.full_x),
+                  const_cast<int *>(&effective_buffer_params_.full_y),
+                  const_cast<int *>(&effective_buffer_params_.width),
+                  const_cast<int *>(&effective_buffer_params_.height),
+                  &threshold,
+                  &reset,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride,
+                  &num_active_pixels.device_pointer};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
+
+  queue_->copy_from_device(num_active_pixels);
+  queue_->synchronize();
+
+  return num_active_pixels.data()[0];
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
+{
+  const int work_size = effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
+{
+  const int work_size = effective_buffer_params_.width;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
+}
+
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&work_size),
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_from_device()
+{
+  queue_->copy_from_device(buffers_->buffer);
+
+  /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
+  return queue_->synchronize();
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_to_device()
+{
+  queue_->copy_to_device(buffers_->buffer);
+
+  /* NOTE: The direct device access to the buffers only happens within this path trace work. The
+   * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
+   * which will perform synchronization as needed. */
+
+  return true;
+}
+
+bool PathTraceWorkGPU::zero_render_buffers()
+{
+  queue_->zero_to_device(buffers_->buffer);
+
+  return true;
+}
+
+bool PathTraceWorkGPU::has_shadow_catcher() const
+{
+  return device_scene_->data.integrator.has_shadow_catcher;
+}
+
+int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
+{
+  if (max_active_path_index_ == 0) {
+    return 0;
+  }
+
+  if (!has_shadow_catcher()) {
+    return 0;
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  const int work_size = max_active_path_index_;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
+
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  return num_queued_paths_.data()[0];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
new file mode 100644
index 00000000000..38788122b0d
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/device_graphics_interop.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/work_tile_scheduler.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+
+/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized
+ * to match device queue's number of path states.
+ * This implementation suits best devices which have a lot of integrator states, such as GPU. */
+class PathTraceWorkGPU : public PathTraceWork {
+ public:
+  PathTraceWorkGPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void alloc_work_memory() override;
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  void alloc_integrator_soa();
+  void alloc_integrator_queue();
+  void alloc_integrator_sorting();
+  void alloc_integrator_path_split();
+
+  /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */
+  DeviceKernel get_most_queued_kernel() const;
+
+  void enqueue_reset();
+
+  bool enqueue_work_tiles(bool &finished);
+  void enqueue_work_tiles(DeviceKernel kernel,
+                          const KernelWorkTile work_tiles[],
+                          const int num_work_tiles,
+                          const int num_active_paths,
+                          const int num_predicted_splits);
+
+  bool enqueue_path_iteration();
+  void enqueue_path_iteration(DeviceKernel kernel);
+
+  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+
+  void compact_states(const int num_active_paths);
+
+  int get_num_active_paths();
+
+  /* Check whether graphics interop can be used for the GPUDisplay update. */
+  bool should_use_graphics_interop();
+
+  /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the
+   * device, then copies pixels to the host and pushes them to the `gpu_display`. */
+  void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability
+   * functionality, avoiding copy of pixels to the host. */
+  bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Synchronously run film conversion kernel and store display result in the given destination. */
+  void get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                   PassMode pass_mode,
+                                   int num_samples);
+
+  int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
+  void enqueue_adaptive_sampling_filter_x();
+  void enqueue_adaptive_sampling_filter_y();
+
+  bool has_shadow_catcher() const;
+
+  /* Count how many currently scheduled paths can still split. */
+  int shadow_catcher_count_possible_splits();
+
+  /* Integrator queue. */
+  unique_ptr<DeviceQueue> queue_;
+
+  /* Scheduler which gives work to path tracing threads. */
+  WorkTileScheduler work_tile_scheduler_;
+
+  /* Integrate state for paths. */
+  IntegratorStateGPU integrator_state_gpu_;
+  /* SoA arrays for integrator state. */
+  vector<unique_ptr<device_memory>> integrator_state_soa_;
+  uint integrator_state_soa_kernel_features_;
+  /* Keep track of number of queued kernels. */
+  device_vector<IntegratorQueueCounter> integrator_queue_counter_;
+  /* Shader sorting. */
+  device_vector<int> integrator_shader_sort_counter_;
+  device_vector<int> integrator_shader_raytrace_sort_counter_;
+  /* Path split. */
+  device_vector<int> integrator_next_shadow_catcher_path_index_;
+
+  /* Temporary buffer to get an array of queued path for a particular kernel. */
+  device_vector<int> queued_paths_;
+  device_vector<int> num_queued_paths_;
+
+  /* Temporary buffer for passing work tiles to kernel. */
+  device_vector<KernelWorkTile> work_tiles_;
+
+  /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not
+   * available. Is allocated on-demand. */
+  device_vector<half4> gpu_display_rgba_half_;
+
+  unique_ptr<DeviceGraphicsInterop> device_graphics_interop_;
+
+  /* Cached result of device->should_use_graphics_interop(). */
+  bool interop_use_checked_ = false;
+  bool interop_use_ = false;
+
+  /* Maximum number of concurrent integrator states. */
+  int max_num_paths_;
+
+  /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below
+   * this value more work will be scheduled. */
+  int min_num_active_paths_;
+
+  /* Maximum path index, effective number of paths used may be smaller than
+   * the size of the integrator_state_ buffer so can avoid iterating over the
+   * full buffer. */
+  int max_active_path_index_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
new file mode 100644
index 00000000000..4eb1dd941f9
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -0,0 +1,1187 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/render_scheduler.h"
+
+#include "render/session.h"
+#include "render/tile.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Render scheduler.
+ */
+
+RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams &params)
+    : headless_(params.headless),
+      background_(params.background),
+      pixel_size_(params.pixel_size),
+      tile_manager_(tile_manager),
+      default_start_resolution_divider_(pixel_size_ * 8)
+{
+  use_progressive_noise_floor_ = !background_;
+}
+
+void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte)
+{
+  need_schedule_cryptomatte_ = need_schedule_cryptomatte;
+}
+
+void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance)
+{
+  need_schedule_rebalance_works_ = need_schedule_rebalance;
+}
+
+bool RenderScheduler::is_background() const
+{
+  return background_;
+}
+
+void RenderScheduler::set_denoiser_params(const DenoiseParams &params)
+{
+  denoiser_params_ = params;
+}
+
+void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  adaptive_sampling_ = adaptive_sampling;
+}
+
+bool RenderScheduler::is_adaptive_sampling_used() const
+{
+  return adaptive_sampling_.use;
+}
+
+void RenderScheduler::set_start_sample(int start_sample)
+{
+  start_sample_ = start_sample;
+}
+
+int RenderScheduler::get_start_sample() const
+{
+  return start_sample_;
+}
+
+void RenderScheduler::set_num_samples(int num_samples)
+{
+  num_samples_ = num_samples;
+}
+
+int RenderScheduler::get_num_samples() const
+{
+  return num_samples_;
+}
+
+void RenderScheduler::set_time_limit(double time_limit)
+{
+  time_limit_ = time_limit;
+}
+
+double RenderScheduler::get_time_limit() const
+{
+  return time_limit_;
+}
+
+int RenderScheduler::get_rendered_sample() const
+{
+  DCHECK_GT(get_num_rendered_samples(), 0);
+
+  return start_sample_ + get_num_rendered_samples() - 1;
+}
+
+int RenderScheduler::get_num_rendered_samples() const
+{
+  return state_.num_rendered_samples;
+}
+
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+{
+  buffer_params_ = buffer_params;
+
+  update_start_resolution_divider();
+
+  set_num_samples(num_samples);
+
+  /* In background mode never do lower resolution render preview, as it is not really supported
+   * by the software. */
+  if (background_) {
+    state_.resolution_divider = 1;
+  }
+  else {
+    /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider
+     * first and then initialized render work. */
+    state_.resolution_divider = start_resolution_divider_ * 2;
+  }
+
+  state_.num_rendered_samples = 0;
+  state_.last_display_update_time = 0.0;
+  state_.last_display_update_sample = -1;
+
+  state_.last_rebalance_time = 0.0;
+  state_.num_rebalance_requested = 0;
+  state_.num_rebalance_changes = 0;
+  state_.last_rebalance_changed = false;
+  state_.need_rebalance_at_next_work = false;
+
+  /* TODO(sergey): Choose better initial value. */
+  /* NOTE: The adaptive sampling settings might not be available here yet. */
+  state_.adaptive_sampling_threshold = 0.4f;
+
+  state_.last_work_tile_was_denoised = false;
+  state_.tile_result_was_written = false;
+  state_.postprocess_work_scheduled = false;
+  state_.full_frame_work_scheduled = false;
+  state_.full_frame_was_written = false;
+
+  state_.path_trace_finished = false;
+
+  state_.start_render_time = 0.0;
+  state_.end_render_time = 0.0;
+  state_.time_limit_reached = false;
+
+  state_.occupancy_num_samples = 0;
+  state_.occupancy = 1.0f;
+
+  first_render_time_.path_trace_per_sample = 0.0;
+  first_render_time_.denoise_time = 0.0;
+  first_render_time_.display_update_time = 0.0;
+
+  path_trace_time_.reset();
+  denoise_time_.reset();
+  adaptive_filter_time_.reset();
+  display_update_time_.reset();
+  rebalance_time_.reset();
+}
+
+void RenderScheduler::reset_for_next_tile()
+{
+  reset(buffer_params_, num_samples_);
+}
+
+bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
+{
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (render_work_reschedule_on_idle(render_work)) {
+    return true;
+  }
+
+  state_.path_trace_finished = true;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  return false;
+}
+
+bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work)
+{
+  if (!use_progressive_noise_floor_) {
+    return false;
+  }
+
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (adaptive_sampling_.use) {
+    if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) {
+      state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2,
+                                               adaptive_sampling_.threshold);
+
+      render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold;
+      render_work.adaptive_sampling.reset = true;
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work)
+{
+  VLOG(3) << "Schedule work for cancel.";
+
+  /* Un-schedule samples: they will not be rendered and should not be counted. */
+  state_.num_rendered_samples -= render_work.path_trace.num_samples;
+
+  const bool has_rendered_samples = get_num_rendered_samples() != 0;
+
+  /* Reset all fields of the previous work, canelling things like adaptive sampling filtering and
+   * denoising.
+   * However, need to preserve write requests, since those will not be possible to recover and
+   * writes are only to happen once. */
+  const bool tile_write = render_work.tile.write;
+  const bool full_write = render_work.full.write;
+
+  render_work = RenderWork();
+
+  render_work.tile.write = tile_write;
+  render_work.full.write = full_write;
+
+  /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which
+   * got cancelled. */
+  if (!state_.tile_result_was_written && has_rendered_samples) {
+    render_work.tile.write = true;
+  }
+
+  if (!state_.full_frame_was_written) {
+    render_work.full.write = true;
+  }
+
+  /* Update current tile, but only if any sample was rendered.
+   * Allows to have latest state of tile visible while full buffer is being processed.
+   *
+   * Note that if there are no samples in the current tile its render buffer might have pixels
+   * remained from previous state.
+   *
+   * If the full result was written, then there is no way any updates were made to the render
+   * buffers. And the buffers might have been freed from the device, so display update is not
+   * possible. */
+  if (has_rendered_samples && !state_.full_frame_was_written) {
+    render_work.display.update = true;
+  }
+}
+
+bool RenderScheduler::done() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (state_.path_trace_finished || state_.time_limit_reached) {
+    return true;
+  }
+
+  return get_num_rendered_samples() >= num_samples_;
+}
+
+RenderWork RenderScheduler::get_render_work()
+{
+  check_time_limit_reached();
+
+  const double time_now = time_dt();
+
+  if (done()) {
+    RenderWork render_work;
+    render_work.resolution_divider = state_.resolution_divider;
+
+    if (!set_postprocess_render_work(&render_work)) {
+      set_full_frame_render_work(&render_work);
+    }
+
+    if (!render_work) {
+      state_.end_render_time = time_now;
+    }
+
+    update_state_for_render_work(render_work);
+
+    return render_work;
+  }
+
+  RenderWork render_work;
+
+  if (state_.resolution_divider != pixel_size_) {
+    state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_);
+    state_.num_rendered_samples = 0;
+    state_.last_display_update_sample = -1;
+  }
+
+  render_work.resolution_divider = state_.resolution_divider;
+
+  render_work.path_trace.start_sample = get_start_sample_to_path_trace();
+  render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+
+  render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());
+
+  /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */
+  render_work.rebalance = work_need_rebalance();
+
+  /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the
+   * samples are rendered. */
+  state_.num_rendered_samples += render_work.path_trace.num_samples;
+
+  render_work.adaptive_sampling.filter = work_need_adaptive_filter();
+  render_work.adaptive_sampling.threshold = work_adaptive_threshold();
+  render_work.adaptive_sampling.reset = false;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.tile.write = done();
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  if (done()) {
+    set_postprocess_render_work(&render_work);
+  }
+
+  update_state_for_render_work(render_work);
+
+  return render_work;
+}
+
+void RenderScheduler::update_state_for_render_work(const RenderWork &render_work)
+{
+  const double time_now = time_dt();
+
+  if (render_work.rebalance) {
+    state_.last_rebalance_time = time_now;
+    ++state_.num_rebalance_requested;
+  }
+
+  /* A fallback display update time, for the case there is an error of display update, or when
+   * there is no display at all. */
+  if (render_work.display.update) {
+    state_.last_display_update_time = time_now;
+    state_.last_display_update_sample = state_.num_rendered_samples;
+  }
+
+  state_.last_work_tile_was_denoised = render_work.tile.denoise;
+  state_.tile_result_was_written |= render_work.tile.write;
+  state_.full_frame_was_written |= render_work.full.write;
+}
+
+bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
+{
+  if (state_.postprocess_work_scheduled) {
+    return false;
+  }
+  state_.postprocess_work_scheduled = true;
+
+  bool any_scheduled = false;
+
+  if (need_schedule_cryptomatte_) {
+    render_work->cryptomatte.postprocess = true;
+    any_scheduled = true;
+  }
+
+  if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
+    render_work->tile.denoise = true;
+    any_scheduled = true;
+  }
+
+  if (!state_.tile_result_was_written) {
+    render_work->tile.write = true;
+    any_scheduled = true;
+  }
+
+  if (any_scheduled) {
+    render_work->display.update = true;
+  }
+
+  return any_scheduled;
+}
+
+void RenderScheduler::set_full_frame_render_work(RenderWork *render_work)
+{
+  if (state_.full_frame_work_scheduled) {
+    return;
+  }
+
+  if (!tile_manager_.has_multiple_tiles()) {
+    /* There is only single tile, so all work has been performed already. */
+    return;
+  }
+
+  if (!tile_manager_.done()) {
+    /* There are still tiles to be rendered. */
+    return;
+  }
+
+  if (state_.full_frame_was_written) {
+    return;
+  }
+
+  state_.full_frame_work_scheduled = true;
+
+  render_work->full.write = true;
+}
+
+/* Knowing time which it took to complete a task at the current resolution divider approximate how
+ * long it would have taken to complete it at a final resolution. */
+static double approximate_final_time(const RenderWork &render_work, double time)
+{
+  if (render_work.resolution_divider == 1) {
+    return time;
+  }
+
+  const double resolution_divider_sq = render_work.resolution_divider *
+                                       render_work.resolution_divider;
+  return time * resolution_divider_sq;
+}
+
+void RenderScheduler::report_work_begin(const RenderWork &render_work)
+{
+  /* Start counting render time when rendering samples at their final resolution.
+   *
+   * NOTE: The work might have the path trace part be all zero: this happens when a post-processing
+   * work is scheduled after the path tracing. Checking for just a start sample doesn't work here
+   * because it might be wrongly 0. Check for whether path tracing is actually happening as it is
+   * expected to happen in the first work. */
+  if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 &&
+      render_work.path_trace.start_sample == get_start_sample()) {
+    state_.start_render_time = time_dt();
+  }
+}
+
+void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
+                                             double time,
+                                             bool is_cancelled)
+{
+  path_trace_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.path_trace_per_sample = final_time_approx /
+                                               render_work.path_trace.num_samples;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    path_trace_time_.reset_average();
+  }
+
+  path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+  state_.occupancy_num_samples = render_work.path_trace.num_samples;
+  state_.occupancy = occupancy;
+  VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
+void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
+                                                  double time,
+                                                  bool is_cancelled)
+{
+  adaptive_filter_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_report_reset_average(render_work)) {
+    adaptive_filter_time_.reset_average();
+  }
+
+  adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average adaptive sampling filter  time: " << adaptive_filter_time_.get_average()
+          << " seconds.";
+}
+
+void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time)
+{
+  denoise_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.denoise_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    denoise_time_.reset_average();
+  }
+
+  denoise_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time)
+{
+  display_update_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.display_update_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    display_update_time_.reset_average();
+  }
+
+  display_update_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds.";
+
+  /* Move the display update moment further in time, so that logic which checks when last update
+   * did happen have more reliable point in time (without path tracing and denoising parts of the
+   * render work). */
+  state_.last_display_update_time = time_dt();
+}
+
+void RenderScheduler::report_rebalance_time(const RenderWork &render_work,
+                                            double time,
+                                            bool balance_changed)
+{
+  rebalance_time_.add_wall(time);
+
+  if (work_report_reset_average(render_work)) {
+    rebalance_time_.reset_average();
+  }
+
+  rebalance_time_.add_average(time);
+
+  if (balance_changed) {
+    ++state_.num_rebalance_changes;
+  }
+
+  state_.last_rebalance_changed = balance_changed;
+
+  VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds.";
+}
+
+string RenderScheduler::full_report() const
+{
+  const double render_wall_time = state_.end_render_time - state_.start_render_time;
+  const int num_rendered_samples = get_num_rendered_samples();
+
+  string result = "\nRender Scheduler Summary\n\n";
+
+  {
+    string mode;
+    if (headless_) {
+      mode = "Headless";
+    }
+    else if (background_) {
+      mode = "Background";
+    }
+    else {
+      mode = "Interactive";
+    }
+    result += "Mode: " + mode + "\n";
+  }
+
+  result += "Resolution: " + to_string(buffer_params_.width) + "x" +
+            to_string(buffer_params_.height) + "\n";
+
+  result += "\nAdaptive sampling:\n";
+  result += "  Use: " + string_from_bool(adaptive_sampling_.use) + "\n";
+  if (adaptive_sampling_.use) {
+    result += "  Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n";
+    result += "  Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n";
+    result += "  Threshold: " + to_string(adaptive_sampling_.threshold) + "\n";
+  }
+
+  result += "\nDenoiser:\n";
+  result += "  Use: " + string_from_bool(denoiser_params_.use) + "\n";
+  if (denoiser_params_.use) {
+    result += "  Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n";
+    result += "  Start Sample: " + to_string(denoiser_params_.start_sample) + "\n";
+
+    string passes = "Color";
+    if (denoiser_params_.use_pass_albedo) {
+      passes += ", Albedo";
+    }
+    if (denoiser_params_.use_pass_normal) {
+      passes += ", Normal";
+    }
+
+    result += "  Passes: " + passes + "\n";
+  }
+
+  if (state_.num_rebalance_requested) {
+    result += "\nRebalancer:\n";
+    result += "  Number of requested rebalances: " + to_string(state_.num_rebalance_requested) +
+              "\n";
+    result += "  Number of performed rebalances: " + to_string(state_.num_rebalance_changes) +
+              "\n";
+  }
+
+  result += "\nTime (in seconds):\n";
+  result += string_printf("  %20s %20s %20s\n", "", "Wall", "Average");
+  result += string_printf("  %20s %20f %20f\n",
+                          "Path Tracing",
+                          path_trace_time_.get_wall(),
+                          path_trace_time_.get_average());
+
+  if (adaptive_sampling_.use) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Adaptive Filter",
+                            adaptive_filter_time_.get_wall(),
+                            adaptive_filter_time_.get_average());
+  }
+
+  if (denoiser_params_.use) {
+    result += string_printf(
+        "  %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average());
+  }
+
+  result += string_printf("  %20s %20f %20f\n",
+                          "Display Update",
+                          display_update_time_.get_wall(),
+                          display_update_time_.get_average());
+
+  if (state_.num_rebalance_requested) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Rebalance",
+                            rebalance_time_.get_wall(),
+                            rebalance_time_.get_average());
+  }
+
+  const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() +
+                            denoise_time_.get_wall() + display_update_time_.get_wall();
+  result += "\n  Total: " + to_string(total_time) + "\n";
+
+  result += string_printf(
+      "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time);
+
+  /* When adaptive sampling is used the average time becomes meaningless, because different samples
+   * will likely render different number of pixels. */
+  if (!adaptive_sampling_.use) {
+    result += string_printf("Average time per sample: %f seconds\n",
+                            render_wall_time / num_rendered_samples);
+  }
+
+  return result;
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds() const
+{
+  return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples);
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples(
+    int num_rendered_samples) const
+{
+  double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      num_rendered_samples);
+
+  if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
+    const double remaining_render_time = max(0.0,
+                                             time_limit_ - (time_dt() - state_.start_render_time));
+
+    update_interval = min(update_interval, remaining_render_time);
+  }
+
+  return update_interval;
+}
+
+/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based
+ * on a more careful experiments with viewport rendering. */
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+    int num_rendered_samples) const
+{
+  /* TODO(sergey): Need a decision on whether this should be using number of samples rendered
+   * within the current render session, or use absolute number of samples with the start sample
+   * taken into account. It will depend on whether the start sample offset clears the render
+   * buffer. */
+
+  if (state_.need_rebalance_at_next_work) {
+    return 0.1;
+  }
+  if (state_.last_rebalance_changed) {
+    return 0.2;
+  }
+
+  if (headless_) {
+    /* In headless mode do rare updates, so that the device occupancy is high, but there are still
+     * progress messages printed to the logs. */
+    return 30.0;
+  }
+
+  if (background_) {
+    if (num_rendered_samples < 32) {
+      return 1.0;
+    }
+    return 2.0;
+  }
+
+  /* Render time and number of samples rendered are used to figure out the display update interval.
+   *  Render time is used to allow for fast display updates in the first few seconds of rendering
+   *  on fast devices. Number of samples rendered is used to allow for potentially quicker display
+   *  updates on slow devices during the first few samples. */
+  const double render_time = path_trace_time_.get_wall();
+  if (render_time < 1) {
+    return 0.1;
+  }
+  if (render_time < 2) {
+    return 0.25;
+  }
+  if (render_time < 4) {
+    return 0.5;
+  }
+  if (render_time < 8 || num_rendered_samples < 32) {
+    return 1.0;
+  }
+  return 2.0;
+}
+
+int RenderScheduler::calculate_num_samples_per_update() const
+{
+  const double time_per_sample_average = path_trace_time_.get_average();
+  const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average;
+
+  const double update_interval_in_seconds = guess_display_update_interval_in_seconds();
+
+  return max(int(num_samples_in_second * update_interval_in_seconds), 1);
+}
+
+int RenderScheduler::get_start_sample_to_path_trace() const
+{
+  return start_sample_ + state_.num_rendered_samples;
+}
+
+/* Round number of samples to the closest power of two.
+ * Rounding might happen to higher or lower value depending on which one is closer. Such behavior
+ * allows to have number of samples to be power of two without diverging from the planned number of
+ * samples too much. */
+static inline uint round_num_samples_to_power_of_2(const uint num_samples)
+{
+  if (num_samples == 1) {
+    return 1;
+  }
+
+  if (is_power_of_two(num_samples)) {
+    return num_samples;
+  }
+
+  const uint num_samples_up = next_power_of_two(num_samples);
+  const uint num_samples_down = num_samples_up - (num_samples_up >> 1);
+
+  const uint delta_up = num_samples_up - num_samples;
+  const uint delta_down = num_samples - num_samples_down;
+
+  if (delta_up <= delta_down) {
+    return num_samples_up;
+  }
+
+  return num_samples_down;
+}
+
+int RenderScheduler::get_num_samples_to_path_trace() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return get_num_samples_during_navigation(state_.resolution_divider);
+  }
+
+  /* Always start full resolution render  with a single sample. Gives more instant feedback to
+   * artists, and allows to gather information for a subsequent path tracing works. Do it in the
+   * headless mode as well, to give some estimate of how long samples are taking. */
+  if (state_.num_rendered_samples == 0) {
+    return 1;
+  }
+
+  const int num_samples_per_update = calculate_num_samples_per_update();
+  const int path_trace_start_sample = get_start_sample_to_path_trace();
+
+  /* Round number of samples to a power of two, so that division of path states into tiles goes in
+   * a more integer manner.
+   * This might make it so updates happens more rarely due to rounding up. In the test scenes this
+   * is not huge deal because it is not seen that more than 8 samples can be rendered between
+   * updates. If that becomes a problem we can add some extra rules like never allow to round up
+   * more than N samples. */
+  const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
+
+  const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+  int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+  /* When enough statistics is available and doing an offlien rendering prefer to keep device
+   * occupied. */
+  if (state_.occupancy_num_samples && (background_ || headless_)) {
+    /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+     * with good performance without forcing occupancy to be higher). */
+    int num_samples_to_occupy = state_.occupancy_num_samples;
+    if (state_.occupancy < 0.5f) {
+      num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+    }
+
+    num_samples_to_render = max(num_samples_to_render,
+                                min(num_samples_to_occupy, max_num_samples_to_render));
+  }
+
+  /* If adaptive sampling is not use, render as many samples per update as possible, keeping the
+   * device fully occupied, without much overhead of display updates. */
+  if (!adaptive_sampling_.use) {
+    return num_samples_to_render;
+  }
+
+  /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This
+   * is to ensure that the final render is pixel-matched regardless of how many samples per second
+   * compute device can do. */
+
+  return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render);
+}
+
+int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const
+{
+  /* Special trick for fast navigation: schedule multiple samples during fast navigation
+   * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
+   * usable visual feedback for artists. There are a couple of tricks though. */
+
+  if (is_denoise_active_during_update()) {
+    /* When denoising is used during navigation prefer using a higher resolution with less samples
+     * (scheduling less samples here will make it so the resolution_divider calculation will use a
+     * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser
+     * give visually better results on a higher resolution image with less samples. */
+    return 1;
+  }
+
+  if (resolution_divider <= pixel_size_) {
+    /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
+     * the sample count at this resolution division, but instead assists in the calculation of
+     * the resolution divider. */
+    return 1;
+  }
+
+  if (resolution_divider == pixel_size_ * 2) {
+    /* When resolution divider is the previous step to the final resolution, schedule two samples.
+     * This is so that rendering on lower resolution does not exceed time that it takes to render
+     * first sample at the full resolution. */
+    return 2;
+  }
+
+  /* Always render 4 samples, even if scene is configured for less.
+   * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
+   * to have 4 time extra samples, so verall worst case timing is the same as the final resolution
+   * at one sample. */
+  return 4;
+}
+
+bool RenderScheduler::work_need_adaptive_filter() const
+{
+  return adaptive_sampling_.need_filter(get_rendered_sample());
+}
+
+float RenderScheduler::work_adaptive_threshold() const
+{
+  if (!use_progressive_noise_floor_) {
+    return adaptive_sampling_.threshold;
+  }
+
+  return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold);
+}
+
+bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
+{
+  delayed = false;
+  ready_to_display = true;
+
+  if (!denoiser_params_.use) {
+    /* Denoising is disabled, no need to scheduler work for it. */
+    return false;
+  }
+
+  if (done()) {
+    /* Always denoise at the last sample. */
+    return true;
+  }
+
+  if (background_) {
+    /* Background render, only denoise when rendering the last sample. */
+    /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised
+     * image looks like even for the background rendering. */
+    return false;
+  }
+
+  /* Viewport render. */
+
+  /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as
+   * final samples. */
+  const int num_samples_finished = state_.resolution_divider == pixel_size_ ?
+                                       state_.num_rendered_samples :
+                                       1;
+
+  /* Immediately denoise when we reach the start sample or last sample. */
+  if (num_samples_finished == denoiser_params_.start_sample ||
+      num_samples_finished == num_samples_) {
+    return true;
+  }
+
+  /* Do not denoise until the sample at which denoising should start is reached. */
+  if (num_samples_finished < denoiser_params_.start_sample) {
+    ready_to_display = false;
+    return false;
+  }
+
+  /* Avoid excessive denoising in viewport after reaching a certain sample count and render time.
+   */
+  /* TODO(sergey): Consider making time interval and sample configurable. */
+  delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 &&
+             (time_dt() - state_.last_display_update_time) < 1.0);
+
+  return !delayed;
+}
+
+bool RenderScheduler::work_need_update_display(const bool denoiser_delayed)
+{
+  if (headless_) {
+    /* Force disable display update in headless mode. There will be nothing to display the
+     * in-progress result. */
+    return false;
+  }
+
+  if (denoiser_delayed) {
+    /* If denoiser has been delayed the display can not be updated as it will not contain
+     * up-to-date state of the render result. */
+    return false;
+  }
+
+  if (!adaptive_sampling_.use) {
+    /* When adaptive sampling is not used the work is scheduled in a way that they keep render
+     * device busy for long enough, so that the display update can happen right after the
+     * rendering. */
+    return true;
+  }
+
+  if (done() || state_.last_display_update_sample == -1) {
+    /* Make sure an initial and final results of adaptive sampling is communicated ot the display.
+     */
+    return true;
+  }
+
+  /* For the development purposes of adaptive sampling it might be very useful to see all updates
+   * of active pixels after convergence check. However, it would cause a slowdown for regular usage
+   * users. Possibly, make it a debug panel option to allow rapid update to ease development
+   * without need to re-compiled. */
+  // if (work_need_adaptive_filter()) {
+  //   return true;
+  // }
+
+  /* When adaptive sampling is used, its possible that only handful of samples of a very simple
+   * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points).
+   * We take care of skipping updates here based on when previous display update did happen. */
+  const double update_interval = guess_display_update_interval_in_seconds_for_num_samples(
+      state_.last_display_update_sample);
+  return (time_dt() - state_.last_display_update_time) > update_interval;
+}
+
+bool RenderScheduler::work_need_rebalance()
+{
+  /* This is the minimum time, as the rebalancing can not happen more often than the path trace
+   * work. */
+  static const double kRebalanceIntervalInSeconds = 1;
+
+  if (!need_schedule_rebalance_works_) {
+    return false;
+  }
+
+  if (state_.resolution_divider != pixel_size_) {
+    /* Don't rebalance at a non-final resolution divider. Some reasons for this:
+     *  - It will introduce unnecessary during navigation.
+     *  - Per-render device timing information is not very reliable yet. */
+    return false;
+  }
+
+  if (state_.num_rendered_samples == 0) {
+    state_.need_rebalance_at_next_work = true;
+    return false;
+  }
+
+  if (state_.need_rebalance_at_next_work) {
+    state_.need_rebalance_at_next_work = false;
+    return true;
+  }
+
+  if (state_.last_rebalance_changed) {
+    return true;
+  }
+
+  return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds;
+}
+
+void RenderScheduler::update_start_resolution_divider()
+{
+  if (start_resolution_divider_ == 0) {
+    /* Resolution divider has never been calculated before: use default resolution, so that we have
+     * somewhat good initial behavior, giving a chance to collect real numbers. */
+    start_resolution_divider_ = default_start_resolution_divider_;
+    VLOG(3) << "Initial resolution divider is " << start_resolution_divider_;
+    return;
+  }
+
+  if (first_render_time_.path_trace_per_sample == 0.0) {
+    /* Not enough information to calculate better resolution, keep the existing one. */
+    return;
+  }
+
+  const double desired_update_interval_in_seconds =
+      guess_viewport_navigation_update_interval_in_seconds();
+
+  const double actual_time_per_update = first_render_time_.path_trace_per_sample +
+                                        first_render_time_.denoise_time +
+                                        first_render_time_.display_update_time;
+
+  /* Allow some percent of tolerance, so that if the render time is close enough to the higher
+   * resolution we prefer to use it instead of going way lower resolution and time way below the
+   * desired one. */
+  const int resolution_divider_for_update = calculate_resolution_divider_for_time(
+      desired_update_interval_in_seconds * 1.4, actual_time_per_update);
+
+  /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
+   * render time is somewhere on a boundary between two resolutions. */
+
+  /* Never increase resolution to higher than the pixel size (which is possible if the scene is
+   * simple and compute device is fast). */
+  start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+
+  VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_;
+}
+
+double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const
+{
+  if (is_denoise_active_during_update()) {
+    /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the
+     * image from. With the faster updates and extra compute required the resolution becomes too
+     * low to give usable feedback. */
+    /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser
+     * on GPU the value might need to become lower for faster navigation. */
+    return 1.0 / 12.0;
+  }
+
+  /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will
+   * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high
+   * values of the resolution divider which does not give very pleasant updates during navigation.
+   * Choose less frequent updates to allow more noise-free and higher resolution updates. */
+
+  /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider
+   * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */
+
+  return 1.0 / 30.0;
+}
+
+bool RenderScheduler::is_denoise_active_during_update() const
+{
+  if (!denoiser_params_.use) {
+    return false;
+  }
+
+  if (denoiser_params_.start_sample > 1) {
+    return false;
+  }
+
+  return true;
+}
+
+bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work)
+{
+  return render_work.resolution_divider == pixel_size_ &&
+         render_work.path_trace.start_sample == start_sample_;
+}
+
+bool RenderScheduler::work_report_reset_average(const RenderWork &render_work)
+{
+  /* When rendering at a non-final resolution divider time average is not very useful because it
+   * will either bias average down (due to lower render times on the smaller images) or will give
+   * incorrect result when trying to estimate time which would have spent on the final resolution.
+   *
+   * So we only accumulate average for the latest resolution divider which was rendered. */
+  return render_work.resolution_divider != pixel_size_;
+}
+
+void RenderScheduler::check_time_limit_reached()
+{
+  if (time_limit_ == 0.0) {
+    /* No limit is enforced. */
+    return;
+  }
+
+  if (state_.start_render_time == 0.0) {
+    /* Rendering did not start yet. */
+    return;
+  }
+
+  const double current_time = time_dt();
+
+  if (current_time - state_.start_render_time < time_limit_) {
+    /* Time limit is not reached yet. */
+    return;
+  }
+
+  state_.time_limit_reached = true;
+  state_.end_render_time = current_time;
+}
+
+/* --------------------------------------------------------------------
+ * Utility functions.
+ */
+
+int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
+{
+  /* TODO(sergey): There should a non-iterative analytical formula here. */
+
+  int resolution_divider = 1;
+
+  /* This algorithm iterates through resolution dividers until a divider is found that achieves
+   * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
+   * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
+   * pre_resolution_division_samples and post_resolution_division_samples are used in this
+   * calculation to better predict the performance impact of changing resolution divisions as
+   * the sample count can also change between resolution divisions. */
+  while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
+    int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    resolution_divider = resolution_divider * 2;
+    int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
+{
+  if (resolution == INT_MAX) {
+    return 1;
+  }
+
+  int resolution_divider = 1;
+  while (width * height > resolution * resolution) {
+    width = max(1, width / 2);
+    height = max(1, height / 2);
+
+    resolution_divider <<= 1;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider)
+{
+  const int pixel_area = width * height;
+  const int resolution = lround(sqrt(pixel_area));
+
+  return resolution / resolution_divider;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
new file mode 100644
index 00000000000..9c2d107e46d
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/adaptive_sampling.h"
+#include "integrator/denoiser.h" /* For DenoiseParams. */
+#include "render/buffers.h"
+#include "util/util_string.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SessionParams;
+class TileManager;
+
+class RenderWork {
+ public:
+  int resolution_divider = 1;
+
+  /* Initialize render buffers.
+   * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+   * baking target. */
+  bool init_render_buffers = false;
+
+  /* Path tracing samples information. */
+  struct {
+    int start_sample = 0;
+    int num_samples = 0;
+  } path_trace;
+
+  struct {
+    /* Check for convergency and filter the mask. */
+    bool filter = false;
+
+    float threshold = 0.0f;
+
+    /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */
+    bool reset = false;
+  } adaptive_sampling;
+
+  struct {
+    bool postprocess = false;
+  } cryptomatte;
+
+  /* Work related on the current tile. */
+  struct {
+    /* Write render buffers of the current tile.
+     *
+     * It is up to the path trace to decide whether writing should happen via user-provided
+     * callback into the rendering software, or via tile manager into a partial file. */
+    bool write = false;
+
+    bool denoise = false;
+  } tile;
+
+  /* Work related on the full-frame render buffer. */
+  struct {
+    /* Write full render result.
+     * Implies reading the partial file from disk. */
+    bool write = false;
+  } full;
+
+  /* Display which is used to visualize render result. */
+  struct {
+    /* Display needs to be updated for the new render. */
+    bool update = false;
+
+    /* Display can use denoised result if available. */
+    bool use_denoised_result = true;
+  } display;
+
+  /* Re-balance multi-device scheduling after rendering this work.
+   * Note that the scheduler does not know anything abouce devices, so if there is only a single
+   * device used, then it is up for the PathTracer to ignore the balancing. */
+  bool rebalance = false;
+
+  /* Conversion to bool, to simplify checks about whether there is anything to be done for this
+   * work. */
+  inline operator bool() const
+  {
+    return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise ||
+           tile.write || full.write;
+  }
+};
+
+class RenderScheduler {
+ public:
+  RenderScheduler(TileManager &tile_manager, const SessionParams &params);
+
+  /* Specify whether cryptomatte-related works are to be scheduled. */
+  void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte);
+
+  /* Allows to disable work re-balancing works, allowing to schedule as much to a single device
+   * as possible. */
+  void set_need_schedule_rebalance(bool need_schedule_rebalance);
+
+  bool is_background() const;
+
+  void set_denoiser_params(const DenoiseParams &params);
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  bool is_adaptive_sampling_used() const;
+
+  /* Start sample for path tracing.
+   * The scheduler will schedule work using this sample as the first one. */
+  void set_start_sample(int start_sample);
+  int get_start_sample() const;
+
+  /* Number of samples to render, starting from start sample.
+   * The scheduler will schedule work in the range of
+   * [start_sample, start_sample + num_samples - 1], inclusively. */
+  void set_num_samples(int num_samples);
+  int get_num_samples() const;
+
+  /* Time limit for the path tracing tasks, in minutes.
+   * Zero disables the limit. */
+  void set_time_limit(double time_limit);
+  double get_time_limit() const;
+
+  /* Get sample up to which rendering has been done.
+   * This is an absolute 0-based value.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 14.
+   *
+   * If there were no samples rendered, then the behavior is undefined. */
+  int get_rendered_sample() const;
+
+  /* Get number of samples rendered within the current scheduling session.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 5.
+   *
+   * Note that this is based on the scheduling information. In practice this means that if someone
+   * requested for work to render the scheduler considers the work done. */
+  int get_num_rendered_samples() const;
+
+  /* Reset scheduler, indicating that rendering will happen from scratch.
+   * Resets current rendered state, as well as scheduling information. */
+  void reset(const BufferParams &buffer_params, int num_samples);
+
+  /* Reset scheduler upon switching to a next tile.
+   * Will keep the same number of samples and full-frame render parameters, but will reset progress
+   * and allow schedule renders works from the beginning of the new tile. */
+  void reset_for_next_tile();
+
+  /* Reschedule adaptive sampling work when all pixels did converge.
+   * If there is nothing else to be done for the adaptive sampling (pixels did converge to the
+   * final threshold) then false is returned and the render scheduler will stop scheduling path
+   * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with
+   * a lower threshold. */
+  bool render_work_reschedule_on_converge(RenderWork &render_work);
+
+  /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet
+   * converged.
+   * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and
+   * the path tracer is to finish the current pixels) then false is returned. */
+  bool render_work_reschedule_on_idle(RenderWork &render_work);
+
+  /* Reschedule work when rendering has been requested to cancel.
+   *
+   * Will skip all work which is not needed anymore because no more samples will be added (for
+   * example, adaptive sampling filtering and convergence check will be skipped).
+   * Will enable all work needed to make sure all passes are communicated to the software.
+   *
+   * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */
+  void render_work_reschedule_on_cancel(RenderWork &render_work);
+
+  RenderWork get_render_work();
+
+  /* Report that the path tracer started to work, after scene update and loading kernels. */
+  void report_work_begin(const RenderWork &render_work);
+
+  /* Report time (in seconds) which corresponding part of work took. */
+  void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
+  void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_denoise_time(const RenderWork &render_work, double time);
+  void report_display_update_time(const RenderWork &render_work, double time);
+  void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed);
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+ protected:
+  /* Check whether all work has been scheduled and time limit was not exceeded.
+   *
+   * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some
+   * extra work needs to be scheduled to denoise and write final result. */
+  bool done() const;
+
+  /* Update scheduling state for a newely scheduled work.
+   * Takes care of things like checking whether work was ever denoised, tile was written and states
+   * like that. */
+  void update_state_for_render_work(const RenderWork &render_work);
+
+  /* Returns true if any work was scheduled. */
+  bool set_postprocess_render_work(RenderWork *render_work);
+
+  /*  Set work which is to be performed after all tiles has been rendered. */
+  void set_full_frame_render_work(RenderWork *render_work);
+
+  /* Update start resolution divider based on the accumulated timing information, preserving nice
+   * feeling navigation feel. */
+  void update_start_resolution_divider();
+
+  /* Calculate desired update interval in seconds based on the current timings and settings.
+   * Will give an interval which provides good feeling updates during viewport navigation. */
+  double guess_viewport_navigation_update_interval_in_seconds() const;
+
+  /* Check whether denoising is active during interactive update while resolution divider is not
+   * unit. */
+  bool is_denoise_active_during_update() const;
+
+  /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at
+   * lower samples and near the beginning of rendering, updates happen more often, but with higher
+   * number of samples and later in the render, updates happen less often but device occupancy
+   * goes higher. */
+  double guess_display_update_interval_in_seconds() const;
+  double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const;
+  double guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      int num_rendered_samples) const;
+
+  /* Calculate number of samples which can be rendered within current desred update interval which
+   * is calculated by `guess_update_interval_in_seconds()`. */
+  int calculate_num_samples_per_update() const;
+
+  /* Get start sample and the number of samples which are to be path traces in the current work. */
+  int get_start_sample_to_path_trace() const;
+  int get_num_samples_to_path_trace() const;
+
+  /* Calculate how many samples there are to be rendered for the very first path trace after reset.
+   */
+  int get_num_samples_during_navigation(int resolution_divier) const;
+
+  /* Whether adaptive sampling convergence check and filter is to happen. */
+  bool work_need_adaptive_filter() const;
+
+  /* Calculate thretshold for adaptive sampling. */
+  float work_adaptive_threshold() const;
+
+  /* Check whether current work needs denoising.
+   * Denoising is not needed if the denoiser is not configured, or when denosiing is happening too
+   * often.
+   *
+   * The delayed will be true when the denoiser is configured for use, but it was delayed for a
+   * later sample, to reduce overhead.
+   *
+   * ready_to_display will be false if we may have a denoised result that is outdated due to
+   * increased samples. */
+  bool work_need_denoise(bool &delayed, bool &ready_to_display);
+
+  /* Check whether current work need to update display.
+   *
+   * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */
+  bool work_need_update_display(const bool denoiser_delayed);
+
+  /* Check whether it is time to perform rebalancing for the render work, */
+  bool work_need_rebalance();
+
+  /* Check whether timing of the given work are usable to store timings in the `first_render_time_`
+   * for the resolution divider calculation. */
+  bool work_is_usable_for_first_render_estimation(const RenderWork &render_work);
+
+  /* Check whether timing report about the given work need to reset accumulated average time. */
+  bool work_report_reset_average(const RenderWork &render_work);
+
+  /* CHeck whether render time limit has been reached (or exceeded), and if so store related
+   * information in the state so that rendering is considered finished, and is possible to report
+   * average render time information. */
+  void check_time_limit_reached();
+
+  /* Helper class to keep track of task timing.
+   *
+   * Contains two parts: wall time and average. The wall time is an actual wall time of how long it
+   * took to complete all tasks of a type. Is always advanced when PathTracer reports time update.
+   *
+   * The average time is used for scheduling purposes. It is estimated to be a time of how long it
+   * takes to perform task on the final resolution. */
+  class TimeWithAverage {
+   public:
+    inline void reset()
+    {
+      total_wall_time_ = 0.0;
+
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+    inline void add_wall(double time)
+    {
+      total_wall_time_ += time;
+    }
+
+    inline void add_average(double time, int num_measurements = 1)
+    {
+      average_time_accumulator_ += time;
+      num_average_times_ += num_measurements;
+    }
+
+    inline double get_wall() const
+    {
+      return total_wall_time_;
+    }
+
+    inline double get_average() const
+    {
+      if (num_average_times_ == 0) {
+        return 0;
+      }
+      return average_time_accumulator_ / num_average_times_;
+    }
+
+    inline void reset_average()
+    {
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+   protected:
+    double total_wall_time_ = 0.0;
+
+    double average_time_accumulator_ = 0.0;
+    int num_average_times_ = 0;
+  };
+
+  struct {
+    int resolution_divider = 1;
+
+    /* Number of rendered samples on top of the start sample. */
+    int num_rendered_samples = 0;
+
+    /* Point in time the latest GPUDisplay work has been scheduled. */
+    double last_display_update_time = 0.0;
+    /* Value of -1 means display was never updated. */
+    int last_display_update_sample = -1;
+
+    /* Point in time at which last rebalance has been performed. */
+    double last_rebalance_time = 0.0;
+
+    /* Number of rebalance works which has been requested to be performed.
+     * The path tracer might ignore the work if there is a single device rendering. */
+    int num_rebalance_requested = 0;
+
+    /* Number of rebalance works handled which did change balance across devices. */
+    int num_rebalance_changes = 0;
+
+    bool need_rebalance_at_next_work = false;
+
+    /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across
+     * devices. */
+    bool last_rebalance_changed = false;
+
+    /* Threshold for adaptive sampling which will be scheduled to work when not using progressive
+     * noise floor. */
+    float adaptive_sampling_threshold = 0.0f;
+
+    bool last_work_tile_was_denoised = false;
+    bool tile_result_was_written = false;
+    bool postprocess_work_scheduled = false;
+    bool full_frame_work_scheduled = false;
+    bool full_frame_was_written = false;
+
+    bool path_trace_finished = false;
+    bool time_limit_reached = false;
+
+    /* Time at which rendering started and finished. */
+    double start_render_time = 0.0;
+    double end_render_time = 0.0;
+
+    /* Measured occupancy of the render devices measured normalized to the number of samples.
+     *
+     * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+     * previous work was rendered. */
+    int occupancy_num_samples = 0;
+    float occupancy = 1.0f;
+  } state_;
+
+  /* Timing of tasks which were performed at the very first render work at 100% of the
+   * resolution. This timing information is used to estimate resolution divider for fats
+   * navigation. */
+  struct {
+    double path_trace_per_sample;
+    double denoise_time;
+    double display_update_time;
+  } first_render_time_;
+
+  TimeWithAverage path_trace_time_;
+  TimeWithAverage adaptive_filter_time_;
+  TimeWithAverage denoise_time_;
+  TimeWithAverage display_update_time_;
+  TimeWithAverage rebalance_time_;
+
+  /* Whether cryptomatte-related work will be scheduled. */
+  bool need_schedule_cryptomatte_ = false;
+
+  /* Whether to schedule device load rebalance works.
+   * Rebalancing requires some special treatment for update intervals and such, so if it's known
+   * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully
+   * ignore rebalancing logic. */
+  bool need_schedule_rebalance_works_ = false;
+
+  /* Path tracing work will be scheduled for samples from within
+   * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */
+  int start_sample_ = 0;
+  int num_samples_ = 0;
+
+  /* Limit in seconds for how long path tracing is allowed to happen.
+   * Zero means no limit is applied. */
+  double time_limit_ = 0.0;
+
+  /* Headless rendering without interface. */
+  bool headless_;
+
+  /* Background (offline) rendering. */
+  bool background_;
+
+  /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other
+   * types of hi-dpi displays. */
+  int pixel_size_ = 1;
+
+  TileManager &tile_manager_;
+
+  BufferParams buffer_params_;
+  DenoiseParams denoiser_params_;
+
+  AdaptiveSampling adaptive_sampling_;
+
+  /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise
+   * level. */
+  bool use_progressive_noise_floor_ = false;
+
+  /* Default value for the resolution divider which will be used when there is no render time
+   * information available yet.
+   * It is also what defines the upper limit of the automatically calculated resolution divider. */
+  int default_start_resolution_divider_ = 1;
+
+  /* Initial resolution divider which will be used on render scheduler reset. */
+  int start_resolution_divider_ = 0;
+
+  /* Calculate smallest resolution divider which will bring down actual rendering time below the
+   * desired one. This call assumes linear dependency of render time from number of pixels
+   * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time
+   * down by a factor of 4. */
+  int calculate_resolution_divider_for_time(double desired_time, double actual_time);
+};
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution);
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
new file mode 100644
index 00000000000..465b4a8d4da
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/shader_eval.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
+{
+  DCHECK_NE(device_, nullptr);
+}
+
+bool ShaderEval::eval(const ShaderEvalType type,
+                      const int max_num_points,
+                      const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+                      const function<void(device_vector<float4> &)> &read_output)
+{
+  bool first_device = true;
+  bool success = true;
+
+  device_->foreach_device([&](Device *device) {
+    if (!first_device) {
+      LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
+                    "single device.";
+      return;
+    }
+    first_device = false;
+
+    device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
+    device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+
+    /* Allocate and copy device buffers. */
+    DCHECK_EQ(input.device, device);
+    DCHECK_EQ(output.device, device);
+    DCHECK_LE(output.size(), input.size());
+
+    input.alloc(max_num_points);
+    int num_points = fill_input(input);
+    if (num_points == 0) {
+      return;
+    }
+
+    input.copy_to_device();
+    output.alloc(num_points);
+    output.zero_to_device();
+
+    /* Evaluate on CPU or GPU. */
+    success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
+                                                  eval_gpu(device, type, input, output);
+
+    /* Copy data back from device if not cancelled. */
+    if (success) {
+      output.copy_from_device(0, 1, output.size());
+      read_output(output);
+    }
+
+    input.free();
+    output.free();
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_cpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  vector<CPUKernelThreadGlobals> kernel_thread_globals;
+  device->get_cpu_kernel_thread_globals(kernel_thread_globals);
+
+  /* Find required kernel function. */
+  const CPUKernels &kernels = *(device->get_cpu_kernels());
+
+  /* Simple parallel_for over all work items. */
+  const int64_t work_size = output.size();
+  KernelShaderEvalInput *input_data = input.data();
+  float4 *output_data = output.data();
+  bool success = true;
+
+  tbb::task_arena local_arena(device->info.cpu_threads);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
+      /* TODO: is this fast enough? */
+      if (progress_.get_cancel()) {
+        success = false;
+        return;
+      }
+
+      const int thread_index = tbb::this_task_arena::current_thread_index();
+      KernelGlobals *kg = &kernel_thread_globals[thread_index];
+
+      switch (type) {
+        case SHADER_EVAL_DISPLACE:
+          kernels.shader_eval_displace(kg, input_data, output_data, work_index);
+          break;
+        case SHADER_EVAL_BACKGROUND:
+          kernels.shader_eval_background(kg, input_data, output_data, work_index);
+          break;
+      }
+    });
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_gpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  /* Find required kernel function. */
+  DeviceKernel kernel;
+  switch (type) {
+    case SHADER_EVAL_DISPLACE:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
+      break;
+    case SHADER_EVAL_BACKGROUND:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
+      break;
+  };
+
+  /* Create device queue. */
+  unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
+  queue->init_execution();
+
+  /* Execute work on GPU in chunk, so we can cancel.
+   * TODO : query appropriate size from device.*/
+  const int chunk_size = 65536;
+
+  const int work_size = output.size();
+  void *d_input = (void *)input.device_pointer;
+  void *d_output = (void *)output.device_pointer;
+
+  for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+    int d_work_size = min(chunk_size, work_size - d_offset);
+    void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
+
+    queue->enqueue(kernel, d_work_size, args);
+    queue->synchronize();
+
+    if (progress_.get_cancel()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h
new file mode 100644
index 00000000000..7dbf334b8d7
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_function.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class Progress;
+
+enum ShaderEvalType {
+  SHADER_EVAL_DISPLACE,
+  SHADER_EVAL_BACKGROUND,
+};
+
+/* ShaderEval class performs shader evaluation for background light and displacement. */
+class ShaderEval {
+ public:
+  ShaderEval(Device *device, Progress &progress);
+
+  /* Evaluate shader at points specified by KernelShaderEvalInput and write out
+   * RGBA colors to output. */
+  bool eval(const ShaderEvalType type,
+            const int max_num_points,
+            const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+            const function<void(device_vector<float4> &)> &read_output);
+
+ protected:
+  bool eval_cpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+  bool eval_gpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+
+  Device *device_;
+  Progress &progress_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
new file mode 100644
index 00000000000..3387b7bedf1
--- /dev/null
+++ b/intern/cycles/integrator/tile.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/tile.h"
+
+#include "util/util_logging.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size)
+{
+  os << "size: (" << tile_size.width << ", " << tile_size.height << ")";
+  os << ", num_samples: " << tile_size.num_samples;
+  return os;
+}
+
+ccl_device_inline uint round_down_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return prev_power_of_two(x);
+}
+
+ccl_device_inline uint round_up_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return next_power_of_two(x);
+}
+
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states)
+{
+  if (max_num_path_states == 1) {
+    /* Simple case: avoid any calculation, which could cause rounding issues. */
+    return TileSize(1, 1, 1);
+  }
+
+  const int64_t num_pixels = image_size.x * image_size.y;
+  const int64_t num_pixel_samples = num_pixels * num_samples;
+
+  if (max_num_path_states >= num_pixel_samples) {
+    /* Image fully fits into the state (could be border render, for example). */
+    return TileSize(image_size.x, image_size.y, num_samples);
+  }
+
+  /* The idea here is to keep number of samples per tile as much as possible to improve coherency
+   * across threads.
+   *
+   * Some general ideas:
+   *  - Prefer smaller tiles with more samples, which improves spatial coherency of paths.
+   *  - Keep values a power of two, for more integer fit into the maximum number of paths. */
+
+  TileSize tile_size;
+
+  /* Calculate tile size as if it is the most possible one to fit an entire range of samples.
+   * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling
+   * multiple tiles with the same coordinates rendering different samples. */
+  const int num_path_states_per_sample = max_num_path_states / num_samples;
+  if (num_path_states_per_sample != 0) {
+    tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample)));
+    tile_size.height = tile_size.width;
+  }
+  else {
+    tile_size.width = tile_size.height = 1;
+  }
+
+  if (num_samples == 1) {
+    tile_size.num_samples = 1;
+  }
+  else {
+    /* Heuristic here is to have more uniform division of the sample range: for example prefer
+     * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */
+    tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))),
+                                static_cast<uint>(num_samples));
+
+    const int tile_area = tile_size.width / tile_size.height;
+    tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area);
+  }
+
+  DCHECK_GE(tile_size.width, 1);
+  DCHECK_GE(tile_size.height, 1);
+  DCHECK_GE(tile_size.num_samples, 1);
+  DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states);
+
+  return tile_size;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
new file mode 100644
index 00000000000..d0824843ddb
--- /dev/null
+++ b/intern/cycles/integrator/tile.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct TileSize {
+  TileSize() = default;
+
+  inline TileSize(int width, int height, int num_samples)
+      : width(width), height(height), num_samples(num_samples)
+  {
+  }
+
+  inline bool operator==(const TileSize &other) const
+  {
+    return width == other.width && height == other.height && num_samples == other.num_samples;
+  }
+  inline bool operator!=(const TileSize &other) const
+  {
+    return !(*this == other);
+  }
+
+  int width = 0, height = 0;
+  int num_samples = 0;
+};
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
+
+/* Calculate tile size which is best suitable for rendering image of a given size with given number
+ * of active path states.
+ * Will attempt to provide best guess to keep path tracing threads of a device as localized as
+ * possible, and have as many threads active for every tile as possible. */
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
new file mode 100644
index 00000000000..9f96fe3632b
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_balancer.h"
+
+#include "util/util_math.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  if (num_infos == 1) {
+    work_balance_infos[0].weight = 1.0;
+    return;
+  }
+
+  /* There is no statistics available, so start with an equal distribution. */
+  const double weight = 1.0 / num_infos;
+  for (WorkBalanceInfo &balance_info : work_balance_infos) {
+    balance_info.weight = weight;
+  }
+}
+
+static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos)
+{
+  double total_time = 0;
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    total_time += info.time_spent;
+  }
+  return total_time;
+}
+
+/* The balance is based on equalizing time which devices spent performing a task. Assume that
+ * average of the observed times is usable for estimating whether more or less work is to be
+ * scheduled, and how difference in the work scheduling is needed. */
+
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  const double total_time = calculate_total_time(work_balance_infos);
+  const double time_average = total_time / num_infos;
+
+  double total_weight = 0;
+  vector<double> new_weights;
+  new_weights.reserve(num_infos);
+
+  /* Equalize the overall average time. This means that we don't make it so every work will perform
+   * amount of work based on the current average, but that after the weights changes the time will
+   * equalize.
+   * Can think of it that if one of the devices is 10% faster than another, then one device needs
+   * to do 5% less of the current work, and another needs to do 5% more. */
+  const double lerp_weight = 1.0 / num_infos;
+
+  bool has_big_difference = false;
+
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    const double time_target = lerp(info.time_spent, time_average, lerp_weight);
+    const double new_weight = info.weight * time_target / info.time_spent;
+    new_weights.push_back(new_weight);
+    total_weight += new_weight;
+
+    if (std::fabs(1.0 - time_target / time_average) > 0.02) {
+      has_big_difference = true;
+    }
+  }
+
+  if (!has_big_difference) {
+    return false;
+  }
+
+  const double total_weight_inv = 1.0 / total_weight;
+  for (int i = 0; i < num_infos; ++i) {
+    WorkBalanceInfo &info = work_balance_infos[i];
+    info.weight = new_weights[i] * total_weight_inv;
+    info.time_spent = 0;
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
new file mode 100644
index 00000000000..94e20ecf054
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct WorkBalanceInfo {
+  /* Time spent performing corresponding work. */
+  double time_spent = 0;
+
+  /* Average occupancy of the device while performing the work. */
+  float occupancy = 1.0f;
+
+  /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
+   * the big tile which is to be rendered on the device). */
+  double weight = 1.0;
+};
+
+/* Balance work for an initial render interation, before any statistics is known. */
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos);
+
+/* Rebalance work after statistics has been accumulated.
+ * Returns true if the balancing did change. */
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
new file mode 100644
index 00000000000..3fc99d5b74d
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_tile_scheduler.h"
+
+#include "device/device_queue.h"
+#include "integrator/tile.h"
+#include "render/buffers.h"
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+WorkTileScheduler::WorkTileScheduler()
+{
+}
+
+void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
+{
+  max_num_path_states_ = max_num_path_states;
+}
+
+void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num)
+{
+  /* Image buffer parameters. */
+  image_full_offset_px_.x = buffer_params.full_x;
+  image_full_offset_px_.y = buffer_params.full_y;
+
+  image_size_px_ = make_int2(buffer_params.width, buffer_params.height);
+
+  offset_ = buffer_params.offset;
+  stride_ = buffer_params.stride;
+
+  /* Samples parameters. */
+  sample_start_ = sample_start;
+  samples_num_ = samples_num;
+
+  /* Initialize new scheduling. */
+  reset_scheduler_state();
+}
+
+void WorkTileScheduler::reset_scheduler_state()
+{
+  tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_);
+
+  VLOG(3) << "Will schedule tiles of size " << tile_size_;
+
+  if (VLOG_IS_ON(3)) {
+    /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
+     * and purely focusing on the number of used path states. */
+    const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+                                        tile_size_.num_samples;
+    const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+    VLOG(3) << "Number of unused path states: "
+            << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+  }
+
+  num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+  num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+
+  total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
+  num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
+
+  next_work_index_ = 0;
+  total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
+}
+
+bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size)
+{
+  /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because
+   * the path trace work can decice to use smaller tile sizes and greedily schedule multiple tiles,
+   * improving overall device occupancy.
+   * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling
+   * limit". */
+
+  DCHECK_NE(max_num_path_states_, 0);
+
+  const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1);
+  if (work_index >= total_work_size_) {
+    return false;
+  }
+
+  const int sample_range_index = work_index % num_tiles_per_sample_range_;
+  const int start_sample = sample_range_index * tile_size_.num_samples;
+  const int tile_index = work_index / num_tiles_per_sample_range_;
+  const int tile_y = tile_index / num_tiles_x_;
+  const int tile_x = tile_index - tile_y * num_tiles_x_;
+
+  KernelWorkTile work_tile;
+  work_tile.x = tile_x * tile_size_.width;
+  work_tile.y = tile_y * tile_size_.height;
+  work_tile.w = tile_size_.width;
+  work_tile.h = tile_size_.height;
+  work_tile.start_sample = sample_start_ + start_sample;
+  work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+  work_tile.offset = offset_;
+  work_tile.stride = stride_;
+
+  work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
+  work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);
+
+  work_tile.x += image_full_offset_px_.x;
+  work_tile.y += image_full_offset_px_.y;
+
+  const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+  DCHECK_GT(tile_work_size, 0);
+
+  if (max_work_size && tile_work_size > max_work_size) {
+    /* The work did not fit into the requested limit of the work size. Unschedule the tile,
+     * allowing others (or ourselves later one) to pick it up.
+     *
+     * TODO: Such temporary decrement is not ideal, since it might lead to situation when another
+     * device sees there is nothing to be done, finishing its work and leaving all work to be
+     * done by us. */
+    atomic_fetch_and_add_int32(&next_work_index_, -1);
+    return false;
+  }
+
+  *work_tile_ = work_tile;
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
new file mode 100644
index 00000000000..e4c8f701259
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/tile.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+struct KernelWorkTile;
+
+/* Scheduler of device work tiles.
+ * Takes care of feeding multiple devices running in parallel a work which needs to be done. */
+class WorkTileScheduler {
+ public:
+  WorkTileScheduler();
+
+  /* MAximum path states which are allowed to be used by a single scheduled work tile.
+   *
+   * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
+   * this number of states. */
+  void set_max_num_path_states(int max_num_path_states);
+
+  /* Scheduling will happen for pixels within a big tile denotes by its parameters. */
+  void reset(const BufferParams &buffer_params, int sample_start, int samples_num);
+
+  /* Get work for a device.
+   * Returns true if there is still work to be done and initialize the work tile to all
+   * parameters of this work. If there is nothing remaining to be done, returns false and the
+   * work tile is kept unchanged.
+   *
+   * Optionally pass max_work_size to do nothing if there is no tile small enough. */
+  bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0);
+
+ protected:
+  void reset_scheduler_state();
+
+  /* Maximum allowed path states to be used.
+   *
+   * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
+   * number of path states is kind of a detail. Is there a more generic term from the scheduler
+   * point of view? */
+  int max_num_path_states_ = 0;
+
+  /* Offset in pixels within a global buffer. */
+  int2 image_full_offset_px_ = make_int2(0, 0);
+
+  /* dimensions of the currently rendering image in pixels. */
+  int2 image_size_px_ = make_int2(0, 0);
+
+  /* Offset and stride of the buffer within which scheduing is happenning.
+   * Will be passed over to the KernelWorkTile. */
+  int offset_, stride_;
+
+  /* Start sample of index and number of samples which are to be rendered.
+   * The scheduler will cover samples range of [start, start + num] over the entire image
+   * (splitting into a smaller work tiles). */
+  int sample_start_ = 0;
+  int samples_num_ = 0;
+
+  /* Tile size which be scheduled for rendering. */
+  TileSize tile_size_;
+
+  /* Number of tiles in X and Y axis of the image. */
+  int num_tiles_x_, num_tiles_y_;
+
+  /* Total number of tiles on the image.
+   * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`.
+   *
+   * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value
+   * in the `get_work()`? */
+  int total_tiles_num_ = 0;
+
+  /* In the case when the number of sam[les in the `tile_size_` is lower than samples_num_ denotes
+   * how many tiles are to be "stacked" to cover the entire requested range of samples. */
+  int num_tiles_per_sample_range_ = 0;
+
+  int next_work_index_ = 0;
+  int total_work_size_ = 0;
+};
+
+CCL_NAMESPACE_END