Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/integrator')
-rw-r--r--intern/cycles/integrator/CMakeLists.txt76
-rw-r--r--intern/cycles/integrator/adaptive_sampling.cpp71
-rw-r--r--intern/cycles/integrator/adaptive_sampling.h55
-rw-r--r--intern/cycles/integrator/denoiser.cpp204
-rw-r--r--intern/cycles/integrator/denoiser.h135
-rw-r--r--intern/cycles/integrator/denoiser_device.cpp106
-rw-r--r--intern/cycles/integrator/denoiser_device.h40
-rw-r--r--intern/cycles/integrator/denoiser_oidn.cpp628
-rw-r--r--intern/cycles/integrator/denoiser_oidn.h47
-rw-r--r--intern/cycles/integrator/denoiser_optix.cpp34
-rw-r--r--intern/cycles/integrator/denoiser_optix.h31
-rw-r--r--intern/cycles/integrator/pass_accessor.cpp318
-rw-r--r--intern/cycles/integrator/pass_accessor.h160
-rw-r--r--intern/cycles/integrator/pass_accessor_cpu.cpp183
-rw-r--r--intern/cycles/integrator/pass_accessor_cpu.h77
-rw-r--r--intern/cycles/integrator/pass_accessor_gpu.cpp118
-rw-r--r--intern/cycles/integrator/pass_accessor_gpu.h68
-rw-r--r--intern/cycles/integrator/path_trace.cpp1147
-rw-r--r--intern/cycles/integrator/path_trace.h324
-rw-r--r--intern/cycles/integrator/path_trace_work.cpp203
-rw-r--r--intern/cycles/integrator/path_trace_work.h194
-rw-r--r--intern/cycles/integrator/path_trace_work_cpu.cpp281
-rw-r--r--intern/cycles/integrator/path_trace_work_cpu.h82
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.cpp933
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.h165
-rw-r--r--intern/cycles/integrator/render_scheduler.cpp1187
-rw-r--r--intern/cycles/integrator/render_scheduler.h466
-rw-r--r--intern/cycles/integrator/shader_eval.cpp173
-rw-r--r--intern/cycles/integrator/shader_eval.h61
-rw-r--r--intern/cycles/integrator/tile.cpp108
-rw-r--r--intern/cycles/integrator/tile.h56
-rw-r--r--intern/cycles/integrator/work_balancer.cpp99
-rw-r--r--intern/cycles/integrator/work_balancer.h42
-rw-r--r--intern/cycles/integrator/work_tile_scheduler.cpp138
-rw-r--r--intern/cycles/integrator/work_tile_scheduler.h98
35 files changed, 8108 insertions, 0 deletions
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
new file mode 100644
index 00000000000..bfabd35d7c3
--- /dev/null
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright 2011-2021 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(INC
+ ..
+)
+
+set(SRC
+ adaptive_sampling.cpp
+ denoiser.cpp
+ denoiser_device.cpp
+ denoiser_oidn.cpp
+ denoiser_optix.cpp
+ path_trace.cpp
+ tile.cpp
+ pass_accessor.cpp
+ pass_accessor_cpu.cpp
+ pass_accessor_gpu.cpp
+ path_trace_work.cpp
+ path_trace_work_cpu.cpp
+ path_trace_work_gpu.cpp
+ render_scheduler.cpp
+ shader_eval.cpp
+ work_balancer.cpp
+ work_tile_scheduler.cpp
+)
+
+set(SRC_HEADERS
+ adaptive_sampling.h
+ denoiser.h
+ denoiser_device.h
+ denoiser_oidn.h
+ denoiser_optix.h
+ path_trace.h
+ tile.h
+ pass_accessor.h
+ pass_accessor_cpu.h
+ pass_accessor_gpu.h
+ path_trace_work.h
+ path_trace_work_cpu.h
+ path_trace_work_gpu.h
+ render_scheduler.h
+ shader_eval.h
+ work_balancer.h
+ work_tile_scheduler.h
+)
+
+set(LIB
+ # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to
+ # avoid such cyclic dependency.
+ cycles_render
+
+ cycles_util
+)
+
+if(WITH_OPENIMAGEDENOISE)
+ list(APPEND LIB
+ ${OPENIMAGEDENOISE_LIBRARIES}
+ )
+endif()
+
+include_directories(${INC})
+include_directories(SYSTEM ${INC_SYS})
+
+cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp
new file mode 100644
index 00000000000..23fbcfea5c2
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/adaptive_sampling.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+AdaptiveSampling::AdaptiveSampling()
+{
+}
+
+int AdaptiveSampling::align_samples(int start_sample, int num_samples) const
+{
+ if (!use) {
+ return num_samples;
+ }
+
+ /*
+ * The naive implementation goes as following:
+ *
+ * int count = 1;
+ * while (!need_filter(start_sample + count - 1) && count < num_samples) {
+ * ++count;
+ * }
+ * return count;
+ */
+
+ /* 0-based sample index at which first filtering will happen. */
+ const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1);
+
+ /* Allow as many samples as possible until the first filter sample. */
+ if (start_sample + num_samples <= first_filter_sample) {
+ return num_samples;
+ }
+
+ const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1));
+
+ const int num_samples_until_filter = next_filter_sample - start_sample + 1;
+
+ return min(num_samples_until_filter, num_samples);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+ if (!use) {
+ return false;
+ }
+
+ if (sample <= min_samples) {
+ return false;
+ }
+
+ return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h
new file mode 100644
index 00000000000..d98edd9894c
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling {
+ public:
+ AdaptiveSampling();
+
+ /* Align number of samples so that they align with the adaptive filtering.
+ *
+ * Returns the new value for the `num_samples` so that after rendering so many samples on top
+ * of `start_sample` filtering is required.
+ *
+ * The alignment happens in a way that allows to render as many samples as possible without
+ * missing any filtering point. This means that the result is "clamped" by the nearest sample
+ * at which filtering is needed. This is part of mechanism which ensures that all devices will
+ * perform same exact filtering and adaptive sampling, regardless of their performance.
+ *
+ * `start_sample` is the 0-based index of sample.
+ *
+ * NOTE: The start sample is included into the number of samples to render. This means that
+ * if the number of samples is 1, then the path tracer will render samples [align_samples],
+ * if the number of samples is 2, then the path tracer will render samples [align_samples,
+ * align_samples + 1] and so on. */
+ int align_samples(int start_sample, int num_samples) const;
+
+ /* Check whether adaptive sampling filter should happen at this sample.
+ * Returns false if the adaptive sampling is not use.
+ *
+ * `sample` is the 0-based index of sample. */
+ bool need_filter(int sample) const;
+
+ bool use = false;
+ int adaptive_step = 0;
+ int min_samples = 0;
+ float threshold = 0.0f;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
new file mode 100644
index 00000000000..598bbd497a5
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser.h"
+
+#include "device/device.h"
+#include "integrator/denoiser_oidn.h"
+#include "integrator/denoiser_optix.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams &params)
+{
+ DCHECK(params.use);
+
+ switch (params.type) {
+ case DENOISER_OPTIX:
+ return make_unique<OptiXDenoiser>(path_trace_device, params);
+
+ case DENOISER_OPENIMAGEDENOISE:
+ return make_unique<OIDNDenoiser>(path_trace_device, params);
+
+ case DENOISER_NUM:
+ case DENOISER_NONE:
+ case DENOISER_ALL:
+ /* pass */
+ break;
+ }
+
+ LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen.";
+
+ return nullptr;
+}
+
+Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
+ : path_trace_device_(path_trace_device), params_(params)
+{
+ DCHECK(params.use);
+}
+
+void Denoiser::set_params(const DenoiseParams &params)
+{
+ DCHECK_EQ(params.type, params_.type);
+
+ if (params.type == params_.type) {
+ params_ = params;
+ }
+ else {
+ LOG(ERROR) << "Attempt to change denoiser type.";
+ }
+}
+
+const DenoiseParams &Denoiser::get_params() const
+{
+ return params_;
+}
+
+bool Denoiser::load_kernels(Progress *progress)
+{
+ const Device *denoiser_device = ensure_denoiser_device(progress);
+
+ if (!denoiser_device) {
+ path_trace_device_->set_error("No device available to denoise on");
+ return false;
+ }
+
+ VLOG(3) << "Will denoise on " << denoiser_device->info.description << " ("
+ << denoiser_device->info.id << ")";
+
+ return true;
+}
+
+Device *Denoiser::get_denoiser_device() const
+{
+ return denoiser_device_;
+}
+
+/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */
+static bool is_single_supported_device(Device *device, DenoiserType type)
+{
+ if (device->info.type == DEVICE_MULTI) {
+ /* Assume multi-device is never created with a single sub-device.
+ * If one requests such configuration it should be checked on the session level. */
+ return false;
+ }
+
+ if (!device->info.multi_devices.empty()) {
+ /* Some configurations will use multi_devices, but keep the type of an individual device.
+ * This does simplify checks for homogenous setups, but here we really need a single device. */
+ return false;
+ }
+
+ /* Check the denoiser type is supported. */
+ return (device->info.denoisers & type);
+}
+
+/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of
+ * multi-device.
+ *
+ * If there is no device available which supports given denoiser type nullptr is returned. */
+static Device *find_best_device(Device *device, DenoiserType type)
+{
+ Device *best_device = nullptr;
+
+ device->foreach_device([&](Device *sub_device) {
+ if ((sub_device->info.denoisers & type) == 0) {
+ return;
+ }
+ if (!best_device) {
+ best_device = sub_device;
+ }
+ else {
+ /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
+ * of the device and data transfer cost. */
+ }
+ });
+
+ return best_device;
+}
+
+static unique_ptr<Device> create_denoiser_device(Device *path_trace_device,
+ const uint device_type_mask)
+{
+ const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask);
+ if (device_infos.empty()) {
+ return nullptr;
+ }
+
+ /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on
+ * a physical CUDA device which is already used for rendering. */
+
+ /* TODO(sergey): Choose fastest device for denoising. */
+
+ const DeviceInfo denoiser_device_info = device_infos.front();
+
+ unique_ptr<Device> denoiser_device(
+ Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler));
+
+ if (!denoiser_device) {
+ return nullptr;
+ }
+
+ if (denoiser_device->have_error()) {
+ return nullptr;
+ }
+
+ /* Only need denoising feature, everything else is unused. */
+ if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) {
+ return nullptr;
+ }
+
+ return denoiser_device;
+}
+
+Device *Denoiser::ensure_denoiser_device(Progress *progress)
+{
+ /* The best device has been found already, avoid sequential lookups.
+ * Additionally, avoid device re-creation if it has failed once. */
+ if (denoiser_device_ || device_creation_attempted_) {
+ return denoiser_device_;
+ }
+
+ /* Simple case: rendering happens on a single device which also supports denoiser. */
+ if (is_single_supported_device(path_trace_device_, params_.type)) {
+ denoiser_device_ = path_trace_device_;
+ return denoiser_device_;
+ }
+
+ /* Find best device from the ones which are already used for rendering. */
+ denoiser_device_ = find_best_device(path_trace_device_, params_.type);
+ if (denoiser_device_) {
+ return denoiser_device_;
+ }
+
+ if (progress) {
+ progress->set_status("Loading denoising kernels (may take a few minutes the first time)");
+ }
+
+ device_creation_attempted_ = true;
+
+ const uint device_type_mask = get_device_type_mask();
+ local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask);
+ denoiser_device_ = local_denoiser_device_.get();
+
+ return denoiser_device_;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h
new file mode 100644
index 00000000000..3101b45e31b
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the
+ * better place is figured out. */
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "util/util_function.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class RenderBuffers;
+class Progress;
+
+/* Implementation of a specific denoising algorithm.
+ *
+ * This class takes care of breaking down denosiing algorithm into a series of device calls or to
+ * calls of an external API to denoise given input.
+ *
+ * TODO(sergey): Are we better with device or a queue here? */
+class Denoiser {
+ public:
+ /* Create denoiser for the given path trace device.
+ *
+ * Notes:
+ * - The denoiser must be configured. This means that `params.use` must be true.
+ * This is checked in debug builds.
+ * - The device might be MultiDevice. */
+ static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams &params);
+
+ virtual ~Denoiser() = default;
+
+ void set_params(const DenoiseParams &params);
+ const DenoiseParams &get_params() const;
+
+ /* Create devices and load kernels needed for denoising.
+ * The progress is used to communicate state when kenrels actually needs to be loaded.
+ *
+ * NOTE: The `progress` is an optional argument, can be nullptr. */
+ virtual bool load_kernels(Progress *progress);
+
+ /* Denoise the entire buffer.
+ *
+ * Buffer parameters denotes an effective parameters used during rendering. It could be
+ * a lower resolution render into a bigger allocated buffer, which is used in viewport during
+ * navigation and non-unit pixel size. Use that instead of render_buffers->params.
+ *
+ * The buffer might be copming from a "foreign" device from what this denoise is created for.
+ * This means that in general case the denoiser will make sure the input data is available on
+ * the denoiser device, perform denoising, and put data back to the device where the buffer
+ * came from.
+ *
+ * The `num_samples` corresponds to the number of samples in the render buffers. It is used
+ * to scale buffers down to the "final" value in algorithms which don't do automatic exposure,
+ * or which needs "final" value for data passes.
+ *
+ * The `allow_inplace_modification` means that the denoiser is allowed to do in-place
+ * modification of the input passes (scaling them down i.e.). This will lower the memory
+ * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of
+ * view.
+ *
+ * Returns true when all passes are denoised. Will return false if there is a denoiser error (for
+ * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */
+ virtual bool denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification) = 0;
+
+ /* Get a device which is used to perform actual denoising.
+ *
+ * Notes:
+ *
+ * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then,
+ *
+ * - The device can be different from the path tracing device. This happens, for example, when
+ * using OptiX denoiser and rendering on CPU.
+ *
+ * - No threading safety is ensured in this call. This means, that it is up to caller to ensure
+ * that there is no threadingconflict between denoising task lazily initializing the device and
+ * access to this device happen. */
+ Device *get_denoiser_device() const;
+
+ function<bool(void)> is_cancelled_cb;
+
+ bool is_cancelled() const
+ {
+ if (!is_cancelled_cb) {
+ return false;
+ }
+ return is_cancelled_cb();
+ }
+
+ protected:
+ Denoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ /* Make sure denoising device is initialized. */
+ virtual Device *ensure_denoiser_device(Progress *progress);
+
+ /* Get device type mask which is used to filter available devices when new device needs to be
+ * created. */
+ virtual uint get_device_type_mask() const = 0;
+
+ Device *path_trace_device_;
+ DenoiseParams params_;
+
+ /* Cached pointer to the device on which denoising will happen.
+ * Used to avoid lookup of a device for every denoising request. */
+ Device *denoiser_device_ = nullptr;
+
+ /* Denoiser device which was created to perform denoising in the case the none of the rendering
+ * devices are capable of denoising. */
+ unique_ptr<Device> local_denoiser_device_;
+ bool device_creation_attempted_ = false;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp
new file mode 100644
index 00000000000..8088cfd7800
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_device.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+ : Denoiser(path_trace_device, params)
+{
+}
+
+DeviceDenoiser::~DeviceDenoiser()
+{
+ /* Explicit implementation, to allow forward declaration of Device in the header. */
+}
+
+bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification)
+{
+ Device *denoiser_device = get_denoiser_device();
+ if (!denoiser_device) {
+ return false;
+ }
+
+ DeviceDenoiseTask task;
+ task.params = params_;
+ task.num_samples = num_samples;
+ task.buffer_params = buffer_params;
+ task.allow_inplace_modification = allow_inplace_modification;
+
+ RenderBuffers local_render_buffers(denoiser_device);
+ bool local_buffer_used = false;
+
+ if (denoiser_device == render_buffers->buffer.device) {
+ /* The device can access an existing buffer pointer. */
+ local_buffer_used = false;
+ task.render_buffers = render_buffers;
+ }
+ else {
+ VLOG(3) << "Creating temporary buffer on denoiser device.";
+
+ DeviceQueue *queue = denoiser_device->get_denoise_queue();
+
+ /* Create buffer which is available by the device used by denoiser. */
+
+ /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
+ * ignoring other light ad data passes. */
+
+ local_buffer_used = true;
+
+ render_buffers->copy_from_device();
+
+ local_render_buffers.reset(buffer_params);
+
+ /* NOTE: The local buffer is allocated for an exact size of the effective render size, while
+ * the input render buffer is allcoated for the lowest resolution divider possible. So it is
+ * important to only copy actually needed part of the input buffer. */
+ memcpy(local_render_buffers.buffer.data(),
+ render_buffers->buffer.data(),
+ sizeof(float) * local_render_buffers.buffer.size());
+
+ queue->copy_to_device(local_render_buffers.buffer);
+
+ task.render_buffers = &local_render_buffers;
+ task.allow_inplace_modification = true;
+ }
+
+ const bool denoise_result = denoiser_device->denoise_buffer(task);
+
+ if (local_buffer_used) {
+ local_render_buffers.copy_from_device();
+
+ render_buffers_host_copy_denoised(
+ render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
+
+ render_buffers->copy_to_device();
+ }
+
+ return denoise_result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h
new file mode 100644
index 00000000000..0fd934dba79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
+ * implemented as a part of a driver of specific device.
+ *
+ * This implementation makes sure the to-be-denoised buffer is available on the denoising device
+ * and invoke denoising kernel via device API. */
+class DeviceDenoiser : public Denoiser {
+ public:
+ DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
+ ~DeviceDenoiser();
+
+ virtual bool denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
new file mode 100644
index 00000000000..1b5a012ec87
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -0,0 +1,628 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_oidn.h"
+
+#include <array>
+
+#include "device/device.h"
+#include "device/device_queue.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "render/buffers.h"
+#include "util/util_array.h"
+#include "util/util_logging.h"
+#include "util/util_openimagedenoise.h"
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+thread_mutex OIDNDenoiser::mutex_;
+
+OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params)
+ : Denoiser(path_trace_device, params)
+{
+ DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE);
+
+ DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform.";
+}
+
+#ifdef WITH_OPENIMAGEDENOISE
+static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/)
+{
+ OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr);
+ return !oidn_denoiser->is_cancelled();
+}
+#endif
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+class OIDNPass {
+ public:
+ OIDNPass() = default;
+
+ OIDNPass(const BufferParams &buffer_params,
+ const char *name,
+ PassType type,
+ PassMode mode = PassMode::NOISY)
+ : name(name), type(type), mode(mode)
+ {
+ offset = buffer_params.get_pass_offset(type, mode);
+ need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);
+
+ const PassInfo pass_info = Pass::get_info(type);
+ num_components = pass_info.num_components;
+ use_compositing = pass_info.use_compositing;
+ use_denoising_albedo = pass_info.use_denoising_albedo;
+ }
+
+ inline operator bool() const
+ {
+ return name[0] != '\0';
+ }
+
+ /* Name of an image which will be passed to the OIDN library.
+ * Should be one of the following: color, albedo, normal, output.
+ * The albedo and normal images are optional. */
+ const char *name = "";
+
+ PassType type = PASS_NONE;
+ PassMode mode = PassMode::NOISY;
+ int num_components = -1;
+ bool use_compositing = false;
+ bool use_denoising_albedo = true;
+
+ /* Offset of beginning of this pass in the render buffers. */
+ int offset = -1;
+
+ /* Denotes whether the data is to be scaled down with the number of passes.
+ * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so
+ * scaling is not needed for the color pass unless adaptive sampling is used.
+ *
+ * NOTE: Do not scale the outout pass, as that requires to be a pointer in the original buffer.
+ * All the scaling on the output needed for integration with adaptive sampling will happen
+ * outside of generic pass handling. */
+ bool need_scale = false;
+
+ /* The content of the pass has been pre-filtered. */
+ bool is_filtered = false;
+
+ /* For the scaled passes, the data which holds values of scaled pixels. */
+ array<float> scaled_buffer;
+};
+
+class OIDNDenoiseContext {
+ public:
+ OIDNDenoiseContext(OIDNDenoiser *denoiser,
+ const DenoiseParams &denoise_params,
+ const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ const bool allow_inplace_modification)
+ : denoiser_(denoiser),
+ denoise_params_(denoise_params),
+ buffer_params_(buffer_params),
+ render_buffers_(render_buffers),
+ num_samples_(num_samples),
+ allow_inplace_modification_(allow_inplace_modification),
+ pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT))
+ {
+ if (denoise_params_.use_pass_albedo) {
+ oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO);
+ }
+
+ if (denoise_params_.use_pass_normal) {
+ oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL);
+ }
+ }
+
+ bool need_denoising() const
+ {
+ if (buffer_params_.width == 0 && buffer_params_.height == 0) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /* Make the guiding passes available by a sequential denoising of various passes. */
+ void read_guiding_passes()
+ {
+ read_guiding_pass(oidn_albedo_pass_);
+ read_guiding_pass(oidn_normal_pass_);
+ }
+
+ void denoise_pass(const PassType pass_type)
+ {
+ OIDNPass oidn_color_pass(buffer_params_, "color", pass_type);
+ if (oidn_color_pass.offset == PASS_UNUSED) {
+ return;
+ }
+
+ if (oidn_color_pass.use_denoising_albedo) {
+ if (albedo_replaced_with_fake_) {
+ LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+ return;
+ }
+ }
+
+ OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED);
+ if (oidn_output_pass.offset == PASS_UNUSED) {
+ LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+ return;
+ }
+
+ OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass);
+
+ oidn::DeviceRef oidn_device = oidn::newDevice();
+ oidn_device.commit();
+
+ /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too.
+ */
+ oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+ set_input_pass(oidn_filter, oidn_color_access_pass);
+ set_guiding_passes(oidn_filter, oidn_color_pass);
+ set_output_pass(oidn_filter, oidn_output_pass);
+ oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_);
+ oidn_filter.set("hdr", true);
+ oidn_filter.set("srgb", false);
+ if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE ||
+ denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) {
+ oidn_filter.set("cleanAux", true);
+ }
+ oidn_filter.commit();
+
+ filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_);
+ filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_);
+
+ /* Filter the beauty image. */
+ oidn_filter.execute();
+
+ /* Check for errors. */
+ const char *error_message;
+ const oidn::Error error = oidn_device.getError(error_message);
+ if (error != oidn::Error::None && error != oidn::Error::Cancelled) {
+ LOG(ERROR) << "OpenImageDenoise error: " << error_message;
+ }
+
+ postprocess_output(oidn_color_pass, oidn_output_pass);
+ }
+
+ protected:
+ void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass)
+ {
+ if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass ||
+ oidn_pass.is_filtered) {
+ return;
+ }
+
+ oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+ set_pass(oidn_filter, oidn_pass);
+ set_output_pass(oidn_filter, oidn_pass);
+ oidn_filter.commit();
+ oidn_filter.execute();
+
+ oidn_pass.is_filtered = true;
+ }
+
+ /* Make pixels of a guiding pass available by the denoiser. */
+ void read_guiding_pass(OIDNPass &oidn_pass)
+ {
+ if (!oidn_pass) {
+ return;
+ }
+
+ DCHECK(!oidn_pass.use_compositing);
+
+ if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE &&
+ !is_pass_scale_needed(oidn_pass)) {
+ /* Pass data is available as-is from the render buffers. */
+ return;
+ }
+
+ if (allow_inplace_modification_) {
+ scale_pass_in_render_buffers(oidn_pass);
+ return;
+ }
+
+ read_pass_pixels_into_buffer(oidn_pass);
+ }
+
+ /* Special reader of the input pass.
+ * To save memory it will read pixels into the output, and let the denoiser to perform an
+ * in-place operation. */
+ OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+ {
+ const bool use_compositing = oidn_input_pass.use_compositing;
+
+ /* Simple case: no compositing is involved, no scaling is needed.
+ * The pass pixels will be referenced as-is, without extra processing. */
+ if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) {
+ return oidn_input_pass;
+ }
+
+ float *buffer_data = render_buffers_->buffer.data();
+ float *pass_data = buffer_data + oidn_output_pass.offset;
+
+ PassAccessor::Destination destination(pass_data, 3);
+ destination.pixel_stride = buffer_params_.pass_stride;
+
+ read_pass_pixels(oidn_input_pass, destination);
+
+ OIDNPass oidn_input_pass_at_output = oidn_input_pass;
+ oidn_input_pass_at_output.offset = oidn_output_pass.offset;
+
+ return oidn_input_pass_at_output;
+ }
+
+ /* Read pass pixels using PassAccessor into the given destination. */
+ void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination)
+ {
+ PassAccessor::PassAccessInfo pass_access_info;
+ pass_access_info.type = oidn_pass.type;
+ pass_access_info.mode = oidn_pass.mode;
+ pass_access_info.offset = oidn_pass.offset;
+
+ /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+ * on the approximation. The latter is not even possible because OIDN does not support
+ * denoising of semi-transparent pixels. */
+ pass_access_info.use_approximate_shadow_catcher = false;
+ pass_access_info.use_approximate_shadow_catcher_background = false;
+ pass_access_info.show_active_pixels = false;
+
+ /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured
+ * by users. What is important is to use same exposure for read and write access of the pass
+ * pixels. */
+ const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_);
+
+ pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination);
+ }
+
+ /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */
+ void read_pass_pixels_into_buffer(OIDNPass &oidn_pass)
+ {
+ VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " ("
+ << pass_type_as_string(oidn_pass.type) << ")";
+
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+
+ array<float> &scaled_buffer = oidn_pass.scaled_buffer;
+ scaled_buffer.resize(width * height * 3);
+
+ const PassAccessor::Destination destination(scaled_buffer.data(), 3);
+
+ read_pass_pixels(oidn_pass, destination);
+ }
+
+ /* Set OIDN image to reference pixels from the given render buffer pass.
+ * No transform to the pixels is done, no additional memory is used. */
+ void set_pass_referenced(oidn::FilterRef &oidn_filter,
+ const char *name,
+ const OIDNPass &oidn_pass)
+ {
+ const int64_t x = buffer_params_.full_x;
+ const int64_t y = buffer_params_.full_y;
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+ const int64_t offset = buffer_params_.offset;
+ const int64_t stride = buffer_params_.stride;
+ const int64_t pass_stride = buffer_params_.pass_stride;
+
+ const int64_t pixel_index = offset + x + y * stride;
+ const int64_t buffer_offset = pixel_index * pass_stride;
+
+ float *buffer_data = render_buffers_->buffer.data();
+
+ oidn_filter.setImage(name,
+ buffer_data + buffer_offset + oidn_pass.offset,
+ oidn::Format::Float3,
+ width,
+ height,
+ 0,
+ pass_stride * sizeof(float),
+ stride * pass_stride * sizeof(float));
+ }
+
+ void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+ {
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+
+ oidn_filter.setImage(
+ name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0);
+ }
+
+ void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+ {
+ set_pass(oidn_filter, oidn_pass.name, oidn_pass);
+ }
+ void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+ {
+ if (oidn_pass.scaled_buffer.empty()) {
+ set_pass_referenced(oidn_filter, name, oidn_pass);
+ }
+ else {
+ set_pass_from_buffer(oidn_filter, name, oidn_pass);
+ }
+ }
+
+ void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+ {
+ set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass);
+ }
+
+ void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+ {
+ if (oidn_albedo_pass_) {
+ if (oidn_pass.use_denoising_albedo) {
+ set_pass(oidn_filter, oidn_albedo_pass_);
+ }
+ else {
+ /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been
+ * provided. */
+ set_fake_albedo_pass(oidn_filter);
+ }
+ }
+
+ if (oidn_normal_pass_) {
+ set_pass(oidn_filter, oidn_normal_pass_);
+ }
+ }
+
+ void set_fake_albedo_pass(oidn::FilterRef &oidn_filter)
+ {
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+
+ if (!albedo_replaced_with_fake_) {
+ const int64_t num_pixel_components = width * height * 3;
+ oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components);
+
+ for (int i = 0; i < num_pixel_components; ++i) {
+ oidn_albedo_pass_.scaled_buffer[i] = 0.5f;
+ }
+
+ albedo_replaced_with_fake_ = true;
+ }
+
+ set_pass(oidn_filter, oidn_albedo_pass_);
+ }
+
+ void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+ {
+ set_pass(oidn_filter, "output", oidn_pass);
+ }
+
+ /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel
+ * back. */
+ void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+ {
+ kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components);
+
+ const int64_t x = buffer_params_.full_x;
+ const int64_t y = buffer_params_.full_y;
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+ const int64_t offset = buffer_params_.offset;
+ const int64_t stride = buffer_params_.stride;
+ const int64_t pass_stride = buffer_params_.pass_stride;
+ const int64_t row_stride = stride * pass_stride;
+
+ const int64_t pixel_offset = offset + x + y * stride;
+ const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+ float *buffer_data = render_buffers_->buffer.data();
+
+ const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+ const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing;
+
+ for (int y = 0; y < height; ++y) {
+ float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+ for (int x = 0; x < width; ++x) {
+ float *buffer_pixel = buffer_row + x * pass_stride;
+ float *denoised_pixel = buffer_pixel + oidn_output_pass.offset;
+
+ if (need_scale) {
+ const float pixel_scale = has_pass_sample_count ?
+ __float_as_uint(buffer_pixel[pass_sample_count_]) :
+ num_samples_;
+
+ denoised_pixel[0] = denoised_pixel[0] * pixel_scale;
+ denoised_pixel[1] = denoised_pixel[1] * pixel_scale;
+ denoised_pixel[2] = denoised_pixel[2] * pixel_scale;
+ }
+
+ if (oidn_output_pass.num_components == 3) {
+ /* Pass without alpha channel. */
+ }
+ else if (!oidn_input_pass.use_compositing) {
+ /* Currently compositing passes are either 3-component (derived by dividing light passes)
+ * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+ * simplifies logic and avoids extra memory allocation. */
+ const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset;
+ denoised_pixel[3] = noisy_pixel[3];
+ }
+ else {
+ /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+ * is an opaque pixel for 4 component passes. */
+ denoised_pixel[3] = 0;
+ }
+ }
+ }
+ }
+
+ bool is_pass_scale_needed(OIDNPass &oidn_pass) const
+ {
+ if (pass_sample_count_ != PASS_UNUSED) {
+ /* With adaptive sampling pixels will have different number of samples in them, so need to
+ * always scale the pass to make pixels uniformly sampled. */
+ return true;
+ }
+
+ if (!oidn_pass.need_scale) {
+ return false;
+ }
+
+ if (num_samples_ == 1) {
+ /* If the avoid scaling if there is only one sample, to save up time (so we dont divide
+ * buffer by 1). */
+ return false;
+ }
+
+ return true;
+ }
+
+ void scale_pass_in_render_buffers(OIDNPass &oidn_pass)
+ {
+ const int64_t x = buffer_params_.full_x;
+ const int64_t y = buffer_params_.full_y;
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+ const int64_t offset = buffer_params_.offset;
+ const int64_t stride = buffer_params_.stride;
+ const int64_t pass_stride = buffer_params_.pass_stride;
+ const int64_t row_stride = stride * pass_stride;
+
+ const int64_t pixel_offset = offset + x + y * stride;
+ const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+ float *buffer_data = render_buffers_->buffer.data();
+
+ const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+
+ for (int y = 0; y < height; ++y) {
+ float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+ for (int x = 0; x < width; ++x) {
+ float *buffer_pixel = buffer_row + x * pass_stride;
+ float *pass_pixel = buffer_pixel + oidn_pass.offset;
+
+ const float pixel_scale = 1.0f / (has_pass_sample_count ?
+ __float_as_uint(buffer_pixel[pass_sample_count_]) :
+ num_samples_);
+
+ pass_pixel[0] = pass_pixel[0] * pixel_scale;
+ pass_pixel[1] = pass_pixel[1] * pixel_scale;
+ pass_pixel[2] = pass_pixel[2] * pixel_scale;
+ }
+ }
+ }
+
+ OIDNDenoiser *denoiser_ = nullptr;
+
+ const DenoiseParams &denoise_params_;
+ const BufferParams &buffer_params_;
+ RenderBuffers *render_buffers_ = nullptr;
+ int num_samples_ = 0;
+ bool allow_inplace_modification_ = false;
+ int pass_sample_count_ = PASS_UNUSED;
+
+ /* Optional albedo and normal passes, reused by denoising of different pass types. */
+ OIDNPass oidn_albedo_pass_;
+ OIDNPass oidn_normal_pass_;
+
+ /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+ * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+ * the fake values and denoising of passes which do need albedo can no longer happen. */
+ bool albedo_replaced_with_fake_ = false;
+};
+#endif
+
+static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers)
+{
+ Device *device = render_buffers->buffer.device;
+ if (device->info.has_gpu_queue) {
+ return device->gpu_queue_create();
+ }
+ return nullptr;
+}
+
+static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue,
+ RenderBuffers *render_buffers)
+{
+ if (queue) {
+ queue->copy_from_device(render_buffers->buffer);
+ queue->synchronize();
+ }
+ else {
+ render_buffers->copy_from_device();
+ }
+}
+
+static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue,
+ RenderBuffers *render_buffers)
+{
+ if (queue) {
+ queue->copy_to_device(render_buffers->buffer);
+ queue->synchronize();
+ }
+ else {
+ render_buffers->copy_to_device();
+ }
+}
+
+bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification)
+{
+ thread_scoped_lock lock(mutex_);
+
+ /* Make sure the host-side data is available for denoising. */
+ unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers);
+ copy_render_buffers_from_device(queue, render_buffers);
+
+#ifdef WITH_OPENIMAGEDENOISE
+ OIDNDenoiseContext context(
+ this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification);
+
+ if (context.need_denoising()) {
+ context.read_guiding_passes();
+
+ const std::array<PassType, 3> passes = {
+ {/* Passes which will use real albedo when it is available. */
+ PASS_COMBINED,
+ PASS_SHADOW_CATCHER_MATTE,
+
+ /* Passes which do not need albedo and hence if real is present it needs to become fake.
+ */
+ PASS_SHADOW_CATCHER}};
+
+ for (const PassType pass_type : passes) {
+ context.denoise_pass(pass_type);
+ if (is_cancelled()) {
+ return false;
+ }
+ }
+
+ /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code
+ * copies data from the device it doesn't overwrite the denoiser buffers. */
+ copy_render_buffers_to_device(queue, render_buffers);
+ }
+#endif
+
+ /* This code is not supposed to run when compiled without OIDN support, so can assume if we made
+ * it up here all passes are properly denoised. */
+ return true;
+}
+
+uint OIDNDenoiser::get_device_type_mask() const
+{
+ return DEVICE_MASK_CPU;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h
new file mode 100644
index 00000000000..566e761ae79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of denoising API which uses OpenImageDenoise library. */
+class OIDNDenoiser : public Denoiser {
+ public:
+ /* Forwardly declared state which might be using compile-flag specific fields, such as
+ * OpenImageDenoise device and filter handles. */
+ class State;
+
+ OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ virtual bool denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification) override;
+
+ protected:
+ virtual uint get_device_type_mask() const override;
+
+ /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded.
+ * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */
+ static thread_mutex mutex_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_optix.cpp b/intern/cycles/integrator/denoiser_optix.cpp
new file mode 100644
index 00000000000..5f9de23bfe6
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_optix.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+ : DeviceDenoiser(path_trace_device, params)
+{
+}
+
+uint OptiXDenoiser::get_device_type_mask() const
+{
+ return DEVICE_MASK_OPTIX;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_optix.h b/intern/cycles/integrator/denoiser_optix.h
new file mode 100644
index 00000000000..a8df770ecf7
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser_device.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDenoiser : public DeviceDenoiser {
+ public:
+ OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ protected:
+ virtual uint get_device_type_mask() const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
new file mode 100644
index 00000000000..87c048b1fa5
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/kernel_types.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Pass input information.
+ */
+
+PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass)
+ : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass destination.
+ */
+
+PassAccessor::Destination::Destination(float *pixels, int num_components)
+ : pixels(pixels), num_components(num_components)
+{
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
+ : Destination(pass_type)
+{
+ pixels_half_rgba = pixels;
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type)
+{
+ const PassInfo pass_info = Pass::get_info(pass_type);
+ num_components = pass_info.num_components;
+}
+
+/* --------------------------------------------------------------------
+ * Pass source.
+ */
+
+PassAccessor::Source::Source(const float *pixels, int num_components)
+ : pixels(pixels), num_components(num_components)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessor.
+ */
+
+PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples)
+ : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples)
+{
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+ const Destination &destination) const
+{
+ if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+ return false;
+ }
+
+ return get_render_tile_pixels(render_buffers, render_buffers->params, destination);
+}
+
+static void pad_pixels(const BufferParams &buffer_params,
+ const PassAccessor::Destination &destination,
+ const int src_num_components)
+{
+ /* When requesting a single channel pass as RGBA, or RGB pass as RGBA,
+ * fill in the additional components for convenience. */
+ const int dest_num_components = destination.num_components;
+
+ if (src_num_components >= dest_num_components) {
+ return;
+ }
+
+ const size_t size = buffer_params.width * buffer_params.height;
+ if (destination.pixels) {
+ float *pixel = destination.pixels;
+
+ for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
+ if (dest_num_components >= 3 && src_num_components == 1) {
+ pixel[1] = pixel[0];
+ pixel[2] = pixel[0];
+ }
+ if (dest_num_components >= 4) {
+ pixel[3] = 1.0f;
+ }
+ }
+ }
+
+ if (destination.pixels_half_rgba) {
+ const half one = float_to_half(1.0f);
+ half4 *pixel = destination.pixels_half_rgba;
+
+ for (size_t i = 0; i < size; i++, pixel++) {
+ if (dest_num_components >= 3 && src_num_components == 1) {
+ pixel[0].y = pixel[0].x;
+ pixel[0].z = pixel[0].x;
+ }
+ if (dest_num_components >= 4) {
+ pixel[0].w = one;
+ }
+ }
+ }
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination) const
+{
+ if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+ return false;
+ }
+
+ if (pass_access_info_.offset == PASS_UNUSED) {
+ return false;
+ }
+
+ const PassType type = pass_access_info_.type;
+ const PassMode mode = pass_access_info_.mode;
+ const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
+
+ if (pass_info.num_components == 1) {
+ /* Single channel passes. */
+ if (mode == PassMode::DENOISED) {
+ /* Denoised passes store their final pixels, no need in special calculation. */
+ get_pass_float(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_RENDER_TIME) {
+ /* TODO(sergey): Needs implementation. */
+ }
+ else if (type == PASS_DEPTH) {
+ get_pass_depth(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_MIST) {
+ get_pass_mist(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_SAMPLE_COUNT) {
+ get_pass_sample_count(render_buffers, buffer_params, destination);
+ }
+ else {
+ get_pass_float(render_buffers, buffer_params, destination);
+ }
+ }
+ else if (type == PASS_MOTION) {
+ /* Motion pass. */
+ DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components";
+ get_pass_motion(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_CRYPTOMATTE) {
+ /* Cryptomatte pass. */
+ DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components";
+ get_pass_cryptomatte(render_buffers, buffer_params, destination);
+ }
+ else {
+ /* RGB, RGBA and vector passes. */
+ DCHECK(destination.num_components == 3 || destination.num_components == 4)
+ << pass_type_as_string(type) << " pass must have 3 or 4 components";
+
+ if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) {
+ /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass
+ * to approximate shadow with). */
+ get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) {
+ /* Shadow catcher pass. */
+ get_pass_shadow_catcher(render_buffers, buffer_params, destination);
+ }
+ else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE ||
+ pass_info.indirect_type != PASS_NONE) &&
+ mode != PassMode::DENOISED) {
+ /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */
+ get_pass_light_path(render_buffers, buffer_params, destination);
+ }
+ else {
+ /* Passes that need no special computation, or denoised passes that already
+ * had the computation done. */
+ if (pass_info.num_components == 3) {
+ get_pass_float3(render_buffers, buffer_params, destination);
+ }
+ else if (pass_info.num_components == 4) {
+ if (destination.num_components == 3) {
+ /* Special case for denoiser access of RGBA passes ignoring alpha channel. */
+ get_pass_float3(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER ||
+ type == PASS_SHADOW_CATCHER_MATTE) {
+ /* Passes with transparency as 4th component. */
+ get_pass_combined(render_buffers, buffer_params, destination);
+ }
+ else {
+ /* Passes with alpha as 4th component. */
+ get_pass_float4(render_buffers, buffer_params, destination);
+ }
+ }
+ }
+ }
+
+ pad_pixels(buffer_params, destination, pass_info.num_components);
+
+ return true;
+}
+
+void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+ const BufferParams &buffer_params,
+ const Destination &destination) const
+{
+ const PassMode mode = pass_access_info_.mode;
+ const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
+ pass_access_info_.include_albedo);
+
+ kfilm_convert->pass_offset = pass_access_info_.offset;
+ kfilm_convert->pass_stride = buffer_params.pass_stride;
+
+ kfilm_convert->pass_use_exposure = pass_info.use_exposure;
+ kfilm_convert->pass_use_filter = pass_info.use_filter;
+
+ /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */
+ if (pass_info.direct_type != PASS_NONE) {
+ kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type);
+ }
+ kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type);
+ kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type);
+
+ kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED);
+ kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+ kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset(
+ PASS_ADAPTIVE_AUX_BUFFER);
+ kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT);
+ kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode);
+ kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset(
+ PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+ kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset(
+ PASS_SHADOW_CATCHER_MATTE, mode);
+
+ /* Background is not denoised, so always use noisy pass. */
+ kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);
+
+ if (pass_info.use_filter) {
+ kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+ }
+ else {
+ kfilm_convert->scale = 1.0f;
+ }
+
+ if (pass_info.use_exposure) {
+ kfilm_convert->exposure = exposure_;
+ }
+ else {
+ kfilm_convert->exposure = 1.0f;
+ }
+
+ kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure;
+
+ kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher;
+ kfilm_convert->use_approximate_shadow_catcher_background =
+ pass_access_info_.use_approximate_shadow_catcher_background;
+ kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels;
+
+ kfilm_convert->num_components = destination.num_components;
+ kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+ destination.num_components;
+
+ kfilm_convert->is_denoised = (mode == PassMode::DENOISED);
+}
+
+bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source)
+{
+ if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+ return false;
+ }
+
+ const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
+ pass_access_info_.include_albedo);
+
+ const BufferParams &buffer_params = render_buffers->params;
+
+ float *buffer_data = render_buffers->buffer.data();
+ const int size = buffer_params.width * buffer_params.height;
+
+ const int out_stride = buffer_params.pass_stride;
+ const int in_stride = source.num_components;
+ const int num_components_to_copy = min(source.num_components, pass_info.num_components);
+
+ float *out = buffer_data + pass_access_info_.offset;
+ const float *in = source.pixels + source.offset * in_stride;
+
+ for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
+ memcpy(out, in, sizeof(float) * num_components_to_copy);
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
new file mode 100644
index 00000000000..624bf7d0b2c
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "render/pass.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class RenderBuffers;
+class BufferPass;
+class BufferParams;
+struct KernelFilmConvert;
+
+/* Helper class which allows to access pass data.
+ * Is designed in a way that it is created once when the pass data is known, and then pixels gets
+ * progressively update from various render buffers. */
+class PassAccessor {
+ public:
+ class PassAccessInfo {
+ public:
+ PassAccessInfo() = default;
+ explicit PassAccessInfo(const BufferPass &pass);
+
+ PassType type = PASS_NONE;
+ PassMode mode = PassMode::NOISY;
+ bool include_albedo = false;
+ int offset = -1;
+
+ /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its
+ * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop.
+ */
+ bool use_approximate_shadow_catcher = false;
+
+ /* When approximate shadow catcher matte is used alpha-over the result on top of background. */
+ bool use_approximate_shadow_catcher_background = false;
+
+ bool show_active_pixels = false;
+ };
+
+ class Destination {
+ public:
+ Destination() = default;
+ Destination(float *pixels, int num_components);
+ Destination(const PassType pass_type, half4 *pixels);
+
+ /* Destination will be initialized with the number of components which is native for the given
+ * pass type. */
+ explicit Destination(const PassType pass_type);
+
+ /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+ float *pixels = nullptr;
+ half4 *pixels_half_rgba = nullptr;
+
+ /* Device-side pointers. */
+ device_ptr d_pixels = 0;
+ device_ptr d_pixels_half_rgba = 0;
+
+ /* Number of components per pixel in the floating-point destination.
+ * Is ignored for half4 destination (where number of components is implied to be 4). */
+ int num_components = 0;
+
+ /* Offset in pixels from the beginning of pixels storage.
+ * Allows to get pixels of render buffer into a partial slice of the destination. */
+ int offset = 0;
+
+ /* Number of floats per pixel. When zero is the same as `num_components`.
+ *
+ * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component
+ * half-floats. */
+ int pixel_stride = 0;
+
+ /* Row stride in pixel elements:
+ * - For the float destination stride is a number of floats per row.
+ * - For the half4 destination stride is a number of half4 per row. */
+ int stride = 0;
+ };
+
+ class Source {
+ public:
+ Source() = default;
+ Source(const float *pixels, int num_components);
+
+ /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+ const float *pixels = nullptr;
+ int num_components = 0;
+
+ /* Offset in pixels from the beginning of pixels storage.
+ * Allows to get pixels of render buffer into a partial slice of the destination. */
+ int offset = 0;
+ };
+
+ PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples);
+
+ virtual ~PassAccessor() = default;
+
+ /* Get pass data from the given render buffers, perform needed filtering, and store result into
+ * the pixels.
+ * The result is stored sequentially starting from the very beginning of the pixels memory. */
+ bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+ const Destination &destination) const;
+ bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination) const;
+ /* Set pass data for the given render buffers. Used for baking to read from passes. */
+ bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source);
+
+ protected:
+ virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+ const BufferParams &buffer_params,
+ const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+ virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const = 0;
+
+ /* Float (scalar) passes. */
+ DECLARE_PASS_ACCESSOR(depth)
+ DECLARE_PASS_ACCESSOR(mist)
+ DECLARE_PASS_ACCESSOR(sample_count)
+ DECLARE_PASS_ACCESSOR(float)
+
+ /* Float3 passes. */
+ DECLARE_PASS_ACCESSOR(light_path)
+ DECLARE_PASS_ACCESSOR(shadow_catcher)
+ DECLARE_PASS_ACCESSOR(float3)
+
+ /* Float4 passes. */
+ DECLARE_PASS_ACCESSOR(motion)
+ DECLARE_PASS_ACCESSOR(cryptomatte)
+ DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+ DECLARE_PASS_ACCESSOR(combined)
+ DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+
+ PassAccessInfo pass_access_info_;
+
+ float exposure_ = 0.0f;
+ int num_samples_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
new file mode 100644
index 00000000000..3c6691f6d43
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_film.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Kernel processing.
+ */
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const
+{
+ KernelFilmConvert kfilm_convert;
+ init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+ if (destination.pixels) {
+ /* NOTE: No overlays are applied since they are not used for final renders.
+ * Can be supported via some sort of specialization to avoid code duplication. */
+
+ run_get_pass_kernel_processor_float(
+ &kfilm_convert, render_buffers, buffer_params, destination, processor);
+ }
+
+ if (destination.pixels_half_rgba) {
+ /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+ if (destination.num_components == 1) {
+ run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+ render_buffers,
+ buffer_params,
+ destination,
+ [&processor](const KernelFilmConvert *kfilm_convert,
+ ccl_global const float *buffer,
+ float *pixel_rgba) {
+ float pixel;
+ processor(kfilm_convert, buffer, &pixel);
+
+ pixel_rgba[0] = pixel;
+ pixel_rgba[1] = pixel;
+ pixel_rgba[2] = pixel;
+ pixel_rgba[3] = 1.0f;
+ });
+ }
+ else if (destination.num_components == 3) {
+ run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+ render_buffers,
+ buffer_params,
+ destination,
+ [&processor](const KernelFilmConvert *kfilm_convert,
+ ccl_global const float *buffer,
+ float *pixel_rgba) {
+ processor(kfilm_convert, buffer, pixel_rgba);
+ pixel_rgba[3] = 1.0f;
+ });
+ }
+ else if (destination.num_components == 4) {
+ run_get_pass_kernel_processor_half_rgba(
+ &kfilm_convert, render_buffers, buffer_params, destination, processor);
+ }
+ }
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
+ const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const
+{
+ DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+ const float *buffer_data = render_buffers->buffer.data();
+ const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+ destination.num_components;
+
+ tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+ int64_t pixel_index = y * buffer_params.width;
+ for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+ const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+ const float *buffer = buffer_data + input_pixel_offset;
+ float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
+
+ processor(kfilm_convert, buffer, pixel);
+ }
+ });
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
+ const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const
+{
+ const float *buffer_data = render_buffers->buffer.data();
+
+ half4 *dst_start = destination.pixels_half_rgba + destination.offset;
+ const int destination_stride = destination.stride != 0 ? destination.stride :
+ buffer_params.width;
+
+ tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+ int64_t pixel_index = y * buffer_params.width;
+ half4 *dst_row_start = dst_start + y * destination_stride;
+ for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+ const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+ const float *buffer = buffer_data + input_pixel_offset;
+
+ float pixel[4];
+ processor(kfilm_convert, buffer, pixel);
+
+ film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+ half4 *pixel_half_rgba = dst_row_start + x;
+ float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+ }
+ });
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass) \
+ void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const \
+ { \
+ run_get_pass_kernel_processor( \
+ render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+ }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth)
+DEFINE_PASS_ACCESSOR(mist)
+DEFINE_PASS_ACCESSOR(sample_count)
+DEFINE_PASS_ACCESSOR(float)
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path)
+DEFINE_PASS_ACCESSOR(shadow_catcher)
+DEFINE_PASS_ACCESSOR(float3)
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion)
+DEFINE_PASS_ACCESSOR(cryptomatte)
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+DEFINE_PASS_ACCESSOR(combined)
+DEFINE_PASS_ACCESSOR(float4)
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
new file mode 100644
index 00000000000..0313dc5bb0d
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelFilmConvert;
+
+/* Pass accessor implementation for CPU side. */
+class PassAccessorCPU : public PassAccessor {
+ public:
+ using PassAccessor::PassAccessor;
+
+ protected:
+ template<typename Processor>
+ inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const;
+
+ template<typename Processor>
+ inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const;
+
+ template<typename Processor>
+ inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+ virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const override;
+
+ /* Float (scalar) passes. */
+ DECLARE_PASS_ACCESSOR(depth)
+ DECLARE_PASS_ACCESSOR(mist)
+ DECLARE_PASS_ACCESSOR(sample_count)
+ DECLARE_PASS_ACCESSOR(float)
+
+ /* Float3 passes. */
+ DECLARE_PASS_ACCESSOR(light_path)
+ DECLARE_PASS_ACCESSOR(shadow_catcher)
+ DECLARE_PASS_ACCESSOR(float3)
+
+ /* Float4 passes. */
+ DECLARE_PASS_ACCESSOR(motion)
+ DECLARE_PASS_ACCESSOR(cryptomatte)
+ DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+ DECLARE_PASS_ACCESSOR(combined)
+ DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
new file mode 100644
index 00000000000..eb80ba99655
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_gpu.h"
+
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue,
+ const PassAccessInfo &pass_access_info,
+ float exposure,
+ int num_samples)
+ : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue)
+
+{
+}
+
+/* --------------------------------------------------------------------
+ * Kernel execution.
+ */
+
+void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination) const
+{
+ KernelFilmConvert kfilm_convert;
+ init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ const int destination_stride = destination.stride != 0 ? destination.stride :
+ buffer_params.width;
+
+ if (destination.d_pixels) {
+ DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+ void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+ const_cast<device_ptr *>(&destination.d_pixels),
+ const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+ const_cast<int *>(&work_size),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&destination.offset),
+ const_cast<int *>(&destination_stride)};
+
+ queue_->enqueue(kernel, work_size, args);
+ }
+ if (destination.d_pixels_half_rgba) {
+ const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
+
+ void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+ const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
+ const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+ const_cast<int *>(&work_size),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&destination.offset),
+ const_cast<int *>(&destination_stride)};
+
+ queue_->enqueue(kernel_half_float, work_size, args);
+ }
+
+ queue_->synchronize();
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \
+ void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const \
+ { \
+ run_film_convert_kernels( \
+ DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \
+ }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth, DEPTH);
+DEFINE_PASS_ACCESSOR(mist, MIST);
+DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT);
+DEFINE_PASS_ACCESSOR(float, FLOAT);
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
+DEFINE_PASS_ACCESSOR(float3, FLOAT3);
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion, MOTION);
+DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE);
+DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER);
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW);
+DEFINE_PASS_ACCESSOR(combined, COMBINED);
+DEFINE_PASS_ACCESSOR(float4, FLOAT4);
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h
new file mode 100644
index 00000000000..bc37e4387f3
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+/* Pass accessor implementation for GPU side. */
+class PassAccessorGPU : public PassAccessor {
+ public:
+ PassAccessorGPU(DeviceQueue *queue,
+ const PassAccessInfo &pass_access_info,
+ float exposure,
+ int num_samples);
+
+ protected:
+ void run_film_convert_kernels(DeviceKernel kernel,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+ virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const override;
+
+ /* Float (scalar) passes. */
+ DECLARE_PASS_ACCESSOR(depth);
+ DECLARE_PASS_ACCESSOR(mist);
+ DECLARE_PASS_ACCESSOR(sample_count);
+ DECLARE_PASS_ACCESSOR(float);
+
+ /* Float3 passes. */
+ DECLARE_PASS_ACCESSOR(light_path);
+ DECLARE_PASS_ACCESSOR(float3);
+
+ /* Float4 passes. */
+ DECLARE_PASS_ACCESSOR(motion);
+ DECLARE_PASS_ACCESSOR(cryptomatte);
+ DECLARE_PASS_ACCESSOR(shadow_catcher);
+ DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow);
+ DECLARE_PASS_ACCESSOR(combined);
+ DECLARE_PASS_ACCESSOR(float4);
+
+#undef DECLARE_PASS_ACCESSOR
+
+ DeviceQueue *queue_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
new file mode 100644
index 00000000000..6c02316ac2b
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -0,0 +1,1147 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace.h"
+
+#include "device/cpu/device.h"
+#include "device/device.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/render_scheduler.h"
+#include "render/gpu_display.h"
+#include "render/pass.h"
+#include "render/scene.h"
+#include "render/tile.h"
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTrace::PathTrace(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ RenderScheduler &render_scheduler,
+ TileManager &tile_manager)
+ : device_(device),
+ device_scene_(device_scene),
+ render_scheduler_(render_scheduler),
+ tile_manager_(tile_manager)
+{
+ DCHECK_NE(device_, nullptr);
+
+ {
+ vector<DeviceInfo> cpu_devices;
+ device_cpu_info(cpu_devices);
+
+ cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler));
+ }
+
+ /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
+ * as possible. */
+ device_->foreach_device([&](Device *path_trace_device) {
+ path_trace_works_.emplace_back(PathTraceWork::create(
+ path_trace_device, film, device_scene, &render_cancel_.is_requested));
+ });
+
+ work_balance_infos_.resize(path_trace_works_.size());
+ work_balance_do_initial(work_balance_infos_);
+
+ render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1);
+}
+
+PathTrace::~PathTrace()
+{
+ /* Destroy any GPU resource which was used for graphics interop.
+ * Need to have access to the GPUDisplay as it is the only source of drawing context which is
+ * used for interop. */
+ if (gpu_display_) {
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->destroy_gpu_resources(gpu_display_.get());
+ }
+ }
+}
+
+void PathTrace::load_kernels()
+{
+ if (denoiser_) {
+ denoiser_->load_kernels(progress_);
+ }
+}
+
+void PathTrace::alloc_work_memory()
+{
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->alloc_work_memory();
+ }
+}
+
+bool PathTrace::ready_to_reset()
+{
+ /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
+ * display. Of there is no such display, the logic here will break. */
+ DCHECK(gpu_display_);
+
+ /* The logic here tries to provide behavior which feels the most interactive feel to artists.
+ * General idea is to be able to reset as quickly as possible, while still providing interactive
+ * feel.
+ *
+ * If the render result was ever drawn after previous reset, consider that reset is now possible.
+ * This way camera navigation gives the quickest feedback of rendered pixels, regardless of
+ * whether CPU or GPU drawing pipeline is used.
+ *
+ * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit
+ * arbitrary, but seems to work very well with viewport navigation in Blender. */
+
+ if (did_draw_after_reset_) {
+ return true;
+ }
+
+ return false;
+}
+
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
+{
+ if (big_tile_params_.modified(big_tile_params)) {
+ big_tile_params_ = big_tile_params;
+ render_state_.need_reset_params = true;
+ }
+
+ full_params_ = full_params;
+
+ /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
+ * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
+ * properly updated. */
+ if (gpu_display_) {
+ gpu_display_->reset(full_params);
+ }
+
+ render_state_.has_denoised_result = false;
+ render_state_.tile_written = false;
+
+ did_draw_after_reset_ = false;
+}
+
+void PathTrace::device_free()
+{
+ /* Free render buffers used by the path trace work to reduce memory peak. */
+ BufferParams empty_params;
+ empty_params.pass_stride = 0;
+ empty_params.update_offset_stride();
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->get_render_buffers()->reset(empty_params);
+ }
+ render_state_.need_reset_params = true;
+}
+
+void PathTrace::set_progress(Progress *progress)
+{
+ progress_ = progress;
+}
+
+void PathTrace::render(const RenderWork &render_work)
+{
+ /* Indicate that rendering has started and that it can be requested to cancel. */
+ {
+ thread_scoped_lock lock(render_cancel_.mutex);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+ render_cancel_.is_rendering = true;
+ }
+
+ render_pipeline(render_work);
+
+ /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry
+ * on. */
+ {
+ thread_scoped_lock lock(render_cancel_.mutex);
+ render_cancel_.is_rendering = false;
+ render_cancel_.condition.notify_one();
+ }
+}
+
+void PathTrace::render_pipeline(RenderWork render_work)
+{
+ /* NOTE: Only check for "instant" cancel here. Ther user-requested cancel via progress is
+ * checked in Session and the work in the event of cancel is to be finished here. */
+
+ render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes !=
+ 0);
+
+ render_init_kernel_execution();
+
+ render_scheduler_.report_work_begin(render_work);
+
+ init_render_buffers(render_work);
+
+ rebalance(render_work);
+
+ path_trace(render_work);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+
+ adaptive_sample(render_work);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+
+ cryptomatte_postprocess(render_work);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+
+ denoise(render_work);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+
+ write_tile_buffer(render_work);
+ update_display(render_work);
+
+ progress_update_if_needed(render_work);
+
+ finalize_full_buffer_on_disk(render_work);
+}
+
+void PathTrace::render_init_kernel_execution()
+{
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->init_execution();
+ }
+}
+
+/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a
+ * measurable performance impact at runtime, but will make compilation faster and binary somewhat
+ * smaller. */
+template<typename Callback>
+static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works,
+ const vector<WorkBalanceInfo> &work_balance_infos,
+ const BufferParams &buffer_params,
+ const Callback &callback)
+{
+ const int num_works = path_trace_works.size();
+ const int height = buffer_params.height;
+
+ int current_y = 0;
+ for (int i = 0; i < num_works; ++i) {
+ const double weight = work_balance_infos[i].weight;
+ const int slice_height = max(lround(height * weight), 1);
+
+ /* Disallow negative values to deal with situations when there are more compute devices than
+ * scanlines. */
+ const int remaining_height = max(0, height - current_y);
+
+ BufferParams slide_params = buffer_params;
+ slide_params.full_y = buffer_params.full_y + current_y;
+ if (i < num_works - 1) {
+ slide_params.height = min(slice_height, remaining_height);
+ }
+ else {
+ slide_params.height = remaining_height;
+ }
+
+ slide_params.update_offset_stride();
+
+ callback(path_trace_works[i].get(), slide_params);
+
+ current_y += slide_params.height;
+ }
+}
+
+void PathTrace::update_allocated_work_buffer_params()
+{
+ foreach_sliced_buffer_params(path_trace_works_,
+ work_balance_infos_,
+ big_tile_params_,
+ [](PathTraceWork *path_trace_work, const BufferParams &params) {
+ RenderBuffers *buffers = path_trace_work->get_render_buffers();
+ buffers->reset(params);
+ });
+}
+
+static BufferParams scale_buffer_params(const BufferParams &params, int resolution_divider)
+{
+ BufferParams scaled_params = params;
+
+ scaled_params.width = max(1, params.width / resolution_divider);
+ scaled_params.height = max(1, params.height / resolution_divider);
+ scaled_params.full_x = params.full_x / resolution_divider;
+ scaled_params.full_y = params.full_y / resolution_divider;
+ scaled_params.full_width = params.full_width / resolution_divider;
+ scaled_params.full_height = params.full_height / resolution_divider;
+
+ scaled_params.update_offset_stride();
+
+ return scaled_params;
+}
+
+void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work)
+{
+ const int resolution_divider = render_work.resolution_divider;
+
+ const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider);
+ const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_,
+ resolution_divider);
+
+ foreach_sliced_buffer_params(path_trace_works_,
+ work_balance_infos_,
+ scaled_big_tile_params,
+ [&](PathTraceWork *path_trace_work, const BufferParams params) {
+ path_trace_work->set_effective_buffer_params(
+ scaled_full_params, scaled_big_tile_params, params);
+ });
+
+ render_state_.effective_big_tile_params = scaled_big_tile_params;
+}
+
+void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work)
+{
+ if (render_state_.need_reset_params) {
+ update_allocated_work_buffer_params();
+ }
+
+ if (render_state_.need_reset_params ||
+ render_state_.resolution_divider != render_work.resolution_divider) {
+ update_effective_work_buffer_params(render_work);
+ }
+
+ render_state_.resolution_divider = render_work.resolution_divider;
+ render_state_.need_reset_params = false;
+}
+
+void PathTrace::init_render_buffers(const RenderWork &render_work)
+{
+ update_work_buffer_params_if_needed(render_work);
+
+ /* Handle initialization scheduled by the render scheduler. */
+ if (render_work.init_render_buffers) {
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->zero_render_buffers();
+ });
+
+ tile_buffer_read();
+ }
+}
+
+void PathTrace::path_trace(RenderWork &render_work)
+{
+ if (!render_work.path_trace.num_samples) {
+ return;
+ }
+
+ VLOG(3) << "Will path trace " << render_work.path_trace.num_samples
+ << " samples at the resolution divider " << render_work.resolution_divider;
+
+ const double start_time = time_dt();
+
+ const int num_works = path_trace_works_.size();
+
+ tbb::parallel_for(0, num_works, [&](int i) {
+ const double work_start_time = time_dt();
+ const int num_samples = render_work.path_trace.num_samples;
+
+ PathTraceWork *path_trace_work = path_trace_works_[i].get();
+
+ PathTraceWork::RenderStatistics statistics;
+ path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+ const double work_time = time_dt() - work_start_time;
+ work_balance_infos_[i].time_spent += work_time;
+ work_balance_infos_[i].occupancy = statistics.occupancy;
+
+ VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+ << work_time / num_samples
+ << " seconds per sample), occupancy: " << statistics.occupancy;
+ });
+
+ float occupancy_accum = 0.0f;
+ for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+ occupancy_accum += balance_info.occupancy;
+ }
+ const float occupancy = occupancy_accum / num_works;
+ render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
+ render_scheduler_.report_path_trace_time(
+ render_work, time_dt() - start_time, is_cancel_requested());
+}
+
+void PathTrace::adaptive_sample(RenderWork &render_work)
+{
+ if (!render_work.adaptive_sampling.filter) {
+ return;
+ }
+
+ bool did_reschedule_on_idle = false;
+
+ while (true) {
+ VLOG(3) << "Will filter adaptive stopping buffer, threshold "
+ << render_work.adaptive_sampling.threshold;
+ if (render_work.adaptive_sampling.reset) {
+ VLOG(3) << "Will re-calculate convergency flag for currently converged pixels.";
+ }
+
+ const double start_time = time_dt();
+
+ uint num_active_pixels = 0;
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ const uint num_active_pixels_in_work =
+ path_trace_work->adaptive_sampling_converge_filter_count_active(
+ render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset);
+ if (num_active_pixels_in_work) {
+ atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work);
+ }
+ });
+
+ render_scheduler_.report_adaptive_filter_time(
+ render_work, time_dt() - start_time, is_cancel_requested());
+
+ if (num_active_pixels == 0) {
+ VLOG(3) << "All pixels converged.";
+ if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) {
+ break;
+ }
+ VLOG(3) << "Continuing with lower threshold.";
+ }
+ else if (did_reschedule_on_idle) {
+ break;
+ }
+ else if (num_active_pixels < 128 * 128) {
+ /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that
+ * there is no performance loss from the progressive noise floor feature.
+ *
+ * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of
+ * the final resolution. */
+ if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) {
+ VLOG(3) << "Rescheduling is not possible: final threshold is reached.";
+ break;
+ }
+ VLOG(3) << "Rescheduling lower threshold.";
+ did_reschedule_on_idle = true;
+ }
+ else {
+ break;
+ }
+ }
+}
+
+void PathTrace::set_denoiser_params(const DenoiseParams &params)
+{
+ render_scheduler_.set_denoiser_params(params);
+
+ if (!params.use) {
+ denoiser_.reset();
+ return;
+ }
+
+ if (denoiser_) {
+ const DenoiseParams old_denoiser_params = denoiser_->get_params();
+ if (old_denoiser_params.type == params.type) {
+ denoiser_->set_params(params);
+ return;
+ }
+ }
+
+ denoiser_ = Denoiser::create(device_, params);
+ denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); };
+}
+
+void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+ render_scheduler_.set_adaptive_sampling(adaptive_sampling);
+}
+
+void PathTrace::cryptomatte_postprocess(const RenderWork &render_work)
+{
+ if (!render_work.cryptomatte.postprocess) {
+ return;
+ }
+ VLOG(3) << "Perform cryptomatte work.";
+
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->cryptomatte_postproces();
+ });
+}
+
+void PathTrace::denoise(const RenderWork &render_work)
+{
+ if (!render_work.tile.denoise) {
+ return;
+ }
+
+ if (!denoiser_) {
+ /* Denoiser was not configured, so nothing to do here. */
+ return;
+ }
+
+ VLOG(3) << "Perform denoising work.";
+
+ const double start_time = time_dt();
+
+ RenderBuffers *buffer_to_denoise = nullptr;
+
+ unique_ptr<RenderBuffers> multi_device_buffers;
+ bool allow_inplace_modification = false;
+
+ if (path_trace_works_.size() == 1) {
+ buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+ }
+ else {
+ Device *denoiser_device = denoiser_->get_denoiser_device();
+ if (!denoiser_device) {
+ return;
+ }
+
+ multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
+ multi_device_buffers->reset(render_state_.effective_big_tile_params);
+
+ buffer_to_denoise = multi_device_buffers.get();
+
+ copy_to_render_buffers(multi_device_buffers.get());
+
+ allow_inplace_modification = true;
+ }
+
+ if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
+ buffer_to_denoise,
+ get_num_samples_in_buffer(),
+ allow_inplace_modification)) {
+ render_state_.has_denoised_result = true;
+ }
+
+ if (multi_device_buffers) {
+ multi_device_buffers->copy_from_device();
+ tbb::parallel_for_each(
+ path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
+ });
+ }
+
+ render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+{
+ gpu_display_ = move(gpu_display);
+}
+
+void PathTrace::clear_gpu_display()
+{
+ if (gpu_display_) {
+ gpu_display_->clear();
+ }
+}
+
+void PathTrace::draw()
+{
+ if (!gpu_display_) {
+ return;
+ }
+
+ did_draw_after_reset_ |= gpu_display_->draw();
+}
+
+void PathTrace::update_display(const RenderWork &render_work)
+{
+ if (!render_work.display.update) {
+ return;
+ }
+
+ if (!gpu_display_ && !tile_buffer_update_cb) {
+ VLOG(3) << "Ignore display update.";
+ return;
+ }
+
+ if (full_params_.width == 0 || full_params_.height == 0) {
+ VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
+ return;
+ }
+
+ const double start_time = time_dt();
+
+ if (tile_buffer_update_cb) {
+ VLOG(3) << "Invoke buffer update callback.";
+
+ tile_buffer_update_cb();
+ }
+
+ if (gpu_display_) {
+ VLOG(3) << "Perform copy to GPUDisplay work.";
+
+ const int resolution_divider = render_work.resolution_divider;
+ const int texture_width = max(1, full_params_.width / resolution_divider);
+ const int texture_height = max(1, full_params_.height / resolution_divider);
+ if (!gpu_display_->update_begin(texture_width, texture_height)) {
+ LOG(ERROR) << "Error beginning GPUDisplay update.";
+ return;
+ }
+
+ const PassMode pass_mode = render_work.display.use_denoised_result &&
+ render_state_.has_denoised_result ?
+ PassMode::DENOISED :
+ PassMode::NOISY;
+
+ /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
+ * all works in parallel. */
+ const int num_samples = get_num_samples_in_buffer();
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples);
+ }
+
+ gpu_display_->update_end();
+ }
+
+ render_scheduler_.report_display_update_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::rebalance(const RenderWork &render_work)
+{
+ static const int kLogLevel = 3;
+
+ if (!render_work.rebalance) {
+ return;
+ }
+
+ const int num_works = path_trace_works_.size();
+
+ if (num_works == 1) {
+ VLOG(kLogLevel) << "Ignoring rebalance work due to single device render.";
+ return;
+ }
+
+ const double start_time = time_dt();
+
+ if (VLOG_IS_ON(kLogLevel)) {
+ VLOG(kLogLevel) << "Perform rebalance work.";
+ VLOG(kLogLevel) << "Per-device path tracing time (seconds):";
+ for (int i = 0; i < num_works; ++i) {
+ VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+ << work_balance_infos_[i].time_spent;
+ }
+ }
+
+ const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_);
+
+ if (VLOG_IS_ON(kLogLevel)) {
+ VLOG(kLogLevel) << "Calculated per-device weights for works:";
+ for (int i = 0; i < num_works; ++i) {
+ VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+ << work_balance_infos_[i].weight;
+ }
+ }
+
+ if (!did_rebalance) {
+ VLOG(kLogLevel) << "Balance in path trace works did not change.";
+ render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false);
+ return;
+ }
+
+ RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+ big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+
+ copy_to_render_buffers(&big_tile_cpu_buffers);
+
+ render_state_.need_reset_params = true;
+ update_work_buffer_params_if_needed(render_work);
+
+ copy_from_render_buffers(&big_tile_cpu_buffers);
+
+ render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true);
+}
+
+void PathTrace::write_tile_buffer(const RenderWork &render_work)
+{
+ if (!render_work.tile.write) {
+ return;
+ }
+
+ VLOG(3) << "Write tile result.";
+
+ render_state_.tile_written = true;
+
+ const bool has_multiple_tiles = tile_manager_.has_multiple_tiles();
+
+ /* Write render tile result, but only if not using tiled rendering.
+ *
+ * Tiles are written to a file during rendering, and written to the software at the end
+ * of rendering (wither when all tiles are finished, or when rendering was requested to be
+ * cancelled).
+ *
+ * Important thing is: tile should be written to the software via callback only once. */
+ if (!has_multiple_tiles) {
+ VLOG(3) << "Write tile result via buffer write callback.";
+ tile_buffer_write();
+ }
+
+ /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
+ */
+ if (has_multiple_tiles) {
+ VLOG(3) << "Write tile result into .";
+ tile_buffer_write_to_disk();
+ }
+}
+
+void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work)
+{
+ if (!render_work.full.write) {
+ return;
+ }
+
+ VLOG(3) << "Handle full-frame render buffer work.";
+
+ if (!tile_manager_.has_written_tiles()) {
+ VLOG(3) << "No tiles on disk.";
+ return;
+ }
+
+ /* Make sure writing to the file is fully finished.
+ * This will include writing all possible missing tiles, ensuring validness of the file. */
+ tile_manager_.finish_write_tiles();
+
+ /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after
+ * all scenes and layers are rendered by the Session (which happens after freeing Session memory,
+ * so that we never hold scene and full-frame buffer in memory at the same time). */
+}
+
+void PathTrace::cancel()
+{
+ thread_scoped_lock lock(render_cancel_.mutex);
+
+ render_cancel_.is_requested = true;
+
+ while (render_cancel_.is_rendering) {
+ render_cancel_.condition.wait(lock);
+ }
+
+ render_cancel_.is_requested = false;
+}
+
+int PathTrace::get_num_samples_in_buffer()
+{
+ return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::is_cancel_requested()
+{
+ if (render_cancel_.is_requested) {
+ return true;
+ }
+
+ if (progress_ != nullptr) {
+ if (progress_->get_cancel()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void PathTrace::tile_buffer_write()
+{
+ if (!tile_buffer_write_cb) {
+ return;
+ }
+
+ tile_buffer_write_cb();
+}
+
+void PathTrace::tile_buffer_read()
+{
+ if (!tile_buffer_read_cb) {
+ return;
+ }
+
+ if (tile_buffer_read_cb()) {
+ tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->copy_render_buffers_to_device();
+ });
+ }
+}
+
+void PathTrace::tile_buffer_write_to_disk()
+{
+ /* Sample count pass is required to support per-tile partial results stored in the file. */
+ DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED);
+
+ const int num_rendered_samples = render_scheduler_.get_num_rendered_samples();
+
+ if (num_rendered_samples == 0) {
+ /* The tile has zero samples, no need to write it. */
+ return;
+ }
+
+ /* Get access to the CPU-side render buffers of the current big tile. */
+ RenderBuffers *buffers;
+ RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+
+ if (path_trace_works_.size() == 1) {
+ path_trace_works_[0]->copy_render_buffers_from_device();
+ buffers = path_trace_works_[0]->get_render_buffers();
+ }
+ else {
+ big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+ copy_to_render_buffers(&big_tile_cpu_buffers);
+
+ buffers = &big_tile_cpu_buffers;
+ }
+
+ if (!tile_manager_.write_tile(*buffers)) {
+ LOG(ERROR) << "Error writing tile to file.";
+ }
+}
+
+void PathTrace::progress_update_if_needed(const RenderWork &render_work)
+{
+ if (progress_ != nullptr) {
+ const int2 tile_size = get_render_tile_size();
+ const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
+ const int current_sample = render_work.path_trace.start_sample +
+ render_work.path_trace.num_samples;
+ progress_->add_samples(num_samples_added, current_sample);
+ }
+
+ if (progress_update_cb) {
+ progress_update_cb();
+ }
+}
+
+void PathTrace::progress_set_status(const string &status, const string &substatus)
+{
+ if (progress_ != nullptr) {
+ progress_->set_status(status, substatus);
+ }
+}
+
+void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+ tbb::parallel_for_each(path_trace_works_,
+ [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->copy_to_render_buffers(render_buffers);
+ });
+ render_buffers->copy_to_device();
+}
+
+void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers)
+{
+ render_buffers->copy_from_device();
+ tbb::parallel_for_each(path_trace_works_,
+ [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->copy_from_render_buffers(render_buffers);
+ });
+}
+
+bool PathTrace::copy_render_tile_from_device()
+{
+ if (full_frame_state_.render_buffers) {
+ /* Full-frame buffer is always allocated on CPU. */
+ return true;
+ }
+
+ bool success = true;
+
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ if (!success) {
+ return;
+ }
+ if (!path_trace_work->copy_render_buffers_from_device()) {
+ success = false;
+ }
+ });
+
+ return success;
+}
+
+static string get_layer_view_name(const RenderBuffers &buffers)
+{
+ string result;
+
+ if (buffers.params.layer.size()) {
+ result += string(buffers.params.layer);
+ }
+
+ if (buffers.params.view.size()) {
+ if (!result.empty()) {
+ result += ", ";
+ }
+ result += string(buffers.params.view);
+ }
+
+ return result;
+}
+
+void PathTrace::process_full_buffer_from_disk(string_view filename)
+{
+ VLOG(3) << "Processing full frame buffer file " << filename;
+
+ progress_set_status("Reading full buffer from disk");
+
+ RenderBuffers full_frame_buffers(cpu_device_.get());
+
+ DenoiseParams denoise_params;
+ if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
+ LOG(ERROR) << "Error reading tiles from file.";
+ return;
+ }
+
+ const string layer_view_name = get_layer_view_name(full_frame_buffers);
+
+ render_state_.has_denoised_result = false;
+
+ if (denoise_params.use) {
+ progress_set_status(layer_view_name, "Denoising");
+
+ /* Re-use the denoiser as much as possible, avoiding possible device re-initialization.
+ *
+ * It will not conflict with the regular rendering as:
+ * - Rendering is supposed to be finished here.
+ * - The next rendering will go via Session's `run_update_for_next_iteration` which will
+ * ensure proper denoiser is used. */
+ set_denoiser_params(denoise_params);
+
+ /* Number of samples doesn't matter too much, since the sampels count pass will be used. */
+ denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false);
+
+ render_state_.has_denoised_result = true;
+ }
+
+ full_frame_state_.render_buffers = &full_frame_buffers;
+
+ progress_set_status(layer_view_name, "Finishing");
+
+ /* Write the full result pretending that there is a single tile.
+ * Requires some state change, but allows to use same communication API with the software. */
+ tile_buffer_write();
+
+ full_frame_state_.render_buffers = nullptr;
+}
+
+int PathTrace::get_num_render_tile_samples() const
+{
+ if (full_frame_state_.render_buffers) {
+ /* If the full-frame buffer is read from disk the number of samples is not used as there is a
+ * sample count pass for that in the buffer. Just avoid access to badly defined state of the
+ * path state. */
+ return 0;
+ }
+
+ return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
+ const PassAccessor::Destination &destination)
+{
+ if (full_frame_state_.render_buffers) {
+ return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
+ }
+
+ bool success = true;
+
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ if (!success) {
+ return;
+ }
+ if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) {
+ success = false;
+ }
+ });
+
+ return success;
+}
+
+bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor,
+ const PassAccessor::Source &source)
+{
+ bool success = true;
+
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ if (!success) {
+ return;
+ }
+ if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) {
+ success = false;
+ }
+ });
+
+ return success;
+}
+
+int2 PathTrace::get_render_tile_size() const
+{
+ if (full_frame_state_.render_buffers) {
+ return make_int2(full_frame_state_.render_buffers->params.width,
+ full_frame_state_.render_buffers->params.height);
+ }
+
+ const Tile &tile = tile_manager_.get_current_tile();
+ return make_int2(tile.width, tile.height);
+}
+
+int2 PathTrace::get_render_tile_offset() const
+{
+ if (full_frame_state_.render_buffers) {
+ return make_int2(0, 0);
+ }
+
+ const Tile &tile = tile_manager_.get_current_tile();
+ return make_int2(tile.x, tile.y);
+}
+
+const BufferParams &PathTrace::get_render_tile_params() const
+{
+ if (full_frame_state_.render_buffers) {
+ return full_frame_state_.render_buffers->params;
+ }
+
+ return big_tile_params_;
+}
+
+bool PathTrace::has_denoised_result() const
+{
+ return render_state_.has_denoised_result;
+}
+
+/* --------------------------------------------------------------------
+ * Report generation.
+ */
+
+static const char *device_type_for_description(const DeviceType type)
+{
+ switch (type) {
+ case DEVICE_NONE:
+ return "None";
+
+ case DEVICE_CPU:
+ return "CPU";
+ case DEVICE_CUDA:
+ return "CUDA";
+ case DEVICE_OPTIX:
+ return "OptiX";
+ case DEVICE_DUMMY:
+ return "Dummy";
+ case DEVICE_MULTI:
+ return "Multi";
+ }
+
+ return "UNKNOWN";
+}
+
+/* Construct description of the device which will appear in the full report. */
+/* TODO(sergey): Consider making it more reusable utility. */
+static string full_device_info_description(const DeviceInfo &device_info)
+{
+ string full_description = device_info.description;
+
+ full_description += " (" + string(device_type_for_description(device_info.type)) + ")";
+
+ if (device_info.display_device) {
+ full_description += " (display)";
+ }
+
+ if (device_info.type == DEVICE_CPU) {
+ full_description += " (" + to_string(device_info.cpu_threads) + " threads)";
+ }
+
+ full_description += " [" + device_info.id + "]";
+
+ return full_description;
+}
+
+/* Construct string which will contain information about devices, possibly multiple of the devices.
+ *
+ * In the simple case the result looks like:
+ *
+ * Message: Full Device Description
+ *
+ * If there are multiple devices then the result looks like:
+ *
+ * Message: Full First Device Description
+ * Full Second Device Description
+ *
+ * Note that the newlines are placed in a way so that the result can be easily concatenated to the
+ * full report. */
+static string device_info_list_report(const string &message, const DeviceInfo &device_info)
+{
+ string result = "\n" + message + ": ";
+ const string pad(message.length() + 2, ' ');
+
+ if (device_info.multi_devices.empty()) {
+ result += full_device_info_description(device_info) + "\n";
+ return result;
+ }
+
+ bool is_first = true;
+ for (const DeviceInfo &sub_device_info : device_info.multi_devices) {
+ if (!is_first) {
+ result += pad;
+ }
+
+ result += full_device_info_description(sub_device_info) + "\n";
+
+ is_first = false;
+ }
+
+ return result;
+}
+
+static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works)
+{
+ DeviceInfo device_info;
+ device_info.type = DEVICE_MULTI;
+
+ for (auto &&path_trace_work : path_trace_works) {
+ device_info.multi_devices.push_back(path_trace_work->get_device()->info);
+ }
+
+ return device_info_list_report("Path tracing on", device_info);
+}
+
+static string denoiser_device_report(const Denoiser *denoiser)
+{
+ if (!denoiser) {
+ return "";
+ }
+
+ if (!denoiser->get_params().use) {
+ return "";
+ }
+
+ const Device *denoiser_device = denoiser->get_denoiser_device();
+ if (!denoiser_device) {
+ return "";
+ }
+
+ return device_info_list_report("Denoising on", denoiser_device->info);
+}
+
+string PathTrace::full_report() const
+{
+ string result = "\nFull path tracing report\n";
+
+ result += path_trace_devices_report(path_trace_works_);
+ result += denoiser_device_report(denoiser_.get());
+
+ /* Report from the render scheduler, which includes:
+ * - Render mode (interactive, offline, headless)
+ * - Adaptive sampling and denoiser parameters
+ * - Breakdown of timing. */
+ result += render_scheduler_.full_report();
+
+ return result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
new file mode 100644
index 00000000000..78ca68c1198
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/path_trace_work.h"
+#include "integrator/work_balancer.h"
+#include "render/buffers.h"
+#include "util/util_function.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling;
+class Device;
+class DeviceScene;
+class Film;
+class RenderBuffers;
+class RenderScheduler;
+class RenderWork;
+class Progress;
+class GPUDisplay;
+class TileManager;
+
+/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of
+ * all the common steps of path tracing which are not device-specific. The list of tasks includes
+ * but is not limited to:
+ * - Kernel graph.
+ * - Scheduling logic.
+ * - Queues management.
+ * - Adaptive stopping. */
+class PathTrace {
+ public:
+ /* Render scheduler is used to report timing information and access things like start/finish
+ * sample. */
+ PathTrace(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ RenderScheduler &render_scheduler,
+ TileManager &tile_manager);
+ ~PathTrace();
+
+ /* Create devices and load kernels which are created on-demand (for example, denoising devices).
+ * The progress is reported to the currently configure progress object (via `set_progress`). */
+ void load_kernels();
+
+ /* Allocate working memory. This runs before allocating scene memory so that we can estimate
+ * more accurately which scene device memory may need to allocated on the host. */
+ void alloc_work_memory();
+
+ /* Check whether now it is a good time to reset rendering.
+ * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate
+ * render result. */
+ bool ready_to_reset();
+
+ void reset(const BufferParams &full_params, const BufferParams &big_tile_params);
+
+ void device_free();
+
+ /* Set progress tracker.
+ * Used to communicate details about the progress to the outer world, check whether rendering is
+ * to be canceled.
+ *
+ * The path tracer writes to this object, and then at a convenient moment runs
+ * progress_update_cb() callback. */
+ void set_progress(Progress *progress);
+
+ /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are
+ * rendered (or until rendering is requested to be cancelled). */
+ void render(const RenderWork &render_work);
+
+ /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is
+ * convenient to have it here because then its easy to access render buffer. But the downside is
+ * that this adds too much of entities which can live separately with some clear API. */
+
+ /* Set denoiser parameters.
+ * Use this to configure the denoiser before rendering any samples. */
+ void set_denoiser_params(const DenoiseParams &params);
+
+ /* Set parameters used for adaptive sampling.
+ * Use this to configure the adaptive sampler before rendering any samples. */
+ void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+ /* Set GPU display which takes care of drawing the render result. */
+ void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+
+ /* Clear the GPU display by filling it in with all zeroes. */
+ void clear_gpu_display();
+
+ /* Perform drawing of the current state of the GPUDisplay. */
+ void draw();
+
+ /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
+ * Used in cases like reset of render session.
+ *
+ * This is a blockign call, which returns as soon as there is no running `render_samples()` call.
+ */
+ void cancel();
+
+ /* Copy an entire render buffer to/from the path trace. */
+
+ /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and
+ * the data will be copied to the device of the given render buffers. */
+ void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+ /* Copy happens via CPU side buffer: data will be copied from the device of the given rendetr
+ * buffers and will be copied to all devices of the path trace. */
+ void copy_from_render_buffers(RenderBuffers *render_buffers);
+
+ /* Copy render buffers of the big tile from the device to hsot.
+ * Return true if all copies are successful. */
+ bool copy_render_tile_from_device();
+
+ /* Read given full-frame file from disk, perform needed processing and write it to the software
+ * via the write callback. */
+ void process_full_buffer_from_disk(string_view filename);
+
+ /* Get number of samples in the current big tile render buffers. */
+ int get_num_render_tile_samples() const;
+
+ /* Get pass data of the entire big tile.
+ * This call puts pass render result from all devices into the final pixels storage.
+ *
+ * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`.
+ *
+ * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */
+ bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+ const PassAccessor::Destination &destination);
+
+ /* Set pass data for baking. */
+ bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+ /* Check whether denoiser was run and denoised passes are available. */
+ bool has_denoised_result() const;
+
+ /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile.
+ * In the case of tiled rendering this will return full-frame after all tiles has been rendered.
+ *
+ * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+ * instead. */
+ int2 get_render_tile_size() const;
+ int2 get_render_tile_offset() const;
+
+ /* Get buffer parameters of the current tile.
+ *
+ * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+ * instead. */
+ const BufferParams &get_render_tile_params() const;
+
+ /* Generate full multi-line report of the rendering process, including rendering parameters,
+ * times, and so on. */
+ string full_report() const;
+
+ /* Callback which communicates an updates state of the render buffer of the current big tile.
+ * Is called during path tracing to communicate work-in-progress state of the final buffer. */
+ function<void(void)> tile_buffer_update_cb;
+
+ /* Callback which communicates final rendered buffer. Is called after pathtracing is done. */
+ function<void(void)> tile_buffer_write_cb;
+
+ /* Callback which initializes rendered buffer. Is called before pathtracing starts.
+ *
+ * This is used for baking. */
+ function<bool(void)> tile_buffer_read_cb;
+
+ /* Callback which is called to report current rendering progress.
+ *
+ * It is supposed to be cheaper than buffer update/write, hence can be called more often.
+ * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed
+ * that the buffer is "uniformly" sampled at the moment of this callback). */
+ function<void(void)> progress_update_cb;
+
+ protected:
+ /* Actual implementation of the rendering pipeline.
+ * Calls steps in order, checking for the cancel to be requested inbetween.
+ *
+ * Is separate from `render()` to simplify dealing with the early outputs and keeping
+ * `render_cancel_` in the consistent state. */
+ void render_pipeline(RenderWork render_work);
+
+ /* Initialize kernel execution on all integrator queues. */
+ void render_init_kernel_execution();
+
+ /* Make sure both allocated and effective buffer parameters of path tracer works are up to date
+ * with the current big tile parameters, performance-dependent slicing, and resolution divider.
+ */
+ void update_work_buffer_params_if_needed(const RenderWork &render_work);
+ void update_allocated_work_buffer_params();
+ void update_effective_work_buffer_params(const RenderWork &render_work);
+
+ /* Perform various steps of the render work.
+ *
+ * Note that some steps might modify the work, forcing some steps to happen within this iteration
+ * of rendering. */
+ void init_render_buffers(const RenderWork &render_work);
+ void path_trace(RenderWork &render_work);
+ void adaptive_sample(RenderWork &render_work);
+ void denoise(const RenderWork &render_work);
+ void cryptomatte_postprocess(const RenderWork &render_work);
+ void update_display(const RenderWork &render_work);
+ void rebalance(const RenderWork &render_work);
+ void write_tile_buffer(const RenderWork &render_work);
+ void finalize_full_buffer_on_disk(const RenderWork &render_work);
+
+ /* Get number of samples in the current state of the render buffers. */
+ int get_num_samples_in_buffer();
+
+ /* Check whether user requested to cancel rendering, so that path tracing is to be finished as
+ * soon as possible. */
+ bool is_cancel_requested();
+
+ /* Write the big tile render buffer via the write callback. */
+ void tile_buffer_write();
+
+ /* Read the big tile render buffer via the read callback. */
+ void tile_buffer_read();
+
+ /* Write current tile into the file on disk. */
+ void tile_buffer_write_to_disk();
+
+ /* Run the progress_update_cb callback if it is needed. */
+ void progress_update_if_needed(const RenderWork &render_work);
+
+ void progress_set_status(const string &status, const string &substatus = "");
+
+ /* Pointer to a device which is configured to be used for path tracing. If multiple devices
+ * are configured this is a `MultiDevice`. */
+ Device *device_ = nullptr;
+
+ /* CPU device for creating temporary render buffers on the CPU side. */
+ unique_ptr<Device> cpu_device_;
+
+ DeviceScene *device_scene_;
+
+ RenderScheduler &render_scheduler_;
+ TileManager &tile_manager_;
+
+ unique_ptr<GPUDisplay> gpu_display_;
+
+ /* Per-compute device descriptors of work which is responsible for path tracing on its configured
+ * device. */
+ vector<unique_ptr<PathTraceWork>> path_trace_works_;
+
+ /* Per-path trace work information needed for multi-device balancing. */
+ vector<WorkBalanceInfo> work_balance_infos_;
+
+ /* Render buffer parameters of the full frame and current big tile. */
+ BufferParams full_params_;
+ BufferParams big_tile_params_;
+
+ /* Denoiser which takes care of denoising the big tile. */
+ unique_ptr<Denoiser> denoiser_;
+
+ /* State which is common for all the steps of the render work.
+ * Is brought up to date in the `render()` call and is accessed from all the steps involved into
+ * rendering the work. */
+ struct {
+ /* Denotes whether render buffers parameters of path trace works are to be reset for the new
+ * value of the big tile parameters. */
+ bool need_reset_params = false;
+
+ /* Divider of the resolution for faster previews.
+ *
+ * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to
+ * think of render buffer in this case is as an over-allocated array: the resolution divider
+ * affects both resolution and stride as visible by the integrator kernels. */
+ int resolution_divider = 0;
+
+ /* Paramaters of the big tile with the current resolution divider applied. */
+ BufferParams effective_big_tile_params;
+
+ /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+ bool has_denoised_result = false;
+
+ /* Current tile has been written (to either disk or callback.
+ * Indicates that no more work will be done on this tile. */
+ bool tile_written = false;
+ } render_state_;
+
+ /* Progress object which is used to communicate sample progress. */
+ Progress *progress_;
+
+ /* Fields required for canceling render on demand, as quickly as possible. */
+ struct {
+ /* Indicates whether there is an on-going `render_samples()` call. */
+ bool is_rendering = false;
+
+ /* Indicates whether rendering is requested to be canceled by `cancel()`. */
+ bool is_requested = false;
+
+ /* Synchronization between thread which does `render_samples()` and thread which does
+ * `cancel()`. */
+ thread_mutex mutex;
+ thread_condition_variable condition;
+ } render_cancel_;
+
+ /* Indicates whether a render result was drawn after latest session reset.
+ * Used by `ready_to_reset()` to implement logic which feels the most interactive. */
+ bool did_draw_after_reset_ = true;
+
+ /* State of the full frame processing and writing to the software. */
+ struct {
+ RenderBuffers *render_buffers = nullptr;
+ } full_frame_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
new file mode 100644
index 00000000000..d9634acac10
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/path_trace_work_cpu.h"
+#include "integrator/path_trace_work_gpu.h"
+#include "render/buffers.h"
+#include "render/film.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+{
+ if (device->info.type == DEVICE_CPU) {
+ return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
+ }
+
+ return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
+}
+
+PathTraceWork::PathTraceWork(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+ : device_(device),
+ film_(film),
+ device_scene_(device_scene),
+ buffers_(make_unique<RenderBuffers>(device)),
+ effective_buffer_params_(buffers_->params),
+ cancel_requested_flag_(cancel_requested_flag)
+{
+}
+
+PathTraceWork::~PathTraceWork()
+{
+}
+
+RenderBuffers *PathTraceWork::get_render_buffers()
+{
+ return buffers_.get();
+}
+
+void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params,
+ const BufferParams &effective_big_tile_params,
+ const BufferParams &effective_buffer_params)
+{
+ effective_full_params_ = effective_full_params;
+ effective_big_tile_params_ = effective_big_tile_params;
+ effective_buffer_params_ = effective_buffer_params;
+}
+
+bool PathTraceWork::has_multiple_works() const
+{
+ /* Assume if there are multiple works working on the same big tile none of the works gets the
+ * entire big tile to work on. */
+ return !(effective_big_tile_params_.width == effective_buffer_params_.width &&
+ effective_big_tile_params_.height == effective_buffer_params_.height &&
+ effective_big_tile_params_.full_x == effective_buffer_params_.full_x &&
+ effective_big_tile_params_.full_y == effective_buffer_params_.full_y);
+}
+
+void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+ copy_render_buffers_from_device();
+
+ const int64_t width = effective_buffer_params_.width;
+ const int64_t height = effective_buffer_params_.height;
+ const int64_t pass_stride = effective_buffer_params_.pass_stride;
+ const int64_t row_stride = width * pass_stride;
+ const int64_t data_size = row_stride * height * sizeof(float);
+
+ const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int64_t offset_in_floats = offset_y * row_stride;
+
+ const float *src = buffers_->buffer.data();
+ float *dst = render_buffers->buffer.data() + offset_in_floats;
+
+ memcpy(dst, src, data_size);
+}
+
+void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
+{
+ const int64_t width = effective_buffer_params_.width;
+ const int64_t height = effective_buffer_params_.height;
+ const int64_t pass_stride = effective_buffer_params_.pass_stride;
+ const int64_t row_stride = width * pass_stride;
+ const int64_t data_size = row_stride * height * sizeof(float);
+
+ const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int64_t offset_in_floats = offset_y * row_stride;
+
+ const float *src = render_buffers->buffer.data() + offset_in_floats;
+ float *dst = buffers_->buffer.data();
+
+ memcpy(dst, src, data_size);
+
+ copy_render_buffers_to_device();
+}
+
+void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
+{
+ const int64_t width = effective_buffer_params_.width;
+ const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int64_t offset = offset_y * width;
+
+ render_buffers_host_copy_denoised(
+ buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
+
+ copy_render_buffers_to_device();
+}
+
+bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
+ const PassAccessor::Destination &destination)
+{
+ const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int width = effective_buffer_params_.width;
+
+ PassAccessor::Destination slice_destination = destination;
+ slice_destination.offset += offset_y * width;
+
+ return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
+}
+
+bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,
+ const PassAccessor::Source &source)
+{
+ const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int width = effective_buffer_params_.width;
+
+ PassAccessor::Source slice_source = source;
+ slice_source.offset += offset_y * width;
+
+ return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source);
+}
+
+PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const
+{
+ const KernelFilm &kfilm = device_scene_->data.film;
+ const KernelBackground &kbackground = device_scene_->data.background;
+
+ const BufferParams &params = buffers_->params;
+
+ const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass());
+
+ PassAccessor::PassAccessInfo pass_access_info;
+ pass_access_info.type = display_pass->type;
+ pass_access_info.offset = PASS_UNUSED;
+
+ if (pass_mode == PassMode::DENOISED) {
+ pass_access_info.mode = PassMode::DENOISED;
+ pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED);
+ }
+
+ if (pass_access_info.offset == PASS_UNUSED) {
+ pass_access_info.mode = PassMode::NOISY;
+ pass_access_info.offset = params.get_pass_offset(pass_access_info.type);
+ }
+
+ pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher;
+ pass_access_info.use_approximate_shadow_catcher_background =
+ kfilm.use_approximate_shadow_catcher && !kbackground.transparent;
+
+ return pass_access_info;
+}
+
+PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template(
+ const GPUDisplay *gpu_display) const
+{
+ PassAccessor::Destination destination(film_->get_display_pass());
+
+ const int2 display_texture_size = gpu_display->get_texture_size();
+ const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x;
+ const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y;
+
+ destination.offset = texture_y * display_texture_size.x + texture_x;
+ destination.stride = display_texture_size.x;
+
+ return destination;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
new file mode 100644
index 00000000000..97b97f3d888
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "render/buffers.h"
+#include "render/pass.h"
+#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class DeviceScene;
+class Film;
+class GPUDisplay;
+class RenderBuffers;
+
+class PathTraceWork {
+ public:
+ struct RenderStatistics {
+ float occupancy = 1.0f;
+ };
+
+ /* Create path trace work which fits best the device.
+ *
+ * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as
+ * possible. This could be, for rexample, request to cancel rendering on camera navigation in
+ * viewport. */
+ static unique_ptr<PathTraceWork> create(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag);
+
+ virtual ~PathTraceWork();
+
+ /* Access the render buffers.
+ *
+ * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to
+ * correspond to the big tile size and relative device performance. */
+ RenderBuffers *get_render_buffers();
+
+ /* Set effective parameters of the big tile and the work itself. */
+ void set_effective_buffer_params(const BufferParams &effective_full_params,
+ const BufferParams &effective_big_tile_params,
+ const BufferParams &effective_buffer_params);
+
+ /* Check whether the big tile is being worked on by multiple path trace works. */
+ bool has_multiple_works() const;
+
+ /* Allocate working memory for execution. Must be called before init_execution(). */
+ virtual void alloc_work_memory(){};
+
+ /* Initialize execution of kernels.
+ * Will ensure that all device queues are initialized for execution.
+ *
+ * This method is to be called after any change in the scene. It is not needed to call it prior
+ * to an every call of the `render_samples()`. */
+ virtual void init_execution() = 0;
+
+ /* Render given number of samples as a synchronous blocking call.
+ * The samples are added to the render buffer associated with this work. */
+ virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+
+ /* Copy render result from this work to the corresponding place of the GPU display.
+ *
+ * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The
+ * noisy pass mode will be passed here when it is known that the buffer does not have denoised
+ * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is
+ * not used then this function will fall-back to the noisy pass instead. */
+ virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples) = 0;
+
+ virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0;
+
+ /* Copy data from/to given render buffers.
+ * Will copy pixels from a corresponding place (from multi-device point of view) of the render
+ * buffers, and copy work's render buffers to the corresponding place of the destination. */
+
+ /* Notes:
+ * - Copies work's render buffer from the device.
+ * - Copies CPU-side buffer of the given buffer
+ * - Does not copy the buffer to its device. */
+ void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+ /* Notes:
+ * - Does not copy given render buffers from the device.
+ * - Copies work's render buffer to its device. */
+ void copy_from_render_buffers(const RenderBuffers *render_buffers);
+
+ /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+ * given render buffers, leaving rest of the passes.
+ *
+ * Same notes about device copying aplies to this call as well. */
+ void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers);
+
+ /* Copy render buffers to/from device using an appropriate device queue when needed so that
+ * things are executed in order with the `render_samples()`. */
+ virtual bool copy_render_buffers_from_device() = 0;
+ virtual bool copy_render_buffers_to_device() = 0;
+
+ /* Zero render buffers to/from device using an appropriate device queue when needed so that
+ * things are executed in order with the `render_samples()`. */
+ virtual bool zero_render_buffers() = 0;
+
+ /* Access pixels rendered by this work and copy them to the coresponding location in the
+ * destination.
+ *
+ * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()`
+ * to update host-side data. */
+ bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+ const PassAccessor::Destination &destination);
+
+ /* Set pass data for baking. */
+ bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+ /* Perform convergence test on the render buffer, and filter the convergence mask.
+ * Returns number of active pixels (the ones which did not converge yet). */
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
+
+ /* Run cryptomatte pass post-processing kernels. */
+ virtual void cryptomatte_postproces() = 0;
+
+ /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
+ * possible, without waiting for any samples to be finished. */
+ inline bool is_cancel_requested() const
+ {
+ /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in
+ * threaded environment. */
+ return *cancel_requested_flag_;
+ }
+
+ /* Access to the device which is used to path trace this work on. */
+ Device *get_device() const
+ {
+ return device_;
+ }
+
+ protected:
+ PathTraceWork(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag);
+
+ PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const;
+
+ /* Get destination which offset and stride are configured so that writing to it will write to a
+ * proper location of GPU display texture, taking current tile and device slice into account. */
+ PassAccessor::Destination get_gpu_display_destination_template(
+ const GPUDisplay *gpu_display) const;
+
+ /* Device which will be used for path tracing.
+ * Note that it is an actual render device (and never is a multi-device). */
+ Device *device_;
+
+ /* Film is used to access display pass configuration for GPU display update.
+ * Note that only fields which are not a part of kernel data can be accessed via the Film. */
+ Film *film_;
+
+ /* Device side scene storage, that may be used for integrator logic. */
+ DeviceScene *device_scene_;
+
+ /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big
+ * tile which is being rendered by this work.
+ * It also defines possible subset of a big tile in the case of multi-device rendering. */
+ unique_ptr<RenderBuffers> buffers_;
+
+ /* Effective parameters of the full, big tile, and current work render buffer.
+ * The latter might be different from buffers_->params when there is a resolution divider
+ * involved. */
+ BufferParams effective_full_params_;
+ BufferParams effective_big_tile_params_;
+ BufferParams effective_buffer_params_;
+
+ bool *cancel_requested_flag_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
new file mode 100644
index 00000000000..b9a33b64051
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_cpu.h"
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Create TBB arena for execution of path tracing and rendering tasks. */
+static inline tbb::task_arena local_tbb_arena_create(const Device *device)
+{
+ /* TODO: limit this to number of threads of CPU device, it may be smaller than
+ * the system number of threads when we reduce the number of CPU threads in
+ * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
+ return tbb::task_arena(device->info.cpu_threads);
+}
+
+/* Get CPUKernelThreadGlobals for the current thread. */
+static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
+ vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+ const int thread_index = tbb::this_task_arena::current_thread_index();
+ DCHECK_GE(thread_index, 0);
+ DCHECK_LE(thread_index, kernel_thread_globals.size());
+
+ return &kernel_thread_globals[thread_index];
+}
+
+PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+ : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+ kernels_(*(device->get_cpu_kernels()))
+{
+ DCHECK_EQ(device->info.type, DEVICE_CPU);
+}
+
+void PathTraceWorkCPU::init_execution()
+{
+ /* Cache per-thread kernel globals. */
+ device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
+}
+
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num)
+{
+ const int64_t image_width = effective_buffer_params_.width;
+ const int64_t image_height = effective_buffer_params_.height;
+ const int64_t total_pixels_num = image_width * image_height;
+
+ for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+ kernel_globals.start_profiling();
+ }
+
+ tbb::task_arena local_arena = local_tbb_arena_create(device_);
+ local_arena.execute([&]() {
+ tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
+ if (is_cancel_requested()) {
+ return;
+ }
+
+ const int y = work_index / image_width;
+ const int x = work_index - y * image_width;
+
+ KernelWorkTile work_tile;
+ work_tile.x = effective_buffer_params_.full_x + x;
+ work_tile.y = effective_buffer_params_.full_y + y;
+ work_tile.w = 1;
+ work_tile.h = 1;
+ work_tile.start_sample = start_sample;
+ work_tile.num_samples = 1;
+ work_tile.offset = effective_buffer_params_.offset;
+ work_tile.stride = effective_buffer_params_.stride;
+
+ CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+
+ render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
+ });
+ });
+
+ for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+ kernel_globals.stop_profiling();
+ }
+
+ statistics.occupancy = 1.0f;
+}
+
+void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
+ const KernelWorkTile &work_tile,
+ const int samples_num)
+{
+ const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
+ const bool has_bake = device_scene_->data.bake.use;
+
+ IntegratorStateCPU integrator_states[2] = {};
+
+ IntegratorStateCPU *state = &integrator_states[0];
+ IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+
+ KernelWorkTile sample_work_tile = work_tile;
+ float *render_buffer = buffers_->buffer.data();
+
+ for (int sample = 0; sample < samples_num; ++sample) {
+ if (is_cancel_requested()) {
+ break;
+ }
+
+ if (has_bake) {
+ if (!kernels_.integrator_init_from_bake(
+ kernel_globals, state, &sample_work_tile, render_buffer)) {
+ break;
+ }
+ }
+ else {
+ if (!kernels_.integrator_init_from_camera(
+ kernel_globals, state, &sample_work_tile, render_buffer)) {
+ break;
+ }
+ }
+
+ kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
+
+ if (has_shadow_catcher) {
+ kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
+ }
+
+ ++sample_work_tile.start_sample;
+ }
+}
+
+void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ half4 *rgba_half = gpu_display->map_texture_buffer();
+ if (!rgba_half) {
+ /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
+ * some implementations of GPUDisplay which can not map memory? */
+ return;
+ }
+
+ const KernelFilm &kfilm = device_scene_->data.film;
+
+ const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+
+ const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
+
+ PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+ destination.pixels_half_rgba = rgba_half;
+
+ tbb::task_arena local_arena = local_tbb_arena_create(device_);
+ local_arena.execute([&]() {
+ pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+ });
+
+ gpu_display->unmap_texture_buffer();
+}
+
+void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+{
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_from_device()
+{
+ return buffers_->copy_from_device();
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_to_device()
+{
+ buffers_->buffer.copy_to_device();
+ return true;
+}
+
+bool PathTraceWorkCPU::zero_render_buffers()
+{
+ buffers_->zero();
+ return true;
+}
+
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+ const int full_x = effective_buffer_params_.full_x;
+ const int full_y = effective_buffer_params_.full_y;
+ const int width = effective_buffer_params_.width;
+ const int height = effective_buffer_params_.height;
+ const int offset = effective_buffer_params_.offset;
+ const int stride = effective_buffer_params_.stride;
+
+ float *render_buffer = buffers_->buffer.data();
+
+ uint num_active_pixels = 0;
+
+ tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+ /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+ local_arena.execute([&]() {
+ tbb::parallel_for(full_y, full_y + height, [&](int y) {
+ CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+
+ bool row_converged = true;
+ uint num_row_pixels_active = 0;
+ for (int x = 0; x < width; ++x) {
+ if (!kernels_.adaptive_sampling_convergence_check(
+ kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+ ++num_row_pixels_active;
+ row_converged = false;
+ }
+ }
+
+ atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
+ if (!row_converged) {
+ kernels_.adaptive_sampling_filter_x(
+ kernel_globals, render_buffer, y, full_x, width, offset, stride);
+ }
+ });
+ });
+
+ if (num_active_pixels) {
+ local_arena.execute([&]() {
+ tbb::parallel_for(full_x, full_x + width, [&](int x) {
+ CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+ kernels_.adaptive_sampling_filter_y(
+ kernel_globals, render_buffer, x, full_y, height, offset, stride);
+ });
+ });
+ }
+
+ return num_active_pixels;
+}
+
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+ const int width = effective_buffer_params_.width;
+ const int height = effective_buffer_params_.height;
+
+ float *render_buffer = buffers_->buffer.data();
+
+ tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+ /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+ local_arena.execute([&]() {
+ tbb::parallel_for(0, height, [&](int y) {
+ CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+ int pixel_index = y * width;
+
+ for (int x = 0; x < width; ++x, ++pixel_index) {
+ kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+ }
+ });
+ });
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
new file mode 100644
index 00000000000..ab729bbf879
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/cpu/kernel_thread_globals.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+struct KernelGlobals;
+
+class CPUKernels;
+
+/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel,
+ * for CPU devices.
+ *
+ * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent
+ * queues on the render device which makes this work be only usable on CPU. */
+class PathTraceWorkCPU : public PathTraceWork {
+ public:
+ PathTraceWorkCPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag);
+
+ virtual void init_execution() override;
+
+ virtual void render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num) override;
+
+ virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples) override;
+ virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+ virtual bool copy_render_buffers_from_device() override;
+ virtual bool copy_render_buffers_to_device() override;
+ virtual bool zero_render_buffers() override;
+
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+ virtual void cryptomatte_postproces() override;
+
+ protected:
+ /* Core path tracing routine. Renders given work time on the given queue. */
+ void render_samples_full_pipeline(KernelGlobals *kernel_globals,
+ const KernelWorkTile &work_tile,
+ const int samples_num);
+
+ /* CPU kernels. */
+ const CPUKernels &kernels_;
+
+ /* Copy of kernel globals which is suitable for concurrent access from multiple threads.
+ *
+ * More specifically, the `kernel_globals_` is local to each threads and nobody else is
+ * accessing it, but some "localization" is required to decouple from kernel globals stored
+ * on the device level. */
+ vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
new file mode 100644
index 00000000000..10baf869aa6
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_gpu.h"
+
+#include "device/device.h"
+
+#include "integrator/pass_accessor_gpu.h"
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+ : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+ queue_(device->gpu_queue_create()),
+ integrator_state_soa_kernel_features_(0),
+ integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
+ integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
+ integrator_shader_raytrace_sort_counter_(
+ device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+ integrator_next_shadow_catcher_path_index_(
+ device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+ queued_paths_(device, "queued_paths", MEM_READ_WRITE),
+ num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
+ work_tiles_(device, "work_tiles", MEM_READ_WRITE),
+ gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+ max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
+ min_num_active_paths_(queue_->num_concurrent_busy_states()),
+ max_active_path_index_(0)
+{
+ memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
+
+ /* Limit number of active paths to the half of the overall state. This is due to the logic in the
+ * path compaction which relies on the fact that regeneration does not happen sooner than half of
+ * the states are available again. */
+ min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+}
+
+void PathTraceWorkGPU::alloc_integrator_soa()
+{
+ /* IntegrateState allocated as structure of arrays. */
+
+ /* Check if we already allocated memory for the required features. */
+ const uint kernel_features = device_scene_->data.kernel_features;
+ if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
+ return;
+ }
+ integrator_state_soa_kernel_features_ = kernel_features;
+
+ /* Allocate a device only memory buffer before for each struct member, and then
+ * write the pointers into a struct that resides in constant memory.
+ *
+ * TODO: store float3 in separate XYZ arrays. */
+#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+ if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
+ device_only_memory<type> *array = new device_only_memory<type>(device_, \
+ "integrator_state_" #name); \
+ array->alloc_to_device(max_num_paths_); \
+ integrator_state_soa_.emplace_back(array); \
+ integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+ }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+ if ((kernel_features & feature) && \
+ (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
+ device_only_memory<type> *array = new device_only_memory<type>(device_, \
+ "integrator_state_" #name); \
+ array->alloc_to_device(max_num_paths_); \
+ integrator_state_soa_.emplace_back(array); \
+ integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
+ }
+#define KERNEL_STRUCT_END(name) \
+ break; \
+ }
+#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+ if (array_index == array_size - 1) { \
+ break; \
+ } \
+ }
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+}
+
+void PathTraceWorkGPU::alloc_integrator_queue()
+{
+ if (integrator_queue_counter_.size() == 0) {
+ integrator_queue_counter_.alloc(1);
+ integrator_queue_counter_.zero_to_device();
+ integrator_queue_counter_.copy_from_device();
+ integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+ integrator_queue_counter_.device_pointer;
+ }
+
+ /* Allocate data for active path index arrays. */
+ if (num_queued_paths_.size() == 0) {
+ num_queued_paths_.alloc(1);
+ num_queued_paths_.zero_to_device();
+ }
+
+ if (queued_paths_.size() == 0) {
+ queued_paths_.alloc(max_num_paths_);
+ /* TODO: this could be skip if we had a function to just allocate on device. */
+ queued_paths_.zero_to_device();
+ }
+}
+
+void PathTraceWorkGPU::alloc_integrator_sorting()
+{
+ /* Allocate arrays for shader sorting. */
+ const int max_shaders = device_scene_->data.max_shaders;
+ if (integrator_shader_sort_counter_.size() < max_shaders) {
+ integrator_shader_sort_counter_.alloc(max_shaders);
+ integrator_shader_sort_counter_.zero_to_device();
+
+ integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+ integrator_shader_raytrace_sort_counter_.zero_to_device();
+
+ integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+ (int *)integrator_shader_sort_counter_.device_pointer;
+ integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+ (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+ }
+}
+
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+ if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+ return;
+ }
+
+ integrator_next_shadow_catcher_path_index_.alloc(1);
+ /* TODO(sergey): Use queue? */
+ integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+ integrator_state_gpu_.next_shadow_catcher_path_index =
+ (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
+void PathTraceWorkGPU::alloc_work_memory()
+{
+ alloc_integrator_soa();
+ alloc_integrator_queue();
+ alloc_integrator_sorting();
+ alloc_integrator_path_split();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+ queue_->init_execution();
+
+ /* Copy to device side struct in constant memory. */
+ device_->const_copy_to(
+ "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+}
+
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num)
+{
+ /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+ * add more work (because tiles are smaller, so there is higher chance that more paths will
+ * become busy after adding new tiles). This is especially important for the shadow catcher which
+ * schedules work in halves of available number of paths. */
+ work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
+
+ work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
+
+ enqueue_reset();
+
+ int num_iterations = 0;
+ uint64_t num_busy_accum = 0;
+
+ /* TODO: set a hard limit in case of undetected kernel failures? */
+ while (true) {
+ /* Enqueue work from the scheduler, on start or when there are not enough
+ * paths to keep the device occupied. */
+ bool finished;
+ if (enqueue_work_tiles(finished)) {
+ /* Copy stats from the device. */
+ queue_->copy_from_device(integrator_queue_counter_);
+
+ if (!queue_->synchronize()) {
+ break; /* Stop on error. */
+ }
+ }
+
+ if (is_cancel_requested()) {
+ break;
+ }
+
+ /* Stop if no more work remaining. */
+ if (finished) {
+ break;
+ }
+
+ /* Enqueue on of the path iteration kernels. */
+ if (enqueue_path_iteration()) {
+ /* Copy stats from the device. */
+ queue_->copy_from_device(integrator_queue_counter_);
+
+ if (!queue_->synchronize()) {
+ break; /* Stop on error. */
+ }
+ }
+
+ if (is_cancel_requested()) {
+ break;
+ }
+
+ num_busy_accum += get_num_active_paths();
+ ++num_iterations;
+ }
+
+ statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
+}
+
+DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
+{
+ const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int max_num_queued = 0;
+ DeviceKernel kernel = DEVICE_KERNEL_NUM;
+
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ if (queue_counter->num_queued[i] > max_num_queued) {
+ kernel = (DeviceKernel)i;
+ max_num_queued = queue_counter->num_queued[i];
+ }
+ }
+
+ return kernel;
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+ void *args[] = {&max_num_paths_};
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
+ queue_->zero_to_device(integrator_queue_counter_);
+ queue_->zero_to_device(integrator_shader_sort_counter_);
+ queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+
+ /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+ * counter on the host side because `zero_to_device()` is not doing it. */
+ if (integrator_queue_counter_.host_pointer) {
+ memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
+ }
+}
+
+bool PathTraceWorkGPU::enqueue_path_iteration()
+{
+ /* Find kernel to execute, with max number of queued paths. */
+ const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int num_active_paths = 0;
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ num_active_paths += queue_counter->num_queued[i];
+ }
+
+ if (num_active_paths == 0) {
+ return false;
+ }
+
+ /* Find kernel to execute, with max number of queued paths. */
+ const DeviceKernel kernel = get_most_queued_kernel();
+ if (kernel == DEVICE_KERNEL_NUM) {
+ return false;
+ }
+
+ /* Finish shadows before potentially adding more shadow rays. We can only
+ * store one shadow ray in the integrator state. */
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
+ if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+ enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+ return true;
+ }
+ else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+ enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+ return true;
+ }
+ }
+
+ /* Schedule kernel with maximum number of queued items. */
+ enqueue_path_iteration(kernel);
+ return true;
+}
+
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+{
+ void *d_path_index = (void *)NULL;
+
+ /* Create array of path indices for which this kernel is queued to be executed. */
+ int work_size = max_active_path_index_;
+
+ IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+ int num_queued = queue_counter->num_queued[kernel];
+
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ /* Compute array of active paths, sorted by shader. */
+ work_size = num_queued;
+ d_path_index = (void *)queued_paths_.device_pointer;
+
+ compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+ }
+ else if (num_queued < work_size) {
+ work_size = num_queued;
+ d_path_index = (void *)queued_paths_.device_pointer;
+
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+ /* Compute array of active shadow paths for specific kernel. */
+ compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
+ }
+ else {
+ /* Compute array of active paths for specific kernel. */
+ compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
+ }
+ }
+
+ DCHECK_LE(work_size, max_num_paths_);
+
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+ /* Ray intersection kernels with integrator state. */
+ void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
+
+ queue_->enqueue(kernel, work_size, args);
+ break;
+ }
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+ /* Shading kernels with integrator state and render buffer. */
+ void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+ void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+ queue_->enqueue(kernel, work_size, args);
+ break;
+ }
+
+ default:
+ LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
+ << " used for path iteration, should never happen.";
+ break;
+ }
+}
+
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+ int d_queued_kernel = queued_kernel;
+ void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+ assert(d_counter != nullptr);
+
+ /* Compute prefix sum of number of active paths with each shader. */
+ {
+ const int work_size = 1;
+ int max_shaders = device_scene_->data.max_shaders;
+ void *args[] = {&d_counter, &max_shaders};
+ queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
+ }
+
+ queue_->zero_to_device(num_queued_paths_);
+
+ /* Launch kernel to fill the active paths arrays. */
+ {
+ /* TODO: this could be smaller for terminated paths based on amount of work we want
+ * to schedule. */
+ const int work_size = max_active_path_index_;
+
+ void *d_queued_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {const_cast<int *>(&work_size),
+ &d_queued_paths,
+ &d_num_queued_paths,
+ &d_counter,
+ &d_queued_kernel};
+
+ queue_->enqueue(kernel, work_size, args);
+ }
+
+ if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+ queue_->zero_to_device(integrator_shader_sort_counter_);
+ }
+ else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+ }
+ else {
+ assert(0);
+ }
+}
+
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+ int d_queued_kernel = queued_kernel;
+
+ /* Launch kernel to fill the active paths arrays. */
+ const int work_size = max_active_path_index_;
+ void *d_queued_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {
+ const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
+
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(kernel, work_size, args);
+}
+
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+ if (num_active_paths == 0) {
+ max_active_path_index_ = 0;
+ }
+
+ /* Compact fragmented path states into the start of the array, moving any paths
+ * with index higher than the number of active paths into the gaps. */
+ if (max_active_path_index_ == num_active_paths) {
+ return;
+ }
+
+ void *d_compact_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+ /* Create array with terminated paths that we can write to. */
+ {
+ /* TODO: can the work size be reduced here? */
+ int offset = num_active_paths;
+ int work_size = num_active_paths;
+ void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+ }
+
+ /* Create array of paths that we need to compact, where the path index is bigger
+ * than the number of active paths. */
+ {
+ int work_size = max_active_path_index_;
+ void *args[] = {
+ &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+ }
+
+ queue_->copy_from_device(num_queued_paths_);
+ queue_->synchronize();
+
+ int num_compact_paths = num_queued_paths_.data()[0];
+
+ /* Move paths into gaps. */
+ if (num_compact_paths > 0) {
+ int work_size = num_compact_paths;
+ int active_states_offset = 0;
+ int terminated_states_offset = num_active_paths;
+ void *args[] = {
+ &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+ }
+
+ queue_->synchronize();
+
+ /* Adjust max active path index now we know which part of the array is actually used. */
+ max_active_path_index_ = num_active_paths;
+}
+
+bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
+{
+ /* If there are existing paths wait them to go to intersect closest kernel, which will align the
+ * wavefront of the existing and newely added paths. */
+ /* TODO: Check whether counting new intersection kernels here will have positive affect on the
+ * performance. */
+ const DeviceKernel kernel = get_most_queued_kernel();
+ if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
+ return false;
+ }
+
+ int num_active_paths = get_num_active_paths();
+
+ /* Don't schedule more work if cancelling. */
+ if (is_cancel_requested()) {
+ if (num_active_paths == 0) {
+ finished = true;
+ }
+ return false;
+ }
+
+ finished = false;
+
+ vector<KernelWorkTile> work_tiles;
+
+ int max_num_camera_paths = max_num_paths_;
+ int num_predicted_splits = 0;
+
+ if (has_shadow_catcher()) {
+ /* When there are shadow catchers in the scene bounce from them will split the state. So we
+ * make sure there is enough space in the path states array to fit split states.
+ *
+ * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+ * that all the new paths can be split.
+ *
+ * Note that it is possible that some of the current states can still split, so need to make
+ * sure there is enough space for them as well. */
+
+ /* Number of currently in-flight states which can still split. */
+ const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+ const int num_available_paths = max_num_paths_ - num_active_paths;
+ const int num_new_paths = num_available_paths / 2;
+ max_num_camera_paths = max(num_active_paths,
+ num_active_paths + num_new_paths - num_scheduled_possible_split);
+ num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+ }
+
+ /* Schedule when we're out of paths or there are too few paths to keep the
+ * device occupied. */
+ int num_paths = num_active_paths;
+ if (num_paths == 0 || num_paths < min_num_active_paths_) {
+ /* Get work tiles until the maximum number of path is reached. */
+ while (num_paths < max_num_camera_paths) {
+ KernelWorkTile work_tile;
+ if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
+ work_tiles.push_back(work_tile);
+ num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* If we couldn't get any more tiles, we're done. */
+ if (work_tiles.size() == 0 && num_paths == 0) {
+ finished = true;
+ return false;
+ }
+ }
+
+ /* Initialize paths from work tiles. */
+ if (work_tiles.size() == 0) {
+ return false;
+ }
+
+ /* Compact state array when number of paths becomes small relative to the
+ * known maximum path index, which makes computing active index arrays slow. */
+ compact_states(num_active_paths);
+
+ if (has_shadow_catcher()) {
+ integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+ queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+ }
+
+ enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
+ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
+ work_tiles.data(),
+ work_tiles.size(),
+ num_active_paths,
+ num_predicted_splits);
+
+ return true;
+}
+
+void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
+ const KernelWorkTile work_tiles[],
+ const int num_work_tiles,
+ const int num_active_paths,
+ const int num_predicted_splits)
+{
+ /* Copy work tiles to device. */
+ if (work_tiles_.size() < num_work_tiles) {
+ work_tiles_.alloc(num_work_tiles);
+ }
+
+ int path_index_offset = num_active_paths;
+ int max_tile_work_size = 0;
+ for (int i = 0; i < num_work_tiles; i++) {
+ KernelWorkTile &work_tile = work_tiles_.data()[i];
+ work_tile = work_tiles[i];
+
+ const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+ work_tile.path_index_offset = path_index_offset;
+ work_tile.work_size = tile_work_size;
+
+ path_index_offset += tile_work_size;
+
+ max_tile_work_size = max(max_tile_work_size, tile_work_size);
+ }
+
+ queue_->copy_to_device(work_tiles_);
+
+ void *d_work_tiles = (void *)work_tiles_.device_pointer;
+ void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+
+ /* Launch kernel. */
+ void *args[] = {&d_work_tiles,
+ const_cast<int *>(&num_work_tiles),
+ &d_render_buffer,
+ const_cast<int *>(&max_tile_work_size)};
+
+ queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
+
+ max_active_path_index_ = path_index_offset + num_predicted_splits;
+}
+
+int PathTraceWorkGPU::get_num_active_paths()
+{
+ /* TODO: this is wrong, does not account for duplicates with shadow! */
+ IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int num_paths = 0;
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ DCHECK_GE(queue_counter->num_queued[i], 0)
+ << "Invalid number of queued states for kernel "
+ << device_kernel_as_string(static_cast<DeviceKernel>(i));
+ num_paths += queue_counter->num_queued[i];
+ }
+
+ return num_paths;
+}
+
+bool PathTraceWorkGPU::should_use_graphics_interop()
+{
+ /* There are few aspects with the graphics interop when using multiple devices caused by the fact
+ * that the GPUDisplay has a single texture:
+ *
+ * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
+ * attempting to register OpenGL PBO which has been mapped. Which makes sense, because
+ * otherwise one would run into a conflict of where the source of truth is. */
+ if (has_multiple_works()) {
+ return false;
+ }
+
+ if (!interop_use_checked_) {
+ Device *device = queue_->device;
+ interop_use_ = device->should_use_graphics_interop();
+
+ if (interop_use_) {
+ VLOG(2) << "Will be using graphics interop GPU display update.";
+ }
+ else {
+ VLOG(2) << "Will be using naive GPU display update.";
+ }
+
+ interop_use_checked_ = true;
+ }
+
+ return interop_use_;
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ if (device_->have_error()) {
+ /* Don't attempt to update GPU display if the device has errors: the error state will make
+ * wrong decisions to happen about interop, causing more chained bugs. */
+ return;
+ }
+
+ if (!buffers_->buffer.device_pointer) {
+ LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
+ return;
+ }
+
+ if (should_use_graphics_interop()) {
+ if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+ return;
+ }
+
+ /* If error happens when trying to use graphics interop fallback to the native implementation
+ * and don't attempt to use interop for the further updates. */
+ interop_use_ = false;
+ }
+
+ copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ const int full_x = effective_buffer_params_.full_x;
+ const int full_y = effective_buffer_params_.full_y;
+ const int width = effective_buffer_params_.width;
+ const int height = effective_buffer_params_.height;
+ const int final_width = buffers_->params.width;
+ const int final_height = buffers_->params.height;
+
+ const int texture_x = full_x - effective_full_params_.full_x;
+ const int texture_y = full_y - effective_full_params_.full_y;
+
+ /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
+ *
+ * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
+ * change of the resolution divider. However, if the display becomes smaller, shrink the
+ * allocated memory as well. */
+ if (gpu_display_rgba_half_.data_width != final_width ||
+ gpu_display_rgba_half_.data_height != final_height) {
+ gpu_display_rgba_half_.alloc(final_width, final_height);
+ /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
+ * transfering zeroes to the device. */
+ queue_->zero_to_device(gpu_display_rgba_half_);
+ }
+
+ PassAccessor::Destination destination(film_->get_display_pass());
+ destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+
+ get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+ gpu_display_rgba_half_.copy_from_device();
+
+ gpu_display->copy_pixels_to_texture(
+ gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+}
+
+bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ if (!device_graphics_interop_) {
+ device_graphics_interop_ = queue_->graphics_interop_create();
+ }
+
+ const DeviceGraphicsInteropDestination graphics_interop_dst =
+ gpu_display->graphics_interop_get();
+ device_graphics_interop_->set_destination(graphics_interop_dst);
+
+ const device_ptr d_rgba_half = device_graphics_interop_->map();
+ if (!d_rgba_half) {
+ return false;
+ }
+
+ PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+ destination.d_pixels_half_rgba = d_rgba_half;
+
+ get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+ device_graphics_interop_->unmap();
+
+ return true;
+}
+
+void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+{
+ if (!device_graphics_interop_) {
+ return;
+ }
+ gpu_display->graphics_interop_activate();
+ device_graphics_interop_ = nullptr;
+ gpu_display->graphics_interop_deactivate();
+}
+
+void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+ PassMode pass_mode,
+ int num_samples)
+{
+ const KernelFilm &kfilm = device_scene_->data.film;
+
+ const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+ const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
+
+ pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+}
+
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+ const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
+
+ if (num_active_pixels) {
+ enqueue_adaptive_sampling_filter_x();
+ enqueue_adaptive_sampling_filter_y();
+ queue_->synchronize();
+ }
+
+ return num_active_pixels;
+}
+
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
+{
+ device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+ num_active_pixels.alloc(1);
+
+ queue_->zero_to_device(num_active_pixels);
+
+ const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ const_cast<int *>(&effective_buffer_params_.full_x),
+ const_cast<int *>(&effective_buffer_params_.full_y),
+ const_cast<int *>(&effective_buffer_params_.width),
+ const_cast<int *>(&effective_buffer_params_.height),
+ &threshold,
+ &reset,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride,
+ &num_active_pixels.device_pointer};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
+
+ queue_->copy_from_device(num_active_pixels);
+ queue_->synchronize();
+
+ return num_active_pixels.data()[0];
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
+{
+ const int work_size = effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ &effective_buffer_params_.full_x,
+ &effective_buffer_params_.full_y,
+ &effective_buffer_params_.width,
+ &effective_buffer_params_.height,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
+{
+ const int work_size = effective_buffer_params_.width;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ &effective_buffer_params_.full_x,
+ &effective_buffer_params_.full_y,
+ &effective_buffer_params_.width,
+ &effective_buffer_params_.height,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
+}
+
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+ const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ const_cast<int *>(&work_size),
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_from_device()
+{
+ queue_->copy_from_device(buffers_->buffer);
+
+ /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
+ return queue_->synchronize();
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_to_device()
+{
+ queue_->copy_to_device(buffers_->buffer);
+
+ /* NOTE: The direct device access to the buffers only happens within this path trace work. The
+ * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
+ * which will perform synchronization as needed. */
+
+ return true;
+}
+
+bool PathTraceWorkGPU::zero_render_buffers()
+{
+ queue_->zero_to_device(buffers_->buffer);
+
+ return true;
+}
+
+bool PathTraceWorkGPU::has_shadow_catcher() const
+{
+ return device_scene_->data.integrator.has_shadow_catcher;
+}
+
+int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
+{
+ if (max_active_path_index_ == 0) {
+ return 0;
+ }
+
+ if (!has_shadow_catcher()) {
+ return 0;
+ }
+
+ queue_->zero_to_device(num_queued_paths_);
+
+ const int work_size = max_active_path_index_;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
+
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
+ queue_->copy_from_device(num_queued_paths_);
+ queue_->synchronize();
+
+ return num_queued_paths_.data()[0];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
new file mode 100644
index 00000000000..38788122b0d
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/device_graphics_interop.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/work_tile_scheduler.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+
+/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized
+ * to match device queue's number of path states.
+ * This implementation suits best devices which have a lot of integrator states, such as GPU. */
+class PathTraceWorkGPU : public PathTraceWork {
+ public:
+ PathTraceWorkGPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag);
+
+ virtual void alloc_work_memory() override;
+ virtual void init_execution() override;
+
+ virtual void render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num) override;
+
+ virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples) override;
+ virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+ virtual bool copy_render_buffers_from_device() override;
+ virtual bool copy_render_buffers_to_device() override;
+ virtual bool zero_render_buffers() override;
+
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+ virtual void cryptomatte_postproces() override;
+
+ protected:
+ void alloc_integrator_soa();
+ void alloc_integrator_queue();
+ void alloc_integrator_sorting();
+ void alloc_integrator_path_split();
+
+ /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */
+ DeviceKernel get_most_queued_kernel() const;
+
+ void enqueue_reset();
+
+ bool enqueue_work_tiles(bool &finished);
+ void enqueue_work_tiles(DeviceKernel kernel,
+ const KernelWorkTile work_tiles[],
+ const int num_work_tiles,
+ const int num_active_paths,
+ const int num_predicted_splits);
+
+ bool enqueue_path_iteration();
+ void enqueue_path_iteration(DeviceKernel kernel);
+
+ void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+ void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+
+ void compact_states(const int num_active_paths);
+
+ int get_num_active_paths();
+
+ /* Check whether graphics interop can be used for the GPUDisplay update. */
+ bool should_use_graphics_interop();
+
+ /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the
+ * device, then copies pixels to the host and pushes them to the `gpu_display`. */
+ void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+ /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability
+ * functionality, avoiding copy of pixels to the host. */
+ bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+ /* Synchronously run film conversion kernel and store display result in the given destination. */
+ void get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+ PassMode pass_mode,
+ int num_samples);
+
+ int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
+ void enqueue_adaptive_sampling_filter_x();
+ void enqueue_adaptive_sampling_filter_y();
+
+ bool has_shadow_catcher() const;
+
+ /* Count how many currently scheduled paths can still split. */
+ int shadow_catcher_count_possible_splits();
+
+ /* Integrator queue. */
+ unique_ptr<DeviceQueue> queue_;
+
+ /* Scheduler which gives work to path tracing threads. */
+ WorkTileScheduler work_tile_scheduler_;
+
+ /* Integrate state for paths. */
+ IntegratorStateGPU integrator_state_gpu_;
+ /* SoA arrays for integrator state. */
+ vector<unique_ptr<device_memory>> integrator_state_soa_;
+ uint integrator_state_soa_kernel_features_;
+ /* Keep track of number of queued kernels. */
+ device_vector<IntegratorQueueCounter> integrator_queue_counter_;
+ /* Shader sorting. */
+ device_vector<int> integrator_shader_sort_counter_;
+ device_vector<int> integrator_shader_raytrace_sort_counter_;
+ /* Path split. */
+ device_vector<int> integrator_next_shadow_catcher_path_index_;
+
+ /* Temporary buffer to get an array of queued path for a particular kernel. */
+ device_vector<int> queued_paths_;
+ device_vector<int> num_queued_paths_;
+
+ /* Temporary buffer for passing work tiles to kernel. */
+ device_vector<KernelWorkTile> work_tiles_;
+
+ /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not
+ * available. Is allocated on-demand. */
+ device_vector<half4> gpu_display_rgba_half_;
+
+ unique_ptr<DeviceGraphicsInterop> device_graphics_interop_;
+
+ /* Cached result of device->should_use_graphics_interop(). */
+ bool interop_use_checked_ = false;
+ bool interop_use_ = false;
+
+ /* Maximum number of concurrent integrator states. */
+ int max_num_paths_;
+
+ /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below
+ * this value more work will be scheduled. */
+ int min_num_active_paths_;
+
+ /* Maximum path index, effective number of paths used may be smaller than
+ * the size of the integrator_state_ buffer so can avoid iterating over the
+ * full buffer. */
+ int max_active_path_index_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
new file mode 100644
index 00000000000..4eb1dd941f9
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -0,0 +1,1187 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/render_scheduler.h"
+
+#include "render/session.h"
+#include "render/tile.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Render scheduler.
+ */
+
+RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams &params)
+ : headless_(params.headless),
+ background_(params.background),
+ pixel_size_(params.pixel_size),
+ tile_manager_(tile_manager),
+ default_start_resolution_divider_(pixel_size_ * 8)
+{
+ use_progressive_noise_floor_ = !background_;
+}
+
+void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte)
+{
+ need_schedule_cryptomatte_ = need_schedule_cryptomatte;
+}
+
+void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance)
+{
+ need_schedule_rebalance_works_ = need_schedule_rebalance;
+}
+
+bool RenderScheduler::is_background() const
+{
+ return background_;
+}
+
+void RenderScheduler::set_denoiser_params(const DenoiseParams &params)
+{
+ denoiser_params_ = params;
+}
+
+void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+ adaptive_sampling_ = adaptive_sampling;
+}
+
+bool RenderScheduler::is_adaptive_sampling_used() const
+{
+ return adaptive_sampling_.use;
+}
+
+void RenderScheduler::set_start_sample(int start_sample)
+{
+ start_sample_ = start_sample;
+}
+
+int RenderScheduler::get_start_sample() const
+{
+ return start_sample_;
+}
+
+void RenderScheduler::set_num_samples(int num_samples)
+{
+ num_samples_ = num_samples;
+}
+
+int RenderScheduler::get_num_samples() const
+{
+ return num_samples_;
+}
+
+void RenderScheduler::set_time_limit(double time_limit)
+{
+ time_limit_ = time_limit;
+}
+
+double RenderScheduler::get_time_limit() const
+{
+ return time_limit_;
+}
+
+int RenderScheduler::get_rendered_sample() const
+{
+ DCHECK_GT(get_num_rendered_samples(), 0);
+
+ return start_sample_ + get_num_rendered_samples() - 1;
+}
+
+int RenderScheduler::get_num_rendered_samples() const
+{
+ return state_.num_rendered_samples;
+}
+
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+{
+ buffer_params_ = buffer_params;
+
+ update_start_resolution_divider();
+
+ set_num_samples(num_samples);
+
+ /* In background mode never do lower resolution render preview, as it is not really supported
+ * by the software. */
+ if (background_) {
+ state_.resolution_divider = 1;
+ }
+ else {
+ /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider
+ * first and then initialized render work. */
+ state_.resolution_divider = start_resolution_divider_ * 2;
+ }
+
+ state_.num_rendered_samples = 0;
+ state_.last_display_update_time = 0.0;
+ state_.last_display_update_sample = -1;
+
+ state_.last_rebalance_time = 0.0;
+ state_.num_rebalance_requested = 0;
+ state_.num_rebalance_changes = 0;
+ state_.last_rebalance_changed = false;
+ state_.need_rebalance_at_next_work = false;
+
+ /* TODO(sergey): Choose better initial value. */
+ /* NOTE: The adaptive sampling settings might not be available here yet. */
+ state_.adaptive_sampling_threshold = 0.4f;
+
+ state_.last_work_tile_was_denoised = false;
+ state_.tile_result_was_written = false;
+ state_.postprocess_work_scheduled = false;
+ state_.full_frame_work_scheduled = false;
+ state_.full_frame_was_written = false;
+
+ state_.path_trace_finished = false;
+
+ state_.start_render_time = 0.0;
+ state_.end_render_time = 0.0;
+ state_.time_limit_reached = false;
+
+ state_.occupancy_num_samples = 0;
+ state_.occupancy = 1.0f;
+
+ first_render_time_.path_trace_per_sample = 0.0;
+ first_render_time_.denoise_time = 0.0;
+ first_render_time_.display_update_time = 0.0;
+
+ path_trace_time_.reset();
+ denoise_time_.reset();
+ adaptive_filter_time_.reset();
+ display_update_time_.reset();
+ rebalance_time_.reset();
+}
+
+void RenderScheduler::reset_for_next_tile()
+{
+ reset(buffer_params_, num_samples_);
+}
+
+bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
+{
+ /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+ * navigation. */
+ if (state_.resolution_divider != pixel_size_) {
+ return false;
+ }
+
+ if (render_work_reschedule_on_idle(render_work)) {
+ return true;
+ }
+
+ state_.path_trace_finished = true;
+
+ bool denoiser_delayed, denoiser_ready_to_display;
+ render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+ render_work.display.update = work_need_update_display(denoiser_delayed);
+ render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+ return false;
+}
+
+bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work)
+{
+ if (!use_progressive_noise_floor_) {
+ return false;
+ }
+
+ /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+ * navigation. */
+ if (state_.resolution_divider != pixel_size_) {
+ return false;
+ }
+
+ if (adaptive_sampling_.use) {
+ if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) {
+ state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2,
+ adaptive_sampling_.threshold);
+
+ render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold;
+ render_work.adaptive_sampling.reset = true;
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work)
+{
+ VLOG(3) << "Schedule work for cancel.";
+
+ /* Un-schedule samples: they will not be rendered and should not be counted. */
+ state_.num_rendered_samples -= render_work.path_trace.num_samples;
+
+ const bool has_rendered_samples = get_num_rendered_samples() != 0;
+
+ /* Reset all fields of the previous work, canelling things like adaptive sampling filtering and
+ * denoising.
+ * However, need to preserve write requests, since those will not be possible to recover and
+ * writes are only to happen once. */
+ const bool tile_write = render_work.tile.write;
+ const bool full_write = render_work.full.write;
+
+ render_work = RenderWork();
+
+ render_work.tile.write = tile_write;
+ render_work.full.write = full_write;
+
+ /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which
+ * got cancelled. */
+ if (!state_.tile_result_was_written && has_rendered_samples) {
+ render_work.tile.write = true;
+ }
+
+ if (!state_.full_frame_was_written) {
+ render_work.full.write = true;
+ }
+
+ /* Update current tile, but only if any sample was rendered.
+ * Allows to have latest state of tile visible while full buffer is being processed.
+ *
+ * Note that if there are no samples in the current tile its render buffer might have pixels
+ * remained from previous state.
+ *
+ * If the full result was written, then there is no way any updates were made to the render
+ * buffers. And the buffers might have been freed from the device, so display update is not
+ * possible. */
+ if (has_rendered_samples && !state_.full_frame_was_written) {
+ render_work.display.update = true;
+ }
+}
+
+bool RenderScheduler::done() const
+{
+ if (state_.resolution_divider != pixel_size_) {
+ return false;
+ }
+
+ if (state_.path_trace_finished || state_.time_limit_reached) {
+ return true;
+ }
+
+ return get_num_rendered_samples() >= num_samples_;
+}
+
+RenderWork RenderScheduler::get_render_work()
+{
+ check_time_limit_reached();
+
+ const double time_now = time_dt();
+
+ if (done()) {
+ RenderWork render_work;
+ render_work.resolution_divider = state_.resolution_divider;
+
+ if (!set_postprocess_render_work(&render_work)) {
+ set_full_frame_render_work(&render_work);
+ }
+
+ if (!render_work) {
+ state_.end_render_time = time_now;
+ }
+
+ update_state_for_render_work(render_work);
+
+ return render_work;
+ }
+
+ RenderWork render_work;
+
+ if (state_.resolution_divider != pixel_size_) {
+ state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_);
+ state_.num_rendered_samples = 0;
+ state_.last_display_update_sample = -1;
+ }
+
+ render_work.resolution_divider = state_.resolution_divider;
+
+ render_work.path_trace.start_sample = get_start_sample_to_path_trace();
+ render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+
+ render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());
+
+ /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */
+ render_work.rebalance = work_need_rebalance();
+
+ /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the
+ * samples are rendered. */
+ state_.num_rendered_samples += render_work.path_trace.num_samples;
+
+ render_work.adaptive_sampling.filter = work_need_adaptive_filter();
+ render_work.adaptive_sampling.threshold = work_adaptive_threshold();
+ render_work.adaptive_sampling.reset = false;
+
+ bool denoiser_delayed, denoiser_ready_to_display;
+ render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+ render_work.tile.write = done();
+
+ render_work.display.update = work_need_update_display(denoiser_delayed);
+ render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+ if (done()) {
+ set_postprocess_render_work(&render_work);
+ }
+
+ update_state_for_render_work(render_work);
+
+ return render_work;
+}
+
+void RenderScheduler::update_state_for_render_work(const RenderWork &render_work)
+{
+ const double time_now = time_dt();
+
+ if (render_work.rebalance) {
+ state_.last_rebalance_time = time_now;
+ ++state_.num_rebalance_requested;
+ }
+
+ /* A fallback display update time, for the case there is an error of display update, or when
+ * there is no display at all. */
+ if (render_work.display.update) {
+ state_.last_display_update_time = time_now;
+ state_.last_display_update_sample = state_.num_rendered_samples;
+ }
+
+ state_.last_work_tile_was_denoised = render_work.tile.denoise;
+ state_.tile_result_was_written |= render_work.tile.write;
+ state_.full_frame_was_written |= render_work.full.write;
+}
+
+bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
+{
+ if (state_.postprocess_work_scheduled) {
+ return false;
+ }
+ state_.postprocess_work_scheduled = true;
+
+ bool any_scheduled = false;
+
+ if (need_schedule_cryptomatte_) {
+ render_work->cryptomatte.postprocess = true;
+ any_scheduled = true;
+ }
+
+ if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
+ render_work->tile.denoise = true;
+ any_scheduled = true;
+ }
+
+ if (!state_.tile_result_was_written) {
+ render_work->tile.write = true;
+ any_scheduled = true;
+ }
+
+ if (any_scheduled) {
+ render_work->display.update = true;
+ }
+
+ return any_scheduled;
+}
+
+void RenderScheduler::set_full_frame_render_work(RenderWork *render_work)
+{
+ if (state_.full_frame_work_scheduled) {
+ return;
+ }
+
+ if (!tile_manager_.has_multiple_tiles()) {
+ /* There is only single tile, so all work has been performed already. */
+ return;
+ }
+
+ if (!tile_manager_.done()) {
+ /* There are still tiles to be rendered. */
+ return;
+ }
+
+ if (state_.full_frame_was_written) {
+ return;
+ }
+
+ state_.full_frame_work_scheduled = true;
+
+ render_work->full.write = true;
+}
+
+/* Knowing time which it took to complete a task at the current resolution divider approximate how
+ * long it would have taken to complete it at a final resolution. */
+static double approximate_final_time(const RenderWork &render_work, double time)
+{
+ if (render_work.resolution_divider == 1) {
+ return time;
+ }
+
+ const double resolution_divider_sq = render_work.resolution_divider *
+ render_work.resolution_divider;
+ return time * resolution_divider_sq;
+}
+
+void RenderScheduler::report_work_begin(const RenderWork &render_work)
+{
+ /* Start counting render time when rendering samples at their final resolution.
+ *
+ * NOTE: The work might have the path trace part be all zero: this happens when a post-processing
+ * work is scheduled after the path tracing. Checking for just a start sample doesn't work here
+ * because it might be wrongly 0. Check for whether path tracing is actually happening as it is
+ * expected to happen in the first work. */
+ if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 &&
+ render_work.path_trace.start_sample == get_start_sample()) {
+ state_.start_render_time = time_dt();
+ }
+}
+
+void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
+ double time,
+ bool is_cancelled)
+{
+ path_trace_time_.add_wall(time);
+
+ if (is_cancelled) {
+ return;
+ }
+
+ const double final_time_approx = approximate_final_time(render_work, time);
+
+ if (work_is_usable_for_first_render_estimation(render_work)) {
+ first_render_time_.path_trace_per_sample = final_time_approx /
+ render_work.path_trace.num_samples;
+ }
+
+ if (work_report_reset_average(render_work)) {
+ path_trace_time_.reset_average();
+ }
+
+ path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+ VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+ state_.occupancy_num_samples = render_work.path_trace.num_samples;
+ state_.occupancy = occupancy;
+ VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
+void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
+ double time,
+ bool is_cancelled)
+{
+ adaptive_filter_time_.add_wall(time);
+
+ if (is_cancelled) {
+ return;
+ }
+
+ const double final_time_approx = approximate_final_time(render_work, time);
+
+ if (work_report_reset_average(render_work)) {
+ adaptive_filter_time_.reset_average();
+ }
+
+ adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+ VLOG(4) << "Average adaptive sampling filter time: " << adaptive_filter_time_.get_average()
+ << " seconds.";
+}
+
+void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time)
+{
+ denoise_time_.add_wall(time);
+
+ const double final_time_approx = approximate_final_time(render_work, time);
+
+ if (work_is_usable_for_first_render_estimation(render_work)) {
+ first_render_time_.denoise_time = final_time_approx;
+ }
+
+ if (work_report_reset_average(render_work)) {
+ denoise_time_.reset_average();
+ }
+
+ denoise_time_.add_average(final_time_approx);
+
+ VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time)
+{
+ display_update_time_.add_wall(time);
+
+ const double final_time_approx = approximate_final_time(render_work, time);
+
+ if (work_is_usable_for_first_render_estimation(render_work)) {
+ first_render_time_.display_update_time = final_time_approx;
+ }
+
+ if (work_report_reset_average(render_work)) {
+ display_update_time_.reset_average();
+ }
+
+ display_update_time_.add_average(final_time_approx);
+
+ VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds.";
+
+ /* Move the display update moment further in time, so that logic which checks when last update
+ * did happen have more reliable point in time (without path tracing and denoising parts of the
+ * render work). */
+ state_.last_display_update_time = time_dt();
+}
+
+void RenderScheduler::report_rebalance_time(const RenderWork &render_work,
+ double time,
+ bool balance_changed)
+{
+ rebalance_time_.add_wall(time);
+
+ if (work_report_reset_average(render_work)) {
+ rebalance_time_.reset_average();
+ }
+
+ rebalance_time_.add_average(time);
+
+ if (balance_changed) {
+ ++state_.num_rebalance_changes;
+ }
+
+ state_.last_rebalance_changed = balance_changed;
+
+ VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds.";
+}
+
+string RenderScheduler::full_report() const
+{
+ const double render_wall_time = state_.end_render_time - state_.start_render_time;
+ const int num_rendered_samples = get_num_rendered_samples();
+
+ string result = "\nRender Scheduler Summary\n\n";
+
+ {
+ string mode;
+ if (headless_) {
+ mode = "Headless";
+ }
+ else if (background_) {
+ mode = "Background";
+ }
+ else {
+ mode = "Interactive";
+ }
+ result += "Mode: " + mode + "\n";
+ }
+
+ result += "Resolution: " + to_string(buffer_params_.width) + "x" +
+ to_string(buffer_params_.height) + "\n";
+
+ result += "\nAdaptive sampling:\n";
+ result += " Use: " + string_from_bool(adaptive_sampling_.use) + "\n";
+ if (adaptive_sampling_.use) {
+ result += " Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n";
+ result += " Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n";
+ result += " Threshold: " + to_string(adaptive_sampling_.threshold) + "\n";
+ }
+
+ result += "\nDenoiser:\n";
+ result += " Use: " + string_from_bool(denoiser_params_.use) + "\n";
+ if (denoiser_params_.use) {
+ result += " Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n";
+ result += " Start Sample: " + to_string(denoiser_params_.start_sample) + "\n";
+
+ string passes = "Color";
+ if (denoiser_params_.use_pass_albedo) {
+ passes += ", Albedo";
+ }
+ if (denoiser_params_.use_pass_normal) {
+ passes += ", Normal";
+ }
+
+ result += " Passes: " + passes + "\n";
+ }
+
+ if (state_.num_rebalance_requested) {
+ result += "\nRebalancer:\n";
+ result += " Number of requested rebalances: " + to_string(state_.num_rebalance_requested) +
+ "\n";
+ result += " Number of performed rebalances: " + to_string(state_.num_rebalance_changes) +
+ "\n";
+ }
+
+ result += "\nTime (in seconds):\n";
+ result += string_printf(" %20s %20s %20s\n", "", "Wall", "Average");
+ result += string_printf(" %20s %20f %20f\n",
+ "Path Tracing",
+ path_trace_time_.get_wall(),
+ path_trace_time_.get_average());
+
+ if (adaptive_sampling_.use) {
+ result += string_printf(" %20s %20f %20f\n",
+ "Adaptive Filter",
+ adaptive_filter_time_.get_wall(),
+ adaptive_filter_time_.get_average());
+ }
+
+ if (denoiser_params_.use) {
+ result += string_printf(
+ " %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average());
+ }
+
+ result += string_printf(" %20s %20f %20f\n",
+ "Display Update",
+ display_update_time_.get_wall(),
+ display_update_time_.get_average());
+
+ if (state_.num_rebalance_requested) {
+ result += string_printf(" %20s %20f %20f\n",
+ "Rebalance",
+ rebalance_time_.get_wall(),
+ rebalance_time_.get_average());
+ }
+
+ const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() +
+ denoise_time_.get_wall() + display_update_time_.get_wall();
+ result += "\n Total: " + to_string(total_time) + "\n";
+
+ result += string_printf(
+ "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time);
+
+ /* When adaptive sampling is used the average time becomes meaningless, because different samples
+ * will likely render different number of pixels. */
+ if (!adaptive_sampling_.use) {
+ result += string_printf("Average time per sample: %f seconds\n",
+ render_wall_time / num_rendered_samples);
+ }
+
+ return result;
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds() const
+{
+ return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples);
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples(
+ int num_rendered_samples) const
+{
+ double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+ num_rendered_samples);
+
+ if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
+ const double remaining_render_time = max(0.0,
+ time_limit_ - (time_dt() - state_.start_render_time));
+
+ update_interval = min(update_interval, remaining_render_time);
+ }
+
+ return update_interval;
+}
+
+/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based
+ * on a more careful experiments with viewport rendering. */
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+ int num_rendered_samples) const
+{
+ /* TODO(sergey): Need a decision on whether this should be using number of samples rendered
+ * within the current render session, or use absolute number of samples with the start sample
+ * taken into account. It will depend on whether the start sample offset clears the render
+ * buffer. */
+
+ if (state_.need_rebalance_at_next_work) {
+ return 0.1;
+ }
+ if (state_.last_rebalance_changed) {
+ return 0.2;
+ }
+
+ if (headless_) {
+ /* In headless mode do rare updates, so that the device occupancy is high, but there are still
+ * progress messages printed to the logs. */
+ return 30.0;
+ }
+
+ if (background_) {
+ if (num_rendered_samples < 32) {
+ return 1.0;
+ }
+ return 2.0;
+ }
+
+ /* Render time and number of samples rendered are used to figure out the display update interval.
+ * Render time is used to allow for fast display updates in the first few seconds of rendering
+ * on fast devices. Number of samples rendered is used to allow for potentially quicker display
+ * updates on slow devices during the first few samples. */
+ const double render_time = path_trace_time_.get_wall();
+ if (render_time < 1) {
+ return 0.1;
+ }
+ if (render_time < 2) {
+ return 0.25;
+ }
+ if (render_time < 4) {
+ return 0.5;
+ }
+ if (render_time < 8 || num_rendered_samples < 32) {
+ return 1.0;
+ }
+ return 2.0;
+}
+
+int RenderScheduler::calculate_num_samples_per_update() const
+{
+ const double time_per_sample_average = path_trace_time_.get_average();
+ const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average;
+
+ const double update_interval_in_seconds = guess_display_update_interval_in_seconds();
+
+ return max(int(num_samples_in_second * update_interval_in_seconds), 1);
+}
+
+int RenderScheduler::get_start_sample_to_path_trace() const
+{
+ return start_sample_ + state_.num_rendered_samples;
+}
+
+/* Round number of samples to the closest power of two.
+ * Rounding might happen to higher or lower value depending on which one is closer. Such behavior
+ * allows to have number of samples to be power of two without diverging from the planned number of
+ * samples too much. */
+static inline uint round_num_samples_to_power_of_2(const uint num_samples)
+{
+ if (num_samples == 1) {
+ return 1;
+ }
+
+ if (is_power_of_two(num_samples)) {
+ return num_samples;
+ }
+
+ const uint num_samples_up = next_power_of_two(num_samples);
+ const uint num_samples_down = num_samples_up - (num_samples_up >> 1);
+
+ const uint delta_up = num_samples_up - num_samples;
+ const uint delta_down = num_samples - num_samples_down;
+
+ if (delta_up <= delta_down) {
+ return num_samples_up;
+ }
+
+ return num_samples_down;
+}
+
+int RenderScheduler::get_num_samples_to_path_trace() const
+{
+ if (state_.resolution_divider != pixel_size_) {
+ return get_num_samples_during_navigation(state_.resolution_divider);
+ }
+
+ /* Always start full resolution render with a single sample. Gives more instant feedback to
+ * artists, and allows to gather information for a subsequent path tracing works. Do it in the
+ * headless mode as well, to give some estimate of how long samples are taking. */
+ if (state_.num_rendered_samples == 0) {
+ return 1;
+ }
+
+ const int num_samples_per_update = calculate_num_samples_per_update();
+ const int path_trace_start_sample = get_start_sample_to_path_trace();
+
+ /* Round number of samples to a power of two, so that division of path states into tiles goes in
+ * a more integer manner.
+ * This might make it so updates happens more rarely due to rounding up. In the test scenes this
+ * is not huge deal because it is not seen that more than 8 samples can be rendered between
+ * updates. If that becomes a problem we can add some extra rules like never allow to round up
+ * more than N samples. */
+ const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
+
+ const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+ int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+ /* When enough statistics is available and doing an offlien rendering prefer to keep device
+ * occupied. */
+ if (state_.occupancy_num_samples && (background_ || headless_)) {
+ /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+ * with good performance without forcing occupancy to be higher). */
+ int num_samples_to_occupy = state_.occupancy_num_samples;
+ if (state_.occupancy < 0.5f) {
+ num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+ }
+
+ num_samples_to_render = max(num_samples_to_render,
+ min(num_samples_to_occupy, max_num_samples_to_render));
+ }
+
+ /* If adaptive sampling is not use, render as many samples per update as possible, keeping the
+ * device fully occupied, without much overhead of display updates. */
+ if (!adaptive_sampling_.use) {
+ return num_samples_to_render;
+ }
+
+ /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This
+ * is to ensure that the final render is pixel-matched regardless of how many samples per second
+ * compute device can do. */
+
+ return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render);
+}
+
+int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const
+{
+ /* Special trick for fast navigation: schedule multiple samples during fast navigation
+ * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
+ * usable visual feedback for artists. There are a couple of tricks though. */
+
+ if (is_denoise_active_during_update()) {
+ /* When denoising is used during navigation prefer using a higher resolution with less samples
+ * (scheduling less samples here will make it so the resolution_divider calculation will use a
+ * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser
+ * give visually better results on a higher resolution image with less samples. */
+ return 1;
+ }
+
+ if (resolution_divider <= pixel_size_) {
+ /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
+ * the sample count at this resolution division, but instead assists in the calculation of
+ * the resolution divider. */
+ return 1;
+ }
+
+ if (resolution_divider == pixel_size_ * 2) {
+ /* When resolution divider is the previous step to the final resolution, schedule two samples.
+ * This is so that rendering on lower resolution does not exceed time that it takes to render
+ * first sample at the full resolution. */
+ return 2;
+ }
+
+ /* Always render 4 samples, even if scene is configured for less.
+ * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
+ * to have 4 time extra samples, so verall worst case timing is the same as the final resolution
+ * at one sample. */
+ return 4;
+}
+
+bool RenderScheduler::work_need_adaptive_filter() const
+{
+ return adaptive_sampling_.need_filter(get_rendered_sample());
+}
+
+float RenderScheduler::work_adaptive_threshold() const
+{
+ if (!use_progressive_noise_floor_) {
+ return adaptive_sampling_.threshold;
+ }
+
+ return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold);
+}
+
+bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
+{
+ delayed = false;
+ ready_to_display = true;
+
+ if (!denoiser_params_.use) {
+ /* Denoising is disabled, no need to scheduler work for it. */
+ return false;
+ }
+
+ if (done()) {
+ /* Always denoise at the last sample. */
+ return true;
+ }
+
+ if (background_) {
+ /* Background render, only denoise when rendering the last sample. */
+ /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised
+ * image looks like even for the background rendering. */
+ return false;
+ }
+
+ /* Viewport render. */
+
+ /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as
+ * final samples. */
+ const int num_samples_finished = state_.resolution_divider == pixel_size_ ?
+ state_.num_rendered_samples :
+ 1;
+
+ /* Immediately denoise when we reach the start sample or last sample. */
+ if (num_samples_finished == denoiser_params_.start_sample ||
+ num_samples_finished == num_samples_) {
+ return true;
+ }
+
+ /* Do not denoise until the sample at which denoising should start is reached. */
+ if (num_samples_finished < denoiser_params_.start_sample) {
+ ready_to_display = false;
+ return false;
+ }
+
+ /* Avoid excessive denoising in viewport after reaching a certain sample count and render time.
+ */
+ /* TODO(sergey): Consider making time interval and sample configurable. */
+ delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 &&
+ (time_dt() - state_.last_display_update_time) < 1.0);
+
+ return !delayed;
+}
+
+bool RenderScheduler::work_need_update_display(const bool denoiser_delayed)
+{
+ if (headless_) {
+ /* Force disable display update in headless mode. There will be nothing to display the
+ * in-progress result. */
+ return false;
+ }
+
+ if (denoiser_delayed) {
+ /* If denoiser has been delayed the display can not be updated as it will not contain
+ * up-to-date state of the render result. */
+ return false;
+ }
+
+ if (!adaptive_sampling_.use) {
+ /* When adaptive sampling is not used the work is scheduled in a way that they keep render
+ * device busy for long enough, so that the display update can happen right after the
+ * rendering. */
+ return true;
+ }
+
+ if (done() || state_.last_display_update_sample == -1) {
+ /* Make sure an initial and final results of adaptive sampling is communicated ot the display.
+ */
+ return true;
+ }
+
+ /* For the development purposes of adaptive sampling it might be very useful to see all updates
+ * of active pixels after convergence check. However, it would cause a slowdown for regular usage
+ * users. Possibly, make it a debug panel option to allow rapid update to ease development
+ * without need to re-compiled. */
+ // if (work_need_adaptive_filter()) {
+ // return true;
+ // }
+
+ /* When adaptive sampling is used, its possible that only handful of samples of a very simple
+ * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points).
+ * We take care of skipping updates here based on when previous display update did happen. */
+ const double update_interval = guess_display_update_interval_in_seconds_for_num_samples(
+ state_.last_display_update_sample);
+ return (time_dt() - state_.last_display_update_time) > update_interval;
+}
+
+bool RenderScheduler::work_need_rebalance()
+{
+ /* This is the minimum time, as the rebalancing can not happen more often than the path trace
+ * work. */
+ static const double kRebalanceIntervalInSeconds = 1;
+
+ if (!need_schedule_rebalance_works_) {
+ return false;
+ }
+
+ if (state_.resolution_divider != pixel_size_) {
+ /* Don't rebalance at a non-final resolution divider. Some reasons for this:
+ * - It will introduce unnecessary during navigation.
+ * - Per-render device timing information is not very reliable yet. */
+ return false;
+ }
+
+ if (state_.num_rendered_samples == 0) {
+ state_.need_rebalance_at_next_work = true;
+ return false;
+ }
+
+ if (state_.need_rebalance_at_next_work) {
+ state_.need_rebalance_at_next_work = false;
+ return true;
+ }
+
+ if (state_.last_rebalance_changed) {
+ return true;
+ }
+
+ return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds;
+}
+
+void RenderScheduler::update_start_resolution_divider()
+{
+ if (start_resolution_divider_ == 0) {
+ /* Resolution divider has never been calculated before: use default resolution, so that we have
+ * somewhat good initial behavior, giving a chance to collect real numbers. */
+ start_resolution_divider_ = default_start_resolution_divider_;
+ VLOG(3) << "Initial resolution divider is " << start_resolution_divider_;
+ return;
+ }
+
+ if (first_render_time_.path_trace_per_sample == 0.0) {
+ /* Not enough information to calculate better resolution, keep the existing one. */
+ return;
+ }
+
+ const double desired_update_interval_in_seconds =
+ guess_viewport_navigation_update_interval_in_seconds();
+
+ const double actual_time_per_update = first_render_time_.path_trace_per_sample +
+ first_render_time_.denoise_time +
+ first_render_time_.display_update_time;
+
+ /* Allow some percent of tolerance, so that if the render time is close enough to the higher
+ * resolution we prefer to use it instead of going way lower resolution and time way below the
+ * desired one. */
+ const int resolution_divider_for_update = calculate_resolution_divider_for_time(
+ desired_update_interval_in_seconds * 1.4, actual_time_per_update);
+
+ /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
+ * render time is somewhere on a boundary between two resolutions. */
+
+ /* Never increase resolution to higher than the pixel size (which is possible if the scene is
+ * simple and compute device is fast). */
+ start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+
+ VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_;
+}
+
+double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const
+{
+ if (is_denoise_active_during_update()) {
+ /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the
+ * image from. With the faster updates and extra compute required the resolution becomes too
+ * low to give usable feedback. */
+ /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser
+ * on GPU the value might need to become lower for faster navigation. */
+ return 1.0 / 12.0;
+ }
+
+ /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will
+ * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high
+ * values of the resolution divider which does not give very pleasant updates during navigation.
+ * Choose less frequent updates to allow more noise-free and higher resolution updates. */
+
+ /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider
+ * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */
+
+ return 1.0 / 30.0;
+}
+
+bool RenderScheduler::is_denoise_active_during_update() const
+{
+ if (!denoiser_params_.use) {
+ return false;
+ }
+
+ if (denoiser_params_.start_sample > 1) {
+ return false;
+ }
+
+ return true;
+}
+
+bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work)
+{
+ return render_work.resolution_divider == pixel_size_ &&
+ render_work.path_trace.start_sample == start_sample_;
+}
+
+bool RenderScheduler::work_report_reset_average(const RenderWork &render_work)
+{
+ /* When rendering at a non-final resolution divider time average is not very useful because it
+ * will either bias average down (due to lower render times on the smaller images) or will give
+ * incorrect result when trying to estimate time which would have spent on the final resolution.
+ *
+ * So we only accumulate average for the latest resolution divider which was rendered. */
+ return render_work.resolution_divider != pixel_size_;
+}
+
+void RenderScheduler::check_time_limit_reached()
+{
+ if (time_limit_ == 0.0) {
+ /* No limit is enforced. */
+ return;
+ }
+
+ if (state_.start_render_time == 0.0) {
+ /* Rendering did not start yet. */
+ return;
+ }
+
+ const double current_time = time_dt();
+
+ if (current_time - state_.start_render_time < time_limit_) {
+ /* Time limit is not reached yet. */
+ return;
+ }
+
+ state_.time_limit_reached = true;
+ state_.end_render_time = current_time;
+}
+
+/* --------------------------------------------------------------------
+ * Utility functions.
+ */
+
+int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
+{
+ /* TODO(sergey): There should a non-iterative analytical formula here. */
+
+ int resolution_divider = 1;
+
+ /* This algorithm iterates through resolution dividers until a divider is found that achieves
+ * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
+ * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
+ * pre_resolution_division_samples and post_resolution_division_samples are used in this
+ * calculation to better predict the performance impact of changing resolution divisions as
+ * the sample count can also change between resolution divisions. */
+ while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
+ int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+ resolution_divider = resolution_divider * 2;
+ int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+ actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
+ }
+
+ return resolution_divider;
+}
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
+{
+ if (resolution == INT_MAX) {
+ return 1;
+ }
+
+ int resolution_divider = 1;
+ while (width * height > resolution * resolution) {
+ width = max(1, width / 2);
+ height = max(1, height / 2);
+
+ resolution_divider <<= 1;
+ }
+
+ return resolution_divider;
+}
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider)
+{
+ const int pixel_area = width * height;
+ const int resolution = lround(sqrt(pixel_area));
+
+ return resolution / resolution_divider;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
new file mode 100644
index 00000000000..9c2d107e46d
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/adaptive_sampling.h"
+#include "integrator/denoiser.h" /* For DenoiseParams. */
+#include "render/buffers.h"
+#include "util/util_string.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SessionParams;
+class TileManager;
+
+class RenderWork {
+ public:
+ int resolution_divider = 1;
+
+ /* Initialize render buffers.
+ * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+ * baking target. */
+ bool init_render_buffers = false;
+
+ /* Path tracing samples information. */
+ struct {
+ int start_sample = 0;
+ int num_samples = 0;
+ } path_trace;
+
+ struct {
+ /* Check for convergency and filter the mask. */
+ bool filter = false;
+
+ float threshold = 0.0f;
+
+ /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */
+ bool reset = false;
+ } adaptive_sampling;
+
+ struct {
+ bool postprocess = false;
+ } cryptomatte;
+
+ /* Work related on the current tile. */
+ struct {
+ /* Write render buffers of the current tile.
+ *
+ * It is up to the path trace to decide whether writing should happen via user-provided
+ * callback into the rendering software, or via tile manager into a partial file. */
+ bool write = false;
+
+ bool denoise = false;
+ } tile;
+
+ /* Work related on the full-frame render buffer. */
+ struct {
+ /* Write full render result.
+ * Implies reading the partial file from disk. */
+ bool write = false;
+ } full;
+
+ /* Display which is used to visualize render result. */
+ struct {
+ /* Display needs to be updated for the new render. */
+ bool update = false;
+
+ /* Display can use denoised result if available. */
+ bool use_denoised_result = true;
+ } display;
+
+ /* Re-balance multi-device scheduling after rendering this work.
+ * Note that the scheduler does not know anything abouce devices, so if there is only a single
+ * device used, then it is up for the PathTracer to ignore the balancing. */
+ bool rebalance = false;
+
+ /* Conversion to bool, to simplify checks about whether there is anything to be done for this
+ * work. */
+ inline operator bool() const
+ {
+ return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise ||
+ tile.write || full.write;
+ }
+};
+
+class RenderScheduler {
+ public:
+ RenderScheduler(TileManager &tile_manager, const SessionParams &params);
+
+ /* Specify whether cryptomatte-related works are to be scheduled. */
+ void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte);
+
+ /* Allows to disable work re-balancing works, allowing to schedule as much to a single device
+ * as possible. */
+ void set_need_schedule_rebalance(bool need_schedule_rebalance);
+
+ bool is_background() const;
+
+ void set_denoiser_params(const DenoiseParams &params);
+ void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+ bool is_adaptive_sampling_used() const;
+
+ /* Start sample for path tracing.
+ * The scheduler will schedule work using this sample as the first one. */
+ void set_start_sample(int start_sample);
+ int get_start_sample() const;
+
+ /* Number of samples to render, starting from start sample.
+ * The scheduler will schedule work in the range of
+ * [start_sample, start_sample + num_samples - 1], inclusively. */
+ void set_num_samples(int num_samples);
+ int get_num_samples() const;
+
+ /* Time limit for the path tracing tasks, in minutes.
+ * Zero disables the limit. */
+ void set_time_limit(double time_limit);
+ double get_time_limit() const;
+
+ /* Get sample up to which rendering has been done.
+ * This is an absolute 0-based value.
+ *
+ * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+ * return 14.
+ *
+ * If there were no samples rendered, then the behavior is undefined. */
+ int get_rendered_sample() const;
+
+ /* Get number of samples rendered within the current scheduling session.
+ *
+ * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+ * return 5.
+ *
+ * Note that this is based on the scheduling information. In practice this means that if someone
+ * requested for work to render the scheduler considers the work done. */
+ int get_num_rendered_samples() const;
+
+ /* Reset scheduler, indicating that rendering will happen from scratch.
+ * Resets current rendered state, as well as scheduling information. */
+ void reset(const BufferParams &buffer_params, int num_samples);
+
+ /* Reset scheduler upon switching to a next tile.
+ * Will keep the same number of samples and full-frame render parameters, but will reset progress
+ * and allow schedule renders works from the beginning of the new tile. */
+ void reset_for_next_tile();
+
+ /* Reschedule adaptive sampling work when all pixels did converge.
+ * If there is nothing else to be done for the adaptive sampling (pixels did converge to the
+ * final threshold) then false is returned and the render scheduler will stop scheduling path
+ * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with
+ * a lower threshold. */
+ bool render_work_reschedule_on_converge(RenderWork &render_work);
+
+ /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet
+ * converged.
+ * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and
+ * the path tracer is to finish the current pixels) then false is returned. */
+ bool render_work_reschedule_on_idle(RenderWork &render_work);
+
+ /* Reschedule work when rendering has been requested to cancel.
+ *
+ * Will skip all work which is not needed anymore because no more samples will be added (for
+ * example, adaptive sampling filtering and convergence check will be skipped).
+ * Will enable all work needed to make sure all passes are communicated to the software.
+ *
+ * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */
+ void render_work_reschedule_on_cancel(RenderWork &render_work);
+
+ RenderWork get_render_work();
+
+ /* Report that the path tracer started to work, after scene update and loading kernels. */
+ void report_work_begin(const RenderWork &render_work);
+
+ /* Report time (in seconds) which corresponding part of work took. */
+ void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+ void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
+ void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
+ void report_denoise_time(const RenderWork &render_work, double time);
+ void report_display_update_time(const RenderWork &render_work, double time);
+ void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed);
+
+ /* Generate full multi-line report of the rendering process, including rendering parameters,
+ * times, and so on. */
+ string full_report() const;
+
+ protected:
+ /* Check whether all work has been scheduled and time limit was not exceeded.
+ *
+ * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some
+ * extra work needs to be scheduled to denoise and write final result. */
+ bool done() const;
+
+ /* Update scheduling state for a newely scheduled work.
+ * Takes care of things like checking whether work was ever denoised, tile was written and states
+ * like that. */
+ void update_state_for_render_work(const RenderWork &render_work);
+
+ /* Returns true if any work was scheduled. */
+ bool set_postprocess_render_work(RenderWork *render_work);
+
+ /* Set work which is to be performed after all tiles has been rendered. */
+ void set_full_frame_render_work(RenderWork *render_work);
+
+ /* Update start resolution divider based on the accumulated timing information, preserving nice
+ * feeling navigation feel. */
+ void update_start_resolution_divider();
+
+ /* Calculate desired update interval in seconds based on the current timings and settings.
+ * Will give an interval which provides good feeling updates during viewport navigation. */
+ double guess_viewport_navigation_update_interval_in_seconds() const;
+
+ /* Check whether denoising is active during interactive update while resolution divider is not
+ * unit. */
+ bool is_denoise_active_during_update() const;
+
+ /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at
+ * lower samples and near the beginning of rendering, updates happen more often, but with higher
+ * number of samples and later in the render, updates happen less often but device occupancy
+ * goes higher. */
+ double guess_display_update_interval_in_seconds() const;
+ double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const;
+ double guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+ int num_rendered_samples) const;
+
+ /* Calculate number of samples which can be rendered within current desred update interval which
+ * is calculated by `guess_update_interval_in_seconds()`. */
+ int calculate_num_samples_per_update() const;
+
+ /* Get start sample and the number of samples which are to be path traces in the current work. */
+ int get_start_sample_to_path_trace() const;
+ int get_num_samples_to_path_trace() const;
+
+ /* Calculate how many samples there are to be rendered for the very first path trace after reset.
+ */
+ int get_num_samples_during_navigation(int resolution_divier) const;
+
+ /* Whether adaptive sampling convergence check and filter is to happen. */
+ bool work_need_adaptive_filter() const;
+
+ /* Calculate thretshold for adaptive sampling. */
+ float work_adaptive_threshold() const;
+
+ /* Check whether current work needs denoising.
+ * Denoising is not needed if the denoiser is not configured, or when denosiing is happening too
+ * often.
+ *
+ * The delayed will be true when the denoiser is configured for use, but it was delayed for a
+ * later sample, to reduce overhead.
+ *
+ * ready_to_display will be false if we may have a denoised result that is outdated due to
+ * increased samples. */
+ bool work_need_denoise(bool &delayed, bool &ready_to_display);
+
+ /* Check whether current work need to update display.
+ *
+ * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */
+ bool work_need_update_display(const bool denoiser_delayed);
+
+ /* Check whether it is time to perform rebalancing for the render work, */
+ bool work_need_rebalance();
+
+ /* Check whether timing of the given work are usable to store timings in the `first_render_time_`
+ * for the resolution divider calculation. */
+ bool work_is_usable_for_first_render_estimation(const RenderWork &render_work);
+
+ /* Check whether timing report about the given work need to reset accumulated average time. */
+ bool work_report_reset_average(const RenderWork &render_work);
+
+ /* CHeck whether render time limit has been reached (or exceeded), and if so store related
+ * information in the state so that rendering is considered finished, and is possible to report
+ * average render time information. */
+ void check_time_limit_reached();
+
+ /* Helper class to keep track of task timing.
+ *
+ * Contains two parts: wall time and average. The wall time is an actual wall time of how long it
+ * took to complete all tasks of a type. Is always advanced when PathTracer reports time update.
+ *
+ * The average time is used for scheduling purposes. It is estimated to be a time of how long it
+ * takes to perform task on the final resolution. */
+ class TimeWithAverage {
+ public:
+ inline void reset()
+ {
+ total_wall_time_ = 0.0;
+
+ average_time_accumulator_ = 0.0;
+ num_average_times_ = 0;
+ }
+
+ inline void add_wall(double time)
+ {
+ total_wall_time_ += time;
+ }
+
+ inline void add_average(double time, int num_measurements = 1)
+ {
+ average_time_accumulator_ += time;
+ num_average_times_ += num_measurements;
+ }
+
+ inline double get_wall() const
+ {
+ return total_wall_time_;
+ }
+
+ inline double get_average() const
+ {
+ if (num_average_times_ == 0) {
+ return 0;
+ }
+ return average_time_accumulator_ / num_average_times_;
+ }
+
+ inline void reset_average()
+ {
+ average_time_accumulator_ = 0.0;
+ num_average_times_ = 0;
+ }
+
+ protected:
+ double total_wall_time_ = 0.0;
+
+ double average_time_accumulator_ = 0.0;
+ int num_average_times_ = 0;
+ };
+
+ struct {
+ int resolution_divider = 1;
+
+ /* Number of rendered samples on top of the start sample. */
+ int num_rendered_samples = 0;
+
+ /* Point in time the latest GPUDisplay work has been scheduled. */
+ double last_display_update_time = 0.0;
+ /* Value of -1 means display was never updated. */
+ int last_display_update_sample = -1;
+
+ /* Point in time at which last rebalance has been performed. */
+ double last_rebalance_time = 0.0;
+
+ /* Number of rebalance works which has been requested to be performed.
+ * The path tracer might ignore the work if there is a single device rendering. */
+ int num_rebalance_requested = 0;
+
+ /* Number of rebalance works handled which did change balance across devices. */
+ int num_rebalance_changes = 0;
+
+ bool need_rebalance_at_next_work = false;
+
+ /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across
+ * devices. */
+ bool last_rebalance_changed = false;
+
+ /* Threshold for adaptive sampling which will be scheduled to work when not using progressive
+ * noise floor. */
+ float adaptive_sampling_threshold = 0.0f;
+
+ bool last_work_tile_was_denoised = false;
+ bool tile_result_was_written = false;
+ bool postprocess_work_scheduled = false;
+ bool full_frame_work_scheduled = false;
+ bool full_frame_was_written = false;
+
+ bool path_trace_finished = false;
+ bool time_limit_reached = false;
+
+ /* Time at which rendering started and finished. */
+ double start_render_time = 0.0;
+ double end_render_time = 0.0;
+
+ /* Measured occupancy of the render devices measured normalized to the number of samples.
+ *
+ * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+ * previous work was rendered. */
+ int occupancy_num_samples = 0;
+ float occupancy = 1.0f;
+ } state_;
+
+ /* Timing of tasks which were performed at the very first render work at 100% of the
+ * resolution. This timing information is used to estimate resolution divider for fats
+ * navigation. */
+ struct {
+ double path_trace_per_sample;
+ double denoise_time;
+ double display_update_time;
+ } first_render_time_;
+
+ TimeWithAverage path_trace_time_;
+ TimeWithAverage adaptive_filter_time_;
+ TimeWithAverage denoise_time_;
+ TimeWithAverage display_update_time_;
+ TimeWithAverage rebalance_time_;
+
+ /* Whether cryptomatte-related work will be scheduled. */
+ bool need_schedule_cryptomatte_ = false;
+
+ /* Whether to schedule device load rebalance works.
+ * Rebalancing requires some special treatment for update intervals and such, so if it's known
+ * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully
+ * ignore rebalancing logic. */
+ bool need_schedule_rebalance_works_ = false;
+
+ /* Path tracing work will be scheduled for samples from within
+ * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */
+ int start_sample_ = 0;
+ int num_samples_ = 0;
+
+ /* Limit in seconds for how long path tracing is allowed to happen.
+ * Zero means no limit is applied. */
+ double time_limit_ = 0.0;
+
+ /* Headless rendering without interface. */
+ bool headless_;
+
+ /* Background (offline) rendering. */
+ bool background_;
+
+ /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other
+ * types of hi-dpi displays. */
+ int pixel_size_ = 1;
+
+ TileManager &tile_manager_;
+
+ BufferParams buffer_params_;
+ DenoiseParams denoiser_params_;
+
+ AdaptiveSampling adaptive_sampling_;
+
+ /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise
+ * level. */
+ bool use_progressive_noise_floor_ = false;
+
+ /* Default value for the resolution divider which will be used when there is no render time
+ * information available yet.
+ * It is also what defines the upper limit of the automatically calculated resolution divider. */
+ int default_start_resolution_divider_ = 1;
+
+ /* Initial resolution divider which will be used on render scheduler reset. */
+ int start_resolution_divider_ = 0;
+
+ /* Calculate smallest resolution divider which will bring down actual rendering time below the
+ * desired one. This call assumes linear dependency of render time from number of pixels
+ * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time
+ * down by a factor of 4. */
+ int calculate_resolution_divider_for_time(double desired_time, double actual_time);
+};
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution);
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
new file mode 100644
index 00000000000..465b4a8d4da
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/shader_eval.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
+{
+ DCHECK_NE(device_, nullptr);
+}
+
+bool ShaderEval::eval(const ShaderEvalType type,
+ const int max_num_points,
+ const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+ const function<void(device_vector<float4> &)> &read_output)
+{
+ bool first_device = true;
+ bool success = true;
+
+ device_->foreach_device([&](Device *device) {
+ if (!first_device) {
+ LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
+ "single device.";
+ return;
+ }
+ first_device = false;
+
+ device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
+ device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+
+ /* Allocate and copy device buffers. */
+ DCHECK_EQ(input.device, device);
+ DCHECK_EQ(output.device, device);
+ DCHECK_LE(output.size(), input.size());
+
+ input.alloc(max_num_points);
+ int num_points = fill_input(input);
+ if (num_points == 0) {
+ return;
+ }
+
+ input.copy_to_device();
+ output.alloc(num_points);
+ output.zero_to_device();
+
+ /* Evaluate on CPU or GPU. */
+ success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
+ eval_gpu(device, type, input, output);
+
+ /* Copy data back from device if not cancelled. */
+ if (success) {
+ output.copy_from_device(0, 1, output.size());
+ read_output(output);
+ }
+
+ input.free();
+ output.free();
+ });
+
+ return success;
+}
+
+bool ShaderEval::eval_cpu(Device *device,
+ const ShaderEvalType type,
+ device_vector<KernelShaderEvalInput> &input,
+ device_vector<float4> &output)
+{
+ vector<CPUKernelThreadGlobals> kernel_thread_globals;
+ device->get_cpu_kernel_thread_globals(kernel_thread_globals);
+
+ /* Find required kernel function. */
+ const CPUKernels &kernels = *(device->get_cpu_kernels());
+
+ /* Simple parallel_for over all work items. */
+ const int64_t work_size = output.size();
+ KernelShaderEvalInput *input_data = input.data();
+ float4 *output_data = output.data();
+ bool success = true;
+
+ tbb::task_arena local_arena(device->info.cpu_threads);
+ local_arena.execute([&]() {
+ tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
+ /* TODO: is this fast enough? */
+ if (progress_.get_cancel()) {
+ success = false;
+ return;
+ }
+
+ const int thread_index = tbb::this_task_arena::current_thread_index();
+ KernelGlobals *kg = &kernel_thread_globals[thread_index];
+
+ switch (type) {
+ case SHADER_EVAL_DISPLACE:
+ kernels.shader_eval_displace(kg, input_data, output_data, work_index);
+ break;
+ case SHADER_EVAL_BACKGROUND:
+ kernels.shader_eval_background(kg, input_data, output_data, work_index);
+ break;
+ }
+ });
+ });
+
+ return success;
+}
+
+bool ShaderEval::eval_gpu(Device *device,
+ const ShaderEvalType type,
+ device_vector<KernelShaderEvalInput> &input,
+ device_vector<float4> &output)
+{
+ /* Find required kernel function. */
+ DeviceKernel kernel;
+ switch (type) {
+ case SHADER_EVAL_DISPLACE:
+ kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
+ break;
+ case SHADER_EVAL_BACKGROUND:
+ kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
+ break;
+ };
+
+ /* Create device queue. */
+ unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
+ queue->init_execution();
+
+ /* Execute work on GPU in chunk, so we can cancel.
+ * TODO : query appropriate size from device.*/
+ const int chunk_size = 65536;
+
+ const int work_size = output.size();
+ void *d_input = (void *)input.device_pointer;
+ void *d_output = (void *)output.device_pointer;
+
+ for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+ int d_work_size = min(chunk_size, work_size - d_offset);
+ void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
+
+ queue->enqueue(kernel, d_work_size, args);
+ queue->synchronize();
+
+ if (progress_.get_cancel()) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h
new file mode 100644
index 00000000000..7dbf334b8d7
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_function.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class Progress;
+
+enum ShaderEvalType {
+ SHADER_EVAL_DISPLACE,
+ SHADER_EVAL_BACKGROUND,
+};
+
+/* ShaderEval class performs shader evaluation for background light and displacement. */
+class ShaderEval {
+ public:
+ ShaderEval(Device *device, Progress &progress);
+
+ /* Evaluate shader at points specified by KernelShaderEvalInput and write out
+ * RGBA colors to output. */
+ bool eval(const ShaderEvalType type,
+ const int max_num_points,
+ const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+ const function<void(device_vector<float4> &)> &read_output);
+
+ protected:
+ bool eval_cpu(Device *device,
+ const ShaderEvalType type,
+ device_vector<KernelShaderEvalInput> &input,
+ device_vector<float4> &output);
+ bool eval_gpu(Device *device,
+ const ShaderEvalType type,
+ device_vector<KernelShaderEvalInput> &input,
+ device_vector<float4> &output);
+
+ Device *device_;
+ Progress &progress_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
new file mode 100644
index 00000000000..3387b7bedf1
--- /dev/null
+++ b/intern/cycles/integrator/tile.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/tile.h"
+
+#include "util/util_logging.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size)
+{
+ os << "size: (" << tile_size.width << ", " << tile_size.height << ")";
+ os << ", num_samples: " << tile_size.num_samples;
+ return os;
+}
+
+ccl_device_inline uint round_down_to_power_of_two(uint x)
+{
+ if (is_power_of_two(x)) {
+ return x;
+ }
+
+ return prev_power_of_two(x);
+}
+
+ccl_device_inline uint round_up_to_power_of_two(uint x)
+{
+ if (is_power_of_two(x)) {
+ return x;
+ }
+
+ return next_power_of_two(x);
+}
+
+TileSize tile_calculate_best_size(const int2 &image_size,
+ const int num_samples,
+ const int max_num_path_states)
+{
+ if (max_num_path_states == 1) {
+ /* Simple case: avoid any calculation, which could cause rounding issues. */
+ return TileSize(1, 1, 1);
+ }
+
+ const int64_t num_pixels = image_size.x * image_size.y;
+ const int64_t num_pixel_samples = num_pixels * num_samples;
+
+ if (max_num_path_states >= num_pixel_samples) {
+ /* Image fully fits into the state (could be border render, for example). */
+ return TileSize(image_size.x, image_size.y, num_samples);
+ }
+
+ /* The idea here is to keep number of samples per tile as much as possible to improve coherency
+ * across threads.
+ *
+ * Some general ideas:
+ * - Prefer smaller tiles with more samples, which improves spatial coherency of paths.
+ * - Keep values a power of two, for more integer fit into the maximum number of paths. */
+
+ TileSize tile_size;
+
+ /* Calculate tile size as if it is the most possible one to fit an entire range of samples.
+ * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling
+ * multiple tiles with the same coordinates rendering different samples. */
+ const int num_path_states_per_sample = max_num_path_states / num_samples;
+ if (num_path_states_per_sample != 0) {
+ tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample)));
+ tile_size.height = tile_size.width;
+ }
+ else {
+ tile_size.width = tile_size.height = 1;
+ }
+
+ if (num_samples == 1) {
+ tile_size.num_samples = 1;
+ }
+ else {
+ /* Heuristic here is to have more uniform division of the sample range: for example prefer
+ * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */
+ tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))),
+ static_cast<uint>(num_samples));
+
+ const int tile_area = tile_size.width / tile_size.height;
+ tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area);
+ }
+
+ DCHECK_GE(tile_size.width, 1);
+ DCHECK_GE(tile_size.height, 1);
+ DCHECK_GE(tile_size.num_samples, 1);
+ DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states);
+
+ return tile_size;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
new file mode 100644
index 00000000000..d0824843ddb
--- /dev/null
+++ b/intern/cycles/integrator/tile.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct TileSize {
+ TileSize() = default;
+
+ inline TileSize(int width, int height, int num_samples)
+ : width(width), height(height), num_samples(num_samples)
+ {
+ }
+
+ inline bool operator==(const TileSize &other) const
+ {
+ return width == other.width && height == other.height && num_samples == other.num_samples;
+ }
+ inline bool operator!=(const TileSize &other) const
+ {
+ return !(*this == other);
+ }
+
+ int width = 0, height = 0;
+ int num_samples = 0;
+};
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
+
+/* Calculate tile size which is best suitable for rendering image of a given size with given number
+ * of active path states.
+ * Will attempt to provide best guess to keep path tracing threads of a device as localized as
+ * possible, and have as many threads active for every tile as possible. */
+TileSize tile_calculate_best_size(const int2 &image_size,
+ const int num_samples,
+ const int max_num_path_states);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
new file mode 100644
index 00000000000..9f96fe3632b
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_balancer.h"
+
+#include "util/util_math.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
+{
+ const int num_infos = work_balance_infos.size();
+
+ if (num_infos == 1) {
+ work_balance_infos[0].weight = 1.0;
+ return;
+ }
+
+ /* There is no statistics available, so start with an equal distribution. */
+ const double weight = 1.0 / num_infos;
+ for (WorkBalanceInfo &balance_info : work_balance_infos) {
+ balance_info.weight = weight;
+ }
+}
+
+static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos)
+{
+ double total_time = 0;
+ for (const WorkBalanceInfo &info : work_balance_infos) {
+ total_time += info.time_spent;
+ }
+ return total_time;
+}
+
+/* The balance is based on equalizing time which devices spent performing a task. Assume that
+ * average of the observed times is usable for estimating whether more or less work is to be
+ * scheduled, and how difference in the work scheduling is needed. */
+
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos)
+{
+ const int num_infos = work_balance_infos.size();
+
+ const double total_time = calculate_total_time(work_balance_infos);
+ const double time_average = total_time / num_infos;
+
+ double total_weight = 0;
+ vector<double> new_weights;
+ new_weights.reserve(num_infos);
+
+ /* Equalize the overall average time. This means that we don't make it so every work will perform
+ * amount of work based on the current average, but that after the weights changes the time will
+ * equalize.
+ * Can think of it that if one of the devices is 10% faster than another, then one device needs
+ * to do 5% less of the current work, and another needs to do 5% more. */
+ const double lerp_weight = 1.0 / num_infos;
+
+ bool has_big_difference = false;
+
+ for (const WorkBalanceInfo &info : work_balance_infos) {
+ const double time_target = lerp(info.time_spent, time_average, lerp_weight);
+ const double new_weight = info.weight * time_target / info.time_spent;
+ new_weights.push_back(new_weight);
+ total_weight += new_weight;
+
+ if (std::fabs(1.0 - time_target / time_average) > 0.02) {
+ has_big_difference = true;
+ }
+ }
+
+ if (!has_big_difference) {
+ return false;
+ }
+
+ const double total_weight_inv = 1.0 / total_weight;
+ for (int i = 0; i < num_infos; ++i) {
+ WorkBalanceInfo &info = work_balance_infos[i];
+ info.weight = new_weights[i] * total_weight_inv;
+ info.time_spent = 0;
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
new file mode 100644
index 00000000000..94e20ecf054
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct WorkBalanceInfo {
+ /* Time spent performing corresponding work. */
+ double time_spent = 0;
+
+ /* Average occupancy of the device while performing the work. */
+ float occupancy = 1.0f;
+
+ /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
+ * the big tile which is to be rendered on the device). */
+ double weight = 1.0;
+};
+
+/* Balance work for an initial render interation, before any statistics is known. */
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos);
+
+/* Rebalance work after statistics has been accumulated.
+ * Returns true if the balancing did change. */
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
new file mode 100644
index 00000000000..3fc99d5b74d
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_tile_scheduler.h"
+
+#include "device/device_queue.h"
+#include "integrator/tile.h"
+#include "render/buffers.h"
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+WorkTileScheduler::WorkTileScheduler()
+{
+}
+
+void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
+{
+ max_num_path_states_ = max_num_path_states;
+}
+
+void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num)
+{
+ /* Image buffer parameters. */
+ image_full_offset_px_.x = buffer_params.full_x;
+ image_full_offset_px_.y = buffer_params.full_y;
+
+ image_size_px_ = make_int2(buffer_params.width, buffer_params.height);
+
+ offset_ = buffer_params.offset;
+ stride_ = buffer_params.stride;
+
+ /* Samples parameters. */
+ sample_start_ = sample_start;
+ samples_num_ = samples_num;
+
+ /* Initialize new scheduling. */
+ reset_scheduler_state();
+}
+
+void WorkTileScheduler::reset_scheduler_state()
+{
+ tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_);
+
+ VLOG(3) << "Will schedule tiles of size " << tile_size_;
+
+ if (VLOG_IS_ON(3)) {
+ /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
+ * and purely focusing on the number of used path states. */
+ const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+ tile_size_.num_samples;
+ const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+ VLOG(3) << "Number of unused path states: "
+ << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+ }
+
+ num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+ num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+
+ total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
+ num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
+
+ next_work_index_ = 0;
+ total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
+}
+
+bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size)
+{
+ /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because
+ * the path trace work can decice to use smaller tile sizes and greedily schedule multiple tiles,
+ * improving overall device occupancy.
+ * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling
+ * limit". */
+
+ DCHECK_NE(max_num_path_states_, 0);
+
+ const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1);
+ if (work_index >= total_work_size_) {
+ return false;
+ }
+
+ const int sample_range_index = work_index % num_tiles_per_sample_range_;
+ const int start_sample = sample_range_index * tile_size_.num_samples;
+ const int tile_index = work_index / num_tiles_per_sample_range_;
+ const int tile_y = tile_index / num_tiles_x_;
+ const int tile_x = tile_index - tile_y * num_tiles_x_;
+
+ KernelWorkTile work_tile;
+ work_tile.x = tile_x * tile_size_.width;
+ work_tile.y = tile_y * tile_size_.height;
+ work_tile.w = tile_size_.width;
+ work_tile.h = tile_size_.height;
+ work_tile.start_sample = sample_start_ + start_sample;
+ work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+ work_tile.offset = offset_;
+ work_tile.stride = stride_;
+
+ work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
+ work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);
+
+ work_tile.x += image_full_offset_px_.x;
+ work_tile.y += image_full_offset_px_.y;
+
+ const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+ DCHECK_GT(tile_work_size, 0);
+
+ if (max_work_size && tile_work_size > max_work_size) {
+ /* The work did not fit into the requested limit of the work size. Unschedule the tile,
+ * allowing others (or ourselves later one) to pick it up.
+ *
+ * TODO: Such temporary decrement is not ideal, since it might lead to situation when another
+ * device sees there is nothing to be done, finishing its work and leaving all work to be
+ * done by us. */
+ atomic_fetch_and_add_int32(&next_work_index_, -1);
+ return false;
+ }
+
+ *work_tile_ = work_tile;
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
new file mode 100644
index 00000000000..e4c8f701259
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/tile.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+struct KernelWorkTile;
+
+/* Scheduler of device work tiles.
+ * Takes care of feeding multiple devices running in parallel a work which needs to be done. */
+class WorkTileScheduler {
+ public:
+ WorkTileScheduler();
+
+ /* MAximum path states which are allowed to be used by a single scheduled work tile.
+ *
+ * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
+ * this number of states. */
+ void set_max_num_path_states(int max_num_path_states);
+
+ /* Scheduling will happen for pixels within a big tile denotes by its parameters. */
+ void reset(const BufferParams &buffer_params, int sample_start, int samples_num);
+
+ /* Get work for a device.
+ * Returns true if there is still work to be done and initialize the work tile to all
+ * parameters of this work. If there is nothing remaining to be done, returns false and the
+ * work tile is kept unchanged.
+ *
+ * Optionally pass max_work_size to do nothing if there is no tile small enough. */
+ bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0);
+
+ protected:
+ void reset_scheduler_state();
+
+ /* Maximum allowed path states to be used.
+ *
+ * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
+ * number of path states is kind of a detail. Is there a more generic term from the scheduler
+ * point of view? */
+ int max_num_path_states_ = 0;
+
+ /* Offset in pixels within a global buffer. */
+ int2 image_full_offset_px_ = make_int2(0, 0);
+
+ /* dimensions of the currently rendering image in pixels. */
+ int2 image_size_px_ = make_int2(0, 0);
+
+ /* Offset and stride of the buffer within which scheduing is happenning.
+ * Will be passed over to the KernelWorkTile. */
+ int offset_, stride_;
+
+ /* Start sample of index and number of samples which are to be rendered.
+ * The scheduler will cover samples range of [start, start + num] over the entire image
+ * (splitting into a smaller work tiles). */
+ int sample_start_ = 0;
+ int samples_num_ = 0;
+
+ /* Tile size which be scheduled for rendering. */
+ TileSize tile_size_;
+
+ /* Number of tiles in X and Y axis of the image. */
+ int num_tiles_x_, num_tiles_y_;
+
+ /* Total number of tiles on the image.
+ * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`.
+ *
+ * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value
+ * in the `get_work()`? */
+ int total_tiles_num_ = 0;
+
+ /* In the case when the number of sam[les in the `tile_size_` is lower than samples_num_ denotes
+ * how many tiles are to be "stacked" to cover the entire requested range of samples. */
+ int num_tiles_per_sample_range_ = 0;
+
+ int next_work_index_ = 0;
+ int total_work_size_ = 0;
+};
+
+CCL_NAMESPACE_END