diff options
Diffstat (limited to 'intern/cycles/integrator')
35 files changed, 8108 insertions, 0 deletions
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt new file mode 100644 index 00000000000..bfabd35d7c3 --- /dev/null +++ b/intern/cycles/integrator/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright 2011-2021 Blender Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set(INC + .. +) + +set(SRC + adaptive_sampling.cpp + denoiser.cpp + denoiser_device.cpp + denoiser_oidn.cpp + denoiser_optix.cpp + path_trace.cpp + tile.cpp + pass_accessor.cpp + pass_accessor_cpu.cpp + pass_accessor_gpu.cpp + path_trace_work.cpp + path_trace_work_cpu.cpp + path_trace_work_gpu.cpp + render_scheduler.cpp + shader_eval.cpp + work_balancer.cpp + work_tile_scheduler.cpp +) + +set(SRC_HEADERS + adaptive_sampling.h + denoiser.h + denoiser_device.h + denoiser_oidn.h + denoiser_optix.h + path_trace.h + tile.h + pass_accessor.h + pass_accessor_cpu.h + pass_accessor_gpu.h + path_trace_work.h + path_trace_work_cpu.h + path_trace_work_gpu.h + render_scheduler.h + shader_eval.h + work_balancer.h + work_tile_scheduler.h +) + +set(LIB + # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to + # avoid such cyclic dependency. + cycles_render + + cycles_util +) + +if(WITH_OPENIMAGEDENOISE) + list(APPEND LIB + ${OPENIMAGEDENOISE_LIBRARIES} + ) +endif() + +include_directories(${INC}) +include_directories(SYSTEM ${INC_SYS}) + +cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp new file mode 100644 index 00000000000..23fbcfea5c2 --- /dev/null +++ b/intern/cycles/integrator/adaptive_sampling.cpp @@ -0,0 +1,71 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/adaptive_sampling.h" + +#include "util/util_math.h" + +CCL_NAMESPACE_BEGIN + +AdaptiveSampling::AdaptiveSampling() +{ +} + +int AdaptiveSampling::align_samples(int start_sample, int num_samples) const +{ + if (!use) { + return num_samples; + } + + /* + * The naive implementation goes as following: + * + * int count = 1; + * while (!need_filter(start_sample + count - 1) && count < num_samples) { + * ++count; + * } + * return count; + */ + + /* 0-based sample index at which first filtering will happen. */ + const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1); + + /* Allow as many samples as possible until the first filter sample. */ + if (start_sample + num_samples <= first_filter_sample) { + return num_samples; + } + + const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1)); + + const int num_samples_until_filter = next_filter_sample - start_sample + 1; + + return min(num_samples_until_filter, num_samples); +} + +bool AdaptiveSampling::need_filter(int sample) const +{ + if (!use) { + return false; + } + + if (sample <= min_samples) { + return false; + } + + return (sample & (adaptive_step - 1)) == (adaptive_step - 1); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h new file mode 100644 index 00000000000..d98edd9894c --- /dev/null +++ b/intern/cycles/integrator/adaptive_sampling.h @@ -0,0 +1,55 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +class AdaptiveSampling { + public: + AdaptiveSampling(); + + /* Align number of samples so that they align with the adaptive filtering. + * + * Returns the new value for the `num_samples` so that after rendering so many samples on top + * of `start_sample` filtering is required. + * + * The alignment happens in a way that allows to render as many samples as possible without + * missing any filtering point. This means that the result is "clamped" by the nearest sample + * at which filtering is needed. This is part of mechanism which ensures that all devices will + * perform same exact filtering and adaptive sampling, regardless of their performance. + * + * `start_sample` is the 0-based index of sample. + * + * NOTE: The start sample is included into the number of samples to render. This means that + * if the number of samples is 1, then the path tracer will render samples [align_samples], + * if the number of samples is 2, then the path tracer will render samples [align_samples, + * align_samples + 1] and so on. */ + int align_samples(int start_sample, int num_samples) const; + + /* Check whether adaptive sampling filter should happen at this sample. + * Returns false if the adaptive sampling is not use. + * + * `sample` is the 0-based index of sample. */ + bool need_filter(int sample) const; + + bool use = false; + int adaptive_step = 0; + int min_samples = 0; + float threshold = 0.0f; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp new file mode 100644 index 00000000000..598bbd497a5 --- /dev/null +++ b/intern/cycles/integrator/denoiser.cpp @@ -0,0 +1,204 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/denoiser.h" + +#include "device/device.h" +#include "integrator/denoiser_oidn.h" +#include "integrator/denoiser_optix.h" +#include "render/buffers.h" +#include "util/util_logging.h" +#include "util/util_progress.h" + +CCL_NAMESPACE_BEGIN + +unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams ¶ms) +{ + DCHECK(params.use); + + switch (params.type) { + case DENOISER_OPTIX: + return make_unique<OptiXDenoiser>(path_trace_device, params); + + case DENOISER_OPENIMAGEDENOISE: + return make_unique<OIDNDenoiser>(path_trace_device, params); + + case DENOISER_NUM: + case DENOISER_NONE: + case DENOISER_ALL: + /* pass */ + break; + } + + LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen."; + + return nullptr; +} + +Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams ¶ms) + : path_trace_device_(path_trace_device), params_(params) +{ + DCHECK(params.use); +} + +void Denoiser::set_params(const DenoiseParams ¶ms) +{ + DCHECK_EQ(params.type, params_.type); + + if (params.type == params_.type) { + params_ = params; + } + else { + LOG(ERROR) << "Attempt to change denoiser type."; + } +} + +const DenoiseParams &Denoiser::get_params() const +{ + return params_; +} + +bool Denoiser::load_kernels(Progress *progress) +{ + const Device *denoiser_device = ensure_denoiser_device(progress); + + if (!denoiser_device) { + path_trace_device_->set_error("No device available to denoise on"); + return false; + } + + VLOG(3) << "Will denoise on " << denoiser_device->info.description << " (" + << denoiser_device->info.id << ")"; + + return true; +} + +Device *Denoiser::get_denoiser_device() const +{ + return denoiser_device_; +} + +/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */ +static bool is_single_supported_device(Device *device, DenoiserType type) +{ + if (device->info.type == DEVICE_MULTI) { + /* Assume multi-device is never created with a single sub-device. + * If one requests such configuration it should be checked on the session level. */ + return false; + } + + if (!device->info.multi_devices.empty()) { + /* Some configurations will use multi_devices, but keep the type of an individual device. + * This does simplify checks for homogenous setups, but here we really need a single device. */ + return false; + } + + /* Check the denoiser type is supported. */ + return (device->info.denoisers & type); +} + +/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of + * multi-device. + * + * If there is no device available which supports given denoiser type nullptr is returned. */ +static Device *find_best_device(Device *device, DenoiserType type) +{ + Device *best_device = nullptr; + + device->foreach_device([&](Device *sub_device) { + if ((sub_device->info.denoisers & type) == 0) { + return; + } + if (!best_device) { + best_device = sub_device; + } + else { + /* TODO(sergey): Choose fastest device from available ones. Taking into account performance + * of the device and data transfer cost. */ + } + }); + + return best_device; +} + +static unique_ptr<Device> create_denoiser_device(Device *path_trace_device, + const uint device_type_mask) +{ + const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask); + if (device_infos.empty()) { + return nullptr; + } + + /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on + * a physical CUDA device which is already used for rendering. */ + + /* TODO(sergey): Choose fastest device for denoising. */ + + const DeviceInfo denoiser_device_info = device_infos.front(); + + unique_ptr<Device> denoiser_device( + Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler)); + + if (!denoiser_device) { + return nullptr; + } + + if (denoiser_device->have_error()) { + return nullptr; + } + + /* Only need denoising feature, everything else is unused. */ + if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) { + return nullptr; + } + + return denoiser_device; +} + +Device *Denoiser::ensure_denoiser_device(Progress *progress) +{ + /* The best device has been found already, avoid sequential lookups. + * Additionally, avoid device re-creation if it has failed once. */ + if (denoiser_device_ || device_creation_attempted_) { + return denoiser_device_; + } + + /* Simple case: rendering happens on a single device which also supports denoiser. */ + if (is_single_supported_device(path_trace_device_, params_.type)) { + denoiser_device_ = path_trace_device_; + return denoiser_device_; + } + + /* Find best device from the ones which are already used for rendering. */ + denoiser_device_ = find_best_device(path_trace_device_, params_.type); + if (denoiser_device_) { + return denoiser_device_; + } + + if (progress) { + progress->set_status("Loading denoising kernels (may take a few minutes the first time)"); + } + + device_creation_attempted_ = true; + + const uint device_type_mask = get_device_type_mask(); + local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask); + denoiser_device_ = local_denoiser_device_.get(); + + return denoiser_device_; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h new file mode 100644 index 00000000000..3101b45e31b --- /dev/null +++ b/intern/cycles/integrator/denoiser.h @@ -0,0 +1,135 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the + * better place is figured out. */ + +#include "device/device.h" +#include "device/device_denoise.h" +#include "util/util_function.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +class BufferParams; +class Device; +class RenderBuffers; +class Progress; + +/* Implementation of a specific denoising algorithm. + * + * This class takes care of breaking down denosiing algorithm into a series of device calls or to + * calls of an external API to denoise given input. + * + * TODO(sergey): Are we better with device or a queue here? */ +class Denoiser { + public: + /* Create denoiser for the given path trace device. + * + * Notes: + * - The denoiser must be configured. This means that `params.use` must be true. + * This is checked in debug builds. + * - The device might be MultiDevice. */ + static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams ¶ms); + + virtual ~Denoiser() = default; + + void set_params(const DenoiseParams ¶ms); + const DenoiseParams &get_params() const; + + /* Create devices and load kernels needed for denoising. + * The progress is used to communicate state when kenrels actually needs to be loaded. + * + * NOTE: The `progress` is an optional argument, can be nullptr. */ + virtual bool load_kernels(Progress *progress); + + /* Denoise the entire buffer. + * + * Buffer parameters denotes an effective parameters used during rendering. It could be + * a lower resolution render into a bigger allocated buffer, which is used in viewport during + * navigation and non-unit pixel size. Use that instead of render_buffers->params. + * + * The buffer might be copming from a "foreign" device from what this denoise is created for. + * This means that in general case the denoiser will make sure the input data is available on + * the denoiser device, perform denoising, and put data back to the device where the buffer + * came from. + * + * The `num_samples` corresponds to the number of samples in the render buffers. It is used + * to scale buffers down to the "final" value in algorithms which don't do automatic exposure, + * or which needs "final" value for data passes. + * + * The `allow_inplace_modification` means that the denoiser is allowed to do in-place + * modification of the input passes (scaling them down i.e.). This will lower the memory + * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of + * view. + * + * Returns true when all passes are denoised. Will return false if there is a denoiser error (for + * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */ + virtual bool denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) = 0; + + /* Get a device which is used to perform actual denoising. + * + * Notes: + * + * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then, + * + * - The device can be different from the path tracing device. This happens, for example, when + * using OptiX denoiser and rendering on CPU. + * + * - No threading safety is ensured in this call. This means, that it is up to caller to ensure + * that there is no threadingconflict between denoising task lazily initializing the device and + * access to this device happen. */ + Device *get_denoiser_device() const; + + function<bool(void)> is_cancelled_cb; + + bool is_cancelled() const + { + if (!is_cancelled_cb) { + return false; + } + return is_cancelled_cb(); + } + + protected: + Denoiser(Device *path_trace_device, const DenoiseParams ¶ms); + + /* Make sure denoising device is initialized. */ + virtual Device *ensure_denoiser_device(Progress *progress); + + /* Get device type mask which is used to filter available devices when new device needs to be + * created. */ + virtual uint get_device_type_mask() const = 0; + + Device *path_trace_device_; + DenoiseParams params_; + + /* Cached pointer to the device on which denoising will happen. + * Used to avoid lookup of a device for every denoising request. */ + Device *denoiser_device_ = nullptr; + + /* Denoiser device which was created to perform denoising in the case the none of the rendering + * devices are capable of denoising. */ + unique_ptr<Device> local_denoiser_device_; + bool device_creation_attempted_ = false; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp new file mode 100644 index 00000000000..8088cfd7800 --- /dev/null +++ b/intern/cycles/integrator/denoiser_device.cpp @@ -0,0 +1,106 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/denoiser_device.h" + +#include "device/device.h" +#include "device/device_denoise.h" +#include "device/device_memory.h" +#include "device/device_queue.h" +#include "render/buffers.h" +#include "util/util_logging.h" +#include "util/util_progress.h" + +CCL_NAMESPACE_BEGIN + +DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams ¶ms) + : Denoiser(path_trace_device, params) +{ +} + +DeviceDenoiser::~DeviceDenoiser() +{ + /* Explicit implementation, to allow forward declaration of Device in the header. */ +} + +bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) +{ + Device *denoiser_device = get_denoiser_device(); + if (!denoiser_device) { + return false; + } + + DeviceDenoiseTask task; + task.params = params_; + task.num_samples = num_samples; + task.buffer_params = buffer_params; + task.allow_inplace_modification = allow_inplace_modification; + + RenderBuffers local_render_buffers(denoiser_device); + bool local_buffer_used = false; + + if (denoiser_device == render_buffers->buffer.device) { + /* The device can access an existing buffer pointer. */ + local_buffer_used = false; + task.render_buffers = render_buffers; + } + else { + VLOG(3) << "Creating temporary buffer on denoiser device."; + + DeviceQueue *queue = denoiser_device->get_denoise_queue(); + + /* Create buffer which is available by the device used by denoiser. */ + + /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes, + * ignoring other light ad data passes. */ + + local_buffer_used = true; + + render_buffers->copy_from_device(); + + local_render_buffers.reset(buffer_params); + + /* NOTE: The local buffer is allocated for an exact size of the effective render size, while + * the input render buffer is allcoated for the lowest resolution divider possible. So it is + * important to only copy actually needed part of the input buffer. */ + memcpy(local_render_buffers.buffer.data(), + render_buffers->buffer.data(), + sizeof(float) * local_render_buffers.buffer.size()); + + queue->copy_to_device(local_render_buffers.buffer); + + task.render_buffers = &local_render_buffers; + task.allow_inplace_modification = true; + } + + const bool denoise_result = denoiser_device->denoise_buffer(task); + + if (local_buffer_used) { + local_render_buffers.copy_from_device(); + + render_buffers_host_copy_denoised( + render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params); + + render_buffers->copy_to_device(); + } + + return denoise_result; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h new file mode 100644 index 00000000000..0fd934dba79 --- /dev/null +++ b/intern/cycles/integrator/denoiser_device.h @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/denoiser.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are + * implemented as a part of a driver of specific device. + * + * This implementation makes sure the to-be-denoised buffer is available on the denoising device + * and invoke denoising kernel via device API. */ +class DeviceDenoiser : public Denoiser { + public: + DeviceDenoiser(Device *path_trace_device, const DenoiseParams ¶ms); + ~DeviceDenoiser(); + + virtual bool denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) override; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp new file mode 100644 index 00000000000..1b5a012ec87 --- /dev/null +++ b/intern/cycles/integrator/denoiser_oidn.cpp @@ -0,0 +1,628 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/denoiser_oidn.h" + +#include <array> + +#include "device/device.h" +#include "device/device_queue.h" +#include "integrator/pass_accessor_cpu.h" +#include "render/buffers.h" +#include "util/util_array.h" +#include "util/util_logging.h" +#include "util/util_openimagedenoise.h" + +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/kernel.h" + +CCL_NAMESPACE_BEGIN + +thread_mutex OIDNDenoiser::mutex_; + +OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams ¶ms) + : Denoiser(path_trace_device, params) +{ + DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE); + + DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform."; +} + +#ifdef WITH_OPENIMAGEDENOISE +static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/) +{ + OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr); + return !oidn_denoiser->is_cancelled(); +} +#endif + +#ifdef WITH_OPENIMAGEDENOISE + +class OIDNPass { + public: + OIDNPass() = default; + + OIDNPass(const BufferParams &buffer_params, + const char *name, + PassType type, + PassMode mode = PassMode::NOISY) + : name(name), type(type), mode(mode) + { + offset = buffer_params.get_pass_offset(type, mode); + need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL); + + const PassInfo pass_info = Pass::get_info(type); + num_components = pass_info.num_components; + use_compositing = pass_info.use_compositing; + use_denoising_albedo = pass_info.use_denoising_albedo; + } + + inline operator bool() const + { + return name[0] != '\0'; + } + + /* Name of an image which will be passed to the OIDN library. + * Should be one of the following: color, albedo, normal, output. + * The albedo and normal images are optional. */ + const char *name = ""; + + PassType type = PASS_NONE; + PassMode mode = PassMode::NOISY; + int num_components = -1; + bool use_compositing = false; + bool use_denoising_albedo = true; + + /* Offset of beginning of this pass in the render buffers. */ + int offset = -1; + + /* Denotes whether the data is to be scaled down with the number of passes. + * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so + * scaling is not needed for the color pass unless adaptive sampling is used. + * + * NOTE: Do not scale the outout pass, as that requires to be a pointer in the original buffer. + * All the scaling on the output needed for integration with adaptive sampling will happen + * outside of generic pass handling. */ + bool need_scale = false; + + /* The content of the pass has been pre-filtered. */ + bool is_filtered = false; + + /* For the scaled passes, the data which holds values of scaled pixels. */ + array<float> scaled_buffer; +}; + +class OIDNDenoiseContext { + public: + OIDNDenoiseContext(OIDNDenoiser *denoiser, + const DenoiseParams &denoise_params, + const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + const bool allow_inplace_modification) + : denoiser_(denoiser), + denoise_params_(denoise_params), + buffer_params_(buffer_params), + render_buffers_(render_buffers), + num_samples_(num_samples), + allow_inplace_modification_(allow_inplace_modification), + pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT)) + { + if (denoise_params_.use_pass_albedo) { + oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO); + } + + if (denoise_params_.use_pass_normal) { + oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL); + } + } + + bool need_denoising() const + { + if (buffer_params_.width == 0 && buffer_params_.height == 0) { + return false; + } + + return true; + } + + /* Make the guiding passes available by a sequential denoising of various passes. */ + void read_guiding_passes() + { + read_guiding_pass(oidn_albedo_pass_); + read_guiding_pass(oidn_normal_pass_); + } + + void denoise_pass(const PassType pass_type) + { + OIDNPass oidn_color_pass(buffer_params_, "color", pass_type); + if (oidn_color_pass.offset == PASS_UNUSED) { + return; + } + + if (oidn_color_pass.use_denoising_albedo) { + if (albedo_replaced_with_fake_) { + LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set."; + return; + } + } + + OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED); + if (oidn_output_pass.offset == PASS_UNUSED) { + LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type); + return; + } + + OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass); + + oidn::DeviceRef oidn_device = oidn::newDevice(); + oidn_device.commit(); + + /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too. + */ + oidn::FilterRef oidn_filter = oidn_device.newFilter("RT"); + set_input_pass(oidn_filter, oidn_color_access_pass); + set_guiding_passes(oidn_filter, oidn_color_pass); + set_output_pass(oidn_filter, oidn_output_pass); + oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_); + oidn_filter.set("hdr", true); + oidn_filter.set("srgb", false); + if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE || + denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) { + oidn_filter.set("cleanAux", true); + } + oidn_filter.commit(); + + filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_); + filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_); + + /* Filter the beauty image. */ + oidn_filter.execute(); + + /* Check for errors. */ + const char *error_message; + const oidn::Error error = oidn_device.getError(error_message); + if (error != oidn::Error::None && error != oidn::Error::Cancelled) { + LOG(ERROR) << "OpenImageDenoise error: " << error_message; + } + + postprocess_output(oidn_color_pass, oidn_output_pass); + } + + protected: + void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass) + { + if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass || + oidn_pass.is_filtered) { + return; + } + + oidn::FilterRef oidn_filter = oidn_device.newFilter("RT"); + set_pass(oidn_filter, oidn_pass); + set_output_pass(oidn_filter, oidn_pass); + oidn_filter.commit(); + oidn_filter.execute(); + + oidn_pass.is_filtered = true; + } + + /* Make pixels of a guiding pass available by the denoiser. */ + void read_guiding_pass(OIDNPass &oidn_pass) + { + if (!oidn_pass) { + return; + } + + DCHECK(!oidn_pass.use_compositing); + + if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE && + !is_pass_scale_needed(oidn_pass)) { + /* Pass data is available as-is from the render buffers. */ + return; + } + + if (allow_inplace_modification_) { + scale_pass_in_render_buffers(oidn_pass); + return; + } + + read_pass_pixels_into_buffer(oidn_pass); + } + + /* Special reader of the input pass. + * To save memory it will read pixels into the output, and let the denoiser to perform an + * in-place operation. */ + OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass) + { + const bool use_compositing = oidn_input_pass.use_compositing; + + /* Simple case: no compositing is involved, no scaling is needed. + * The pass pixels will be referenced as-is, without extra processing. */ + if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) { + return oidn_input_pass; + } + + float *buffer_data = render_buffers_->buffer.data(); + float *pass_data = buffer_data + oidn_output_pass.offset; + + PassAccessor::Destination destination(pass_data, 3); + destination.pixel_stride = buffer_params_.pass_stride; + + read_pass_pixels(oidn_input_pass, destination); + + OIDNPass oidn_input_pass_at_output = oidn_input_pass; + oidn_input_pass_at_output.offset = oidn_output_pass.offset; + + return oidn_input_pass_at_output; + } + + /* Read pass pixels using PassAccessor into the given destination. */ + void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination) + { + PassAccessor::PassAccessInfo pass_access_info; + pass_access_info.type = oidn_pass.type; + pass_access_info.mode = oidn_pass.mode; + pass_access_info.offset = oidn_pass.offset; + + /* Denoiser operates on passes which are used to calculate the approximation, and is never used + * on the approximation. The latter is not even possible because OIDN does not support + * denoising of semi-transparent pixels. */ + pass_access_info.use_approximate_shadow_catcher = false; + pass_access_info.use_approximate_shadow_catcher_background = false; + pass_access_info.show_active_pixels = false; + + /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured + * by users. What is important is to use same exposure for read and write access of the pass + * pixels. */ + const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_); + + pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination); + } + + /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */ + void read_pass_pixels_into_buffer(OIDNPass &oidn_pass) + { + VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " (" + << pass_type_as_string(oidn_pass.type) << ")"; + + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + + array<float> &scaled_buffer = oidn_pass.scaled_buffer; + scaled_buffer.resize(width * height * 3); + + const PassAccessor::Destination destination(scaled_buffer.data(), 3); + + read_pass_pixels(oidn_pass, destination); + } + + /* Set OIDN image to reference pixels from the given render buffer pass. + * No transform to the pixels is done, no additional memory is used. */ + void set_pass_referenced(oidn::FilterRef &oidn_filter, + const char *name, + const OIDNPass &oidn_pass) + { + const int64_t x = buffer_params_.full_x; + const int64_t y = buffer_params_.full_y; + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + const int64_t offset = buffer_params_.offset; + const int64_t stride = buffer_params_.stride; + const int64_t pass_stride = buffer_params_.pass_stride; + + const int64_t pixel_index = offset + x + y * stride; + const int64_t buffer_offset = pixel_index * pass_stride; + + float *buffer_data = render_buffers_->buffer.data(); + + oidn_filter.setImage(name, + buffer_data + buffer_offset + oidn_pass.offset, + oidn::Format::Float3, + width, + height, + 0, + pass_stride * sizeof(float), + stride * pass_stride * sizeof(float)); + } + + void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass) + { + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + + oidn_filter.setImage( + name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0); + } + + void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass) + { + set_pass(oidn_filter, oidn_pass.name, oidn_pass); + } + void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass) + { + if (oidn_pass.scaled_buffer.empty()) { + set_pass_referenced(oidn_filter, name, oidn_pass); + } + else { + set_pass_from_buffer(oidn_filter, name, oidn_pass); + } + } + + void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass) + { + set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass); + } + + void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass) + { + if (oidn_albedo_pass_) { + if (oidn_pass.use_denoising_albedo) { + set_pass(oidn_filter, oidn_albedo_pass_); + } + else { + /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been + * provided. */ + set_fake_albedo_pass(oidn_filter); + } + } + + if (oidn_normal_pass_) { + set_pass(oidn_filter, oidn_normal_pass_); + } + } + + void set_fake_albedo_pass(oidn::FilterRef &oidn_filter) + { + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + + if (!albedo_replaced_with_fake_) { + const int64_t num_pixel_components = width * height * 3; + oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components); + + for (int i = 0; i < num_pixel_components; ++i) { + oidn_albedo_pass_.scaled_buffer[i] = 0.5f; + } + + albedo_replaced_with_fake_ = true; + } + + set_pass(oidn_filter, oidn_albedo_pass_); + } + + void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass) + { + set_pass(oidn_filter, "output", oidn_pass); + } + + /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel + * back. */ + void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass) + { + kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components); + + const int64_t x = buffer_params_.full_x; + const int64_t y = buffer_params_.full_y; + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + const int64_t offset = buffer_params_.offset; + const int64_t stride = buffer_params_.stride; + const int64_t pass_stride = buffer_params_.pass_stride; + const int64_t row_stride = stride * pass_stride; + + const int64_t pixel_offset = offset + x + y * stride; + const int64_t buffer_offset = (pixel_offset * pass_stride); + + float *buffer_data = render_buffers_->buffer.data(); + + const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED); + const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing; + + for (int y = 0; y < height; ++y) { + float *buffer_row = buffer_data + buffer_offset + y * row_stride; + for (int x = 0; x < width; ++x) { + float *buffer_pixel = buffer_row + x * pass_stride; + float *denoised_pixel = buffer_pixel + oidn_output_pass.offset; + + if (need_scale) { + const float pixel_scale = has_pass_sample_count ? + __float_as_uint(buffer_pixel[pass_sample_count_]) : + num_samples_; + + denoised_pixel[0] = denoised_pixel[0] * pixel_scale; + denoised_pixel[1] = denoised_pixel[1] * pixel_scale; + denoised_pixel[2] = denoised_pixel[2] * pixel_scale; + } + + if (oidn_output_pass.num_components == 3) { + /* Pass without alpha channel. */ + } + else if (!oidn_input_pass.use_compositing) { + /* Currently compositing passes are either 3-component (derived by dividing light passes) + * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it + * simplifies logic and avoids extra memory allocation. */ + const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset; + denoised_pixel[3] = noisy_pixel[3]; + } + else { + /* Assigning to zero since this is a default alpha value for 3-component passes, and it + * is an opaque pixel for 4 component passes. */ + denoised_pixel[3] = 0; + } + } + } + } + + bool is_pass_scale_needed(OIDNPass &oidn_pass) const + { + if (pass_sample_count_ != PASS_UNUSED) { + /* With adaptive sampling pixels will have different number of samples in them, so need to + * always scale the pass to make pixels uniformly sampled. */ + return true; + } + + if (!oidn_pass.need_scale) { + return false; + } + + if (num_samples_ == 1) { + /* If the avoid scaling if there is only one sample, to save up time (so we dont divide + * buffer by 1). */ + return false; + } + + return true; + } + + void scale_pass_in_render_buffers(OIDNPass &oidn_pass) + { + const int64_t x = buffer_params_.full_x; + const int64_t y = buffer_params_.full_y; + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + const int64_t offset = buffer_params_.offset; + const int64_t stride = buffer_params_.stride; + const int64_t pass_stride = buffer_params_.pass_stride; + const int64_t row_stride = stride * pass_stride; + + const int64_t pixel_offset = offset + x + y * stride; + const int64_t buffer_offset = (pixel_offset * pass_stride); + + float *buffer_data = render_buffers_->buffer.data(); + + const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED); + + for (int y = 0; y < height; ++y) { + float *buffer_row = buffer_data + buffer_offset + y * row_stride; + for (int x = 0; x < width; ++x) { + float *buffer_pixel = buffer_row + x * pass_stride; + float *pass_pixel = buffer_pixel + oidn_pass.offset; + + const float pixel_scale = 1.0f / (has_pass_sample_count ? + __float_as_uint(buffer_pixel[pass_sample_count_]) : + num_samples_); + + pass_pixel[0] = pass_pixel[0] * pixel_scale; + pass_pixel[1] = pass_pixel[1] * pixel_scale; + pass_pixel[2] = pass_pixel[2] * pixel_scale; + } + } + } + + OIDNDenoiser *denoiser_ = nullptr; + + const DenoiseParams &denoise_params_; + const BufferParams &buffer_params_; + RenderBuffers *render_buffers_ = nullptr; + int num_samples_ = 0; + bool allow_inplace_modification_ = false; + int pass_sample_count_ = PASS_UNUSED; + + /* Optional albedo and normal passes, reused by denoising of different pass types. */ + OIDNPass oidn_albedo_pass_; + OIDNPass oidn_normal_pass_; + + /* For passes which don't need albedo channel for denoising we replace the actual albedo with + * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with + * the fake values and denoising of passes which do need albedo can no longer happen. */ + bool albedo_replaced_with_fake_ = false; +}; +#endif + +static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers) +{ + Device *device = render_buffers->buffer.device; + if (device->info.has_gpu_queue) { + return device->gpu_queue_create(); + } + return nullptr; +} + +static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue, + RenderBuffers *render_buffers) +{ + if (queue) { + queue->copy_from_device(render_buffers->buffer); + queue->synchronize(); + } + else { + render_buffers->copy_from_device(); + } +} + +static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue, + RenderBuffers *render_buffers) +{ + if (queue) { + queue->copy_to_device(render_buffers->buffer); + queue->synchronize(); + } + else { + render_buffers->copy_to_device(); + } +} + +bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) +{ + thread_scoped_lock lock(mutex_); + + /* Make sure the host-side data is available for denoising. */ + unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers); + copy_render_buffers_from_device(queue, render_buffers); + +#ifdef WITH_OPENIMAGEDENOISE + OIDNDenoiseContext context( + this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification); + + if (context.need_denoising()) { + context.read_guiding_passes(); + + const std::array<PassType, 3> passes = { + {/* Passes which will use real albedo when it is available. */ + PASS_COMBINED, + PASS_SHADOW_CATCHER_MATTE, + + /* Passes which do not need albedo and hence if real is present it needs to become fake. + */ + PASS_SHADOW_CATCHER}}; + + for (const PassType pass_type : passes) { + context.denoise_pass(pass_type); + if (is_cancelled()) { + return false; + } + } + + /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code + * copies data from the device it doesn't overwrite the denoiser buffers. */ + copy_render_buffers_to_device(queue, render_buffers); + } +#endif + + /* This code is not supposed to run when compiled without OIDN support, so can assume if we made + * it up here all passes are properly denoised. */ + return true; +} + +uint OIDNDenoiser::get_device_type_mask() const +{ + return DEVICE_MASK_CPU; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h new file mode 100644 index 00000000000..566e761ae79 --- /dev/null +++ b/intern/cycles/integrator/denoiser_oidn.h @@ -0,0 +1,47 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/denoiser.h" +#include "util/util_thread.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +/* Implementation of denoising API which uses OpenImageDenoise library. */ +class OIDNDenoiser : public Denoiser { + public: + /* Forwardly declared state which might be using compile-flag specific fields, such as + * OpenImageDenoise device and filter handles. */ + class State; + + OIDNDenoiser(Device *path_trace_device, const DenoiseParams ¶ms); + + virtual bool denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) override; + + protected: + virtual uint get_device_type_mask() const override; + + /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded. + * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */ + static thread_mutex mutex_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_optix.cpp b/intern/cycles/integrator/denoiser_optix.cpp new file mode 100644 index 00000000000..5f9de23bfe6 --- /dev/null +++ b/intern/cycles/integrator/denoiser_optix.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/denoiser_optix.h" + +#include "device/device.h" +#include "device/device_denoise.h" + +CCL_NAMESPACE_BEGIN + +OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams ¶ms) + : DeviceDenoiser(path_trace_device, params) +{ +} + +uint OptiXDenoiser::get_device_type_mask() const +{ + return DEVICE_MASK_OPTIX; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_optix.h b/intern/cycles/integrator/denoiser_optix.h new file mode 100644 index 00000000000..a8df770ecf7 --- /dev/null +++ b/intern/cycles/integrator/denoiser_optix.h @@ -0,0 +1,31 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/denoiser_device.h" + +CCL_NAMESPACE_BEGIN + +class OptiXDenoiser : public DeviceDenoiser { + public: + OptiXDenoiser(Device *path_trace_device, const DenoiseParams ¶ms); + + protected: + virtual uint get_device_type_mask() const override; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp new file mode 100644 index 00000000000..87c048b1fa5 --- /dev/null +++ b/intern/cycles/integrator/pass_accessor.cpp @@ -0,0 +1,318 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/pass_accessor.h" + +#include "render/buffers.h" +#include "util/util_logging.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/kernel_types.h" +// clang-format on + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * Pass input information. + */ + +PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass) + : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset) +{ +} + +/* -------------------------------------------------------------------- + * Pass destination. + */ + +PassAccessor::Destination::Destination(float *pixels, int num_components) + : pixels(pixels), num_components(num_components) +{ +} + +PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels) + : Destination(pass_type) +{ + pixels_half_rgba = pixels; +} + +PassAccessor::Destination::Destination(const PassType pass_type) +{ + const PassInfo pass_info = Pass::get_info(pass_type); + num_components = pass_info.num_components; +} + +/* -------------------------------------------------------------------- + * Pass source. + */ + +PassAccessor::Source::Source(const float *pixels, int num_components) + : pixels(pixels), num_components(num_components) +{ +} + +/* -------------------------------------------------------------------- + * Pass accessor. + */ + +PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples) + : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples) +{ +} + +bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers, + const Destination &destination) const +{ + if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) { + return false; + } + + return get_render_tile_pixels(render_buffers, render_buffers->params, destination); +} + +static void pad_pixels(const BufferParams &buffer_params, + const PassAccessor::Destination &destination, + const int src_num_components) +{ + /* When requesting a single channel pass as RGBA, or RGB pass as RGBA, + * fill in the additional components for convenience. */ + const int dest_num_components = destination.num_components; + + if (src_num_components >= dest_num_components) { + return; + } + + const size_t size = buffer_params.width * buffer_params.height; + if (destination.pixels) { + float *pixel = destination.pixels; + + for (size_t i = 0; i < size; i++, pixel += dest_num_components) { + if (dest_num_components >= 3 && src_num_components == 1) { + pixel[1] = pixel[0]; + pixel[2] = pixel[0]; + } + if (dest_num_components >= 4) { + pixel[3] = 1.0f; + } + } + } + + if (destination.pixels_half_rgba) { + const half one = float_to_half(1.0f); + half4 *pixel = destination.pixels_half_rgba; + + for (size_t i = 0; i < size; i++, pixel++) { + if (dest_num_components >= 3 && src_num_components == 1) { + pixel[0].y = pixel[0].x; + pixel[0].z = pixel[0].x; + } + if (dest_num_components >= 4) { + pixel[0].w = one; + } + } + } +} + +bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination) const +{ + if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) { + return false; + } + + if (pass_access_info_.offset == PASS_UNUSED) { + return false; + } + + const PassType type = pass_access_info_.type; + const PassMode mode = pass_access_info_.mode; + const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo); + + if (pass_info.num_components == 1) { + /* Single channel passes. */ + if (mode == PassMode::DENOISED) { + /* Denoised passes store their final pixels, no need in special calculation. */ + get_pass_float(render_buffers, buffer_params, destination); + } + else if (type == PASS_RENDER_TIME) { + /* TODO(sergey): Needs implementation. */ + } + else if (type == PASS_DEPTH) { + get_pass_depth(render_buffers, buffer_params, destination); + } + else if (type == PASS_MIST) { + get_pass_mist(render_buffers, buffer_params, destination); + } + else if (type == PASS_SAMPLE_COUNT) { + get_pass_sample_count(render_buffers, buffer_params, destination); + } + else { + get_pass_float(render_buffers, buffer_params, destination); + } + } + else if (type == PASS_MOTION) { + /* Motion pass. */ + DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components"; + get_pass_motion(render_buffers, buffer_params, destination); + } + else if (type == PASS_CRYPTOMATTE) { + /* Cryptomatte pass. */ + DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components"; + get_pass_cryptomatte(render_buffers, buffer_params, destination); + } + else { + /* RGB, RGBA and vector passes. */ + DCHECK(destination.num_components == 3 || destination.num_components == 4) + << pass_type_as_string(type) << " pass must have 3 or 4 components"; + + if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) { + /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass + * to approximate shadow with). */ + get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination); + } + else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) { + /* Shadow catcher pass. */ + get_pass_shadow_catcher(render_buffers, buffer_params, destination); + } + else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE || + pass_info.indirect_type != PASS_NONE) && + mode != PassMode::DENOISED) { + /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */ + get_pass_light_path(render_buffers, buffer_params, destination); + } + else { + /* Passes that need no special computation, or denoised passes that already + * had the computation done. */ + if (pass_info.num_components == 3) { + get_pass_float3(render_buffers, buffer_params, destination); + } + else if (pass_info.num_components == 4) { + if (destination.num_components == 3) { + /* Special case for denoiser access of RGBA passes ignoring alpha channel. */ + get_pass_float3(render_buffers, buffer_params, destination); + } + else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER || + type == PASS_SHADOW_CATCHER_MATTE) { + /* Passes with transparency as 4th component. */ + get_pass_combined(render_buffers, buffer_params, destination); + } + else { + /* Passes with alpha as 4th component. */ + get_pass_float4(render_buffers, buffer_params, destination); + } + } + } + } + + pad_pixels(buffer_params, destination, pass_info.num_components); + + return true; +} + +void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert, + const BufferParams &buffer_params, + const Destination &destination) const +{ + const PassMode mode = pass_access_info_.mode; + const PassInfo &pass_info = Pass::get_info(pass_access_info_.type, + pass_access_info_.include_albedo); + + kfilm_convert->pass_offset = pass_access_info_.offset; + kfilm_convert->pass_stride = buffer_params.pass_stride; + + kfilm_convert->pass_use_exposure = pass_info.use_exposure; + kfilm_convert->pass_use_filter = pass_info.use_filter; + + /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */ + if (pass_info.direct_type != PASS_NONE) { + kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type); + } + kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type); + kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type); + + kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED); + kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT); + kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset( + PASS_ADAPTIVE_AUX_BUFFER); + kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT); + kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode); + kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset( + PASS_SHADOW_CATCHER_SAMPLE_COUNT); + kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset( + PASS_SHADOW_CATCHER_MATTE, mode); + + /* Background is not denoised, so always use noisy pass. */ + kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND); + + if (pass_info.use_filter) { + kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f; + } + else { + kfilm_convert->scale = 1.0f; + } + + if (pass_info.use_exposure) { + kfilm_convert->exposure = exposure_; + } + else { + kfilm_convert->exposure = 1.0f; + } + + kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure; + + kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher; + kfilm_convert->use_approximate_shadow_catcher_background = + pass_access_info_.use_approximate_shadow_catcher_background; + kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels; + + kfilm_convert->num_components = destination.num_components; + kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride : + destination.num_components; + + kfilm_convert->is_denoised = (mode == PassMode::DENOISED); +} + +bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source) +{ + if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) { + return false; + } + + const PassInfo pass_info = Pass::get_info(pass_access_info_.type, + pass_access_info_.include_albedo); + + const BufferParams &buffer_params = render_buffers->params; + + float *buffer_data = render_buffers->buffer.data(); + const int size = buffer_params.width * buffer_params.height; + + const int out_stride = buffer_params.pass_stride; + const int in_stride = source.num_components; + const int num_components_to_copy = min(source.num_components, pass_info.num_components); + + float *out = buffer_data + pass_access_info_.offset; + const float *in = source.pixels + source.offset * in_stride; + + for (int i = 0; i < size; i++, out += out_stride, in += in_stride) { + memcpy(out, in, sizeof(float) * num_components_to_copy); + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h new file mode 100644 index 00000000000..624bf7d0b2c --- /dev/null +++ b/intern/cycles/integrator/pass_accessor.h @@ -0,0 +1,160 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "render/pass.h" +#include "util/util_half.h" +#include "util/util_string.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +class RenderBuffers; +class BufferPass; +class BufferParams; +struct KernelFilmConvert; + +/* Helper class which allows to access pass data. + * Is designed in a way that it is created once when the pass data is known, and then pixels gets + * progressively update from various render buffers. */ +class PassAccessor { + public: + class PassAccessInfo { + public: + PassAccessInfo() = default; + explicit PassAccessInfo(const BufferPass &pass); + + PassType type = PASS_NONE; + PassMode mode = PassMode::NOISY; + bool include_albedo = false; + int offset = -1; + + /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its + * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop. + */ + bool use_approximate_shadow_catcher = false; + + /* When approximate shadow catcher matte is used alpha-over the result on top of background. */ + bool use_approximate_shadow_catcher_background = false; + + bool show_active_pixels = false; + }; + + class Destination { + public: + Destination() = default; + Destination(float *pixels, int num_components); + Destination(const PassType pass_type, half4 *pixels); + + /* Destination will be initialized with the number of components which is native for the given + * pass type. */ + explicit Destination(const PassType pass_type); + + /* CPU-side pointers. only usable by the `PassAccessorCPU`. */ + float *pixels = nullptr; + half4 *pixels_half_rgba = nullptr; + + /* Device-side pointers. */ + device_ptr d_pixels = 0; + device_ptr d_pixels_half_rgba = 0; + + /* Number of components per pixel in the floating-point destination. + * Is ignored for half4 destination (where number of components is implied to be 4). */ + int num_components = 0; + + /* Offset in pixels from the beginning of pixels storage. + * Allows to get pixels of render buffer into a partial slice of the destination. */ + int offset = 0; + + /* Number of floats per pixel. When zero is the same as `num_components`. + * + * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component + * half-floats. */ + int pixel_stride = 0; + + /* Row stride in pixel elements: + * - For the float destination stride is a number of floats per row. + * - For the half4 destination stride is a number of half4 per row. */ + int stride = 0; + }; + + class Source { + public: + Source() = default; + Source(const float *pixels, int num_components); + + /* CPU-side pointers. only usable by the `PassAccessorCPU`. */ + const float *pixels = nullptr; + int num_components = 0; + + /* Offset in pixels from the beginning of pixels storage. + * Allows to get pixels of render buffer into a partial slice of the destination. */ + int offset = 0; + }; + + PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples); + + virtual ~PassAccessor() = default; + + /* Get pass data from the given render buffers, perform needed filtering, and store result into + * the pixels. + * The result is stored sequentially starting from the very beginning of the pixels memory. */ + bool get_render_tile_pixels(const RenderBuffers *render_buffers, + const Destination &destination) const; + bool get_render_tile_pixels(const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination) const; + /* Set pass data for the given render buffers. Used for baking to read from passes. */ + bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source); + + protected: + virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert, + const BufferParams &buffer_params, + const Destination &destination) const; + +#define DECLARE_PASS_ACCESSOR(pass) \ + virtual void get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const = 0; + + /* Float (scalar) passes. */ + DECLARE_PASS_ACCESSOR(depth) + DECLARE_PASS_ACCESSOR(mist) + DECLARE_PASS_ACCESSOR(sample_count) + DECLARE_PASS_ACCESSOR(float) + + /* Float3 passes. */ + DECLARE_PASS_ACCESSOR(light_path) + DECLARE_PASS_ACCESSOR(shadow_catcher) + DECLARE_PASS_ACCESSOR(float3) + + /* Float4 passes. */ + DECLARE_PASS_ACCESSOR(motion) + DECLARE_PASS_ACCESSOR(cryptomatte) + DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow) + DECLARE_PASS_ACCESSOR(combined) + DECLARE_PASS_ACCESSOR(float4) + +#undef DECLARE_PASS_ACCESSOR + + PassAccessInfo pass_access_info_; + + float exposure_ = 0.0f; + int num_samples_ = 0; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp new file mode 100644 index 00000000000..3c6691f6d43 --- /dev/null +++ b/intern/cycles/integrator/pass_accessor_cpu.cpp @@ -0,0 +1,183 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/pass_accessor_cpu.h" + +#include "render/buffers.h" +#include "util/util_logging.h" +#include "util/util_tbb.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_film.h" +// clang-format on + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * Kernel processing. + */ + +template<typename Processor> +inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const +{ + KernelFilmConvert kfilm_convert; + init_kernel_film_convert(&kfilm_convert, buffer_params, destination); + + if (destination.pixels) { + /* NOTE: No overlays are applied since they are not used for final renders. + * Can be supported via some sort of specialization to avoid code duplication. */ + + run_get_pass_kernel_processor_float( + &kfilm_convert, render_buffers, buffer_params, destination, processor); + } + + if (destination.pixels_half_rgba) { + /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */ + + if (destination.num_components == 1) { + run_get_pass_kernel_processor_half_rgba(&kfilm_convert, + render_buffers, + buffer_params, + destination, + [&processor](const KernelFilmConvert *kfilm_convert, + ccl_global const float *buffer, + float *pixel_rgba) { + float pixel; + processor(kfilm_convert, buffer, &pixel); + + pixel_rgba[0] = pixel; + pixel_rgba[1] = pixel; + pixel_rgba[2] = pixel; + pixel_rgba[3] = 1.0f; + }); + } + else if (destination.num_components == 3) { + run_get_pass_kernel_processor_half_rgba(&kfilm_convert, + render_buffers, + buffer_params, + destination, + [&processor](const KernelFilmConvert *kfilm_convert, + ccl_global const float *buffer, + float *pixel_rgba) { + processor(kfilm_convert, buffer, pixel_rgba); + pixel_rgba[3] = 1.0f; + }); + } + else if (destination.num_components == 4) { + run_get_pass_kernel_processor_half_rgba( + &kfilm_convert, render_buffers, buffer_params, destination, processor); + } + } +} + +template<typename Processor> +inline void PassAccessorCPU::run_get_pass_kernel_processor_float( + const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const +{ + DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented."; + + const float *buffer_data = render_buffers->buffer.data(); + const int pixel_stride = destination.pixel_stride ? destination.pixel_stride : + destination.num_components; + + tbb::parallel_for(0, buffer_params.height, [&](int64_t y) { + int64_t pixel_index = y * buffer_params.width; + for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) { + const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride; + const float *buffer = buffer_data + input_pixel_offset; + float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride; + + processor(kfilm_convert, buffer, pixel); + } + }); +} + +template<typename Processor> +inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( + const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const +{ + const float *buffer_data = render_buffers->buffer.data(); + + half4 *dst_start = destination.pixels_half_rgba + destination.offset; + const int destination_stride = destination.stride != 0 ? destination.stride : + buffer_params.width; + + tbb::parallel_for(0, buffer_params.height, [&](int64_t y) { + int64_t pixel_index = y * buffer_params.width; + half4 *dst_row_start = dst_start + y * destination_stride; + for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) { + const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride; + const float *buffer = buffer_data + input_pixel_offset; + + float pixel[4]; + processor(kfilm_convert, buffer, pixel); + + film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel); + + half4 *pixel_half_rgba = dst_row_start + x; + float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); + } + }); +} + +/* -------------------------------------------------------------------- + * Pass accessors. + */ + +#define DEFINE_PASS_ACCESSOR(pass) \ + void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const \ + { \ + run_get_pass_kernel_processor( \ + render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \ + } + +/* Float (scalar) passes. */ +DEFINE_PASS_ACCESSOR(depth) +DEFINE_PASS_ACCESSOR(mist) +DEFINE_PASS_ACCESSOR(sample_count) +DEFINE_PASS_ACCESSOR(float) + +/* Float3 passes. */ +DEFINE_PASS_ACCESSOR(light_path) +DEFINE_PASS_ACCESSOR(shadow_catcher) +DEFINE_PASS_ACCESSOR(float3) + +/* Float4 passes. */ +DEFINE_PASS_ACCESSOR(motion) +DEFINE_PASS_ACCESSOR(cryptomatte) +DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow) +DEFINE_PASS_ACCESSOR(combined) +DEFINE_PASS_ACCESSOR(float4) + +#undef DEFINE_PASS_ACCESSOR + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h new file mode 100644 index 00000000000..0313dc5bb0d --- /dev/null +++ b/intern/cycles/integrator/pass_accessor_cpu.h @@ -0,0 +1,77 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/pass_accessor.h" + +CCL_NAMESPACE_BEGIN + +struct KernelFilmConvert; + +/* Pass accessor implementation for CPU side. */ +class PassAccessorCPU : public PassAccessor { + public: + using PassAccessor::PassAccessor; + + protected: + template<typename Processor> + inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const; + + template<typename Processor> + inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const; + + template<typename Processor> + inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const; + +#define DECLARE_PASS_ACCESSOR(pass) \ + virtual void get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const override; + + /* Float (scalar) passes. */ + DECLARE_PASS_ACCESSOR(depth) + DECLARE_PASS_ACCESSOR(mist) + DECLARE_PASS_ACCESSOR(sample_count) + DECLARE_PASS_ACCESSOR(float) + + /* Float3 passes. */ + DECLARE_PASS_ACCESSOR(light_path) + DECLARE_PASS_ACCESSOR(shadow_catcher) + DECLARE_PASS_ACCESSOR(float3) + + /* Float4 passes. */ + DECLARE_PASS_ACCESSOR(motion) + DECLARE_PASS_ACCESSOR(cryptomatte) + DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow) + DECLARE_PASS_ACCESSOR(combined) + DECLARE_PASS_ACCESSOR(float4) + +#undef DECLARE_PASS_ACCESSOR +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp new file mode 100644 index 00000000000..eb80ba99655 --- /dev/null +++ b/intern/cycles/integrator/pass_accessor_gpu.cpp @@ -0,0 +1,118 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/pass_accessor_gpu.h" + +#include "device/device_queue.h" +#include "render/buffers.h" +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue, + const PassAccessInfo &pass_access_info, + float exposure, + int num_samples) + : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue) + +{ +} + +/* -------------------------------------------------------------------- + * Kernel execution. + */ + +void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination) const +{ + KernelFilmConvert kfilm_convert; + init_kernel_film_convert(&kfilm_convert, buffer_params, destination); + + const int work_size = buffer_params.width * buffer_params.height; + + const int destination_stride = destination.stride != 0 ? destination.stride : + buffer_params.width; + + if (destination.d_pixels) { + DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented."; + + void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert), + const_cast<device_ptr *>(&destination.d_pixels), + const_cast<device_ptr *>(&render_buffers->buffer.device_pointer), + const_cast<int *>(&work_size), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&destination.offset), + const_cast<int *>(&destination_stride)}; + + queue_->enqueue(kernel, work_size, args); + } + if (destination.d_pixels_half_rgba) { + const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1); + + void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert), + const_cast<device_ptr *>(&destination.d_pixels_half_rgba), + const_cast<device_ptr *>(&render_buffers->buffer.device_pointer), + const_cast<int *>(&work_size), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&destination.offset), + const_cast<int *>(&destination_stride)}; + + queue_->enqueue(kernel_half_float, work_size, args); + } + + queue_->synchronize(); +} + +/* -------------------------------------------------------------------- + * Pass accessors. + */ + +#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \ + void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const \ + { \ + run_film_convert_kernels( \ + DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \ + } + +/* Float (scalar) passes. */ +DEFINE_PASS_ACCESSOR(depth, DEPTH); +DEFINE_PASS_ACCESSOR(mist, MIST); +DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT); +DEFINE_PASS_ACCESSOR(float, FLOAT); + +/* Float3 passes. */ +DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH); +DEFINE_PASS_ACCESSOR(float3, FLOAT3); + +/* Float4 passes. */ +DEFINE_PASS_ACCESSOR(motion, MOTION); +DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE); +DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER); +DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW); +DEFINE_PASS_ACCESSOR(combined, COMBINED); +DEFINE_PASS_ACCESSOR(float4, FLOAT4); + +#undef DEFINE_PASS_ACCESSOR + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h new file mode 100644 index 00000000000..bc37e4387f3 --- /dev/null +++ b/intern/cycles/integrator/pass_accessor_gpu.h @@ -0,0 +1,68 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/pass_accessor.h" +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +/* Pass accessor implementation for GPU side. */ +class PassAccessorGPU : public PassAccessor { + public: + PassAccessorGPU(DeviceQueue *queue, + const PassAccessInfo &pass_access_info, + float exposure, + int num_samples); + + protected: + void run_film_convert_kernels(DeviceKernel kernel, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination) const; + +#define DECLARE_PASS_ACCESSOR(pass) \ + virtual void get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const override; + + /* Float (scalar) passes. */ + DECLARE_PASS_ACCESSOR(depth); + DECLARE_PASS_ACCESSOR(mist); + DECLARE_PASS_ACCESSOR(sample_count); + DECLARE_PASS_ACCESSOR(float); + + /* Float3 passes. */ + DECLARE_PASS_ACCESSOR(light_path); + DECLARE_PASS_ACCESSOR(float3); + + /* Float4 passes. */ + DECLARE_PASS_ACCESSOR(motion); + DECLARE_PASS_ACCESSOR(cryptomatte); + DECLARE_PASS_ACCESSOR(shadow_catcher); + DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow); + DECLARE_PASS_ACCESSOR(combined); + DECLARE_PASS_ACCESSOR(float4); + +#undef DECLARE_PASS_ACCESSOR + + DeviceQueue *queue_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp new file mode 100644 index 00000000000..6c02316ac2b --- /dev/null +++ b/intern/cycles/integrator/path_trace.cpp @@ -0,0 +1,1147 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/path_trace.h" + +#include "device/cpu/device.h" +#include "device/device.h" +#include "integrator/pass_accessor.h" +#include "integrator/render_scheduler.h" +#include "render/gpu_display.h" +#include "render/pass.h" +#include "render/scene.h" +#include "render/tile.h" +#include "util/util_algorithm.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_tbb.h" +#include "util/util_time.h" + +CCL_NAMESPACE_BEGIN + +PathTrace::PathTrace(Device *device, + Film *film, + DeviceScene *device_scene, + RenderScheduler &render_scheduler, + TileManager &tile_manager) + : device_(device), + device_scene_(device_scene), + render_scheduler_(render_scheduler), + tile_manager_(tile_manager) +{ + DCHECK_NE(device_, nullptr); + + { + vector<DeviceInfo> cpu_devices; + device_cpu_info(cpu_devices); + + cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler)); + } + + /* Create path tracing work in advance, so that it can be reused by incremental sampling as much + * as possible. */ + device_->foreach_device([&](Device *path_trace_device) { + path_trace_works_.emplace_back(PathTraceWork::create( + path_trace_device, film, device_scene, &render_cancel_.is_requested)); + }); + + work_balance_infos_.resize(path_trace_works_.size()); + work_balance_do_initial(work_balance_infos_); + + render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1); +} + +PathTrace::~PathTrace() +{ + /* Destroy any GPU resource which was used for graphics interop. + * Need to have access to the GPUDisplay as it is the only source of drawing context which is + * used for interop. */ + if (gpu_display_) { + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->destroy_gpu_resources(gpu_display_.get()); + } + } +} + +void PathTrace::load_kernels() +{ + if (denoiser_) { + denoiser_->load_kernels(progress_); + } +} + +void PathTrace::alloc_work_memory() +{ + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->alloc_work_memory(); + } +} + +bool PathTrace::ready_to_reset() +{ + /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU + * display. Of there is no such display, the logic here will break. */ + DCHECK(gpu_display_); + + /* The logic here tries to provide behavior which feels the most interactive feel to artists. + * General idea is to be able to reset as quickly as possible, while still providing interactive + * feel. + * + * If the render result was ever drawn after previous reset, consider that reset is now possible. + * This way camera navigation gives the quickest feedback of rendered pixels, regardless of + * whether CPU or GPU drawing pipeline is used. + * + * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit + * arbitrary, but seems to work very well with viewport navigation in Blender. */ + + if (did_draw_after_reset_) { + return true; + } + + return false; +} + +void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params) +{ + if (big_tile_params_.modified(big_tile_params)) { + big_tile_params_ = big_tile_params; + render_state_.need_reset_params = true; + } + + full_params_ = full_params; + + /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation. + * It is requires to inform about reset whenever it happens, so that the redraw state tracking is + * properly updated. */ + if (gpu_display_) { + gpu_display_->reset(full_params); + } + + render_state_.has_denoised_result = false; + render_state_.tile_written = false; + + did_draw_after_reset_ = false; +} + +void PathTrace::device_free() +{ + /* Free render buffers used by the path trace work to reduce memory peak. */ + BufferParams empty_params; + empty_params.pass_stride = 0; + empty_params.update_offset_stride(); + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->get_render_buffers()->reset(empty_params); + } + render_state_.need_reset_params = true; +} + +void PathTrace::set_progress(Progress *progress) +{ + progress_ = progress; +} + +void PathTrace::render(const RenderWork &render_work) +{ + /* Indicate that rendering has started and that it can be requested to cancel. */ + { + thread_scoped_lock lock(render_cancel_.mutex); + if (render_cancel_.is_requested) { + return; + } + render_cancel_.is_rendering = true; + } + + render_pipeline(render_work); + + /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry + * on. */ + { + thread_scoped_lock lock(render_cancel_.mutex); + render_cancel_.is_rendering = false; + render_cancel_.condition.notify_one(); + } +} + +void PathTrace::render_pipeline(RenderWork render_work) +{ + /* NOTE: Only check for "instant" cancel here. Ther user-requested cancel via progress is + * checked in Session and the work in the event of cancel is to be finished here. */ + + render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes != + 0); + + render_init_kernel_execution(); + + render_scheduler_.report_work_begin(render_work); + + init_render_buffers(render_work); + + rebalance(render_work); + + path_trace(render_work); + if (render_cancel_.is_requested) { + return; + } + + adaptive_sample(render_work); + if (render_cancel_.is_requested) { + return; + } + + cryptomatte_postprocess(render_work); + if (render_cancel_.is_requested) { + return; + } + + denoise(render_work); + if (render_cancel_.is_requested) { + return; + } + + write_tile_buffer(render_work); + update_display(render_work); + + progress_update_if_needed(render_work); + + finalize_full_buffer_on_disk(render_work); +} + +void PathTrace::render_init_kernel_execution() +{ + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->init_execution(); + } +} + +/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a + * measurable performance impact at runtime, but will make compilation faster and binary somewhat + * smaller. */ +template<typename Callback> +static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works, + const vector<WorkBalanceInfo> &work_balance_infos, + const BufferParams &buffer_params, + const Callback &callback) +{ + const int num_works = path_trace_works.size(); + const int height = buffer_params.height; + + int current_y = 0; + for (int i = 0; i < num_works; ++i) { + const double weight = work_balance_infos[i].weight; + const int slice_height = max(lround(height * weight), 1); + + /* Disallow negative values to deal with situations when there are more compute devices than + * scanlines. */ + const int remaining_height = max(0, height - current_y); + + BufferParams slide_params = buffer_params; + slide_params.full_y = buffer_params.full_y + current_y; + if (i < num_works - 1) { + slide_params.height = min(slice_height, remaining_height); + } + else { + slide_params.height = remaining_height; + } + + slide_params.update_offset_stride(); + + callback(path_trace_works[i].get(), slide_params); + + current_y += slide_params.height; + } +} + +void PathTrace::update_allocated_work_buffer_params() +{ + foreach_sliced_buffer_params(path_trace_works_, + work_balance_infos_, + big_tile_params_, + [](PathTraceWork *path_trace_work, const BufferParams ¶ms) { + RenderBuffers *buffers = path_trace_work->get_render_buffers(); + buffers->reset(params); + }); +} + +static BufferParams scale_buffer_params(const BufferParams ¶ms, int resolution_divider) +{ + BufferParams scaled_params = params; + + scaled_params.width = max(1, params.width / resolution_divider); + scaled_params.height = max(1, params.height / resolution_divider); + scaled_params.full_x = params.full_x / resolution_divider; + scaled_params.full_y = params.full_y / resolution_divider; + scaled_params.full_width = params.full_width / resolution_divider; + scaled_params.full_height = params.full_height / resolution_divider; + + scaled_params.update_offset_stride(); + + return scaled_params; +} + +void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work) +{ + const int resolution_divider = render_work.resolution_divider; + + const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider); + const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_, + resolution_divider); + + foreach_sliced_buffer_params(path_trace_works_, + work_balance_infos_, + scaled_big_tile_params, + [&](PathTraceWork *path_trace_work, const BufferParams params) { + path_trace_work->set_effective_buffer_params( + scaled_full_params, scaled_big_tile_params, params); + }); + + render_state_.effective_big_tile_params = scaled_big_tile_params; +} + +void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work) +{ + if (render_state_.need_reset_params) { + update_allocated_work_buffer_params(); + } + + if (render_state_.need_reset_params || + render_state_.resolution_divider != render_work.resolution_divider) { + update_effective_work_buffer_params(render_work); + } + + render_state_.resolution_divider = render_work.resolution_divider; + render_state_.need_reset_params = false; +} + +void PathTrace::init_render_buffers(const RenderWork &render_work) +{ + update_work_buffer_params_if_needed(render_work); + + /* Handle initialization scheduled by the render scheduler. */ + if (render_work.init_render_buffers) { + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->zero_render_buffers(); + }); + + tile_buffer_read(); + } +} + +void PathTrace::path_trace(RenderWork &render_work) +{ + if (!render_work.path_trace.num_samples) { + return; + } + + VLOG(3) << "Will path trace " << render_work.path_trace.num_samples + << " samples at the resolution divider " << render_work.resolution_divider; + + const double start_time = time_dt(); + + const int num_works = path_trace_works_.size(); + + tbb::parallel_for(0, num_works, [&](int i) { + const double work_start_time = time_dt(); + const int num_samples = render_work.path_trace.num_samples; + + PathTraceWork *path_trace_work = path_trace_works_[i].get(); + + PathTraceWork::RenderStatistics statistics; + path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples); + + const double work_time = time_dt() - work_start_time; + work_balance_infos_[i].time_spent += work_time; + work_balance_infos_[i].occupancy = statistics.occupancy; + + VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds (" + << work_time / num_samples + << " seconds per sample), occupancy: " << statistics.occupancy; + }); + + float occupancy_accum = 0.0f; + for (const WorkBalanceInfo &balance_info : work_balance_infos_) { + occupancy_accum += balance_info.occupancy; + } + const float occupancy = occupancy_accum / num_works; + render_scheduler_.report_path_trace_occupancy(render_work, occupancy); + + render_scheduler_.report_path_trace_time( + render_work, time_dt() - start_time, is_cancel_requested()); +} + +void PathTrace::adaptive_sample(RenderWork &render_work) +{ + if (!render_work.adaptive_sampling.filter) { + return; + } + + bool did_reschedule_on_idle = false; + + while (true) { + VLOG(3) << "Will filter adaptive stopping buffer, threshold " + << render_work.adaptive_sampling.threshold; + if (render_work.adaptive_sampling.reset) { + VLOG(3) << "Will re-calculate convergency flag for currently converged pixels."; + } + + const double start_time = time_dt(); + + uint num_active_pixels = 0; + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + const uint num_active_pixels_in_work = + path_trace_work->adaptive_sampling_converge_filter_count_active( + render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset); + if (num_active_pixels_in_work) { + atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work); + } + }); + + render_scheduler_.report_adaptive_filter_time( + render_work, time_dt() - start_time, is_cancel_requested()); + + if (num_active_pixels == 0) { + VLOG(3) << "All pixels converged."; + if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) { + break; + } + VLOG(3) << "Continuing with lower threshold."; + } + else if (did_reschedule_on_idle) { + break; + } + else if (num_active_pixels < 128 * 128) { + /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that + * there is no performance loss from the progressive noise floor feature. + * + * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of + * the final resolution. */ + if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) { + VLOG(3) << "Rescheduling is not possible: final threshold is reached."; + break; + } + VLOG(3) << "Rescheduling lower threshold."; + did_reschedule_on_idle = true; + } + else { + break; + } + } +} + +void PathTrace::set_denoiser_params(const DenoiseParams ¶ms) +{ + render_scheduler_.set_denoiser_params(params); + + if (!params.use) { + denoiser_.reset(); + return; + } + + if (denoiser_) { + const DenoiseParams old_denoiser_params = denoiser_->get_params(); + if (old_denoiser_params.type == params.type) { + denoiser_->set_params(params); + return; + } + } + + denoiser_ = Denoiser::create(device_, params); + denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); }; +} + +void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling) +{ + render_scheduler_.set_adaptive_sampling(adaptive_sampling); +} + +void PathTrace::cryptomatte_postprocess(const RenderWork &render_work) +{ + if (!render_work.cryptomatte.postprocess) { + return; + } + VLOG(3) << "Perform cryptomatte work."; + + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->cryptomatte_postproces(); + }); +} + +void PathTrace::denoise(const RenderWork &render_work) +{ + if (!render_work.tile.denoise) { + return; + } + + if (!denoiser_) { + /* Denoiser was not configured, so nothing to do here. */ + return; + } + + VLOG(3) << "Perform denoising work."; + + const double start_time = time_dt(); + + RenderBuffers *buffer_to_denoise = nullptr; + + unique_ptr<RenderBuffers> multi_device_buffers; + bool allow_inplace_modification = false; + + if (path_trace_works_.size() == 1) { + buffer_to_denoise = path_trace_works_.front()->get_render_buffers(); + } + else { + Device *denoiser_device = denoiser_->get_denoiser_device(); + if (!denoiser_device) { + return; + } + + multi_device_buffers = make_unique<RenderBuffers>(denoiser_device); + multi_device_buffers->reset(render_state_.effective_big_tile_params); + + buffer_to_denoise = multi_device_buffers.get(); + + copy_to_render_buffers(multi_device_buffers.get()); + + allow_inplace_modification = true; + } + + if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params, + buffer_to_denoise, + get_num_samples_in_buffer(), + allow_inplace_modification)) { + render_state_.has_denoised_result = true; + } + + if (multi_device_buffers) { + multi_device_buffers->copy_from_device(); + tbb::parallel_for_each( + path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get()); + }); + } + + render_scheduler_.report_denoise_time(render_work, time_dt() - start_time); +} + +void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display) +{ + gpu_display_ = move(gpu_display); +} + +void PathTrace::clear_gpu_display() +{ + if (gpu_display_) { + gpu_display_->clear(); + } +} + +void PathTrace::draw() +{ + if (!gpu_display_) { + return; + } + + did_draw_after_reset_ |= gpu_display_->draw(); +} + +void PathTrace::update_display(const RenderWork &render_work) +{ + if (!render_work.display.update) { + return; + } + + if (!gpu_display_ && !tile_buffer_update_cb) { + VLOG(3) << "Ignore display update."; + return; + } + + if (full_params_.width == 0 || full_params_.height == 0) { + VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer."; + return; + } + + const double start_time = time_dt(); + + if (tile_buffer_update_cb) { + VLOG(3) << "Invoke buffer update callback."; + + tile_buffer_update_cb(); + } + + if (gpu_display_) { + VLOG(3) << "Perform copy to GPUDisplay work."; + + const int resolution_divider = render_work.resolution_divider; + const int texture_width = max(1, full_params_.width / resolution_divider); + const int texture_height = max(1, full_params_.height / resolution_divider); + if (!gpu_display_->update_begin(texture_width, texture_height)) { + LOG(ERROR) << "Error beginning GPUDisplay update."; + return; + } + + const PassMode pass_mode = render_work.display.use_denoised_result && + render_state_.has_denoised_result ? + PassMode::DENOISED : + PassMode::NOISY; + + /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from + * all works in parallel. */ + const int num_samples = get_num_samples_in_buffer(); + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples); + } + + gpu_display_->update_end(); + } + + render_scheduler_.report_display_update_time(render_work, time_dt() - start_time); +} + +void PathTrace::rebalance(const RenderWork &render_work) +{ + static const int kLogLevel = 3; + + if (!render_work.rebalance) { + return; + } + + const int num_works = path_trace_works_.size(); + + if (num_works == 1) { + VLOG(kLogLevel) << "Ignoring rebalance work due to single device render."; + return; + } + + const double start_time = time_dt(); + + if (VLOG_IS_ON(kLogLevel)) { + VLOG(kLogLevel) << "Perform rebalance work."; + VLOG(kLogLevel) << "Per-device path tracing time (seconds):"; + for (int i = 0; i < num_works; ++i) { + VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": " + << work_balance_infos_[i].time_spent; + } + } + + const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_); + + if (VLOG_IS_ON(kLogLevel)) { + VLOG(kLogLevel) << "Calculated per-device weights for works:"; + for (int i = 0; i < num_works; ++i) { + VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": " + << work_balance_infos_[i].weight; + } + } + + if (!did_rebalance) { + VLOG(kLogLevel) << "Balance in path trace works did not change."; + render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false); + return; + } + + RenderBuffers big_tile_cpu_buffers(cpu_device_.get()); + big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params); + + copy_to_render_buffers(&big_tile_cpu_buffers); + + render_state_.need_reset_params = true; + update_work_buffer_params_if_needed(render_work); + + copy_from_render_buffers(&big_tile_cpu_buffers); + + render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true); +} + +void PathTrace::write_tile_buffer(const RenderWork &render_work) +{ + if (!render_work.tile.write) { + return; + } + + VLOG(3) << "Write tile result."; + + render_state_.tile_written = true; + + const bool has_multiple_tiles = tile_manager_.has_multiple_tiles(); + + /* Write render tile result, but only if not using tiled rendering. + * + * Tiles are written to a file during rendering, and written to the software at the end + * of rendering (wither when all tiles are finished, or when rendering was requested to be + * cancelled). + * + * Important thing is: tile should be written to the software via callback only once. */ + if (!has_multiple_tiles) { + VLOG(3) << "Write tile result via buffer write callback."; + tile_buffer_write(); + } + + /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile. + */ + if (has_multiple_tiles) { + VLOG(3) << "Write tile result into ."; + tile_buffer_write_to_disk(); + } +} + +void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work) +{ + if (!render_work.full.write) { + return; + } + + VLOG(3) << "Handle full-frame render buffer work."; + + if (!tile_manager_.has_written_tiles()) { + VLOG(3) << "No tiles on disk."; + return; + } + + /* Make sure writing to the file is fully finished. + * This will include writing all possible missing tiles, ensuring validness of the file. */ + tile_manager_.finish_write_tiles(); + + /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after + * all scenes and layers are rendered by the Session (which happens after freeing Session memory, + * so that we never hold scene and full-frame buffer in memory at the same time). */ +} + +void PathTrace::cancel() +{ + thread_scoped_lock lock(render_cancel_.mutex); + + render_cancel_.is_requested = true; + + while (render_cancel_.is_rendering) { + render_cancel_.condition.wait(lock); + } + + render_cancel_.is_requested = false; +} + +int PathTrace::get_num_samples_in_buffer() +{ + return render_scheduler_.get_num_rendered_samples(); +} + +bool PathTrace::is_cancel_requested() +{ + if (render_cancel_.is_requested) { + return true; + } + + if (progress_ != nullptr) { + if (progress_->get_cancel()) { + return true; + } + } + + return false; +} + +void PathTrace::tile_buffer_write() +{ + if (!tile_buffer_write_cb) { + return; + } + + tile_buffer_write_cb(); +} + +void PathTrace::tile_buffer_read() +{ + if (!tile_buffer_read_cb) { + return; + } + + if (tile_buffer_read_cb()) { + tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->copy_render_buffers_to_device(); + }); + } +} + +void PathTrace::tile_buffer_write_to_disk() +{ + /* Sample count pass is required to support per-tile partial results stored in the file. */ + DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED); + + const int num_rendered_samples = render_scheduler_.get_num_rendered_samples(); + + if (num_rendered_samples == 0) { + /* The tile has zero samples, no need to write it. */ + return; + } + + /* Get access to the CPU-side render buffers of the current big tile. */ + RenderBuffers *buffers; + RenderBuffers big_tile_cpu_buffers(cpu_device_.get()); + + if (path_trace_works_.size() == 1) { + path_trace_works_[0]->copy_render_buffers_from_device(); + buffers = path_trace_works_[0]->get_render_buffers(); + } + else { + big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params); + copy_to_render_buffers(&big_tile_cpu_buffers); + + buffers = &big_tile_cpu_buffers; + } + + if (!tile_manager_.write_tile(*buffers)) { + LOG(ERROR) << "Error writing tile to file."; + } +} + +void PathTrace::progress_update_if_needed(const RenderWork &render_work) +{ + if (progress_ != nullptr) { + const int2 tile_size = get_render_tile_size(); + const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples; + const int current_sample = render_work.path_trace.start_sample + + render_work.path_trace.num_samples; + progress_->add_samples(num_samples_added, current_sample); + } + + if (progress_update_cb) { + progress_update_cb(); + } +} + +void PathTrace::progress_set_status(const string &status, const string &substatus) +{ + if (progress_ != nullptr) { + progress_->set_status(status, substatus); + } +} + +void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers) +{ + tbb::parallel_for_each(path_trace_works_, + [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->copy_to_render_buffers(render_buffers); + }); + render_buffers->copy_to_device(); +} + +void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers) +{ + render_buffers->copy_from_device(); + tbb::parallel_for_each(path_trace_works_, + [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->copy_from_render_buffers(render_buffers); + }); +} + +bool PathTrace::copy_render_tile_from_device() +{ + if (full_frame_state_.render_buffers) { + /* Full-frame buffer is always allocated on CPU. */ + return true; + } + + bool success = true; + + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + if (!success) { + return; + } + if (!path_trace_work->copy_render_buffers_from_device()) { + success = false; + } + }); + + return success; +} + +static string get_layer_view_name(const RenderBuffers &buffers) +{ + string result; + + if (buffers.params.layer.size()) { + result += string(buffers.params.layer); + } + + if (buffers.params.view.size()) { + if (!result.empty()) { + result += ", "; + } + result += string(buffers.params.view); + } + + return result; +} + +void PathTrace::process_full_buffer_from_disk(string_view filename) +{ + VLOG(3) << "Processing full frame buffer file " << filename; + + progress_set_status("Reading full buffer from disk"); + + RenderBuffers full_frame_buffers(cpu_device_.get()); + + DenoiseParams denoise_params; + if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) { + LOG(ERROR) << "Error reading tiles from file."; + return; + } + + const string layer_view_name = get_layer_view_name(full_frame_buffers); + + render_state_.has_denoised_result = false; + + if (denoise_params.use) { + progress_set_status(layer_view_name, "Denoising"); + + /* Re-use the denoiser as much as possible, avoiding possible device re-initialization. + * + * It will not conflict with the regular rendering as: + * - Rendering is supposed to be finished here. + * - The next rendering will go via Session's `run_update_for_next_iteration` which will + * ensure proper denoiser is used. */ + set_denoiser_params(denoise_params); + + /* Number of samples doesn't matter too much, since the sampels count pass will be used. */ + denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false); + + render_state_.has_denoised_result = true; + } + + full_frame_state_.render_buffers = &full_frame_buffers; + + progress_set_status(layer_view_name, "Finishing"); + + /* Write the full result pretending that there is a single tile. + * Requires some state change, but allows to use same communication API with the software. */ + tile_buffer_write(); + + full_frame_state_.render_buffers = nullptr; +} + +int PathTrace::get_num_render_tile_samples() const +{ + if (full_frame_state_.render_buffers) { + /* If the full-frame buffer is read from disk the number of samples is not used as there is a + * sample count pass for that in the buffer. Just avoid access to badly defined state of the + * path state. */ + return 0; + } + + return render_scheduler_.get_num_rendered_samples(); +} + +bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor, + const PassAccessor::Destination &destination) +{ + if (full_frame_state_.render_buffers) { + return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination); + } + + bool success = true; + + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + if (!success) { + return; + } + if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) { + success = false; + } + }); + + return success; +} + +bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor, + const PassAccessor::Source &source) +{ + bool success = true; + + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + if (!success) { + return; + } + if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) { + success = false; + } + }); + + return success; +} + +int2 PathTrace::get_render_tile_size() const +{ + if (full_frame_state_.render_buffers) { + return make_int2(full_frame_state_.render_buffers->params.width, + full_frame_state_.render_buffers->params.height); + } + + const Tile &tile = tile_manager_.get_current_tile(); + return make_int2(tile.width, tile.height); +} + +int2 PathTrace::get_render_tile_offset() const +{ + if (full_frame_state_.render_buffers) { + return make_int2(0, 0); + } + + const Tile &tile = tile_manager_.get_current_tile(); + return make_int2(tile.x, tile.y); +} + +const BufferParams &PathTrace::get_render_tile_params() const +{ + if (full_frame_state_.render_buffers) { + return full_frame_state_.render_buffers->params; + } + + return big_tile_params_; +} + +bool PathTrace::has_denoised_result() const +{ + return render_state_.has_denoised_result; +} + +/* -------------------------------------------------------------------- + * Report generation. + */ + +static const char *device_type_for_description(const DeviceType type) +{ + switch (type) { + case DEVICE_NONE: + return "None"; + + case DEVICE_CPU: + return "CPU"; + case DEVICE_CUDA: + return "CUDA"; + case DEVICE_OPTIX: + return "OptiX"; + case DEVICE_DUMMY: + return "Dummy"; + case DEVICE_MULTI: + return "Multi"; + } + + return "UNKNOWN"; +} + +/* Construct description of the device which will appear in the full report. */ +/* TODO(sergey): Consider making it more reusable utility. */ +static string full_device_info_description(const DeviceInfo &device_info) +{ + string full_description = device_info.description; + + full_description += " (" + string(device_type_for_description(device_info.type)) + ")"; + + if (device_info.display_device) { + full_description += " (display)"; + } + + if (device_info.type == DEVICE_CPU) { + full_description += " (" + to_string(device_info.cpu_threads) + " threads)"; + } + + full_description += " [" + device_info.id + "]"; + + return full_description; +} + +/* Construct string which will contain information about devices, possibly multiple of the devices. + * + * In the simple case the result looks like: + * + * Message: Full Device Description + * + * If there are multiple devices then the result looks like: + * + * Message: Full First Device Description + * Full Second Device Description + * + * Note that the newlines are placed in a way so that the result can be easily concatenated to the + * full report. */ +static string device_info_list_report(const string &message, const DeviceInfo &device_info) +{ + string result = "\n" + message + ": "; + const string pad(message.length() + 2, ' '); + + if (device_info.multi_devices.empty()) { + result += full_device_info_description(device_info) + "\n"; + return result; + } + + bool is_first = true; + for (const DeviceInfo &sub_device_info : device_info.multi_devices) { + if (!is_first) { + result += pad; + } + + result += full_device_info_description(sub_device_info) + "\n"; + + is_first = false; + } + + return result; +} + +static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works) +{ + DeviceInfo device_info; + device_info.type = DEVICE_MULTI; + + for (auto &&path_trace_work : path_trace_works) { + device_info.multi_devices.push_back(path_trace_work->get_device()->info); + } + + return device_info_list_report("Path tracing on", device_info); +} + +static string denoiser_device_report(const Denoiser *denoiser) +{ + if (!denoiser) { + return ""; + } + + if (!denoiser->get_params().use) { + return ""; + } + + const Device *denoiser_device = denoiser->get_denoiser_device(); + if (!denoiser_device) { + return ""; + } + + return device_info_list_report("Denoising on", denoiser_device->info); +} + +string PathTrace::full_report() const +{ + string result = "\nFull path tracing report\n"; + + result += path_trace_devices_report(path_trace_works_); + result += denoiser_device_report(denoiser_.get()); + + /* Report from the render scheduler, which includes: + * - Render mode (interactive, offline, headless) + * - Adaptive sampling and denoiser parameters + * - Breakdown of timing. */ + result += render_scheduler_.full_report(); + + return result; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h new file mode 100644 index 00000000000..78ca68c1198 --- /dev/null +++ b/intern/cycles/integrator/path_trace.h @@ -0,0 +1,324 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/denoiser.h" +#include "integrator/pass_accessor.h" +#include "integrator/path_trace_work.h" +#include "integrator/work_balancer.h" +#include "render/buffers.h" +#include "util/util_function.h" +#include "util/util_thread.h" +#include "util/util_unique_ptr.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class AdaptiveSampling; +class Device; +class DeviceScene; +class Film; +class RenderBuffers; +class RenderScheduler; +class RenderWork; +class Progress; +class GPUDisplay; +class TileManager; + +/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of + * all the common steps of path tracing which are not device-specific. The list of tasks includes + * but is not limited to: + * - Kernel graph. + * - Scheduling logic. + * - Queues management. + * - Adaptive stopping. */ +class PathTrace { + public: + /* Render scheduler is used to report timing information and access things like start/finish + * sample. */ + PathTrace(Device *device, + Film *film, + DeviceScene *device_scene, + RenderScheduler &render_scheduler, + TileManager &tile_manager); + ~PathTrace(); + + /* Create devices and load kernels which are created on-demand (for example, denoising devices). + * The progress is reported to the currently configure progress object (via `set_progress`). */ + void load_kernels(); + + /* Allocate working memory. This runs before allocating scene memory so that we can estimate + * more accurately which scene device memory may need to allocated on the host. */ + void alloc_work_memory(); + + /* Check whether now it is a good time to reset rendering. + * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate + * render result. */ + bool ready_to_reset(); + + void reset(const BufferParams &full_params, const BufferParams &big_tile_params); + + void device_free(); + + /* Set progress tracker. + * Used to communicate details about the progress to the outer world, check whether rendering is + * to be canceled. + * + * The path tracer writes to this object, and then at a convenient moment runs + * progress_update_cb() callback. */ + void set_progress(Progress *progress); + + /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are + * rendered (or until rendering is requested to be cancelled). */ + void render(const RenderWork &render_work); + + /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is + * convenient to have it here because then its easy to access render buffer. But the downside is + * that this adds too much of entities which can live separately with some clear API. */ + + /* Set denoiser parameters. + * Use this to configure the denoiser before rendering any samples. */ + void set_denoiser_params(const DenoiseParams ¶ms); + + /* Set parameters used for adaptive sampling. + * Use this to configure the adaptive sampler before rendering any samples. */ + void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling); + + /* Set GPU display which takes care of drawing the render result. */ + void set_gpu_display(unique_ptr<GPUDisplay> gpu_display); + + /* Clear the GPU display by filling it in with all zeroes. */ + void clear_gpu_display(); + + /* Perform drawing of the current state of the GPUDisplay. */ + void draw(); + + /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled. + * Used in cases like reset of render session. + * + * This is a blockign call, which returns as soon as there is no running `render_samples()` call. + */ + void cancel(); + + /* Copy an entire render buffer to/from the path trace. */ + + /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and + * the data will be copied to the device of the given render buffers. */ + void copy_to_render_buffers(RenderBuffers *render_buffers); + + /* Copy happens via CPU side buffer: data will be copied from the device of the given rendetr + * buffers and will be copied to all devices of the path trace. */ + void copy_from_render_buffers(RenderBuffers *render_buffers); + + /* Copy render buffers of the big tile from the device to hsot. + * Return true if all copies are successful. */ + bool copy_render_tile_from_device(); + + /* Read given full-frame file from disk, perform needed processing and write it to the software + * via the write callback. */ + void process_full_buffer_from_disk(string_view filename); + + /* Get number of samples in the current big tile render buffers. */ + int get_num_render_tile_samples() const; + + /* Get pass data of the entire big tile. + * This call puts pass render result from all devices into the final pixels storage. + * + * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`. + * + * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */ + bool get_render_tile_pixels(const PassAccessor &pass_accessor, + const PassAccessor::Destination &destination); + + /* Set pass data for baking. */ + bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source); + + /* Check whether denoiser was run and denoised passes are available. */ + bool has_denoised_result() const; + + /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. + * In the case of tiled rendering this will return full-frame after all tiles has been rendered. + * + * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame + * instead. */ + int2 get_render_tile_size() const; + int2 get_render_tile_offset() const; + + /* Get buffer parameters of the current tile. + * + * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame + * instead. */ + const BufferParams &get_render_tile_params() const; + + /* Generate full multi-line report of the rendering process, including rendering parameters, + * times, and so on. */ + string full_report() const; + + /* Callback which communicates an updates state of the render buffer of the current big tile. + * Is called during path tracing to communicate work-in-progress state of the final buffer. */ + function<void(void)> tile_buffer_update_cb; + + /* Callback which communicates final rendered buffer. Is called after pathtracing is done. */ + function<void(void)> tile_buffer_write_cb; + + /* Callback which initializes rendered buffer. Is called before pathtracing starts. + * + * This is used for baking. */ + function<bool(void)> tile_buffer_read_cb; + + /* Callback which is called to report current rendering progress. + * + * It is supposed to be cheaper than buffer update/write, hence can be called more often. + * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed + * that the buffer is "uniformly" sampled at the moment of this callback). */ + function<void(void)> progress_update_cb; + + protected: + /* Actual implementation of the rendering pipeline. + * Calls steps in order, checking for the cancel to be requested inbetween. + * + * Is separate from `render()` to simplify dealing with the early outputs and keeping + * `render_cancel_` in the consistent state. */ + void render_pipeline(RenderWork render_work); + + /* Initialize kernel execution on all integrator queues. */ + void render_init_kernel_execution(); + + /* Make sure both allocated and effective buffer parameters of path tracer works are up to date + * with the current big tile parameters, performance-dependent slicing, and resolution divider. + */ + void update_work_buffer_params_if_needed(const RenderWork &render_work); + void update_allocated_work_buffer_params(); + void update_effective_work_buffer_params(const RenderWork &render_work); + + /* Perform various steps of the render work. + * + * Note that some steps might modify the work, forcing some steps to happen within this iteration + * of rendering. */ + void init_render_buffers(const RenderWork &render_work); + void path_trace(RenderWork &render_work); + void adaptive_sample(RenderWork &render_work); + void denoise(const RenderWork &render_work); + void cryptomatte_postprocess(const RenderWork &render_work); + void update_display(const RenderWork &render_work); + void rebalance(const RenderWork &render_work); + void write_tile_buffer(const RenderWork &render_work); + void finalize_full_buffer_on_disk(const RenderWork &render_work); + + /* Get number of samples in the current state of the render buffers. */ + int get_num_samples_in_buffer(); + + /* Check whether user requested to cancel rendering, so that path tracing is to be finished as + * soon as possible. */ + bool is_cancel_requested(); + + /* Write the big tile render buffer via the write callback. */ + void tile_buffer_write(); + + /* Read the big tile render buffer via the read callback. */ + void tile_buffer_read(); + + /* Write current tile into the file on disk. */ + void tile_buffer_write_to_disk(); + + /* Run the progress_update_cb callback if it is needed. */ + void progress_update_if_needed(const RenderWork &render_work); + + void progress_set_status(const string &status, const string &substatus = ""); + + /* Pointer to a device which is configured to be used for path tracing. If multiple devices + * are configured this is a `MultiDevice`. */ + Device *device_ = nullptr; + + /* CPU device for creating temporary render buffers on the CPU side. */ + unique_ptr<Device> cpu_device_; + + DeviceScene *device_scene_; + + RenderScheduler &render_scheduler_; + TileManager &tile_manager_; + + unique_ptr<GPUDisplay> gpu_display_; + + /* Per-compute device descriptors of work which is responsible for path tracing on its configured + * device. */ + vector<unique_ptr<PathTraceWork>> path_trace_works_; + + /* Per-path trace work information needed for multi-device balancing. */ + vector<WorkBalanceInfo> work_balance_infos_; + + /* Render buffer parameters of the full frame and current big tile. */ + BufferParams full_params_; + BufferParams big_tile_params_; + + /* Denoiser which takes care of denoising the big tile. */ + unique_ptr<Denoiser> denoiser_; + + /* State which is common for all the steps of the render work. + * Is brought up to date in the `render()` call and is accessed from all the steps involved into + * rendering the work. */ + struct { + /* Denotes whether render buffers parameters of path trace works are to be reset for the new + * value of the big tile parameters. */ + bool need_reset_params = false; + + /* Divider of the resolution for faster previews. + * + * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to + * think of render buffer in this case is as an over-allocated array: the resolution divider + * affects both resolution and stride as visible by the integrator kernels. */ + int resolution_divider = 0; + + /* Paramaters of the big tile with the current resolution divider applied. */ + BufferParams effective_big_tile_params; + + /* Denosier was run and there are denoised versions of the passes in the render buffers. */ + bool has_denoised_result = false; + + /* Current tile has been written (to either disk or callback. + * Indicates that no more work will be done on this tile. */ + bool tile_written = false; + } render_state_; + + /* Progress object which is used to communicate sample progress. */ + Progress *progress_; + + /* Fields required for canceling render on demand, as quickly as possible. */ + struct { + /* Indicates whether there is an on-going `render_samples()` call. */ + bool is_rendering = false; + + /* Indicates whether rendering is requested to be canceled by `cancel()`. */ + bool is_requested = false; + + /* Synchronization between thread which does `render_samples()` and thread which does + * `cancel()`. */ + thread_mutex mutex; + thread_condition_variable condition; + } render_cancel_; + + /* Indicates whether a render result was drawn after latest session reset. + * Used by `ready_to_reset()` to implement logic which feels the most interactive. */ + bool did_draw_after_reset_ = true; + + /* State of the full frame processing and writing to the software. */ + struct { + RenderBuffers *render_buffers = nullptr; + } full_frame_state_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp new file mode 100644 index 00000000000..d9634acac10 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work.cpp @@ -0,0 +1,203 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device.h" + +#include "integrator/path_trace_work.h" +#include "integrator/path_trace_work_cpu.h" +#include "integrator/path_trace_work_gpu.h" +#include "render/buffers.h" +#include "render/film.h" +#include "render/gpu_display.h" +#include "render/scene.h" + +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +unique_ptr<PathTraceWork> PathTraceWork::create(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) +{ + if (device->info.type == DEVICE_CPU) { + return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag); + } + + return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag); +} + +PathTraceWork::PathTraceWork(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) + : device_(device), + film_(film), + device_scene_(device_scene), + buffers_(make_unique<RenderBuffers>(device)), + effective_buffer_params_(buffers_->params), + cancel_requested_flag_(cancel_requested_flag) +{ +} + +PathTraceWork::~PathTraceWork() +{ +} + +RenderBuffers *PathTraceWork::get_render_buffers() +{ + return buffers_.get(); +} + +void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params, + const BufferParams &effective_big_tile_params, + const BufferParams &effective_buffer_params) +{ + effective_full_params_ = effective_full_params; + effective_big_tile_params_ = effective_big_tile_params; + effective_buffer_params_ = effective_buffer_params; +} + +bool PathTraceWork::has_multiple_works() const +{ + /* Assume if there are multiple works working on the same big tile none of the works gets the + * entire big tile to work on. */ + return !(effective_big_tile_params_.width == effective_buffer_params_.width && + effective_big_tile_params_.height == effective_buffer_params_.height && + effective_big_tile_params_.full_x == effective_buffer_params_.full_x && + effective_big_tile_params_.full_y == effective_buffer_params_.full_y); +} + +void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers) +{ + copy_render_buffers_from_device(); + + const int64_t width = effective_buffer_params_.width; + const int64_t height = effective_buffer_params_.height; + const int64_t pass_stride = effective_buffer_params_.pass_stride; + const int64_t row_stride = width * pass_stride; + const int64_t data_size = row_stride * height * sizeof(float); + + const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int64_t offset_in_floats = offset_y * row_stride; + + const float *src = buffers_->buffer.data(); + float *dst = render_buffers->buffer.data() + offset_in_floats; + + memcpy(dst, src, data_size); +} + +void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers) +{ + const int64_t width = effective_buffer_params_.width; + const int64_t height = effective_buffer_params_.height; + const int64_t pass_stride = effective_buffer_params_.pass_stride; + const int64_t row_stride = width * pass_stride; + const int64_t data_size = row_stride * height * sizeof(float); + + const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int64_t offset_in_floats = offset_y * row_stride; + + const float *src = render_buffers->buffer.data() + offset_in_floats; + float *dst = buffers_->buffer.data(); + + memcpy(dst, src, data_size); + + copy_render_buffers_to_device(); +} + +void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers) +{ + const int64_t width = effective_buffer_params_.width; + const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int64_t offset = offset_y * width; + + render_buffers_host_copy_denoised( + buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset); + + copy_render_buffers_to_device(); +} + +bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor, + const PassAccessor::Destination &destination) +{ + const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int width = effective_buffer_params_.width; + + PassAccessor::Destination slice_destination = destination; + slice_destination.offset += offset_y * width; + + return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination); +} + +bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor, + const PassAccessor::Source &source) +{ + const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int width = effective_buffer_params_.width; + + PassAccessor::Source slice_source = source; + slice_source.offset += offset_y * width; + + return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source); +} + +PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const +{ + const KernelFilm &kfilm = device_scene_->data.film; + const KernelBackground &kbackground = device_scene_->data.background; + + const BufferParams ¶ms = buffers_->params; + + const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass()); + + PassAccessor::PassAccessInfo pass_access_info; + pass_access_info.type = display_pass->type; + pass_access_info.offset = PASS_UNUSED; + + if (pass_mode == PassMode::DENOISED) { + pass_access_info.mode = PassMode::DENOISED; + pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED); + } + + if (pass_access_info.offset == PASS_UNUSED) { + pass_access_info.mode = PassMode::NOISY; + pass_access_info.offset = params.get_pass_offset(pass_access_info.type); + } + + pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher; + pass_access_info.use_approximate_shadow_catcher_background = + kfilm.use_approximate_shadow_catcher && !kbackground.transparent; + + return pass_access_info; +} + +PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template( + const GPUDisplay *gpu_display) const +{ + PassAccessor::Destination destination(film_->get_display_pass()); + + const int2 display_texture_size = gpu_display->get_texture_size(); + const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x; + const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y; + + destination.offset = texture_y * display_texture_size.x + texture_x; + destination.stride = display_texture_size.x; + + return destination; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h new file mode 100644 index 00000000000..97b97f3d888 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work.h @@ -0,0 +1,194 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/pass_accessor.h" +#include "render/buffers.h" +#include "render/pass.h" +#include "util/util_types.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +class BufferParams; +class Device; +class DeviceScene; +class Film; +class GPUDisplay; +class RenderBuffers; + +class PathTraceWork { + public: + struct RenderStatistics { + float occupancy = 1.0f; + }; + + /* Create path trace work which fits best the device. + * + * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as + * possible. This could be, for rexample, request to cancel rendering on camera navigation in + * viewport. */ + static unique_ptr<PathTraceWork> create(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag); + + virtual ~PathTraceWork(); + + /* Access the render buffers. + * + * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to + * correspond to the big tile size and relative device performance. */ + RenderBuffers *get_render_buffers(); + + /* Set effective parameters of the big tile and the work itself. */ + void set_effective_buffer_params(const BufferParams &effective_full_params, + const BufferParams &effective_big_tile_params, + const BufferParams &effective_buffer_params); + + /* Check whether the big tile is being worked on by multiple path trace works. */ + bool has_multiple_works() const; + + /* Allocate working memory for execution. Must be called before init_execution(). */ + virtual void alloc_work_memory(){}; + + /* Initialize execution of kernels. + * Will ensure that all device queues are initialized for execution. + * + * This method is to be called after any change in the scene. It is not needed to call it prior + * to an every call of the `render_samples()`. */ + virtual void init_execution() = 0; + + /* Render given number of samples as a synchronous blocking call. + * The samples are added to the render buffer associated with this work. */ + virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0; + + /* Copy render result from this work to the corresponding place of the GPU display. + * + * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The + * noisy pass mode will be passed here when it is known that the buffer does not have denoised + * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is + * not used then this function will fall-back to the noisy pass instead. */ + virtual void copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) = 0; + + virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0; + + /* Copy data from/to given render buffers. + * Will copy pixels from a corresponding place (from multi-device point of view) of the render + * buffers, and copy work's render buffers to the corresponding place of the destination. */ + + /* Notes: + * - Copies work's render buffer from the device. + * - Copies CPU-side buffer of the given buffer + * - Does not copy the buffer to its device. */ + void copy_to_render_buffers(RenderBuffers *render_buffers); + + /* Notes: + * - Does not copy given render buffers from the device. + * - Copies work's render buffer to its device. */ + void copy_from_render_buffers(const RenderBuffers *render_buffers); + + /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the + * given render buffers, leaving rest of the passes. + * + * Same notes about device copying aplies to this call as well. */ + void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers); + + /* Copy render buffers to/from device using an appropriate device queue when needed so that + * things are executed in order with the `render_samples()`. */ + virtual bool copy_render_buffers_from_device() = 0; + virtual bool copy_render_buffers_to_device() = 0; + + /* Zero render buffers to/from device using an appropriate device queue when needed so that + * things are executed in order with the `render_samples()`. */ + virtual bool zero_render_buffers() = 0; + + /* Access pixels rendered by this work and copy them to the coresponding location in the + * destination. + * + * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()` + * to update host-side data. */ + bool get_render_tile_pixels(const PassAccessor &pass_accessor, + const PassAccessor::Destination &destination); + + /* Set pass data for baking. */ + bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source); + + /* Perform convergence test on the render buffer, and filter the convergence mask. + * Returns number of active pixels (the ones which did not converge yet). */ + virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0; + + /* Run cryptomatte pass post-processing kernels. */ + virtual void cryptomatte_postproces() = 0; + + /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as + * possible, without waiting for any samples to be finished. */ + inline bool is_cancel_requested() const + { + /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in + * threaded environment. */ + return *cancel_requested_flag_; + } + + /* Access to the device which is used to path trace this work on. */ + Device *get_device() const + { + return device_; + } + + protected: + PathTraceWork(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag); + + PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const; + + /* Get destination which offset and stride are configured so that writing to it will write to a + * proper location of GPU display texture, taking current tile and device slice into account. */ + PassAccessor::Destination get_gpu_display_destination_template( + const GPUDisplay *gpu_display) const; + + /* Device which will be used for path tracing. + * Note that it is an actual render device (and never is a multi-device). */ + Device *device_; + + /* Film is used to access display pass configuration for GPU display update. + * Note that only fields which are not a part of kernel data can be accessed via the Film. */ + Film *film_; + + /* Device side scene storage, that may be used for integrator logic. */ + DeviceScene *device_scene_; + + /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big + * tile which is being rendered by this work. + * It also defines possible subset of a big tile in the case of multi-device rendering. */ + unique_ptr<RenderBuffers> buffers_; + + /* Effective parameters of the full, big tile, and current work render buffer. + * The latter might be different from buffers_->params when there is a resolution divider + * involved. */ + BufferParams effective_full_params_; + BufferParams effective_big_tile_params_; + BufferParams effective_buffer_params_; + + bool *cancel_requested_flag_ = nullptr; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp new file mode 100644 index 00000000000..b9a33b64051 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -0,0 +1,281 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/path_trace_work_cpu.h" + +#include "device/cpu/kernel.h" +#include "device/device.h" + +#include "integrator/pass_accessor_cpu.h" + +#include "render/buffers.h" +#include "render/gpu_display.h" +#include "render/scene.h" + +#include "util/util_atomic.h" +#include "util/util_logging.h" +#include "util/util_tbb.h" + +CCL_NAMESPACE_BEGIN + +/* Create TBB arena for execution of path tracing and rendering tasks. */ +static inline tbb::task_arena local_tbb_arena_create(const Device *device) +{ + /* TODO: limit this to number of threads of CPU device, it may be smaller than + * the system number of threads when we reduce the number of CPU threads in + * CPU + GPU rendering to dedicate some cores to handling the GPU device. */ + return tbb::task_arena(device->info.cpu_threads); +} + +/* Get CPUKernelThreadGlobals for the current thread. */ +static inline CPUKernelThreadGlobals *kernel_thread_globals_get( + vector<CPUKernelThreadGlobals> &kernel_thread_globals) +{ + const int thread_index = tbb::this_task_arena::current_thread_index(); + DCHECK_GE(thread_index, 0); + DCHECK_LE(thread_index, kernel_thread_globals.size()); + + return &kernel_thread_globals[thread_index]; +} + +PathTraceWorkCPU::PathTraceWorkCPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) + : PathTraceWork(device, film, device_scene, cancel_requested_flag), + kernels_(*(device->get_cpu_kernels())) +{ + DCHECK_EQ(device->info.type, DEVICE_CPU); +} + +void PathTraceWorkCPU::init_execution() +{ + /* Cache per-thread kernel globals. */ + device_->get_cpu_kernel_thread_globals(kernel_thread_globals_); +} + +void PathTraceWorkCPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) +{ + const int64_t image_width = effective_buffer_params_.width; + const int64_t image_height = effective_buffer_params_.height; + const int64_t total_pixels_num = image_width * image_height; + + for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) { + kernel_globals.start_profiling(); + } + + tbb::task_arena local_arena = local_tbb_arena_create(device_); + local_arena.execute([&]() { + tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) { + if (is_cancel_requested()) { + return; + } + + const int y = work_index / image_width; + const int x = work_index - y * image_width; + + KernelWorkTile work_tile; + work_tile.x = effective_buffer_params_.full_x + x; + work_tile.y = effective_buffer_params_.full_y + y; + work_tile.w = 1; + work_tile.h = 1; + work_tile.start_sample = start_sample; + work_tile.num_samples = 1; + work_tile.offset = effective_buffer_params_.offset; + work_tile.stride = effective_buffer_params_.stride; + + CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_); + + render_samples_full_pipeline(kernel_globals, work_tile, samples_num); + }); + }); + + for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) { + kernel_globals.stop_profiling(); + } + + statistics.occupancy = 1.0f; +} + +void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals, + const KernelWorkTile &work_tile, + const int samples_num) +{ + const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher; + const bool has_bake = device_scene_->data.bake.use; + + IntegratorStateCPU integrator_states[2] = {}; + + IntegratorStateCPU *state = &integrator_states[0]; + IntegratorStateCPU *shadow_catcher_state = &integrator_states[1]; + + KernelWorkTile sample_work_tile = work_tile; + float *render_buffer = buffers_->buffer.data(); + + for (int sample = 0; sample < samples_num; ++sample) { + if (is_cancel_requested()) { + break; + } + + if (has_bake) { + if (!kernels_.integrator_init_from_bake( + kernel_globals, state, &sample_work_tile, render_buffer)) { + break; + } + } + else { + if (!kernels_.integrator_init_from_camera( + kernel_globals, state, &sample_work_tile, render_buffer)) { + break; + } + } + + kernels_.integrator_megakernel(kernel_globals, state, render_buffer); + + if (has_shadow_catcher) { + kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer); + } + + ++sample_work_tile.start_sample; + } +} + +void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + half4 *rgba_half = gpu_display->map_texture_buffer(); + if (!rgba_half) { + /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for + * some implementations of GPUDisplay which can not map memory? */ + return; + } + + const KernelFilm &kfilm = device_scene_->data.film; + + const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode); + + const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples); + + PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display); + destination.pixels_half_rgba = rgba_half; + + tbb::task_arena local_arena = local_tbb_arena_create(device_); + local_arena.execute([&]() { + pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination); + }); + + gpu_display->unmap_texture_buffer(); +} + +void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/) +{ +} + +bool PathTraceWorkCPU::copy_render_buffers_from_device() +{ + return buffers_->copy_from_device(); +} + +bool PathTraceWorkCPU::copy_render_buffers_to_device() +{ + buffers_->buffer.copy_to_device(); + return true; +} + +bool PathTraceWorkCPU::zero_render_buffers() +{ + buffers_->zero(); + return true; +} + +int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset) +{ + const int full_x = effective_buffer_params_.full_x; + const int full_y = effective_buffer_params_.full_y; + const int width = effective_buffer_params_.width; + const int height = effective_buffer_params_.height; + const int offset = effective_buffer_params_.offset; + const int stride = effective_buffer_params_.stride; + + float *render_buffer = buffers_->buffer.data(); + + uint num_active_pixels = 0; + + tbb::task_arena local_arena = local_tbb_arena_create(device_); + + /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */ + local_arena.execute([&]() { + tbb::parallel_for(full_y, full_y + height, [&](int y) { + CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0]; + + bool row_converged = true; + uint num_row_pixels_active = 0; + for (int x = 0; x < width; ++x) { + if (!kernels_.adaptive_sampling_convergence_check( + kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) { + ++num_row_pixels_active; + row_converged = false; + } + } + + atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active); + + if (!row_converged) { + kernels_.adaptive_sampling_filter_x( + kernel_globals, render_buffer, y, full_x, width, offset, stride); + } + }); + }); + + if (num_active_pixels) { + local_arena.execute([&]() { + tbb::parallel_for(full_x, full_x + width, [&](int x) { + CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0]; + kernels_.adaptive_sampling_filter_y( + kernel_globals, render_buffer, x, full_y, height, offset, stride); + }); + }); + } + + return num_active_pixels; +} + +void PathTraceWorkCPU::cryptomatte_postproces() +{ + const int width = effective_buffer_params_.width; + const int height = effective_buffer_params_.height; + + float *render_buffer = buffers_->buffer.data(); + + tbb::task_arena local_arena = local_tbb_arena_create(device_); + + /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */ + local_arena.execute([&]() { + tbb::parallel_for(0, height, [&](int y) { + CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0]; + int pixel_index = y * width; + + for (int x = 0; x < width; ++x, ++pixel_index) { + kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index); + } + }); + }); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h new file mode 100644 index 00000000000..ab729bbf879 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_cpu.h @@ -0,0 +1,82 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/integrator/integrator_state.h" + +#include "device/cpu/kernel_thread_globals.h" +#include "device/device_queue.h" + +#include "integrator/path_trace_work.h" + +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +struct KernelWorkTile; +struct KernelGlobals; + +class CPUKernels; + +/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel, + * for CPU devices. + * + * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent + * queues on the render device which makes this work be only usable on CPU. */ +class PathTraceWorkCPU : public PathTraceWork { + public: + PathTraceWorkCPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag); + + virtual void init_execution() override; + + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) override; + + virtual void copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) override; + virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override; + + virtual bool copy_render_buffers_from_device() override; + virtual bool copy_render_buffers_to_device() override; + virtual bool zero_render_buffers() override; + + virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override; + virtual void cryptomatte_postproces() override; + + protected: + /* Core path tracing routine. Renders given work time on the given queue. */ + void render_samples_full_pipeline(KernelGlobals *kernel_globals, + const KernelWorkTile &work_tile, + const int samples_num); + + /* CPU kernels. */ + const CPUKernels &kernels_; + + /* Copy of kernel globals which is suitable for concurrent access from multiple threads. + * + * More specifically, the `kernel_globals_` is local to each threads and nobody else is + * accessing it, but some "localization" is required to decouple from kernel globals stored + * on the device level. */ + vector<CPUKernelThreadGlobals> kernel_thread_globals_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp new file mode 100644 index 00000000000..10baf869aa6 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -0,0 +1,933 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/path_trace_work_gpu.h" + +#include "device/device.h" + +#include "integrator/pass_accessor_gpu.h" +#include "render/buffers.h" +#include "render/gpu_display.h" +#include "render/scene.h" +#include "util/util_logging.h" +#include "util/util_tbb.h" +#include "util/util_time.h" + +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +PathTraceWorkGPU::PathTraceWorkGPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) + : PathTraceWork(device, film, device_scene, cancel_requested_flag), + queue_(device->gpu_queue_create()), + integrator_state_soa_kernel_features_(0), + integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE), + integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE), + integrator_shader_raytrace_sort_counter_( + device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE), + integrator_next_shadow_catcher_path_index_( + device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE), + queued_paths_(device, "queued_paths", MEM_READ_WRITE), + num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE), + work_tiles_(device, "work_tiles", MEM_READ_WRITE), + gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE), + max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))), + min_num_active_paths_(queue_->num_concurrent_busy_states()), + max_active_path_index_(0) +{ + memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_)); + + /* Limit number of active paths to the half of the overall state. This is due to the logic in the + * path compaction which relies on the fact that regeneration does not happen sooner than half of + * the states are available again. */ + min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2); +} + +void PathTraceWorkGPU::alloc_integrator_soa() +{ + /* IntegrateState allocated as structure of arrays. */ + + /* Check if we already allocated memory for the required features. */ + const uint kernel_features = device_scene_->data.kernel_features; + if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) { + return; + } + integrator_state_soa_kernel_features_ = kernel_features; + + /* Allocate a device only memory buffer before for each struct member, and then + * write the pointers into a struct that resides in constant memory. + * + * TODO: store float3 in separate XYZ arrays. */ +#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) { +#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \ + device_only_memory<type> *array = new device_only_memory<type>(device_, \ + "integrator_state_" #name); \ + array->alloc_to_device(max_num_paths_); \ + integrator_state_soa_.emplace_back(array); \ + integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \ + } +#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ + if ((kernel_features & feature) && \ + (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \ + device_only_memory<type> *array = new device_only_memory<type>(device_, \ + "integrator_state_" #name); \ + array->alloc_to_device(max_num_paths_); \ + integrator_state_soa_.emplace_back(array); \ + integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \ + } +#define KERNEL_STRUCT_END(name) \ + break; \ + } +#define KERNEL_STRUCT_END_ARRAY(name, array_size) \ + if (array_index == array_size - 1) { \ + break; \ + } \ + } +#include "kernel/integrator/integrator_state_template.h" +#undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_ARRAY_MEMBER +#undef KERNEL_STRUCT_END +#undef KERNEL_STRUCT_END_ARRAY +} + +void PathTraceWorkGPU::alloc_integrator_queue() +{ + if (integrator_queue_counter_.size() == 0) { + integrator_queue_counter_.alloc(1); + integrator_queue_counter_.zero_to_device(); + integrator_queue_counter_.copy_from_device(); + integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *) + integrator_queue_counter_.device_pointer; + } + + /* Allocate data for active path index arrays. */ + if (num_queued_paths_.size() == 0) { + num_queued_paths_.alloc(1); + num_queued_paths_.zero_to_device(); + } + + if (queued_paths_.size() == 0) { + queued_paths_.alloc(max_num_paths_); + /* TODO: this could be skip if we had a function to just allocate on device. */ + queued_paths_.zero_to_device(); + } +} + +void PathTraceWorkGPU::alloc_integrator_sorting() +{ + /* Allocate arrays for shader sorting. */ + const int max_shaders = device_scene_->data.max_shaders; + if (integrator_shader_sort_counter_.size() < max_shaders) { + integrator_shader_sort_counter_.alloc(max_shaders); + integrator_shader_sort_counter_.zero_to_device(); + + integrator_shader_raytrace_sort_counter_.alloc(max_shaders); + integrator_shader_raytrace_sort_counter_.zero_to_device(); + + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = + (int *)integrator_shader_sort_counter_.device_pointer; + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = + (int *)integrator_shader_raytrace_sort_counter_.device_pointer; + } +} + +void PathTraceWorkGPU::alloc_integrator_path_split() +{ + if (integrator_next_shadow_catcher_path_index_.size() != 0) { + return; + } + + integrator_next_shadow_catcher_path_index_.alloc(1); + /* TODO(sergey): Use queue? */ + integrator_next_shadow_catcher_path_index_.zero_to_device(); + + integrator_state_gpu_.next_shadow_catcher_path_index = + (int *)integrator_next_shadow_catcher_path_index_.device_pointer; +} + +void PathTraceWorkGPU::alloc_work_memory() +{ + alloc_integrator_soa(); + alloc_integrator_queue(); + alloc_integrator_sorting(); + alloc_integrator_path_split(); +} + +void PathTraceWorkGPU::init_execution() +{ + queue_->init_execution(); + + /* Copy to device side struct in constant memory. */ + device_->const_copy_to( + "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); +} + +void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) +{ + /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to + * add more work (because tiles are smaller, so there is higher chance that more paths will + * become busy after adding new tiles). This is especially important for the shadow catcher which + * schedules work in halves of available number of paths. */ + work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8); + + work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num); + + enqueue_reset(); + + int num_iterations = 0; + uint64_t num_busy_accum = 0; + + /* TODO: set a hard limit in case of undetected kernel failures? */ + while (true) { + /* Enqueue work from the scheduler, on start or when there are not enough + * paths to keep the device occupied. */ + bool finished; + if (enqueue_work_tiles(finished)) { + /* Copy stats from the device. */ + queue_->copy_from_device(integrator_queue_counter_); + + if (!queue_->synchronize()) { + break; /* Stop on error. */ + } + } + + if (is_cancel_requested()) { + break; + } + + /* Stop if no more work remaining. */ + if (finished) { + break; + } + + /* Enqueue on of the path iteration kernels. */ + if (enqueue_path_iteration()) { + /* Copy stats from the device. */ + queue_->copy_from_device(integrator_queue_counter_); + + if (!queue_->synchronize()) { + break; /* Stop on error. */ + } + } + + if (is_cancel_requested()) { + break; + } + + num_busy_accum += get_num_active_paths(); + ++num_iterations; + } + + statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_; +} + +DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const +{ + const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int max_num_queued = 0; + DeviceKernel kernel = DEVICE_KERNEL_NUM; + + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + if (queue_counter->num_queued[i] > max_num_queued) { + kernel = (DeviceKernel)i; + max_num_queued = queue_counter->num_queued[i]; + } + } + + return kernel; +} + +void PathTraceWorkGPU::enqueue_reset() +{ + void *args[] = {&max_num_paths_}; + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args); + queue_->zero_to_device(integrator_queue_counter_); + queue_->zero_to_device(integrator_shader_sort_counter_); + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + + /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the + * counter on the host side because `zero_to_device()` is not doing it. */ + if (integrator_queue_counter_.host_pointer) { + memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size()); + } +} + +bool PathTraceWorkGPU::enqueue_path_iteration() +{ + /* Find kernel to execute, with max number of queued paths. */ + const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int num_active_paths = 0; + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + num_active_paths += queue_counter->num_queued[i]; + } + + if (num_active_paths == 0) { + return false; + } + + /* Find kernel to execute, with max number of queued paths. */ + const DeviceKernel kernel = get_most_queued_kernel(); + if (kernel == DEVICE_KERNEL_NUM) { + return false; + } + + /* Finish shadows before potentially adding more shadow rays. We can only + * store one shadow ray in the integrator state. */ + if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) { + if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) { + enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); + return true; + } + else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) { + enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); + return true; + } + } + + /* Schedule kernel with maximum number of queued items. */ + enqueue_path_iteration(kernel); + return true; +} + +void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel) +{ + void *d_path_index = (void *)NULL; + + /* Create array of path indices for which this kernel is queued to be executed. */ + int work_size = max_active_path_index_; + + IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + int num_queued = queue_counter->num_queued[kernel]; + + if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + /* Compute array of active paths, sorted by shader. */ + work_size = num_queued; + d_path_index = (void *)queued_paths_.device_pointer; + + compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel); + } + else if (num_queued < work_size) { + work_size = num_queued; + d_path_index = (void *)queued_paths_.device_pointer; + + if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { + /* Compute array of active shadow paths for specific kernel. */ + compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel); + } + else { + /* Compute array of active paths for specific kernel. */ + compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel); + } + } + + DCHECK_LE(work_size, max_num_paths_); + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: { + /* Ray intersection kernels with integrator state. */ + void *args[] = {&d_path_index, const_cast<int *>(&work_size)}; + + queue_->enqueue(kernel, work_size, args); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: { + /* Shading kernels with integrator state and render buffer. */ + void *d_render_buffer = (void *)buffers_->buffer.device_pointer; + void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)}; + + queue_->enqueue(kernel, work_size, args); + break; + } + + default: + LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel) + << " used for path iteration, should never happen."; + break; + } +} + +void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel) +{ + int d_queued_kernel = queued_kernel; + void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel]; + assert(d_counter != nullptr); + + /* Compute prefix sum of number of active paths with each shader. */ + { + const int work_size = 1; + int max_shaders = device_scene_->data.max_shaders; + void *args[] = {&d_counter, &max_shaders}; + queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); + } + + queue_->zero_to_device(num_queued_paths_); + + /* Launch kernel to fill the active paths arrays. */ + { + /* TODO: this could be smaller for terminated paths based on amount of work we want + * to schedule. */ + const int work_size = max_active_path_index_; + + void *d_queued_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = {const_cast<int *>(&work_size), + &d_queued_paths, + &d_num_queued_paths, + &d_counter, + &d_queued_kernel}; + + queue_->enqueue(kernel, work_size, args); + } + + if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) { + queue_->zero_to_device(integrator_shader_sort_counter_); + } + else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + } + else { + assert(0); + } +} + +void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel) +{ + int d_queued_kernel = queued_kernel; + + /* Launch kernel to fill the active paths arrays. */ + const int work_size = max_active_path_index_; + void *d_queued_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = { + const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel}; + + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(kernel, work_size, args); +} + +void PathTraceWorkGPU::compact_states(const int num_active_paths) +{ + if (num_active_paths == 0) { + max_active_path_index_ = 0; + } + + /* Compact fragmented path states into the start of the array, moving any paths + * with index higher than the number of active paths into the gaps. */ + if (max_active_path_index_ == num_active_paths) { + return; + } + + void *d_compact_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + + /* Create array with terminated paths that we can write to. */ + { + /* TODO: can the work size be reduced here? */ + int offset = num_active_paths; + int work_size = num_active_paths; + void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset}; + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args); + } + + /* Create array of paths that we need to compact, where the path index is bigger + * than the number of active paths. */ + { + int work_size = max_active_path_index_; + void *args[] = { + &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)}; + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args); + } + + queue_->copy_from_device(num_queued_paths_); + queue_->synchronize(); + + int num_compact_paths = num_queued_paths_.data()[0]; + + /* Move paths into gaps. */ + if (num_compact_paths > 0) { + int work_size = num_compact_paths; + int active_states_offset = 0; + int terminated_states_offset = num_active_paths; + void *args[] = { + &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size}; + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args); + } + + queue_->synchronize(); + + /* Adjust max active path index now we know which part of the array is actually used. */ + max_active_path_index_ = num_active_paths; +} + +bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished) +{ + /* If there are existing paths wait them to go to intersect closest kernel, which will align the + * wavefront of the existing and newely added paths. */ + /* TODO: Check whether counting new intersection kernels here will have positive affect on the + * performance. */ + const DeviceKernel kernel = get_most_queued_kernel(); + if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) { + return false; + } + + int num_active_paths = get_num_active_paths(); + + /* Don't schedule more work if cancelling. */ + if (is_cancel_requested()) { + if (num_active_paths == 0) { + finished = true; + } + return false; + } + + finished = false; + + vector<KernelWorkTile> work_tiles; + + int max_num_camera_paths = max_num_paths_; + int num_predicted_splits = 0; + + if (has_shadow_catcher()) { + /* When there are shadow catchers in the scene bounce from them will split the state. So we + * make sure there is enough space in the path states array to fit split states. + * + * Basically, when adding N new paths we ensure that there is 2*N available path states, so + * that all the new paths can be split. + * + * Note that it is possible that some of the current states can still split, so need to make + * sure there is enough space for them as well. */ + + /* Number of currently in-flight states which can still split. */ + const int num_scheduled_possible_split = shadow_catcher_count_possible_splits(); + + const int num_available_paths = max_num_paths_ - num_active_paths; + const int num_new_paths = num_available_paths / 2; + max_num_camera_paths = max(num_active_paths, + num_active_paths + num_new_paths - num_scheduled_possible_split); + num_predicted_splits += num_scheduled_possible_split + num_new_paths; + } + + /* Schedule when we're out of paths or there are too few paths to keep the + * device occupied. */ + int num_paths = num_active_paths; + if (num_paths == 0 || num_paths < min_num_active_paths_) { + /* Get work tiles until the maximum number of path is reached. */ + while (num_paths < max_num_camera_paths) { + KernelWorkTile work_tile; + if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) { + work_tiles.push_back(work_tile); + num_paths += work_tile.w * work_tile.h * work_tile.num_samples; + } + else { + break; + } + } + + /* If we couldn't get any more tiles, we're done. */ + if (work_tiles.size() == 0 && num_paths == 0) { + finished = true; + return false; + } + } + + /* Initialize paths from work tiles. */ + if (work_tiles.size() == 0) { + return false; + } + + /* Compact state array when number of paths becomes small relative to the + * known maximum path index, which makes computing active index arrays slow. */ + compact_states(num_active_paths); + + if (has_shadow_catcher()) { + integrator_next_shadow_catcher_path_index_.data()[0] = num_paths; + queue_->copy_to_device(integrator_next_shadow_catcher_path_index_); + } + + enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE : + DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA, + work_tiles.data(), + work_tiles.size(), + num_active_paths, + num_predicted_splits); + + return true; +} + +void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel, + const KernelWorkTile work_tiles[], + const int num_work_tiles, + const int num_active_paths, + const int num_predicted_splits) +{ + /* Copy work tiles to device. */ + if (work_tiles_.size() < num_work_tiles) { + work_tiles_.alloc(num_work_tiles); + } + + int path_index_offset = num_active_paths; + int max_tile_work_size = 0; + for (int i = 0; i < num_work_tiles; i++) { + KernelWorkTile &work_tile = work_tiles_.data()[i]; + work_tile = work_tiles[i]; + + const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples; + + work_tile.path_index_offset = path_index_offset; + work_tile.work_size = tile_work_size; + + path_index_offset += tile_work_size; + + max_tile_work_size = max(max_tile_work_size, tile_work_size); + } + + queue_->copy_to_device(work_tiles_); + + void *d_work_tiles = (void *)work_tiles_.device_pointer; + void *d_render_buffer = (void *)buffers_->buffer.device_pointer; + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, + const_cast<int *>(&num_work_tiles), + &d_render_buffer, + const_cast<int *>(&max_tile_work_size)}; + + queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args); + + max_active_path_index_ = path_index_offset + num_predicted_splits; +} + +int PathTraceWorkGPU::get_num_active_paths() +{ + /* TODO: this is wrong, does not account for duplicates with shadow! */ + IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int num_paths = 0; + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + DCHECK_GE(queue_counter->num_queued[i], 0) + << "Invalid number of queued states for kernel " + << device_kernel_as_string(static_cast<DeviceKernel>(i)); + num_paths += queue_counter->num_queued[i]; + } + + return num_paths; +} + +bool PathTraceWorkGPU::should_use_graphics_interop() +{ + /* There are few aspects with the graphics interop when using multiple devices caused by the fact + * that the GPUDisplay has a single texture: + * + * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when + * attempting to register OpenGL PBO which has been mapped. Which makes sense, because + * otherwise one would run into a conflict of where the source of truth is. */ + if (has_multiple_works()) { + return false; + } + + if (!interop_use_checked_) { + Device *device = queue_->device; + interop_use_ = device->should_use_graphics_interop(); + + if (interop_use_) { + VLOG(2) << "Will be using graphics interop GPU display update."; + } + else { + VLOG(2) << "Will be using naive GPU display update."; + } + + interop_use_checked_ = true; + } + + return interop_use_; +} + +void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + if (device_->have_error()) { + /* Don't attempt to update GPU display if the device has errors: the error state will make + * wrong decisions to happen about interop, causing more chained bugs. */ + return; + } + + if (!buffers_->buffer.device_pointer) { + LOG(WARNING) << "Request for GPU display update without allocated render buffers."; + return; + } + + if (should_use_graphics_interop()) { + if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) { + return; + } + + /* If error happens when trying to use graphics interop fallback to the native implementation + * and don't attempt to use interop for the further updates. */ + interop_use_ = false; + } + + copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples); +} + +void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + const int full_x = effective_buffer_params_.full_x; + const int full_y = effective_buffer_params_.full_y; + const int width = effective_buffer_params_.width; + const int height = effective_buffer_params_.height; + const int final_width = buffers_->params.width; + const int final_height = buffers_->params.height; + + const int texture_x = full_x - effective_full_params_.full_x; + const int texture_y = full_y - effective_full_params_.full_y; + + /* Re-allocate display memory if needed, and make sure the device pointer is allocated. + * + * NOTE: allocation happens to the final resolution so that no re-allocation happens on every + * change of the resolution divider. However, if the display becomes smaller, shrink the + * allocated memory as well. */ + if (gpu_display_rgba_half_.data_width != final_width || + gpu_display_rgba_half_.data_height != final_height) { + gpu_display_rgba_half_.alloc(final_width, final_height); + /* TODO(sergey): There should be a way to make sure device-side memory is allocated without + * transfering zeroes to the device. */ + queue_->zero_to_device(gpu_display_rgba_half_); + } + + PassAccessor::Destination destination(film_->get_display_pass()); + destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer; + + get_render_tile_film_pixels(destination, pass_mode, num_samples); + + gpu_display_rgba_half_.copy_from_device(); + + gpu_display->copy_pixels_to_texture( + gpu_display_rgba_half_.data(), texture_x, texture_y, width, height); +} + +bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + if (!device_graphics_interop_) { + device_graphics_interop_ = queue_->graphics_interop_create(); + } + + const DeviceGraphicsInteropDestination graphics_interop_dst = + gpu_display->graphics_interop_get(); + device_graphics_interop_->set_destination(graphics_interop_dst); + + const device_ptr d_rgba_half = device_graphics_interop_->map(); + if (!d_rgba_half) { + return false; + } + + PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display); + destination.d_pixels_half_rgba = d_rgba_half; + + get_render_tile_film_pixels(destination, pass_mode, num_samples); + + device_graphics_interop_->unmap(); + + return true; +} + +void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display) +{ + if (!device_graphics_interop_) { + return; + } + gpu_display->graphics_interop_activate(); + device_graphics_interop_ = nullptr; + gpu_display->graphics_interop_deactivate(); +} + +void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination, + PassMode pass_mode, + int num_samples) +{ + const KernelFilm &kfilm = device_scene_->data.film; + + const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode); + const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples); + + pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination); +} + +int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset) +{ + const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset); + + if (num_active_pixels) { + enqueue_adaptive_sampling_filter_x(); + enqueue_adaptive_sampling_filter_y(); + queue_->synchronize(); + } + + return num_active_pixels; +} + +int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset) +{ + device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE); + num_active_pixels.alloc(1); + + queue_->zero_to_device(num_active_pixels); + + const int work_size = effective_buffer_params_.width * effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + const_cast<int *>(&effective_buffer_params_.full_x), + const_cast<int *>(&effective_buffer_params_.full_y), + const_cast<int *>(&effective_buffer_params_.width), + const_cast<int *>(&effective_buffer_params_.height), + &threshold, + &reset, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride, + &num_active_pixels.device_pointer}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args); + + queue_->copy_from_device(num_active_pixels); + queue_->synchronize(); + + return num_active_pixels.data()[0]; +} + +void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x() +{ + const int work_size = effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args); +} + +void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y() +{ + const int work_size = effective_buffer_params_.width; + + void *args[] = {&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args); +} + +void PathTraceWorkGPU::cryptomatte_postproces() +{ + const int work_size = effective_buffer_params_.width * effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + const_cast<int *>(&work_size), + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args); +} + +bool PathTraceWorkGPU::copy_render_buffers_from_device() +{ + queue_->copy_from_device(buffers_->buffer); + + /* Synchronize so that the CPU-side buffer is available at the exit of this function. */ + return queue_->synchronize(); +} + +bool PathTraceWorkGPU::copy_render_buffers_to_device() +{ + queue_->copy_to_device(buffers_->buffer); + + /* NOTE: The direct device access to the buffers only happens within this path trace work. The + * rest of communication happens via API calls which involves `copy_render_buffers_from_device()` + * which will perform synchronization as needed. */ + + return true; +} + +bool PathTraceWorkGPU::zero_render_buffers() +{ + queue_->zero_to_device(buffers_->buffer); + + return true; +} + +bool PathTraceWorkGPU::has_shadow_catcher() const +{ + return device_scene_->data.integrator.has_shadow_catcher; +} + +int PathTraceWorkGPU::shadow_catcher_count_possible_splits() +{ + if (max_active_path_index_ == 0) { + return 0; + } + + if (!has_shadow_catcher()) { + return 0; + } + + queue_->zero_to_device(num_queued_paths_); + + const int work_size = max_active_path_index_; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths}; + + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args); + queue_->copy_from_device(num_queued_paths_); + queue_->synchronize(); + + return num_queued_paths_.data()[0]; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h new file mode 100644 index 00000000000..38788122b0d --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -0,0 +1,165 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/integrator/integrator_state.h" + +#include "device/device_graphics_interop.h" +#include "device/device_memory.h" +#include "device/device_queue.h" + +#include "integrator/path_trace_work.h" +#include "integrator/work_tile_scheduler.h" + +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +struct KernelWorkTile; + +/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized + * to match device queue's number of path states. + * This implementation suits best devices which have a lot of integrator states, such as GPU. */ +class PathTraceWorkGPU : public PathTraceWork { + public: + PathTraceWorkGPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag); + + virtual void alloc_work_memory() override; + virtual void init_execution() override; + + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) override; + + virtual void copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) override; + virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override; + + virtual bool copy_render_buffers_from_device() override; + virtual bool copy_render_buffers_to_device() override; + virtual bool zero_render_buffers() override; + + virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override; + virtual void cryptomatte_postproces() override; + + protected: + void alloc_integrator_soa(); + void alloc_integrator_queue(); + void alloc_integrator_sorting(); + void alloc_integrator_path_split(); + + /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */ + DeviceKernel get_most_queued_kernel() const; + + void enqueue_reset(); + + bool enqueue_work_tiles(bool &finished); + void enqueue_work_tiles(DeviceKernel kernel, + const KernelWorkTile work_tiles[], + const int num_work_tiles, + const int num_active_paths, + const int num_predicted_splits); + + bool enqueue_path_iteration(); + void enqueue_path_iteration(DeviceKernel kernel); + + void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel); + void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel); + + void compact_states(const int num_active_paths); + + int get_num_active_paths(); + + /* Check whether graphics interop can be used for the GPUDisplay update. */ + bool should_use_graphics_interop(); + + /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the + * device, then copies pixels to the host and pushes them to the `gpu_display`. */ + void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples); + + /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability + * functionality, avoiding copy of pixels to the host. */ + bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples); + + /* Synchronously run film conversion kernel and store display result in the given destination. */ + void get_render_tile_film_pixels(const PassAccessor::Destination &destination, + PassMode pass_mode, + int num_samples); + + int adaptive_sampling_convergence_check_count_active(float threshold, bool reset); + void enqueue_adaptive_sampling_filter_x(); + void enqueue_adaptive_sampling_filter_y(); + + bool has_shadow_catcher() const; + + /* Count how many currently scheduled paths can still split. */ + int shadow_catcher_count_possible_splits(); + + /* Integrator queue. */ + unique_ptr<DeviceQueue> queue_; + + /* Scheduler which gives work to path tracing threads. */ + WorkTileScheduler work_tile_scheduler_; + + /* Integrate state for paths. */ + IntegratorStateGPU integrator_state_gpu_; + /* SoA arrays for integrator state. */ + vector<unique_ptr<device_memory>> integrator_state_soa_; + uint integrator_state_soa_kernel_features_; + /* Keep track of number of queued kernels. */ + device_vector<IntegratorQueueCounter> integrator_queue_counter_; + /* Shader sorting. */ + device_vector<int> integrator_shader_sort_counter_; + device_vector<int> integrator_shader_raytrace_sort_counter_; + /* Path split. */ + device_vector<int> integrator_next_shadow_catcher_path_index_; + + /* Temporary buffer to get an array of queued path for a particular kernel. */ + device_vector<int> queued_paths_; + device_vector<int> num_queued_paths_; + + /* Temporary buffer for passing work tiles to kernel. */ + device_vector<KernelWorkTile> work_tiles_; + + /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not + * available. Is allocated on-demand. */ + device_vector<half4> gpu_display_rgba_half_; + + unique_ptr<DeviceGraphicsInterop> device_graphics_interop_; + + /* Cached result of device->should_use_graphics_interop(). */ + bool interop_use_checked_ = false; + bool interop_use_ = false; + + /* Maximum number of concurrent integrator states. */ + int max_num_paths_; + + /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below + * this value more work will be scheduled. */ + int min_num_active_paths_; + + /* Maximum path index, effective number of paths used may be smaller than + * the size of the integrator_state_ buffer so can avoid iterating over the + * full buffer. */ + int max_active_path_index_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp new file mode 100644 index 00000000000..4eb1dd941f9 --- /dev/null +++ b/intern/cycles/integrator/render_scheduler.cpp @@ -0,0 +1,1187 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/render_scheduler.h" + +#include "render/session.h" +#include "render/tile.h" +#include "util/util_logging.h" +#include "util/util_math.h" +#include "util/util_time.h" + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * Render scheduler. + */ + +RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams ¶ms) + : headless_(params.headless), + background_(params.background), + pixel_size_(params.pixel_size), + tile_manager_(tile_manager), + default_start_resolution_divider_(pixel_size_ * 8) +{ + use_progressive_noise_floor_ = !background_; +} + +void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte) +{ + need_schedule_cryptomatte_ = need_schedule_cryptomatte; +} + +void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance) +{ + need_schedule_rebalance_works_ = need_schedule_rebalance; +} + +bool RenderScheduler::is_background() const +{ + return background_; +} + +void RenderScheduler::set_denoiser_params(const DenoiseParams ¶ms) +{ + denoiser_params_ = params; +} + +void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling) +{ + adaptive_sampling_ = adaptive_sampling; +} + +bool RenderScheduler::is_adaptive_sampling_used() const +{ + return adaptive_sampling_.use; +} + +void RenderScheduler::set_start_sample(int start_sample) +{ + start_sample_ = start_sample; +} + +int RenderScheduler::get_start_sample() const +{ + return start_sample_; +} + +void RenderScheduler::set_num_samples(int num_samples) +{ + num_samples_ = num_samples; +} + +int RenderScheduler::get_num_samples() const +{ + return num_samples_; +} + +void RenderScheduler::set_time_limit(double time_limit) +{ + time_limit_ = time_limit; +} + +double RenderScheduler::get_time_limit() const +{ + return time_limit_; +} + +int RenderScheduler::get_rendered_sample() const +{ + DCHECK_GT(get_num_rendered_samples(), 0); + + return start_sample_ + get_num_rendered_samples() - 1; +} + +int RenderScheduler::get_num_rendered_samples() const +{ + return state_.num_rendered_samples; +} + +void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples) +{ + buffer_params_ = buffer_params; + + update_start_resolution_divider(); + + set_num_samples(num_samples); + + /* In background mode never do lower resolution render preview, as it is not really supported + * by the software. */ + if (background_) { + state_.resolution_divider = 1; + } + else { + /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider + * first and then initialized render work. */ + state_.resolution_divider = start_resolution_divider_ * 2; + } + + state_.num_rendered_samples = 0; + state_.last_display_update_time = 0.0; + state_.last_display_update_sample = -1; + + state_.last_rebalance_time = 0.0; + state_.num_rebalance_requested = 0; + state_.num_rebalance_changes = 0; + state_.last_rebalance_changed = false; + state_.need_rebalance_at_next_work = false; + + /* TODO(sergey): Choose better initial value. */ + /* NOTE: The adaptive sampling settings might not be available here yet. */ + state_.adaptive_sampling_threshold = 0.4f; + + state_.last_work_tile_was_denoised = false; + state_.tile_result_was_written = false; + state_.postprocess_work_scheduled = false; + state_.full_frame_work_scheduled = false; + state_.full_frame_was_written = false; + + state_.path_trace_finished = false; + + state_.start_render_time = 0.0; + state_.end_render_time = 0.0; + state_.time_limit_reached = false; + + state_.occupancy_num_samples = 0; + state_.occupancy = 1.0f; + + first_render_time_.path_trace_per_sample = 0.0; + first_render_time_.denoise_time = 0.0; + first_render_time_.display_update_time = 0.0; + + path_trace_time_.reset(); + denoise_time_.reset(); + adaptive_filter_time_.reset(); + display_update_time_.reset(); + rebalance_time_.reset(); +} + +void RenderScheduler::reset_for_next_tile() +{ + reset(buffer_params_, num_samples_); +} + +bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work) +{ + /* Move to the next resolution divider. Assume adaptive filtering is not needed during + * navigation. */ + if (state_.resolution_divider != pixel_size_) { + return false; + } + + if (render_work_reschedule_on_idle(render_work)) { + return true; + } + + state_.path_trace_finished = true; + + bool denoiser_delayed, denoiser_ready_to_display; + render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display); + + render_work.display.update = work_need_update_display(denoiser_delayed); + render_work.display.use_denoised_result = denoiser_ready_to_display; + + return false; +} + +bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work) +{ + if (!use_progressive_noise_floor_) { + return false; + } + + /* Move to the next resolution divider. Assume adaptive filtering is not needed during + * navigation. */ + if (state_.resolution_divider != pixel_size_) { + return false; + } + + if (adaptive_sampling_.use) { + if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) { + state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2, + adaptive_sampling_.threshold); + + render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold; + render_work.adaptive_sampling.reset = true; + + return true; + } + } + + return false; +} + +void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work) +{ + VLOG(3) << "Schedule work for cancel."; + + /* Un-schedule samples: they will not be rendered and should not be counted. */ + state_.num_rendered_samples -= render_work.path_trace.num_samples; + + const bool has_rendered_samples = get_num_rendered_samples() != 0; + + /* Reset all fields of the previous work, canelling things like adaptive sampling filtering and + * denoising. + * However, need to preserve write requests, since those will not be possible to recover and + * writes are only to happen once. */ + const bool tile_write = render_work.tile.write; + const bool full_write = render_work.full.write; + + render_work = RenderWork(); + + render_work.tile.write = tile_write; + render_work.full.write = full_write; + + /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which + * got cancelled. */ + if (!state_.tile_result_was_written && has_rendered_samples) { + render_work.tile.write = true; + } + + if (!state_.full_frame_was_written) { + render_work.full.write = true; + } + + /* Update current tile, but only if any sample was rendered. + * Allows to have latest state of tile visible while full buffer is being processed. + * + * Note that if there are no samples in the current tile its render buffer might have pixels + * remained from previous state. + * + * If the full result was written, then there is no way any updates were made to the render + * buffers. And the buffers might have been freed from the device, so display update is not + * possible. */ + if (has_rendered_samples && !state_.full_frame_was_written) { + render_work.display.update = true; + } +} + +bool RenderScheduler::done() const +{ + if (state_.resolution_divider != pixel_size_) { + return false; + } + + if (state_.path_trace_finished || state_.time_limit_reached) { + return true; + } + + return get_num_rendered_samples() >= num_samples_; +} + +RenderWork RenderScheduler::get_render_work() +{ + check_time_limit_reached(); + + const double time_now = time_dt(); + + if (done()) { + RenderWork render_work; + render_work.resolution_divider = state_.resolution_divider; + + if (!set_postprocess_render_work(&render_work)) { + set_full_frame_render_work(&render_work); + } + + if (!render_work) { + state_.end_render_time = time_now; + } + + update_state_for_render_work(render_work); + + return render_work; + } + + RenderWork render_work; + + if (state_.resolution_divider != pixel_size_) { + state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_); + state_.num_rendered_samples = 0; + state_.last_display_update_sample = -1; + } + + render_work.resolution_divider = state_.resolution_divider; + + render_work.path_trace.start_sample = get_start_sample_to_path_trace(); + render_work.path_trace.num_samples = get_num_samples_to_path_trace(); + + render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample()); + + /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */ + render_work.rebalance = work_need_rebalance(); + + /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the + * samples are rendered. */ + state_.num_rendered_samples += render_work.path_trace.num_samples; + + render_work.adaptive_sampling.filter = work_need_adaptive_filter(); + render_work.adaptive_sampling.threshold = work_adaptive_threshold(); + render_work.adaptive_sampling.reset = false; + + bool denoiser_delayed, denoiser_ready_to_display; + render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display); + + render_work.tile.write = done(); + + render_work.display.update = work_need_update_display(denoiser_delayed); + render_work.display.use_denoised_result = denoiser_ready_to_display; + + if (done()) { + set_postprocess_render_work(&render_work); + } + + update_state_for_render_work(render_work); + + return render_work; +} + +void RenderScheduler::update_state_for_render_work(const RenderWork &render_work) +{ + const double time_now = time_dt(); + + if (render_work.rebalance) { + state_.last_rebalance_time = time_now; + ++state_.num_rebalance_requested; + } + + /* A fallback display update time, for the case there is an error of display update, or when + * there is no display at all. */ + if (render_work.display.update) { + state_.last_display_update_time = time_now; + state_.last_display_update_sample = state_.num_rendered_samples; + } + + state_.last_work_tile_was_denoised = render_work.tile.denoise; + state_.tile_result_was_written |= render_work.tile.write; + state_.full_frame_was_written |= render_work.full.write; +} + +bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work) +{ + if (state_.postprocess_work_scheduled) { + return false; + } + state_.postprocess_work_scheduled = true; + + bool any_scheduled = false; + + if (need_schedule_cryptomatte_) { + render_work->cryptomatte.postprocess = true; + any_scheduled = true; + } + + if (denoiser_params_.use && !state_.last_work_tile_was_denoised) { + render_work->tile.denoise = true; + any_scheduled = true; + } + + if (!state_.tile_result_was_written) { + render_work->tile.write = true; + any_scheduled = true; + } + + if (any_scheduled) { + render_work->display.update = true; + } + + return any_scheduled; +} + +void RenderScheduler::set_full_frame_render_work(RenderWork *render_work) +{ + if (state_.full_frame_work_scheduled) { + return; + } + + if (!tile_manager_.has_multiple_tiles()) { + /* There is only single tile, so all work has been performed already. */ + return; + } + + if (!tile_manager_.done()) { + /* There are still tiles to be rendered. */ + return; + } + + if (state_.full_frame_was_written) { + return; + } + + state_.full_frame_work_scheduled = true; + + render_work->full.write = true; +} + +/* Knowing time which it took to complete a task at the current resolution divider approximate how + * long it would have taken to complete it at a final resolution. */ +static double approximate_final_time(const RenderWork &render_work, double time) +{ + if (render_work.resolution_divider == 1) { + return time; + } + + const double resolution_divider_sq = render_work.resolution_divider * + render_work.resolution_divider; + return time * resolution_divider_sq; +} + +void RenderScheduler::report_work_begin(const RenderWork &render_work) +{ + /* Start counting render time when rendering samples at their final resolution. + * + * NOTE: The work might have the path trace part be all zero: this happens when a post-processing + * work is scheduled after the path tracing. Checking for just a start sample doesn't work here + * because it might be wrongly 0. Check for whether path tracing is actually happening as it is + * expected to happen in the first work. */ + if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 && + render_work.path_trace.start_sample == get_start_sample()) { + state_.start_render_time = time_dt(); + } +} + +void RenderScheduler::report_path_trace_time(const RenderWork &render_work, + double time, + bool is_cancelled) +{ + path_trace_time_.add_wall(time); + + if (is_cancelled) { + return; + } + + const double final_time_approx = approximate_final_time(render_work, time); + + if (work_is_usable_for_first_render_estimation(render_work)) { + first_render_time_.path_trace_per_sample = final_time_approx / + render_work.path_trace.num_samples; + } + + if (work_report_reset_average(render_work)) { + path_trace_time_.reset_average(); + } + + path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples); + + VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds."; +} + +void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy) +{ + state_.occupancy_num_samples = render_work.path_trace.num_samples; + state_.occupancy = occupancy; + VLOG(4) << "Measured path tracing occupancy: " << occupancy; +} + +void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work, + double time, + bool is_cancelled) +{ + adaptive_filter_time_.add_wall(time); + + if (is_cancelled) { + return; + } + + const double final_time_approx = approximate_final_time(render_work, time); + + if (work_report_reset_average(render_work)) { + adaptive_filter_time_.reset_average(); + } + + adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples); + + VLOG(4) << "Average adaptive sampling filter time: " << adaptive_filter_time_.get_average() + << " seconds."; +} + +void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time) +{ + denoise_time_.add_wall(time); + + const double final_time_approx = approximate_final_time(render_work, time); + + if (work_is_usable_for_first_render_estimation(render_work)) { + first_render_time_.denoise_time = final_time_approx; + } + + if (work_report_reset_average(render_work)) { + denoise_time_.reset_average(); + } + + denoise_time_.add_average(final_time_approx); + + VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds."; +} + +void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time) +{ + display_update_time_.add_wall(time); + + const double final_time_approx = approximate_final_time(render_work, time); + + if (work_is_usable_for_first_render_estimation(render_work)) { + first_render_time_.display_update_time = final_time_approx; + } + + if (work_report_reset_average(render_work)) { + display_update_time_.reset_average(); + } + + display_update_time_.add_average(final_time_approx); + + VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds."; + + /* Move the display update moment further in time, so that logic which checks when last update + * did happen have more reliable point in time (without path tracing and denoising parts of the + * render work). */ + state_.last_display_update_time = time_dt(); +} + +void RenderScheduler::report_rebalance_time(const RenderWork &render_work, + double time, + bool balance_changed) +{ + rebalance_time_.add_wall(time); + + if (work_report_reset_average(render_work)) { + rebalance_time_.reset_average(); + } + + rebalance_time_.add_average(time); + + if (balance_changed) { + ++state_.num_rebalance_changes; + } + + state_.last_rebalance_changed = balance_changed; + + VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds."; +} + +string RenderScheduler::full_report() const +{ + const double render_wall_time = state_.end_render_time - state_.start_render_time; + const int num_rendered_samples = get_num_rendered_samples(); + + string result = "\nRender Scheduler Summary\n\n"; + + { + string mode; + if (headless_) { + mode = "Headless"; + } + else if (background_) { + mode = "Background"; + } + else { + mode = "Interactive"; + } + result += "Mode: " + mode + "\n"; + } + + result += "Resolution: " + to_string(buffer_params_.width) + "x" + + to_string(buffer_params_.height) + "\n"; + + result += "\nAdaptive sampling:\n"; + result += " Use: " + string_from_bool(adaptive_sampling_.use) + "\n"; + if (adaptive_sampling_.use) { + result += " Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n"; + result += " Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n"; + result += " Threshold: " + to_string(adaptive_sampling_.threshold) + "\n"; + } + + result += "\nDenoiser:\n"; + result += " Use: " + string_from_bool(denoiser_params_.use) + "\n"; + if (denoiser_params_.use) { + result += " Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n"; + result += " Start Sample: " + to_string(denoiser_params_.start_sample) + "\n"; + + string passes = "Color"; + if (denoiser_params_.use_pass_albedo) { + passes += ", Albedo"; + } + if (denoiser_params_.use_pass_normal) { + passes += ", Normal"; + } + + result += " Passes: " + passes + "\n"; + } + + if (state_.num_rebalance_requested) { + result += "\nRebalancer:\n"; + result += " Number of requested rebalances: " + to_string(state_.num_rebalance_requested) + + "\n"; + result += " Number of performed rebalances: " + to_string(state_.num_rebalance_changes) + + "\n"; + } + + result += "\nTime (in seconds):\n"; + result += string_printf(" %20s %20s %20s\n", "", "Wall", "Average"); + result += string_printf(" %20s %20f %20f\n", + "Path Tracing", + path_trace_time_.get_wall(), + path_trace_time_.get_average()); + + if (adaptive_sampling_.use) { + result += string_printf(" %20s %20f %20f\n", + "Adaptive Filter", + adaptive_filter_time_.get_wall(), + adaptive_filter_time_.get_average()); + } + + if (denoiser_params_.use) { + result += string_printf( + " %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average()); + } + + result += string_printf(" %20s %20f %20f\n", + "Display Update", + display_update_time_.get_wall(), + display_update_time_.get_average()); + + if (state_.num_rebalance_requested) { + result += string_printf(" %20s %20f %20f\n", + "Rebalance", + rebalance_time_.get_wall(), + rebalance_time_.get_average()); + } + + const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() + + denoise_time_.get_wall() + display_update_time_.get_wall(); + result += "\n Total: " + to_string(total_time) + "\n"; + + result += string_printf( + "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time); + + /* When adaptive sampling is used the average time becomes meaningless, because different samples + * will likely render different number of pixels. */ + if (!adaptive_sampling_.use) { + result += string_printf("Average time per sample: %f seconds\n", + render_wall_time / num_rendered_samples); + } + + return result; +} + +double RenderScheduler::guess_display_update_interval_in_seconds() const +{ + return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples); +} + +double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples( + int num_rendered_samples) const +{ + double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit( + num_rendered_samples); + + if (time_limit_ != 0.0 && state_.start_render_time != 0.0) { + const double remaining_render_time = max(0.0, + time_limit_ - (time_dt() - state_.start_render_time)); + + update_interval = min(update_interval, remaining_render_time); + } + + return update_interval; +} + +/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based + * on a more careful experiments with viewport rendering. */ +double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit( + int num_rendered_samples) const +{ + /* TODO(sergey): Need a decision on whether this should be using number of samples rendered + * within the current render session, or use absolute number of samples with the start sample + * taken into account. It will depend on whether the start sample offset clears the render + * buffer. */ + + if (state_.need_rebalance_at_next_work) { + return 0.1; + } + if (state_.last_rebalance_changed) { + return 0.2; + } + + if (headless_) { + /* In headless mode do rare updates, so that the device occupancy is high, but there are still + * progress messages printed to the logs. */ + return 30.0; + } + + if (background_) { + if (num_rendered_samples < 32) { + return 1.0; + } + return 2.0; + } + + /* Render time and number of samples rendered are used to figure out the display update interval. + * Render time is used to allow for fast display updates in the first few seconds of rendering + * on fast devices. Number of samples rendered is used to allow for potentially quicker display + * updates on slow devices during the first few samples. */ + const double render_time = path_trace_time_.get_wall(); + if (render_time < 1) { + return 0.1; + } + if (render_time < 2) { + return 0.25; + } + if (render_time < 4) { + return 0.5; + } + if (render_time < 8 || num_rendered_samples < 32) { + return 1.0; + } + return 2.0; +} + +int RenderScheduler::calculate_num_samples_per_update() const +{ + const double time_per_sample_average = path_trace_time_.get_average(); + const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average; + + const double update_interval_in_seconds = guess_display_update_interval_in_seconds(); + + return max(int(num_samples_in_second * update_interval_in_seconds), 1); +} + +int RenderScheduler::get_start_sample_to_path_trace() const +{ + return start_sample_ + state_.num_rendered_samples; +} + +/* Round number of samples to the closest power of two. + * Rounding might happen to higher or lower value depending on which one is closer. Such behavior + * allows to have number of samples to be power of two without diverging from the planned number of + * samples too much. */ +static inline uint round_num_samples_to_power_of_2(const uint num_samples) +{ + if (num_samples == 1) { + return 1; + } + + if (is_power_of_two(num_samples)) { + return num_samples; + } + + const uint num_samples_up = next_power_of_two(num_samples); + const uint num_samples_down = num_samples_up - (num_samples_up >> 1); + + const uint delta_up = num_samples_up - num_samples; + const uint delta_down = num_samples - num_samples_down; + + if (delta_up <= delta_down) { + return num_samples_up; + } + + return num_samples_down; +} + +int RenderScheduler::get_num_samples_to_path_trace() const +{ + if (state_.resolution_divider != pixel_size_) { + return get_num_samples_during_navigation(state_.resolution_divider); + } + + /* Always start full resolution render with a single sample. Gives more instant feedback to + * artists, and allows to gather information for a subsequent path tracing works. Do it in the + * headless mode as well, to give some estimate of how long samples are taking. */ + if (state_.num_rendered_samples == 0) { + return 1; + } + + const int num_samples_per_update = calculate_num_samples_per_update(); + const int path_trace_start_sample = get_start_sample_to_path_trace(); + + /* Round number of samples to a power of two, so that division of path states into tiles goes in + * a more integer manner. + * This might make it so updates happens more rarely due to rounding up. In the test scenes this + * is not huge deal because it is not seen that more than 8 samples can be rendered between + * updates. If that becomes a problem we can add some extra rules like never allow to round up + * more than N samples. */ + const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update); + + const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample; + + int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render); + + /* When enough statistics is available and doing an offlien rendering prefer to keep device + * occupied. */ + if (state_.occupancy_num_samples && (background_ || headless_)) { + /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes + * with good performance without forcing occupancy to be higher). */ + int num_samples_to_occupy = state_.occupancy_num_samples; + if (state_.occupancy < 0.5f) { + num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy); + } + + num_samples_to_render = max(num_samples_to_render, + min(num_samples_to_occupy, max_num_samples_to_render)); + } + + /* If adaptive sampling is not use, render as many samples per update as possible, keeping the + * device fully occupied, without much overhead of display updates. */ + if (!adaptive_sampling_.use) { + return num_samples_to_render; + } + + /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This + * is to ensure that the final render is pixel-matched regardless of how many samples per second + * compute device can do. */ + + return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render); +} + +int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const +{ + /* Special trick for fast navigation: schedule multiple samples during fast navigation + * (which will prefer to use lower resolution to keep up with refresh rate). This gives more + * usable visual feedback for artists. There are a couple of tricks though. */ + + if (is_denoise_active_during_update()) { + /* When denoising is used during navigation prefer using a higher resolution with less samples + * (scheduling less samples here will make it so the resolution_divider calculation will use a + * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser + * give visually better results on a higher resolution image with less samples. */ + return 1; + } + + if (resolution_divider <= pixel_size_) { + /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect + * the sample count at this resolution division, but instead assists in the calculation of + * the resolution divider. */ + return 1; + } + + if (resolution_divider == pixel_size_ * 2) { + /* When resolution divider is the previous step to the final resolution, schedule two samples. + * This is so that rendering on lower resolution does not exceed time that it takes to render + * first sample at the full resolution. */ + return 2; + } + + /* Always render 4 samples, even if scene is configured for less. + * The idea here is to have enough information on the screen. Resolution divider of 2 allows us + * to have 4 time extra samples, so verall worst case timing is the same as the final resolution + * at one sample. */ + return 4; +} + +bool RenderScheduler::work_need_adaptive_filter() const +{ + return adaptive_sampling_.need_filter(get_rendered_sample()); +} + +float RenderScheduler::work_adaptive_threshold() const +{ + if (!use_progressive_noise_floor_) { + return adaptive_sampling_.threshold; + } + + return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold); +} + +bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display) +{ + delayed = false; + ready_to_display = true; + + if (!denoiser_params_.use) { + /* Denoising is disabled, no need to scheduler work for it. */ + return false; + } + + if (done()) { + /* Always denoise at the last sample. */ + return true; + } + + if (background_) { + /* Background render, only denoise when rendering the last sample. */ + /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised + * image looks like even for the background rendering. */ + return false; + } + + /* Viewport render. */ + + /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as + * final samples. */ + const int num_samples_finished = state_.resolution_divider == pixel_size_ ? + state_.num_rendered_samples : + 1; + + /* Immediately denoise when we reach the start sample or last sample. */ + if (num_samples_finished == denoiser_params_.start_sample || + num_samples_finished == num_samples_) { + return true; + } + + /* Do not denoise until the sample at which denoising should start is reached. */ + if (num_samples_finished < denoiser_params_.start_sample) { + ready_to_display = false; + return false; + } + + /* Avoid excessive denoising in viewport after reaching a certain sample count and render time. + */ + /* TODO(sergey): Consider making time interval and sample configurable. */ + delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 && + (time_dt() - state_.last_display_update_time) < 1.0); + + return !delayed; +} + +bool RenderScheduler::work_need_update_display(const bool denoiser_delayed) +{ + if (headless_) { + /* Force disable display update in headless mode. There will be nothing to display the + * in-progress result. */ + return false; + } + + if (denoiser_delayed) { + /* If denoiser has been delayed the display can not be updated as it will not contain + * up-to-date state of the render result. */ + return false; + } + + if (!adaptive_sampling_.use) { + /* When adaptive sampling is not used the work is scheduled in a way that they keep render + * device busy for long enough, so that the display update can happen right after the + * rendering. */ + return true; + } + + if (done() || state_.last_display_update_sample == -1) { + /* Make sure an initial and final results of adaptive sampling is communicated ot the display. + */ + return true; + } + + /* For the development purposes of adaptive sampling it might be very useful to see all updates + * of active pixels after convergence check. However, it would cause a slowdown for regular usage + * users. Possibly, make it a debug panel option to allow rapid update to ease development + * without need to re-compiled. */ + // if (work_need_adaptive_filter()) { + // return true; + // } + + /* When adaptive sampling is used, its possible that only handful of samples of a very simple + * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points). + * We take care of skipping updates here based on when previous display update did happen. */ + const double update_interval = guess_display_update_interval_in_seconds_for_num_samples( + state_.last_display_update_sample); + return (time_dt() - state_.last_display_update_time) > update_interval; +} + +bool RenderScheduler::work_need_rebalance() +{ + /* This is the minimum time, as the rebalancing can not happen more often than the path trace + * work. */ + static const double kRebalanceIntervalInSeconds = 1; + + if (!need_schedule_rebalance_works_) { + return false; + } + + if (state_.resolution_divider != pixel_size_) { + /* Don't rebalance at a non-final resolution divider. Some reasons for this: + * - It will introduce unnecessary during navigation. + * - Per-render device timing information is not very reliable yet. */ + return false; + } + + if (state_.num_rendered_samples == 0) { + state_.need_rebalance_at_next_work = true; + return false; + } + + if (state_.need_rebalance_at_next_work) { + state_.need_rebalance_at_next_work = false; + return true; + } + + if (state_.last_rebalance_changed) { + return true; + } + + return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds; +} + +void RenderScheduler::update_start_resolution_divider() +{ + if (start_resolution_divider_ == 0) { + /* Resolution divider has never been calculated before: use default resolution, so that we have + * somewhat good initial behavior, giving a chance to collect real numbers. */ + start_resolution_divider_ = default_start_resolution_divider_; + VLOG(3) << "Initial resolution divider is " << start_resolution_divider_; + return; + } + + if (first_render_time_.path_trace_per_sample == 0.0) { + /* Not enough information to calculate better resolution, keep the existing one. */ + return; + } + + const double desired_update_interval_in_seconds = + guess_viewport_navigation_update_interval_in_seconds(); + + const double actual_time_per_update = first_render_time_.path_trace_per_sample + + first_render_time_.denoise_time + + first_render_time_.display_update_time; + + /* Allow some percent of tolerance, so that if the render time is close enough to the higher + * resolution we prefer to use it instead of going way lower resolution and time way below the + * desired one. */ + const int resolution_divider_for_update = calculate_resolution_divider_for_time( + desired_update_interval_in_seconds * 1.4, actual_time_per_update); + + /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual + * render time is somewhere on a boundary between two resolutions. */ + + /* Never increase resolution to higher than the pixel size (which is possible if the scene is + * simple and compute device is fast). */ + start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_); + + VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_; +} + +double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const +{ + if (is_denoise_active_during_update()) { + /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the + * image from. With the faster updates and extra compute required the resolution becomes too + * low to give usable feedback. */ + /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser + * on GPU the value might need to become lower for faster navigation. */ + return 1.0 / 12.0; + } + + /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will + * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high + * values of the resolution divider which does not give very pleasant updates during navigation. + * Choose less frequent updates to allow more noise-free and higher resolution updates. */ + + /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider + * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */ + + return 1.0 / 30.0; +} + +bool RenderScheduler::is_denoise_active_during_update() const +{ + if (!denoiser_params_.use) { + return false; + } + + if (denoiser_params_.start_sample > 1) { + return false; + } + + return true; +} + +bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work) +{ + return render_work.resolution_divider == pixel_size_ && + render_work.path_trace.start_sample == start_sample_; +} + +bool RenderScheduler::work_report_reset_average(const RenderWork &render_work) +{ + /* When rendering at a non-final resolution divider time average is not very useful because it + * will either bias average down (due to lower render times on the smaller images) or will give + * incorrect result when trying to estimate time which would have spent on the final resolution. + * + * So we only accumulate average for the latest resolution divider which was rendered. */ + return render_work.resolution_divider != pixel_size_; +} + +void RenderScheduler::check_time_limit_reached() +{ + if (time_limit_ == 0.0) { + /* No limit is enforced. */ + return; + } + + if (state_.start_render_time == 0.0) { + /* Rendering did not start yet. */ + return; + } + + const double current_time = time_dt(); + + if (current_time - state_.start_render_time < time_limit_) { + /* Time limit is not reached yet. */ + return; + } + + state_.time_limit_reached = true; + state_.end_render_time = current_time; +} + +/* -------------------------------------------------------------------- + * Utility functions. + */ + +int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time) +{ + /* TODO(sergey): There should a non-iterative analytical formula here. */ + + int resolution_divider = 1; + + /* This algorithm iterates through resolution dividers until a divider is found that achieves + * the desired render time. A limit of default_start_resolution_divider_ is put in place as the + * maximum resolution divider to avoid an unreadable viewport due to a low resolution. + * pre_resolution_division_samples and post_resolution_division_samples are used in this + * calculation to better predict the performance impact of changing resolution divisions as + * the sample count can also change between resolution divisions. */ + while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) { + int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider); + resolution_divider = resolution_divider * 2; + int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider); + actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples; + } + + return resolution_divider; +} + +int calculate_resolution_divider_for_resolution(int width, int height, int resolution) +{ + if (resolution == INT_MAX) { + return 1; + } + + int resolution_divider = 1; + while (width * height > resolution * resolution) { + width = max(1, width / 2); + height = max(1, height / 2); + + resolution_divider <<= 1; + } + + return resolution_divider; +} + +int calculate_resolution_for_divider(int width, int height, int resolution_divider) +{ + const int pixel_area = width * height; + const int resolution = lround(sqrt(pixel_area)); + + return resolution / resolution_divider; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h new file mode 100644 index 00000000000..9c2d107e46d --- /dev/null +++ b/intern/cycles/integrator/render_scheduler.h @@ -0,0 +1,466 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/adaptive_sampling.h" +#include "integrator/denoiser.h" /* For DenoiseParams. */ +#include "render/buffers.h" +#include "util/util_string.h" + +CCL_NAMESPACE_BEGIN + +class SessionParams; +class TileManager; + +class RenderWork { + public: + int resolution_divider = 1; + + /* Initialize render buffers. + * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the + * baking target. */ + bool init_render_buffers = false; + + /* Path tracing samples information. */ + struct { + int start_sample = 0; + int num_samples = 0; + } path_trace; + + struct { + /* Check for convergency and filter the mask. */ + bool filter = false; + + float threshold = 0.0f; + + /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */ + bool reset = false; + } adaptive_sampling; + + struct { + bool postprocess = false; + } cryptomatte; + + /* Work related on the current tile. */ + struct { + /* Write render buffers of the current tile. + * + * It is up to the path trace to decide whether writing should happen via user-provided + * callback into the rendering software, or via tile manager into a partial file. */ + bool write = false; + + bool denoise = false; + } tile; + + /* Work related on the full-frame render buffer. */ + struct { + /* Write full render result. + * Implies reading the partial file from disk. */ + bool write = false; + } full; + + /* Display which is used to visualize render result. */ + struct { + /* Display needs to be updated for the new render. */ + bool update = false; + + /* Display can use denoised result if available. */ + bool use_denoised_result = true; + } display; + + /* Re-balance multi-device scheduling after rendering this work. + * Note that the scheduler does not know anything abouce devices, so if there is only a single + * device used, then it is up for the PathTracer to ignore the balancing. */ + bool rebalance = false; + + /* Conversion to bool, to simplify checks about whether there is anything to be done for this + * work. */ + inline operator bool() const + { + return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise || + tile.write || full.write; + } +}; + +class RenderScheduler { + public: + RenderScheduler(TileManager &tile_manager, const SessionParams ¶ms); + + /* Specify whether cryptomatte-related works are to be scheduled. */ + void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte); + + /* Allows to disable work re-balancing works, allowing to schedule as much to a single device + * as possible. */ + void set_need_schedule_rebalance(bool need_schedule_rebalance); + + bool is_background() const; + + void set_denoiser_params(const DenoiseParams ¶ms); + void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling); + + bool is_adaptive_sampling_used() const; + + /* Start sample for path tracing. + * The scheduler will schedule work using this sample as the first one. */ + void set_start_sample(int start_sample); + int get_start_sample() const; + + /* Number of samples to render, starting from start sample. + * The scheduler will schedule work in the range of + * [start_sample, start_sample + num_samples - 1], inclusively. */ + void set_num_samples(int num_samples); + int get_num_samples() const; + + /* Time limit for the path tracing tasks, in minutes. + * Zero disables the limit. */ + void set_time_limit(double time_limit); + double get_time_limit() const; + + /* Get sample up to which rendering has been done. + * This is an absolute 0-based value. + * + * For example, if start sample is 10 and and 5 samples were rendered, then this call will + * return 14. + * + * If there were no samples rendered, then the behavior is undefined. */ + int get_rendered_sample() const; + + /* Get number of samples rendered within the current scheduling session. + * + * For example, if start sample is 10 and and 5 samples were rendered, then this call will + * return 5. + * + * Note that this is based on the scheduling information. In practice this means that if someone + * requested for work to render the scheduler considers the work done. */ + int get_num_rendered_samples() const; + + /* Reset scheduler, indicating that rendering will happen from scratch. + * Resets current rendered state, as well as scheduling information. */ + void reset(const BufferParams &buffer_params, int num_samples); + + /* Reset scheduler upon switching to a next tile. + * Will keep the same number of samples and full-frame render parameters, but will reset progress + * and allow schedule renders works from the beginning of the new tile. */ + void reset_for_next_tile(); + + /* Reschedule adaptive sampling work when all pixels did converge. + * If there is nothing else to be done for the adaptive sampling (pixels did converge to the + * final threshold) then false is returned and the render scheduler will stop scheduling path + * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with + * a lower threshold. */ + bool render_work_reschedule_on_converge(RenderWork &render_work); + + /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet + * converged. + * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and + * the path tracer is to finish the current pixels) then false is returned. */ + bool render_work_reschedule_on_idle(RenderWork &render_work); + + /* Reschedule work when rendering has been requested to cancel. + * + * Will skip all work which is not needed anymore because no more samples will be added (for + * example, adaptive sampling filtering and convergence check will be skipped). + * Will enable all work needed to make sure all passes are communicated to the software. + * + * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */ + void render_work_reschedule_on_cancel(RenderWork &render_work); + + RenderWork get_render_work(); + + /* Report that the path tracer started to work, after scene update and loading kernels. */ + void report_work_begin(const RenderWork &render_work); + + /* Report time (in seconds) which corresponding part of work took. */ + void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled); + void report_path_trace_occupancy(const RenderWork &render_work, float occupancy); + void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled); + void report_denoise_time(const RenderWork &render_work, double time); + void report_display_update_time(const RenderWork &render_work, double time); + void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed); + + /* Generate full multi-line report of the rendering process, including rendering parameters, + * times, and so on. */ + string full_report() const; + + protected: + /* Check whether all work has been scheduled and time limit was not exceeded. + * + * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some + * extra work needs to be scheduled to denoise and write final result. */ + bool done() const; + + /* Update scheduling state for a newely scheduled work. + * Takes care of things like checking whether work was ever denoised, tile was written and states + * like that. */ + void update_state_for_render_work(const RenderWork &render_work); + + /* Returns true if any work was scheduled. */ + bool set_postprocess_render_work(RenderWork *render_work); + + /* Set work which is to be performed after all tiles has been rendered. */ + void set_full_frame_render_work(RenderWork *render_work); + + /* Update start resolution divider based on the accumulated timing information, preserving nice + * feeling navigation feel. */ + void update_start_resolution_divider(); + + /* Calculate desired update interval in seconds based on the current timings and settings. + * Will give an interval which provides good feeling updates during viewport navigation. */ + double guess_viewport_navigation_update_interval_in_seconds() const; + + /* Check whether denoising is active during interactive update while resolution divider is not + * unit. */ + bool is_denoise_active_during_update() const; + + /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at + * lower samples and near the beginning of rendering, updates happen more often, but with higher + * number of samples and later in the render, updates happen less often but device occupancy + * goes higher. */ + double guess_display_update_interval_in_seconds() const; + double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const; + double guess_display_update_interval_in_seconds_for_num_samples_no_limit( + int num_rendered_samples) const; + + /* Calculate number of samples which can be rendered within current desred update interval which + * is calculated by `guess_update_interval_in_seconds()`. */ + int calculate_num_samples_per_update() const; + + /* Get start sample and the number of samples which are to be path traces in the current work. */ + int get_start_sample_to_path_trace() const; + int get_num_samples_to_path_trace() const; + + /* Calculate how many samples there are to be rendered for the very first path trace after reset. + */ + int get_num_samples_during_navigation(int resolution_divier) const; + + /* Whether adaptive sampling convergence check and filter is to happen. */ + bool work_need_adaptive_filter() const; + + /* Calculate thretshold for adaptive sampling. */ + float work_adaptive_threshold() const; + + /* Check whether current work needs denoising. + * Denoising is not needed if the denoiser is not configured, or when denosiing is happening too + * often. + * + * The delayed will be true when the denoiser is configured for use, but it was delayed for a + * later sample, to reduce overhead. + * + * ready_to_display will be false if we may have a denoised result that is outdated due to + * increased samples. */ + bool work_need_denoise(bool &delayed, bool &ready_to_display); + + /* Check whether current work need to update display. + * + * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */ + bool work_need_update_display(const bool denoiser_delayed); + + /* Check whether it is time to perform rebalancing for the render work, */ + bool work_need_rebalance(); + + /* Check whether timing of the given work are usable to store timings in the `first_render_time_` + * for the resolution divider calculation. */ + bool work_is_usable_for_first_render_estimation(const RenderWork &render_work); + + /* Check whether timing report about the given work need to reset accumulated average time. */ + bool work_report_reset_average(const RenderWork &render_work); + + /* CHeck whether render time limit has been reached (or exceeded), and if so store related + * information in the state so that rendering is considered finished, and is possible to report + * average render time information. */ + void check_time_limit_reached(); + + /* Helper class to keep track of task timing. + * + * Contains two parts: wall time and average. The wall time is an actual wall time of how long it + * took to complete all tasks of a type. Is always advanced when PathTracer reports time update. + * + * The average time is used for scheduling purposes. It is estimated to be a time of how long it + * takes to perform task on the final resolution. */ + class TimeWithAverage { + public: + inline void reset() + { + total_wall_time_ = 0.0; + + average_time_accumulator_ = 0.0; + num_average_times_ = 0; + } + + inline void add_wall(double time) + { + total_wall_time_ += time; + } + + inline void add_average(double time, int num_measurements = 1) + { + average_time_accumulator_ += time; + num_average_times_ += num_measurements; + } + + inline double get_wall() const + { + return total_wall_time_; + } + + inline double get_average() const + { + if (num_average_times_ == 0) { + return 0; + } + return average_time_accumulator_ / num_average_times_; + } + + inline void reset_average() + { + average_time_accumulator_ = 0.0; + num_average_times_ = 0; + } + + protected: + double total_wall_time_ = 0.0; + + double average_time_accumulator_ = 0.0; + int num_average_times_ = 0; + }; + + struct { + int resolution_divider = 1; + + /* Number of rendered samples on top of the start sample. */ + int num_rendered_samples = 0; + + /* Point in time the latest GPUDisplay work has been scheduled. */ + double last_display_update_time = 0.0; + /* Value of -1 means display was never updated. */ + int last_display_update_sample = -1; + + /* Point in time at which last rebalance has been performed. */ + double last_rebalance_time = 0.0; + + /* Number of rebalance works which has been requested to be performed. + * The path tracer might ignore the work if there is a single device rendering. */ + int num_rebalance_requested = 0; + + /* Number of rebalance works handled which did change balance across devices. */ + int num_rebalance_changes = 0; + + bool need_rebalance_at_next_work = false; + + /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across + * devices. */ + bool last_rebalance_changed = false; + + /* Threshold for adaptive sampling which will be scheduled to work when not using progressive + * noise floor. */ + float adaptive_sampling_threshold = 0.0f; + + bool last_work_tile_was_denoised = false; + bool tile_result_was_written = false; + bool postprocess_work_scheduled = false; + bool full_frame_work_scheduled = false; + bool full_frame_was_written = false; + + bool path_trace_finished = false; + bool time_limit_reached = false; + + /* Time at which rendering started and finished. */ + double start_render_time = 0.0; + double end_render_time = 0.0; + + /* Measured occupancy of the render devices measured normalized to the number of samples. + * + * In a way it is "trailing": when scheduling new work this occupancy is measured when the + * previous work was rendered. */ + int occupancy_num_samples = 0; + float occupancy = 1.0f; + } state_; + + /* Timing of tasks which were performed at the very first render work at 100% of the + * resolution. This timing information is used to estimate resolution divider for fats + * navigation. */ + struct { + double path_trace_per_sample; + double denoise_time; + double display_update_time; + } first_render_time_; + + TimeWithAverage path_trace_time_; + TimeWithAverage adaptive_filter_time_; + TimeWithAverage denoise_time_; + TimeWithAverage display_update_time_; + TimeWithAverage rebalance_time_; + + /* Whether cryptomatte-related work will be scheduled. */ + bool need_schedule_cryptomatte_ = false; + + /* Whether to schedule device load rebalance works. + * Rebalancing requires some special treatment for update intervals and such, so if it's known + * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully + * ignore rebalancing logic. */ + bool need_schedule_rebalance_works_ = false; + + /* Path tracing work will be scheduled for samples from within + * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */ + int start_sample_ = 0; + int num_samples_ = 0; + + /* Limit in seconds for how long path tracing is allowed to happen. + * Zero means no limit is applied. */ + double time_limit_ = 0.0; + + /* Headless rendering without interface. */ + bool headless_; + + /* Background (offline) rendering. */ + bool background_; + + /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other + * types of hi-dpi displays. */ + int pixel_size_ = 1; + + TileManager &tile_manager_; + + BufferParams buffer_params_; + DenoiseParams denoiser_params_; + + AdaptiveSampling adaptive_sampling_; + + /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise + * level. */ + bool use_progressive_noise_floor_ = false; + + /* Default value for the resolution divider which will be used when there is no render time + * information available yet. + * It is also what defines the upper limit of the automatically calculated resolution divider. */ + int default_start_resolution_divider_ = 1; + + /* Initial resolution divider which will be used on render scheduler reset. */ + int start_resolution_divider_ = 0; + + /* Calculate smallest resolution divider which will bring down actual rendering time below the + * desired one. This call assumes linear dependency of render time from number of pixels + * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time + * down by a factor of 4. */ + int calculate_resolution_divider_for_time(double desired_time, double actual_time); +}; + +int calculate_resolution_divider_for_resolution(int width, int height, int resolution); + +int calculate_resolution_for_divider(int width, int height, int resolution_divider); + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp new file mode 100644 index 00000000000..465b4a8d4da --- /dev/null +++ b/intern/cycles/integrator/shader_eval.cpp @@ -0,0 +1,173 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/shader_eval.h" + +#include "device/device.h" +#include "device/device_queue.h" + +#include "device/cpu/kernel.h" +#include "device/cpu/kernel_thread_globals.h" + +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_tbb.h" + +CCL_NAMESPACE_BEGIN + +ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress) +{ + DCHECK_NE(device_, nullptr); +} + +bool ShaderEval::eval(const ShaderEvalType type, + const int max_num_points, + const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input, + const function<void(device_vector<float4> &)> &read_output) +{ + bool first_device = true; + bool success = true; + + device_->foreach_device([&](Device *device) { + if (!first_device) { + LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a " + "single device."; + return; + } + first_device = false; + + device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY); + device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE); + + /* Allocate and copy device buffers. */ + DCHECK_EQ(input.device, device); + DCHECK_EQ(output.device, device); + DCHECK_LE(output.size(), input.size()); + + input.alloc(max_num_points); + int num_points = fill_input(input); + if (num_points == 0) { + return; + } + + input.copy_to_device(); + output.alloc(num_points); + output.zero_to_device(); + + /* Evaluate on CPU or GPU. */ + success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) : + eval_gpu(device, type, input, output); + + /* Copy data back from device if not cancelled. */ + if (success) { + output.copy_from_device(0, 1, output.size()); + read_output(output); + } + + input.free(); + output.free(); + }); + + return success; +} + +bool ShaderEval::eval_cpu(Device *device, + const ShaderEvalType type, + device_vector<KernelShaderEvalInput> &input, + device_vector<float4> &output) +{ + vector<CPUKernelThreadGlobals> kernel_thread_globals; + device->get_cpu_kernel_thread_globals(kernel_thread_globals); + + /* Find required kernel function. */ + const CPUKernels &kernels = *(device->get_cpu_kernels()); + + /* Simple parallel_for over all work items. */ + const int64_t work_size = output.size(); + KernelShaderEvalInput *input_data = input.data(); + float4 *output_data = output.data(); + bool success = true; + + tbb::task_arena local_arena(device->info.cpu_threads); + local_arena.execute([&]() { + tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) { + /* TODO: is this fast enough? */ + if (progress_.get_cancel()) { + success = false; + return; + } + + const int thread_index = tbb::this_task_arena::current_thread_index(); + KernelGlobals *kg = &kernel_thread_globals[thread_index]; + + switch (type) { + case SHADER_EVAL_DISPLACE: + kernels.shader_eval_displace(kg, input_data, output_data, work_index); + break; + case SHADER_EVAL_BACKGROUND: + kernels.shader_eval_background(kg, input_data, output_data, work_index); + break; + } + }); + }); + + return success; +} + +bool ShaderEval::eval_gpu(Device *device, + const ShaderEvalType type, + device_vector<KernelShaderEvalInput> &input, + device_vector<float4> &output) +{ + /* Find required kernel function. */ + DeviceKernel kernel; + switch (type) { + case SHADER_EVAL_DISPLACE: + kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE; + break; + case SHADER_EVAL_BACKGROUND: + kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND; + break; + }; + + /* Create device queue. */ + unique_ptr<DeviceQueue> queue = device->gpu_queue_create(); + queue->init_execution(); + + /* Execute work on GPU in chunk, so we can cancel. + * TODO : query appropriate size from device.*/ + const int chunk_size = 65536; + + const int work_size = output.size(); + void *d_input = (void *)input.device_pointer; + void *d_output = (void *)output.device_pointer; + + for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) { + int d_work_size = min(chunk_size, work_size - d_offset); + void *args[] = {&d_input, &d_output, &d_offset, &d_work_size}; + + queue->enqueue(kernel, d_work_size, args); + queue->synchronize(); + + if (progress_.get_cancel()) { + return false; + } + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h new file mode 100644 index 00000000000..7dbf334b8d7 --- /dev/null +++ b/intern/cycles/integrator/shader_eval.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device_memory.h" + +#include "kernel/kernel_types.h" + +#include "util/util_function.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class Progress; + +enum ShaderEvalType { + SHADER_EVAL_DISPLACE, + SHADER_EVAL_BACKGROUND, +}; + +/* ShaderEval class performs shader evaluation for background light and displacement. */ +class ShaderEval { + public: + ShaderEval(Device *device, Progress &progress); + + /* Evaluate shader at points specified by KernelShaderEvalInput and write out + * RGBA colors to output. */ + bool eval(const ShaderEvalType type, + const int max_num_points, + const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input, + const function<void(device_vector<float4> &)> &read_output); + + protected: + bool eval_cpu(Device *device, + const ShaderEvalType type, + device_vector<KernelShaderEvalInput> &input, + device_vector<float4> &output); + bool eval_gpu(Device *device, + const ShaderEvalType type, + device_vector<KernelShaderEvalInput> &input, + device_vector<float4> &output); + + Device *device_; + Progress &progress_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp new file mode 100644 index 00000000000..3387b7bedf1 --- /dev/null +++ b/intern/cycles/integrator/tile.cpp @@ -0,0 +1,108 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/tile.h" + +#include "util/util_logging.h" +#include "util/util_math.h" + +CCL_NAMESPACE_BEGIN + +std::ostream &operator<<(std::ostream &os, const TileSize &tile_size) +{ + os << "size: (" << tile_size.width << ", " << tile_size.height << ")"; + os << ", num_samples: " << tile_size.num_samples; + return os; +} + +ccl_device_inline uint round_down_to_power_of_two(uint x) +{ + if (is_power_of_two(x)) { + return x; + } + + return prev_power_of_two(x); +} + +ccl_device_inline uint round_up_to_power_of_two(uint x) +{ + if (is_power_of_two(x)) { + return x; + } + + return next_power_of_two(x); +} + +TileSize tile_calculate_best_size(const int2 &image_size, + const int num_samples, + const int max_num_path_states) +{ + if (max_num_path_states == 1) { + /* Simple case: avoid any calculation, which could cause rounding issues. */ + return TileSize(1, 1, 1); + } + + const int64_t num_pixels = image_size.x * image_size.y; + const int64_t num_pixel_samples = num_pixels * num_samples; + + if (max_num_path_states >= num_pixel_samples) { + /* Image fully fits into the state (could be border render, for example). */ + return TileSize(image_size.x, image_size.y, num_samples); + } + + /* The idea here is to keep number of samples per tile as much as possible to improve coherency + * across threads. + * + * Some general ideas: + * - Prefer smaller tiles with more samples, which improves spatial coherency of paths. + * - Keep values a power of two, for more integer fit into the maximum number of paths. */ + + TileSize tile_size; + + /* Calculate tile size as if it is the most possible one to fit an entire range of samples. + * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling + * multiple tiles with the same coordinates rendering different samples. */ + const int num_path_states_per_sample = max_num_path_states / num_samples; + if (num_path_states_per_sample != 0) { + tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample))); + tile_size.height = tile_size.width; + } + else { + tile_size.width = tile_size.height = 1; + } + + if (num_samples == 1) { + tile_size.num_samples = 1; + } + else { + /* Heuristic here is to have more uniform division of the sample range: for example prefer + * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */ + tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))), + static_cast<uint>(num_samples)); + + const int tile_area = tile_size.width / tile_size.height; + tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area); + } + + DCHECK_GE(tile_size.width, 1); + DCHECK_GE(tile_size.height, 1); + DCHECK_GE(tile_size.num_samples, 1); + DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states); + + return tile_size; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h new file mode 100644 index 00000000000..d0824843ddb --- /dev/null +++ b/intern/cycles/integrator/tile.h @@ -0,0 +1,56 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <ostream> + +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +struct TileSize { + TileSize() = default; + + inline TileSize(int width, int height, int num_samples) + : width(width), height(height), num_samples(num_samples) + { + } + + inline bool operator==(const TileSize &other) const + { + return width == other.width && height == other.height && num_samples == other.num_samples; + } + inline bool operator!=(const TileSize &other) const + { + return !(*this == other); + } + + int width = 0, height = 0; + int num_samples = 0; +}; + +std::ostream &operator<<(std::ostream &os, const TileSize &tile_size); + +/* Calculate tile size which is best suitable for rendering image of a given size with given number + * of active path states. + * Will attempt to provide best guess to keep path tracing threads of a device as localized as + * possible, and have as many threads active for every tile as possible. */ +TileSize tile_calculate_best_size(const int2 &image_size, + const int num_samples, + const int max_num_path_states); + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp new file mode 100644 index 00000000000..9f96fe3632b --- /dev/null +++ b/intern/cycles/integrator/work_balancer.cpp @@ -0,0 +1,99 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/work_balancer.h" + +#include "util/util_math.h" + +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos) +{ + const int num_infos = work_balance_infos.size(); + + if (num_infos == 1) { + work_balance_infos[0].weight = 1.0; + return; + } + + /* There is no statistics available, so start with an equal distribution. */ + const double weight = 1.0 / num_infos; + for (WorkBalanceInfo &balance_info : work_balance_infos) { + balance_info.weight = weight; + } +} + +static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos) +{ + double total_time = 0; + for (const WorkBalanceInfo &info : work_balance_infos) { + total_time += info.time_spent; + } + return total_time; +} + +/* The balance is based on equalizing time which devices spent performing a task. Assume that + * average of the observed times is usable for estimating whether more or less work is to be + * scheduled, and how difference in the work scheduling is needed. */ + +bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos) +{ + const int num_infos = work_balance_infos.size(); + + const double total_time = calculate_total_time(work_balance_infos); + const double time_average = total_time / num_infos; + + double total_weight = 0; + vector<double> new_weights; + new_weights.reserve(num_infos); + + /* Equalize the overall average time. This means that we don't make it so every work will perform + * amount of work based on the current average, but that after the weights changes the time will + * equalize. + * Can think of it that if one of the devices is 10% faster than another, then one device needs + * to do 5% less of the current work, and another needs to do 5% more. */ + const double lerp_weight = 1.0 / num_infos; + + bool has_big_difference = false; + + for (const WorkBalanceInfo &info : work_balance_infos) { + const double time_target = lerp(info.time_spent, time_average, lerp_weight); + const double new_weight = info.weight * time_target / info.time_spent; + new_weights.push_back(new_weight); + total_weight += new_weight; + + if (std::fabs(1.0 - time_target / time_average) > 0.02) { + has_big_difference = true; + } + } + + if (!has_big_difference) { + return false; + } + + const double total_weight_inv = 1.0 / total_weight; + for (int i = 0; i < num_infos; ++i) { + WorkBalanceInfo &info = work_balance_infos[i]; + info.weight = new_weights[i] * total_weight_inv; + info.time_spent = 0; + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h new file mode 100644 index 00000000000..94e20ecf054 --- /dev/null +++ b/intern/cycles/integrator/work_balancer.h @@ -0,0 +1,42 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +struct WorkBalanceInfo { + /* Time spent performing corresponding work. */ + double time_spent = 0; + + /* Average occupancy of the device while performing the work. */ + float occupancy = 1.0f; + + /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of + * the big tile which is to be rendered on the device). */ + double weight = 1.0; +}; + +/* Balance work for an initial render interation, before any statistics is known. */ +void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos); + +/* Rebalance work after statistics has been accumulated. + * Returns true if the balancing did change. */ +bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos); + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp new file mode 100644 index 00000000000..3fc99d5b74d --- /dev/null +++ b/intern/cycles/integrator/work_tile_scheduler.cpp @@ -0,0 +1,138 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/work_tile_scheduler.h" + +#include "device/device_queue.h" +#include "integrator/tile.h" +#include "render/buffers.h" +#include "util/util_atomic.h" +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +WorkTileScheduler::WorkTileScheduler() +{ +} + +void WorkTileScheduler::set_max_num_path_states(int max_num_path_states) +{ + max_num_path_states_ = max_num_path_states; +} + +void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num) +{ + /* Image buffer parameters. */ + image_full_offset_px_.x = buffer_params.full_x; + image_full_offset_px_.y = buffer_params.full_y; + + image_size_px_ = make_int2(buffer_params.width, buffer_params.height); + + offset_ = buffer_params.offset; + stride_ = buffer_params.stride; + + /* Samples parameters. */ + sample_start_ = sample_start; + samples_num_ = samples_num; + + /* Initialize new scheduling. */ + reset_scheduler_state(); +} + +void WorkTileScheduler::reset_scheduler_state() +{ + tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_); + + VLOG(3) << "Will schedule tiles of size " << tile_size_; + + if (VLOG_IS_ON(3)) { + /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling + * and purely focusing on the number of used path states. */ + const int num_path_states_in_tile = tile_size_.width * tile_size_.height * + tile_size_.num_samples; + const int num_tiles = max_num_path_states_ / num_path_states_in_tile; + VLOG(3) << "Number of unused path states: " + << max_num_path_states_ - num_tiles * num_path_states_in_tile; + } + + num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width); + num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height); + + total_tiles_num_ = num_tiles_x_ * num_tiles_y_; + num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples); + + next_work_index_ = 0; + total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_; +} + +bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size) +{ + /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because + * the path trace work can decice to use smaller tile sizes and greedily schedule multiple tiles, + * improving overall device occupancy. + * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling + * limit". */ + + DCHECK_NE(max_num_path_states_, 0); + + const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1); + if (work_index >= total_work_size_) { + return false; + } + + const int sample_range_index = work_index % num_tiles_per_sample_range_; + const int start_sample = sample_range_index * tile_size_.num_samples; + const int tile_index = work_index / num_tiles_per_sample_range_; + const int tile_y = tile_index / num_tiles_x_; + const int tile_x = tile_index - tile_y * num_tiles_x_; + + KernelWorkTile work_tile; + work_tile.x = tile_x * tile_size_.width; + work_tile.y = tile_y * tile_size_.height; + work_tile.w = tile_size_.width; + work_tile.h = tile_size_.height; + work_tile.start_sample = sample_start_ + start_sample; + work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample); + work_tile.offset = offset_; + work_tile.stride = stride_; + + work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x); + work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y); + + work_tile.x += image_full_offset_px_.x; + work_tile.y += image_full_offset_px_.y; + + const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples; + + DCHECK_GT(tile_work_size, 0); + + if (max_work_size && tile_work_size > max_work_size) { + /* The work did not fit into the requested limit of the work size. Unschedule the tile, + * allowing others (or ourselves later one) to pick it up. + * + * TODO: Such temporary decrement is not ideal, since it might lead to situation when another + * device sees there is nothing to be done, finishing its work and leaving all work to be + * done by us. */ + atomic_fetch_and_add_int32(&next_work_index_, -1); + return false; + } + + *work_tile_ = work_tile; + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h new file mode 100644 index 00000000000..e4c8f701259 --- /dev/null +++ b/intern/cycles/integrator/work_tile_scheduler.h @@ -0,0 +1,98 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/tile.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +class BufferParams; + +struct KernelWorkTile; + +/* Scheduler of device work tiles. + * Takes care of feeding multiple devices running in parallel a work which needs to be done. */ +class WorkTileScheduler { + public: + WorkTileScheduler(); + + /* MAximum path states which are allowed to be used by a single scheduled work tile. + * + * Affects the scheduled work size: the work size will be as big as possible, but will not exceed + * this number of states. */ + void set_max_num_path_states(int max_num_path_states); + + /* Scheduling will happen for pixels within a big tile denotes by its parameters. */ + void reset(const BufferParams &buffer_params, int sample_start, int samples_num); + + /* Get work for a device. + * Returns true if there is still work to be done and initialize the work tile to all + * parameters of this work. If there is nothing remaining to be done, returns false and the + * work tile is kept unchanged. + * + * Optionally pass max_work_size to do nothing if there is no tile small enough. */ + bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0); + + protected: + void reset_scheduler_state(); + + /* Maximum allowed path states to be used. + * + * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the + * number of path states is kind of a detail. Is there a more generic term from the scheduler + * point of view? */ + int max_num_path_states_ = 0; + + /* Offset in pixels within a global buffer. */ + int2 image_full_offset_px_ = make_int2(0, 0); + + /* dimensions of the currently rendering image in pixels. */ + int2 image_size_px_ = make_int2(0, 0); + + /* Offset and stride of the buffer within which scheduing is happenning. + * Will be passed over to the KernelWorkTile. */ + int offset_, stride_; + + /* Start sample of index and number of samples which are to be rendered. + * The scheduler will cover samples range of [start, start + num] over the entire image + * (splitting into a smaller work tiles). */ + int sample_start_ = 0; + int samples_num_ = 0; + + /* Tile size which be scheduled for rendering. */ + TileSize tile_size_; + + /* Number of tiles in X and Y axis of the image. */ + int num_tiles_x_, num_tiles_y_; + + /* Total number of tiles on the image. + * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`. + * + * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value + * in the `get_work()`? */ + int total_tiles_num_ = 0; + + /* In the case when the number of sam[les in the `tile_size_` is lower than samples_num_ denotes + * how many tiles are to be "stacked" to cover the entire requested range of samples. */ + int num_tiles_per_sample_range_ = 0; + + int next_work_index_ = 0; + int total_work_size_ = 0; +}; + +CCL_NAMESPACE_END |