Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/integrator/path_trace_work_gpu.cpp')
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.cpp933
1 files changed, 933 insertions, 0 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
new file mode 100644
index 00000000000..10baf869aa6
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_gpu.h"
+
+#include "device/device.h"
+
+#include "integrator/pass_accessor_gpu.h"
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+ : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+ queue_(device->gpu_queue_create()),
+ integrator_state_soa_kernel_features_(0),
+ integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
+ integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
+ integrator_shader_raytrace_sort_counter_(
+ device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+ integrator_next_shadow_catcher_path_index_(
+ device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+ queued_paths_(device, "queued_paths", MEM_READ_WRITE),
+ num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
+ work_tiles_(device, "work_tiles", MEM_READ_WRITE),
+ gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+ max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
+ min_num_active_paths_(queue_->num_concurrent_busy_states()),
+ max_active_path_index_(0)
+{
+ memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
+
+ /* Limit number of active paths to the half of the overall state. This is due to the logic in the
+ * path compaction which relies on the fact that regeneration does not happen sooner than half of
+ * the states are available again. */
+ min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+}
+
+void PathTraceWorkGPU::alloc_integrator_soa()
+{
+ /* IntegrateState allocated as structure of arrays. */
+
+ /* Check if we already allocated memory for the required features. */
+ const uint kernel_features = device_scene_->data.kernel_features;
+ if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
+ return;
+ }
+ integrator_state_soa_kernel_features_ = kernel_features;
+
+ /* Allocate a device only memory buffer before for each struct member, and then
+ * write the pointers into a struct that resides in constant memory.
+ *
+ * TODO: store float3 in separate XYZ arrays. */
+#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+ if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
+ device_only_memory<type> *array = new device_only_memory<type>(device_, \
+ "integrator_state_" #name); \
+ array->alloc_to_device(max_num_paths_); \
+ integrator_state_soa_.emplace_back(array); \
+ integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+ }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+ if ((kernel_features & feature) && \
+ (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
+ device_only_memory<type> *array = new device_only_memory<type>(device_, \
+ "integrator_state_" #name); \
+ array->alloc_to_device(max_num_paths_); \
+ integrator_state_soa_.emplace_back(array); \
+ integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
+ }
+#define KERNEL_STRUCT_END(name) \
+ break; \
+ }
+#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+ if (array_index == array_size - 1) { \
+ break; \
+ } \
+ }
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+}
+
+void PathTraceWorkGPU::alloc_integrator_queue()
+{
+ if (integrator_queue_counter_.size() == 0) {
+ integrator_queue_counter_.alloc(1);
+ integrator_queue_counter_.zero_to_device();
+ integrator_queue_counter_.copy_from_device();
+ integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+ integrator_queue_counter_.device_pointer;
+ }
+
+ /* Allocate data for active path index arrays. */
+ if (num_queued_paths_.size() == 0) {
+ num_queued_paths_.alloc(1);
+ num_queued_paths_.zero_to_device();
+ }
+
+ if (queued_paths_.size() == 0) {
+ queued_paths_.alloc(max_num_paths_);
+ /* TODO: this could be skip if we had a function to just allocate on device. */
+ queued_paths_.zero_to_device();
+ }
+}
+
+void PathTraceWorkGPU::alloc_integrator_sorting()
+{
+ /* Allocate arrays for shader sorting. */
+ const int max_shaders = device_scene_->data.max_shaders;
+ if (integrator_shader_sort_counter_.size() < max_shaders) {
+ integrator_shader_sort_counter_.alloc(max_shaders);
+ integrator_shader_sort_counter_.zero_to_device();
+
+ integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+ integrator_shader_raytrace_sort_counter_.zero_to_device();
+
+ integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+ (int *)integrator_shader_sort_counter_.device_pointer;
+ integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+ (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+ }
+}
+
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+ if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+ return;
+ }
+
+ integrator_next_shadow_catcher_path_index_.alloc(1);
+ /* TODO(sergey): Use queue? */
+ integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+ integrator_state_gpu_.next_shadow_catcher_path_index =
+ (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
+void PathTraceWorkGPU::alloc_work_memory()
+{
+ alloc_integrator_soa();
+ alloc_integrator_queue();
+ alloc_integrator_sorting();
+ alloc_integrator_path_split();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+ queue_->init_execution();
+
+ /* Copy to device side struct in constant memory. */
+ device_->const_copy_to(
+ "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+}
+
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num)
+{
+ /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+ * add more work (because tiles are smaller, so there is higher chance that more paths will
+ * become busy after adding new tiles). This is especially important for the shadow catcher which
+ * schedules work in halves of available number of paths. */
+ work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
+
+ work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
+
+ enqueue_reset();
+
+ int num_iterations = 0;
+ uint64_t num_busy_accum = 0;
+
+ /* TODO: set a hard limit in case of undetected kernel failures? */
+ while (true) {
+ /* Enqueue work from the scheduler, on start or when there are not enough
+ * paths to keep the device occupied. */
+ bool finished;
+ if (enqueue_work_tiles(finished)) {
+ /* Copy stats from the device. */
+ queue_->copy_from_device(integrator_queue_counter_);
+
+ if (!queue_->synchronize()) {
+ break; /* Stop on error. */
+ }
+ }
+
+ if (is_cancel_requested()) {
+ break;
+ }
+
+ /* Stop if no more work remaining. */
+ if (finished) {
+ break;
+ }
+
+ /* Enqueue on of the path iteration kernels. */
+ if (enqueue_path_iteration()) {
+ /* Copy stats from the device. */
+ queue_->copy_from_device(integrator_queue_counter_);
+
+ if (!queue_->synchronize()) {
+ break; /* Stop on error. */
+ }
+ }
+
+ if (is_cancel_requested()) {
+ break;
+ }
+
+ num_busy_accum += get_num_active_paths();
+ ++num_iterations;
+ }
+
+ statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
+}
+
+DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
+{
+ const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int max_num_queued = 0;
+ DeviceKernel kernel = DEVICE_KERNEL_NUM;
+
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ if (queue_counter->num_queued[i] > max_num_queued) {
+ kernel = (DeviceKernel)i;
+ max_num_queued = queue_counter->num_queued[i];
+ }
+ }
+
+ return kernel;
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+ void *args[] = {&max_num_paths_};
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
+ queue_->zero_to_device(integrator_queue_counter_);
+ queue_->zero_to_device(integrator_shader_sort_counter_);
+ queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+
+ /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+ * counter on the host side because `zero_to_device()` is not doing it. */
+ if (integrator_queue_counter_.host_pointer) {
+ memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
+ }
+}
+
+bool PathTraceWorkGPU::enqueue_path_iteration()
+{
+ /* Find kernel to execute, with max number of queued paths. */
+ const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int num_active_paths = 0;
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ num_active_paths += queue_counter->num_queued[i];
+ }
+
+ if (num_active_paths == 0) {
+ return false;
+ }
+
+ /* Find kernel to execute, with max number of queued paths. */
+ const DeviceKernel kernel = get_most_queued_kernel();
+ if (kernel == DEVICE_KERNEL_NUM) {
+ return false;
+ }
+
+ /* Finish shadows before potentially adding more shadow rays. We can only
+ * store one shadow ray in the integrator state. */
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
+ if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+ enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+ return true;
+ }
+ else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+ enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+ return true;
+ }
+ }
+
+ /* Schedule kernel with maximum number of queued items. */
+ enqueue_path_iteration(kernel);
+ return true;
+}
+
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+{
+ void *d_path_index = (void *)NULL;
+
+ /* Create array of path indices for which this kernel is queued to be executed. */
+ int work_size = max_active_path_index_;
+
+ IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+ int num_queued = queue_counter->num_queued[kernel];
+
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ /* Compute array of active paths, sorted by shader. */
+ work_size = num_queued;
+ d_path_index = (void *)queued_paths_.device_pointer;
+
+ compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+ }
+ else if (num_queued < work_size) {
+ work_size = num_queued;
+ d_path_index = (void *)queued_paths_.device_pointer;
+
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+ /* Compute array of active shadow paths for specific kernel. */
+ compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
+ }
+ else {
+ /* Compute array of active paths for specific kernel. */
+ compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
+ }
+ }
+
+ DCHECK_LE(work_size, max_num_paths_);
+
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+ /* Ray intersection kernels with integrator state. */
+ void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
+
+ queue_->enqueue(kernel, work_size, args);
+ break;
+ }
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+ /* Shading kernels with integrator state and render buffer. */
+ void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+ void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+ queue_->enqueue(kernel, work_size, args);
+ break;
+ }
+
+ default:
+ LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
+ << " used for path iteration, should never happen.";
+ break;
+ }
+}
+
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+ int d_queued_kernel = queued_kernel;
+ void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+ assert(d_counter != nullptr);
+
+ /* Compute prefix sum of number of active paths with each shader. */
+ {
+ const int work_size = 1;
+ int max_shaders = device_scene_->data.max_shaders;
+ void *args[] = {&d_counter, &max_shaders};
+ queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
+ }
+
+ queue_->zero_to_device(num_queued_paths_);
+
+ /* Launch kernel to fill the active paths arrays. */
+ {
+ /* TODO: this could be smaller for terminated paths based on amount of work we want
+ * to schedule. */
+ const int work_size = max_active_path_index_;
+
+ void *d_queued_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {const_cast<int *>(&work_size),
+ &d_queued_paths,
+ &d_num_queued_paths,
+ &d_counter,
+ &d_queued_kernel};
+
+ queue_->enqueue(kernel, work_size, args);
+ }
+
+ if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+ queue_->zero_to_device(integrator_shader_sort_counter_);
+ }
+ else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+ }
+ else {
+ assert(0);
+ }
+}
+
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+ int d_queued_kernel = queued_kernel;
+
+ /* Launch kernel to fill the active paths arrays. */
+ const int work_size = max_active_path_index_;
+ void *d_queued_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {
+ const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
+
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(kernel, work_size, args);
+}
+
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+ if (num_active_paths == 0) {
+ max_active_path_index_ = 0;
+ }
+
+ /* Compact fragmented path states into the start of the array, moving any paths
+ * with index higher than the number of active paths into the gaps. */
+ if (max_active_path_index_ == num_active_paths) {
+ return;
+ }
+
+ void *d_compact_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+ /* Create array with terminated paths that we can write to. */
+ {
+ /* TODO: can the work size be reduced here? */
+ int offset = num_active_paths;
+ int work_size = num_active_paths;
+ void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+ }
+
+ /* Create array of paths that we need to compact, where the path index is bigger
+ * than the number of active paths. */
+ {
+ int work_size = max_active_path_index_;
+ void *args[] = {
+ &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+ }
+
+ queue_->copy_from_device(num_queued_paths_);
+ queue_->synchronize();
+
+ int num_compact_paths = num_queued_paths_.data()[0];
+
+ /* Move paths into gaps. */
+ if (num_compact_paths > 0) {
+ int work_size = num_compact_paths;
+ int active_states_offset = 0;
+ int terminated_states_offset = num_active_paths;
+ void *args[] = {
+ &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+ }
+
+ queue_->synchronize();
+
+ /* Adjust max active path index now we know which part of the array is actually used. */
+ max_active_path_index_ = num_active_paths;
+}
+
+bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
+{
+ /* If there are existing paths wait them to go to intersect closest kernel, which will align the
+ * wavefront of the existing and newely added paths. */
+ /* TODO: Check whether counting new intersection kernels here will have positive affect on the
+ * performance. */
+ const DeviceKernel kernel = get_most_queued_kernel();
+ if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
+ return false;
+ }
+
+ int num_active_paths = get_num_active_paths();
+
+ /* Don't schedule more work if cancelling. */
+ if (is_cancel_requested()) {
+ if (num_active_paths == 0) {
+ finished = true;
+ }
+ return false;
+ }
+
+ finished = false;
+
+ vector<KernelWorkTile> work_tiles;
+
+ int max_num_camera_paths = max_num_paths_;
+ int num_predicted_splits = 0;
+
+ if (has_shadow_catcher()) {
+ /* When there are shadow catchers in the scene bounce from them will split the state. So we
+ * make sure there is enough space in the path states array to fit split states.
+ *
+ * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+ * that all the new paths can be split.
+ *
+ * Note that it is possible that some of the current states can still split, so need to make
+ * sure there is enough space for them as well. */
+
+ /* Number of currently in-flight states which can still split. */
+ const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+ const int num_available_paths = max_num_paths_ - num_active_paths;
+ const int num_new_paths = num_available_paths / 2;
+ max_num_camera_paths = max(num_active_paths,
+ num_active_paths + num_new_paths - num_scheduled_possible_split);
+ num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+ }
+
+ /* Schedule when we're out of paths or there are too few paths to keep the
+ * device occupied. */
+ int num_paths = num_active_paths;
+ if (num_paths == 0 || num_paths < min_num_active_paths_) {
+ /* Get work tiles until the maximum number of path is reached. */
+ while (num_paths < max_num_camera_paths) {
+ KernelWorkTile work_tile;
+ if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
+ work_tiles.push_back(work_tile);
+ num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* If we couldn't get any more tiles, we're done. */
+ if (work_tiles.size() == 0 && num_paths == 0) {
+ finished = true;
+ return false;
+ }
+ }
+
+ /* Initialize paths from work tiles. */
+ if (work_tiles.size() == 0) {
+ return false;
+ }
+
+ /* Compact state array when number of paths becomes small relative to the
+ * known maximum path index, which makes computing active index arrays slow. */
+ compact_states(num_active_paths);
+
+ if (has_shadow_catcher()) {
+ integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+ queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+ }
+
+ enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
+ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
+ work_tiles.data(),
+ work_tiles.size(),
+ num_active_paths,
+ num_predicted_splits);
+
+ return true;
+}
+
+void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
+ const KernelWorkTile work_tiles[],
+ const int num_work_tiles,
+ const int num_active_paths,
+ const int num_predicted_splits)
+{
+ /* Copy work tiles to device. */
+ if (work_tiles_.size() < num_work_tiles) {
+ work_tiles_.alloc(num_work_tiles);
+ }
+
+ int path_index_offset = num_active_paths;
+ int max_tile_work_size = 0;
+ for (int i = 0; i < num_work_tiles; i++) {
+ KernelWorkTile &work_tile = work_tiles_.data()[i];
+ work_tile = work_tiles[i];
+
+ const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+ work_tile.path_index_offset = path_index_offset;
+ work_tile.work_size = tile_work_size;
+
+ path_index_offset += tile_work_size;
+
+ max_tile_work_size = max(max_tile_work_size, tile_work_size);
+ }
+
+ queue_->copy_to_device(work_tiles_);
+
+ void *d_work_tiles = (void *)work_tiles_.device_pointer;
+ void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+
+ /* Launch kernel. */
+ void *args[] = {&d_work_tiles,
+ const_cast<int *>(&num_work_tiles),
+ &d_render_buffer,
+ const_cast<int *>(&max_tile_work_size)};
+
+ queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
+
+ max_active_path_index_ = path_index_offset + num_predicted_splits;
+}
+
+int PathTraceWorkGPU::get_num_active_paths()
+{
+ /* TODO: this is wrong, does not account for duplicates with shadow! */
+ IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int num_paths = 0;
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ DCHECK_GE(queue_counter->num_queued[i], 0)
+ << "Invalid number of queued states for kernel "
+ << device_kernel_as_string(static_cast<DeviceKernel>(i));
+ num_paths += queue_counter->num_queued[i];
+ }
+
+ return num_paths;
+}
+
+bool PathTraceWorkGPU::should_use_graphics_interop()
+{
+ /* There are few aspects with the graphics interop when using multiple devices caused by the fact
+ * that the GPUDisplay has a single texture:
+ *
+ * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
+ * attempting to register OpenGL PBO which has been mapped. Which makes sense, because
+ * otherwise one would run into a conflict of where the source of truth is. */
+ if (has_multiple_works()) {
+ return false;
+ }
+
+ if (!interop_use_checked_) {
+ Device *device = queue_->device;
+ interop_use_ = device->should_use_graphics_interop();
+
+ if (interop_use_) {
+ VLOG(2) << "Will be using graphics interop GPU display update.";
+ }
+ else {
+ VLOG(2) << "Will be using naive GPU display update.";
+ }
+
+ interop_use_checked_ = true;
+ }
+
+ return interop_use_;
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ if (device_->have_error()) {
+ /* Don't attempt to update GPU display if the device has errors: the error state will make
+ * wrong decisions to happen about interop, causing more chained bugs. */
+ return;
+ }
+
+ if (!buffers_->buffer.device_pointer) {
+ LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
+ return;
+ }
+
+ if (should_use_graphics_interop()) {
+ if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+ return;
+ }
+
+ /* If error happens when trying to use graphics interop fallback to the native implementation
+ * and don't attempt to use interop for the further updates. */
+ interop_use_ = false;
+ }
+
+ copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ const int full_x = effective_buffer_params_.full_x;
+ const int full_y = effective_buffer_params_.full_y;
+ const int width = effective_buffer_params_.width;
+ const int height = effective_buffer_params_.height;
+ const int final_width = buffers_->params.width;
+ const int final_height = buffers_->params.height;
+
+ const int texture_x = full_x - effective_full_params_.full_x;
+ const int texture_y = full_y - effective_full_params_.full_y;
+
+ /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
+ *
+ * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
+ * change of the resolution divider. However, if the display becomes smaller, shrink the
+ * allocated memory as well. */
+ if (gpu_display_rgba_half_.data_width != final_width ||
+ gpu_display_rgba_half_.data_height != final_height) {
+ gpu_display_rgba_half_.alloc(final_width, final_height);
+ /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
+ * transfering zeroes to the device. */
+ queue_->zero_to_device(gpu_display_rgba_half_);
+ }
+
+ PassAccessor::Destination destination(film_->get_display_pass());
+ destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+
+ get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+ gpu_display_rgba_half_.copy_from_device();
+
+ gpu_display->copy_pixels_to_texture(
+ gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+}
+
+bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ if (!device_graphics_interop_) {
+ device_graphics_interop_ = queue_->graphics_interop_create();
+ }
+
+ const DeviceGraphicsInteropDestination graphics_interop_dst =
+ gpu_display->graphics_interop_get();
+ device_graphics_interop_->set_destination(graphics_interop_dst);
+
+ const device_ptr d_rgba_half = device_graphics_interop_->map();
+ if (!d_rgba_half) {
+ return false;
+ }
+
+ PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+ destination.d_pixels_half_rgba = d_rgba_half;
+
+ get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+ device_graphics_interop_->unmap();
+
+ return true;
+}
+
+void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+{
+ if (!device_graphics_interop_) {
+ return;
+ }
+ gpu_display->graphics_interop_activate();
+ device_graphics_interop_ = nullptr;
+ gpu_display->graphics_interop_deactivate();
+}
+
+void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+ PassMode pass_mode,
+ int num_samples)
+{
+ const KernelFilm &kfilm = device_scene_->data.film;
+
+ const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+ const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
+
+ pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+}
+
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+ const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
+
+ if (num_active_pixels) {
+ enqueue_adaptive_sampling_filter_x();
+ enqueue_adaptive_sampling_filter_y();
+ queue_->synchronize();
+ }
+
+ return num_active_pixels;
+}
+
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
+{
+ device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+ num_active_pixels.alloc(1);
+
+ queue_->zero_to_device(num_active_pixels);
+
+ const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ const_cast<int *>(&effective_buffer_params_.full_x),
+ const_cast<int *>(&effective_buffer_params_.full_y),
+ const_cast<int *>(&effective_buffer_params_.width),
+ const_cast<int *>(&effective_buffer_params_.height),
+ &threshold,
+ &reset,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride,
+ &num_active_pixels.device_pointer};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
+
+ queue_->copy_from_device(num_active_pixels);
+ queue_->synchronize();
+
+ return num_active_pixels.data()[0];
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
+{
+ const int work_size = effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ &effective_buffer_params_.full_x,
+ &effective_buffer_params_.full_y,
+ &effective_buffer_params_.width,
+ &effective_buffer_params_.height,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
+{
+ const int work_size = effective_buffer_params_.width;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ &effective_buffer_params_.full_x,
+ &effective_buffer_params_.full_y,
+ &effective_buffer_params_.width,
+ &effective_buffer_params_.height,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
+}
+
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+ const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ const_cast<int *>(&work_size),
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_from_device()
+{
+ queue_->copy_from_device(buffers_->buffer);
+
+ /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
+ return queue_->synchronize();
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_to_device()
+{
+ queue_->copy_to_device(buffers_->buffer);
+
+ /* NOTE: The direct device access to the buffers only happens within this path trace work. The
+ * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
+ * which will perform synchronization as needed. */
+
+ return true;
+}
+
+bool PathTraceWorkGPU::zero_render_buffers()
+{
+ queue_->zero_to_device(buffers_->buffer);
+
+ return true;
+}
+
+bool PathTraceWorkGPU::has_shadow_catcher() const
+{
+ return device_scene_->data.integrator.has_shadow_catcher;
+}
+
+int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
+{
+ if (max_active_path_index_ == 0) {
+ return 0;
+ }
+
+ if (!has_shadow_catcher()) {
+ return 0;
+ }
+
+ queue_->zero_to_device(num_queued_paths_);
+
+ const int work_size = max_active_path_index_;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
+
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
+ queue_->copy_from_device(num_queued_paths_);
+ queue_->synchronize();
+
+ return num_queued_paths_.data()[0];
+}
+
+CCL_NAMESPACE_END