diff options
Diffstat (limited to 'intern/cycles/integrator/path_trace_work_gpu.cpp')
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 933 |
1 files changed, 933 insertions, 0 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp new file mode 100644 index 00000000000..10baf869aa6 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -0,0 +1,933 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/path_trace_work_gpu.h" + +#include "device/device.h" + +#include "integrator/pass_accessor_gpu.h" +#include "render/buffers.h" +#include "render/gpu_display.h" +#include "render/scene.h" +#include "util/util_logging.h" +#include "util/util_tbb.h" +#include "util/util_time.h" + +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +PathTraceWorkGPU::PathTraceWorkGPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) + : PathTraceWork(device, film, device_scene, cancel_requested_flag), + queue_(device->gpu_queue_create()), + integrator_state_soa_kernel_features_(0), + integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE), + integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE), + integrator_shader_raytrace_sort_counter_( + device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE), + integrator_next_shadow_catcher_path_index_( + device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE), + queued_paths_(device, "queued_paths", MEM_READ_WRITE), + num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE), + work_tiles_(device, "work_tiles", MEM_READ_WRITE), + gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE), + max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))), + min_num_active_paths_(queue_->num_concurrent_busy_states()), + max_active_path_index_(0) +{ + memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_)); + + /* Limit number of active paths to the half of the overall state. This is due to the logic in the + * path compaction which relies on the fact that regeneration does not happen sooner than half of + * the states are available again. */ + min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2); +} + +void PathTraceWorkGPU::alloc_integrator_soa() +{ + /* IntegrateState allocated as structure of arrays. */ + + /* Check if we already allocated memory for the required features. */ + const uint kernel_features = device_scene_->data.kernel_features; + if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) { + return; + } + integrator_state_soa_kernel_features_ = kernel_features; + + /* Allocate a device only memory buffer before for each struct member, and then + * write the pointers into a struct that resides in constant memory. + * + * TODO: store float3 in separate XYZ arrays. */ +#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) { +#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \ + device_only_memory<type> *array = new device_only_memory<type>(device_, \ + "integrator_state_" #name); \ + array->alloc_to_device(max_num_paths_); \ + integrator_state_soa_.emplace_back(array); \ + integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \ + } +#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ + if ((kernel_features & feature) && \ + (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \ + device_only_memory<type> *array = new device_only_memory<type>(device_, \ + "integrator_state_" #name); \ + array->alloc_to_device(max_num_paths_); \ + integrator_state_soa_.emplace_back(array); \ + integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \ + } +#define KERNEL_STRUCT_END(name) \ + break; \ + } +#define KERNEL_STRUCT_END_ARRAY(name, array_size) \ + if (array_index == array_size - 1) { \ + break; \ + } \ + } +#include "kernel/integrator/integrator_state_template.h" +#undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_ARRAY_MEMBER +#undef KERNEL_STRUCT_END +#undef KERNEL_STRUCT_END_ARRAY +} + +void PathTraceWorkGPU::alloc_integrator_queue() +{ + if (integrator_queue_counter_.size() == 0) { + integrator_queue_counter_.alloc(1); + integrator_queue_counter_.zero_to_device(); + integrator_queue_counter_.copy_from_device(); + integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *) + integrator_queue_counter_.device_pointer; + } + + /* Allocate data for active path index arrays. */ + if (num_queued_paths_.size() == 0) { + num_queued_paths_.alloc(1); + num_queued_paths_.zero_to_device(); + } + + if (queued_paths_.size() == 0) { + queued_paths_.alloc(max_num_paths_); + /* TODO: this could be skip if we had a function to just allocate on device. */ + queued_paths_.zero_to_device(); + } +} + +void PathTraceWorkGPU::alloc_integrator_sorting() +{ + /* Allocate arrays for shader sorting. */ + const int max_shaders = device_scene_->data.max_shaders; + if (integrator_shader_sort_counter_.size() < max_shaders) { + integrator_shader_sort_counter_.alloc(max_shaders); + integrator_shader_sort_counter_.zero_to_device(); + + integrator_shader_raytrace_sort_counter_.alloc(max_shaders); + integrator_shader_raytrace_sort_counter_.zero_to_device(); + + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = + (int *)integrator_shader_sort_counter_.device_pointer; + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = + (int *)integrator_shader_raytrace_sort_counter_.device_pointer; + } +} + +void PathTraceWorkGPU::alloc_integrator_path_split() +{ + if (integrator_next_shadow_catcher_path_index_.size() != 0) { + return; + } + + integrator_next_shadow_catcher_path_index_.alloc(1); + /* TODO(sergey): Use queue? */ + integrator_next_shadow_catcher_path_index_.zero_to_device(); + + integrator_state_gpu_.next_shadow_catcher_path_index = + (int *)integrator_next_shadow_catcher_path_index_.device_pointer; +} + +void PathTraceWorkGPU::alloc_work_memory() +{ + alloc_integrator_soa(); + alloc_integrator_queue(); + alloc_integrator_sorting(); + alloc_integrator_path_split(); +} + +void PathTraceWorkGPU::init_execution() +{ + queue_->init_execution(); + + /* Copy to device side struct in constant memory. */ + device_->const_copy_to( + "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); +} + +void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) +{ + /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to + * add more work (because tiles are smaller, so there is higher chance that more paths will + * become busy after adding new tiles). This is especially important for the shadow catcher which + * schedules work in halves of available number of paths. */ + work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8); + + work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num); + + enqueue_reset(); + + int num_iterations = 0; + uint64_t num_busy_accum = 0; + + /* TODO: set a hard limit in case of undetected kernel failures? */ + while (true) { + /* Enqueue work from the scheduler, on start or when there are not enough + * paths to keep the device occupied. */ + bool finished; + if (enqueue_work_tiles(finished)) { + /* Copy stats from the device. */ + queue_->copy_from_device(integrator_queue_counter_); + + if (!queue_->synchronize()) { + break; /* Stop on error. */ + } + } + + if (is_cancel_requested()) { + break; + } + + /* Stop if no more work remaining. */ + if (finished) { + break; + } + + /* Enqueue on of the path iteration kernels. */ + if (enqueue_path_iteration()) { + /* Copy stats from the device. */ + queue_->copy_from_device(integrator_queue_counter_); + + if (!queue_->synchronize()) { + break; /* Stop on error. */ + } + } + + if (is_cancel_requested()) { + break; + } + + num_busy_accum += get_num_active_paths(); + ++num_iterations; + } + + statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_; +} + +DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const +{ + const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int max_num_queued = 0; + DeviceKernel kernel = DEVICE_KERNEL_NUM; + + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + if (queue_counter->num_queued[i] > max_num_queued) { + kernel = (DeviceKernel)i; + max_num_queued = queue_counter->num_queued[i]; + } + } + + return kernel; +} + +void PathTraceWorkGPU::enqueue_reset() +{ + void *args[] = {&max_num_paths_}; + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args); + queue_->zero_to_device(integrator_queue_counter_); + queue_->zero_to_device(integrator_shader_sort_counter_); + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + + /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the + * counter on the host side because `zero_to_device()` is not doing it. */ + if (integrator_queue_counter_.host_pointer) { + memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size()); + } +} + +bool PathTraceWorkGPU::enqueue_path_iteration() +{ + /* Find kernel to execute, with max number of queued paths. */ + const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int num_active_paths = 0; + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + num_active_paths += queue_counter->num_queued[i]; + } + + if (num_active_paths == 0) { + return false; + } + + /* Find kernel to execute, with max number of queued paths. */ + const DeviceKernel kernel = get_most_queued_kernel(); + if (kernel == DEVICE_KERNEL_NUM) { + return false; + } + + /* Finish shadows before potentially adding more shadow rays. We can only + * store one shadow ray in the integrator state. */ + if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) { + if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) { + enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); + return true; + } + else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) { + enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); + return true; + } + } + + /* Schedule kernel with maximum number of queued items. */ + enqueue_path_iteration(kernel); + return true; +} + +void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel) +{ + void *d_path_index = (void *)NULL; + + /* Create array of path indices for which this kernel is queued to be executed. */ + int work_size = max_active_path_index_; + + IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + int num_queued = queue_counter->num_queued[kernel]; + + if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + /* Compute array of active paths, sorted by shader. */ + work_size = num_queued; + d_path_index = (void *)queued_paths_.device_pointer; + + compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel); + } + else if (num_queued < work_size) { + work_size = num_queued; + d_path_index = (void *)queued_paths_.device_pointer; + + if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { + /* Compute array of active shadow paths for specific kernel. */ + compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel); + } + else { + /* Compute array of active paths for specific kernel. */ + compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel); + } + } + + DCHECK_LE(work_size, max_num_paths_); + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: { + /* Ray intersection kernels with integrator state. */ + void *args[] = {&d_path_index, const_cast<int *>(&work_size)}; + + queue_->enqueue(kernel, work_size, args); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: { + /* Shading kernels with integrator state and render buffer. */ + void *d_render_buffer = (void *)buffers_->buffer.device_pointer; + void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)}; + + queue_->enqueue(kernel, work_size, args); + break; + } + + default: + LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel) + << " used for path iteration, should never happen."; + break; + } +} + +void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel) +{ + int d_queued_kernel = queued_kernel; + void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel]; + assert(d_counter != nullptr); + + /* Compute prefix sum of number of active paths with each shader. */ + { + const int work_size = 1; + int max_shaders = device_scene_->data.max_shaders; + void *args[] = {&d_counter, &max_shaders}; + queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); + } + + queue_->zero_to_device(num_queued_paths_); + + /* Launch kernel to fill the active paths arrays. */ + { + /* TODO: this could be smaller for terminated paths based on amount of work we want + * to schedule. */ + const int work_size = max_active_path_index_; + + void *d_queued_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = {const_cast<int *>(&work_size), + &d_queued_paths, + &d_num_queued_paths, + &d_counter, + &d_queued_kernel}; + + queue_->enqueue(kernel, work_size, args); + } + + if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) { + queue_->zero_to_device(integrator_shader_sort_counter_); + } + else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + } + else { + assert(0); + } +} + +void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel) +{ + int d_queued_kernel = queued_kernel; + + /* Launch kernel to fill the active paths arrays. */ + const int work_size = max_active_path_index_; + void *d_queued_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = { + const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel}; + + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(kernel, work_size, args); +} + +void PathTraceWorkGPU::compact_states(const int num_active_paths) +{ + if (num_active_paths == 0) { + max_active_path_index_ = 0; + } + + /* Compact fragmented path states into the start of the array, moving any paths + * with index higher than the number of active paths into the gaps. */ + if (max_active_path_index_ == num_active_paths) { + return; + } + + void *d_compact_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + + /* Create array with terminated paths that we can write to. */ + { + /* TODO: can the work size be reduced here? */ + int offset = num_active_paths; + int work_size = num_active_paths; + void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset}; + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args); + } + + /* Create array of paths that we need to compact, where the path index is bigger + * than the number of active paths. */ + { + int work_size = max_active_path_index_; + void *args[] = { + &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)}; + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args); + } + + queue_->copy_from_device(num_queued_paths_); + queue_->synchronize(); + + int num_compact_paths = num_queued_paths_.data()[0]; + + /* Move paths into gaps. */ + if (num_compact_paths > 0) { + int work_size = num_compact_paths; + int active_states_offset = 0; + int terminated_states_offset = num_active_paths; + void *args[] = { + &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size}; + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args); + } + + queue_->synchronize(); + + /* Adjust max active path index now we know which part of the array is actually used. */ + max_active_path_index_ = num_active_paths; +} + +bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished) +{ + /* If there are existing paths wait them to go to intersect closest kernel, which will align the + * wavefront of the existing and newely added paths. */ + /* TODO: Check whether counting new intersection kernels here will have positive affect on the + * performance. */ + const DeviceKernel kernel = get_most_queued_kernel(); + if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) { + return false; + } + + int num_active_paths = get_num_active_paths(); + + /* Don't schedule more work if cancelling. */ + if (is_cancel_requested()) { + if (num_active_paths == 0) { + finished = true; + } + return false; + } + + finished = false; + + vector<KernelWorkTile> work_tiles; + + int max_num_camera_paths = max_num_paths_; + int num_predicted_splits = 0; + + if (has_shadow_catcher()) { + /* When there are shadow catchers in the scene bounce from them will split the state. So we + * make sure there is enough space in the path states array to fit split states. + * + * Basically, when adding N new paths we ensure that there is 2*N available path states, so + * that all the new paths can be split. + * + * Note that it is possible that some of the current states can still split, so need to make + * sure there is enough space for them as well. */ + + /* Number of currently in-flight states which can still split. */ + const int num_scheduled_possible_split = shadow_catcher_count_possible_splits(); + + const int num_available_paths = max_num_paths_ - num_active_paths; + const int num_new_paths = num_available_paths / 2; + max_num_camera_paths = max(num_active_paths, + num_active_paths + num_new_paths - num_scheduled_possible_split); + num_predicted_splits += num_scheduled_possible_split + num_new_paths; + } + + /* Schedule when we're out of paths or there are too few paths to keep the + * device occupied. */ + int num_paths = num_active_paths; + if (num_paths == 0 || num_paths < min_num_active_paths_) { + /* Get work tiles until the maximum number of path is reached. */ + while (num_paths < max_num_camera_paths) { + KernelWorkTile work_tile; + if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) { + work_tiles.push_back(work_tile); + num_paths += work_tile.w * work_tile.h * work_tile.num_samples; + } + else { + break; + } + } + + /* If we couldn't get any more tiles, we're done. */ + if (work_tiles.size() == 0 && num_paths == 0) { + finished = true; + return false; + } + } + + /* Initialize paths from work tiles. */ + if (work_tiles.size() == 0) { + return false; + } + + /* Compact state array when number of paths becomes small relative to the + * known maximum path index, which makes computing active index arrays slow. */ + compact_states(num_active_paths); + + if (has_shadow_catcher()) { + integrator_next_shadow_catcher_path_index_.data()[0] = num_paths; + queue_->copy_to_device(integrator_next_shadow_catcher_path_index_); + } + + enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE : + DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA, + work_tiles.data(), + work_tiles.size(), + num_active_paths, + num_predicted_splits); + + return true; +} + +void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel, + const KernelWorkTile work_tiles[], + const int num_work_tiles, + const int num_active_paths, + const int num_predicted_splits) +{ + /* Copy work tiles to device. */ + if (work_tiles_.size() < num_work_tiles) { + work_tiles_.alloc(num_work_tiles); + } + + int path_index_offset = num_active_paths; + int max_tile_work_size = 0; + for (int i = 0; i < num_work_tiles; i++) { + KernelWorkTile &work_tile = work_tiles_.data()[i]; + work_tile = work_tiles[i]; + + const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples; + + work_tile.path_index_offset = path_index_offset; + work_tile.work_size = tile_work_size; + + path_index_offset += tile_work_size; + + max_tile_work_size = max(max_tile_work_size, tile_work_size); + } + + queue_->copy_to_device(work_tiles_); + + void *d_work_tiles = (void *)work_tiles_.device_pointer; + void *d_render_buffer = (void *)buffers_->buffer.device_pointer; + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, + const_cast<int *>(&num_work_tiles), + &d_render_buffer, + const_cast<int *>(&max_tile_work_size)}; + + queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args); + + max_active_path_index_ = path_index_offset + num_predicted_splits; +} + +int PathTraceWorkGPU::get_num_active_paths() +{ + /* TODO: this is wrong, does not account for duplicates with shadow! */ + IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int num_paths = 0; + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + DCHECK_GE(queue_counter->num_queued[i], 0) + << "Invalid number of queued states for kernel " + << device_kernel_as_string(static_cast<DeviceKernel>(i)); + num_paths += queue_counter->num_queued[i]; + } + + return num_paths; +} + +bool PathTraceWorkGPU::should_use_graphics_interop() +{ + /* There are few aspects with the graphics interop when using multiple devices caused by the fact + * that the GPUDisplay has a single texture: + * + * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when + * attempting to register OpenGL PBO which has been mapped. Which makes sense, because + * otherwise one would run into a conflict of where the source of truth is. */ + if (has_multiple_works()) { + return false; + } + + if (!interop_use_checked_) { + Device *device = queue_->device; + interop_use_ = device->should_use_graphics_interop(); + + if (interop_use_) { + VLOG(2) << "Will be using graphics interop GPU display update."; + } + else { + VLOG(2) << "Will be using naive GPU display update."; + } + + interop_use_checked_ = true; + } + + return interop_use_; +} + +void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + if (device_->have_error()) { + /* Don't attempt to update GPU display if the device has errors: the error state will make + * wrong decisions to happen about interop, causing more chained bugs. */ + return; + } + + if (!buffers_->buffer.device_pointer) { + LOG(WARNING) << "Request for GPU display update without allocated render buffers."; + return; + } + + if (should_use_graphics_interop()) { + if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) { + return; + } + + /* If error happens when trying to use graphics interop fallback to the native implementation + * and don't attempt to use interop for the further updates. */ + interop_use_ = false; + } + + copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples); +} + +void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + const int full_x = effective_buffer_params_.full_x; + const int full_y = effective_buffer_params_.full_y; + const int width = effective_buffer_params_.width; + const int height = effective_buffer_params_.height; + const int final_width = buffers_->params.width; + const int final_height = buffers_->params.height; + + const int texture_x = full_x - effective_full_params_.full_x; + const int texture_y = full_y - effective_full_params_.full_y; + + /* Re-allocate display memory if needed, and make sure the device pointer is allocated. + * + * NOTE: allocation happens to the final resolution so that no re-allocation happens on every + * change of the resolution divider. However, if the display becomes smaller, shrink the + * allocated memory as well. */ + if (gpu_display_rgba_half_.data_width != final_width || + gpu_display_rgba_half_.data_height != final_height) { + gpu_display_rgba_half_.alloc(final_width, final_height); + /* TODO(sergey): There should be a way to make sure device-side memory is allocated without + * transfering zeroes to the device. */ + queue_->zero_to_device(gpu_display_rgba_half_); + } + + PassAccessor::Destination destination(film_->get_display_pass()); + destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer; + + get_render_tile_film_pixels(destination, pass_mode, num_samples); + + gpu_display_rgba_half_.copy_from_device(); + + gpu_display->copy_pixels_to_texture( + gpu_display_rgba_half_.data(), texture_x, texture_y, width, height); +} + +bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + if (!device_graphics_interop_) { + device_graphics_interop_ = queue_->graphics_interop_create(); + } + + const DeviceGraphicsInteropDestination graphics_interop_dst = + gpu_display->graphics_interop_get(); + device_graphics_interop_->set_destination(graphics_interop_dst); + + const device_ptr d_rgba_half = device_graphics_interop_->map(); + if (!d_rgba_half) { + return false; + } + + PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display); + destination.d_pixels_half_rgba = d_rgba_half; + + get_render_tile_film_pixels(destination, pass_mode, num_samples); + + device_graphics_interop_->unmap(); + + return true; +} + +void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display) +{ + if (!device_graphics_interop_) { + return; + } + gpu_display->graphics_interop_activate(); + device_graphics_interop_ = nullptr; + gpu_display->graphics_interop_deactivate(); +} + +void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination, + PassMode pass_mode, + int num_samples) +{ + const KernelFilm &kfilm = device_scene_->data.film; + + const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode); + const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples); + + pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination); +} + +int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset) +{ + const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset); + + if (num_active_pixels) { + enqueue_adaptive_sampling_filter_x(); + enqueue_adaptive_sampling_filter_y(); + queue_->synchronize(); + } + + return num_active_pixels; +} + +int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset) +{ + device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE); + num_active_pixels.alloc(1); + + queue_->zero_to_device(num_active_pixels); + + const int work_size = effective_buffer_params_.width * effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + const_cast<int *>(&effective_buffer_params_.full_x), + const_cast<int *>(&effective_buffer_params_.full_y), + const_cast<int *>(&effective_buffer_params_.width), + const_cast<int *>(&effective_buffer_params_.height), + &threshold, + &reset, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride, + &num_active_pixels.device_pointer}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args); + + queue_->copy_from_device(num_active_pixels); + queue_->synchronize(); + + return num_active_pixels.data()[0]; +} + +void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x() +{ + const int work_size = effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args); +} + +void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y() +{ + const int work_size = effective_buffer_params_.width; + + void *args[] = {&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args); +} + +void PathTraceWorkGPU::cryptomatte_postproces() +{ + const int work_size = effective_buffer_params_.width * effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + const_cast<int *>(&work_size), + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args); +} + +bool PathTraceWorkGPU::copy_render_buffers_from_device() +{ + queue_->copy_from_device(buffers_->buffer); + + /* Synchronize so that the CPU-side buffer is available at the exit of this function. */ + return queue_->synchronize(); +} + +bool PathTraceWorkGPU::copy_render_buffers_to_device() +{ + queue_->copy_to_device(buffers_->buffer); + + /* NOTE: The direct device access to the buffers only happens within this path trace work. The + * rest of communication happens via API calls which involves `copy_render_buffers_from_device()` + * which will perform synchronization as needed. */ + + return true; +} + +bool PathTraceWorkGPU::zero_render_buffers() +{ + queue_->zero_to_device(buffers_->buffer); + + return true; +} + +bool PathTraceWorkGPU::has_shadow_catcher() const +{ + return device_scene_->data.integrator.has_shadow_catcher; +} + +int PathTraceWorkGPU::shadow_catcher_count_possible_splits() +{ + if (max_active_path_index_ == 0) { + return 0; + } + + if (!has_shadow_catcher()) { + return 0; + } + + queue_->zero_to_device(num_queued_paths_); + + const int work_size = max_active_path_index_; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths}; + + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args); + queue_->copy_from_device(num_queued_paths_); + queue_->synchronize(); + + return num_queued_paths_.data()[0]; +} + +CCL_NAMESPACE_END |