diff options
Diffstat (limited to 'intern/cycles/integrator')
-rw-r--r-- | intern/cycles/integrator/denoiser.cpp | 7 | ||||
-rw-r--r-- | intern/cycles/integrator/pass_accessor.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace.cpp | 77 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace.h | 4 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_tile.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_tile.h | 2 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 68 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.h | 3 |
8 files changed, 119 insertions, 50 deletions
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp index 94991d63e4c..831bd3a4407 100644 --- a/intern/cycles/integrator/denoiser.cpp +++ b/intern/cycles/integrator/denoiser.cpp @@ -101,10 +101,17 @@ static Device *find_best_device(Device *device, DenoiserType type) if ((sub_device->info.denoisers & type) == 0) { return; } + if (!best_device) { best_device = sub_device; } else { + /* Prefer a device that can use graphics interop for faster display update. */ + if (sub_device->should_use_graphics_interop() && + !best_device->should_use_graphics_interop()) { + best_device = sub_device; + } + /* TODO(sergey): Choose fastest device from available ones. Taking into account performance * of the device and data transfer cost. */ } diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp index 05318b7545b..ab056e953c2 100644 --- a/intern/cycles/integrator/pass_accessor.cpp +++ b/intern/cycles/integrator/pass_accessor.cpp @@ -191,6 +191,12 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers, * had the computation done. */ if (pass_info.num_components == 3) { get_pass_float3(render_buffers, buffer_params, destination); + + /* Use alpha for colors passes. */ + if (type == PASS_DIFFUSE_COLOR || type == PASS_GLOSSY_COLOR || + type == PASS_TRANSMISSION_COLOR) { + num_written_components = destination.num_components; + } } else if (pass_info.num_components == 4) { if (destination.num_components == 3) { diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 9ad1c465725..3ec7b601d9f 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -26,6 +26,7 @@ PathTrace::PathTrace(Device *device, RenderScheduler &render_scheduler, TileManager &tile_manager) : device_(device), + film_(film), device_scene_(device_scene), render_scheduler_(render_scheduler), tile_manager_(tile_manager) @@ -60,7 +61,17 @@ PathTrace::~PathTrace() void PathTrace::load_kernels() { if (denoiser_) { + /* Activate graphics interop while denoiser device is created, so that it can choose a device + * that supports interop for faster display updates. */ + if (display_ && path_trace_works_.size() > 1) { + display_->graphics_interop_activate(); + } + denoiser_->load_kernels(progress_); + + if (display_ && path_trace_works_.size() > 1) { + display_->graphics_interop_deactivate(); + } } } @@ -373,7 +384,7 @@ void PathTrace::path_trace(RenderWork &render_work) work_balance_infos_[i].time_spent += work_time; work_balance_infos_[i].occupancy = statistics.occupancy; - VLOG_WORK << "Rendered " << num_samples << " samples in " << work_time << " seconds (" + VLOG_INFO << "Rendered " << num_samples << " samples in " << work_time << " seconds (" << work_time / num_samples << " seconds per sample), occupancy: " << statistics.occupancy; }); @@ -506,28 +517,30 @@ void PathTrace::denoise(const RenderWork &render_work) const double start_time = time_dt(); RenderBuffers *buffer_to_denoise = nullptr; - - unique_ptr<RenderBuffers> multi_device_buffers; bool allow_inplace_modification = false; - if (path_trace_works_.size() == 1) { - buffer_to_denoise = path_trace_works_.front()->get_render_buffers(); + Device *denoiser_device = denoiser_->get_denoiser_device(); + if (path_trace_works_.size() > 1 && denoiser_device && !big_tile_denoise_work_) { + big_tile_denoise_work_ = PathTraceWork::create(denoiser_device, film_, device_scene_, nullptr); } - else { - Device *denoiser_device = denoiser_->get_denoiser_device(); - if (!denoiser_device) { - return; - } - multi_device_buffers = make_unique<RenderBuffers>(denoiser_device); - multi_device_buffers->reset(render_state_.effective_big_tile_params); + if (big_tile_denoise_work_) { + big_tile_denoise_work_->set_effective_buffer_params(render_state_.effective_big_tile_params, + render_state_.effective_big_tile_params, + render_state_.effective_big_tile_params); - buffer_to_denoise = multi_device_buffers.get(); + buffer_to_denoise = big_tile_denoise_work_->get_render_buffers(); + buffer_to_denoise->reset(render_state_.effective_big_tile_params); - copy_to_render_buffers(multi_device_buffers.get()); + copy_to_render_buffers(buffer_to_denoise); allow_inplace_modification = true; } + else { + DCHECK_EQ(path_trace_works_.size(), 1); + + buffer_to_denoise = path_trace_works_.front()->get_render_buffers(); + } if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params, buffer_to_denoise, @@ -536,14 +549,6 @@ void PathTrace::denoise(const RenderWork &render_work) render_state_.has_denoised_result = true; } - if (multi_device_buffers) { - multi_device_buffers->copy_from_device(); - parallel_for_each( - path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) { - path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get()); - }); - } - render_scheduler_.report_denoise_time(render_work, time_dt() - start_time); } @@ -635,8 +640,13 @@ void PathTrace::update_display(const RenderWork &render_work) /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from * all works in parallel. */ const int num_samples = get_num_samples_in_buffer(); - for (auto &&path_trace_work : path_trace_works_) { - path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples); + if (big_tile_denoise_work_ && render_state_.has_denoised_result) { + big_tile_denoise_work_->copy_to_display(display_.get(), pass_mode, num_samples); + } + else { + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples); + } } display_->update_end(); @@ -721,11 +731,10 @@ void PathTrace::write_tile_buffer(const RenderWork &render_work) VLOG_WORK << "Write tile result via buffer write callback."; tile_buffer_write(); } - /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile. */ - if (has_multiple_tiles) { - VLOG_WORK << "Write tile result into ."; + else { + VLOG_WORK << "Write tile result to disk."; tile_buffer_write_to_disk(); } } @@ -901,6 +910,10 @@ bool PathTrace::copy_render_tile_from_device() return true; } + if (big_tile_denoise_work_ && render_state_.has_denoised_result) { + return big_tile_denoise_work_->copy_render_buffers_from_device(); + } + bool success = true; parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { @@ -1002,6 +1015,10 @@ bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor, return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination); } + if (big_tile_denoise_work_ && render_state_.has_denoised_result) { + return big_tile_denoise_work_->get_render_tile_pixels(pass_accessor, destination); + } + bool success = true; parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { @@ -1082,6 +1099,10 @@ void PathTrace::destroy_gpu_resources() for (auto &&path_trace_work : path_trace_works_) { path_trace_work->destroy_gpu_resources(display_.get()); } + + if (big_tile_denoise_work_) { + big_tile_denoise_work_->destroy_gpu_resources(display_.get()); + } } } @@ -1103,6 +1124,8 @@ static const char *device_type_for_description(const DeviceType type) return "OptiX"; case DEVICE_HIP: return "HIP"; + case DEVICE_ONEAPI: + return "oneAPI"; case DEVICE_DUMMY: return "Dummy"; case DEVICE_MULTI: diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h index a470a6e1402..59382b51d23 100644 --- a/intern/cycles/integrator/path_trace.h +++ b/intern/cycles/integrator/path_trace.h @@ -236,6 +236,7 @@ class PathTrace { /* CPU device for creating temporary render buffers on the CPU side. */ unique_ptr<Device> cpu_device_; + Film *film_; DeviceScene *device_scene_; RenderScheduler &render_scheduler_; @@ -261,6 +262,9 @@ class PathTrace { /* Denoiser which takes care of denoising the big tile. */ unique_ptr<Denoiser> denoiser_; + /* Denoiser device descriptor which holds the denoised big tile for multi-device workloads. */ + unique_ptr<PathTraceWork> big_tile_denoise_work_; + /* State which is common for all the steps of the render work. * Is brought up to date in the `render()` call and is accessed from all the steps involved into * rendering the work. */ diff --git a/intern/cycles/integrator/path_trace_tile.cpp b/intern/cycles/integrator/path_trace_tile.cpp index 2f1f4e810a3..dfe88695013 100644 --- a/intern/cycles/integrator/path_trace_tile.cpp +++ b/intern/cycles/integrator/path_trace_tile.cpp @@ -33,7 +33,7 @@ bool PathTraceTile::get_pass_pixels(const string_view pass_name, if (!copied_from_device_) { /* Copy from device on demand. */ path_trace_.copy_render_tile_from_device(); - const_cast<PathTraceTile *>(this)->copied_from_device_ = true; + copied_from_device_ = true; } const BufferParams &buffer_params = path_trace_.get_render_tile_params(); diff --git a/intern/cycles/integrator/path_trace_tile.h b/intern/cycles/integrator/path_trace_tile.h index 99ae08d04d1..223fa96e113 100644 --- a/intern/cycles/integrator/path_trace_tile.h +++ b/intern/cycles/integrator/path_trace_tile.h @@ -24,7 +24,7 @@ class PathTraceTile : public OutputDriver::Tile { private: PathTrace &path_trace_; - bool copied_from_device_; + mutable bool copied_from_device_; }; CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 0acaeace4b0..ee250a6916b 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -181,27 +181,49 @@ void PathTraceWorkGPU::alloc_integrator_queue() void PathTraceWorkGPU::alloc_integrator_sorting() { - /* Allocate arrays for shader sorting. */ - const int max_shaders = device_scene_->data.max_shaders; - if (integrator_shader_sort_counter_.size() < max_shaders) { - integrator_shader_sort_counter_.alloc(max_shaders); - integrator_shader_sort_counter_.zero_to_device(); + /* Compute sort partitions, to balance between memory locality and coherence. + * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a + * more sophisticated heuristic we simply disable sort partitioning if the shader count is high. + */ + num_sort_partitions_ = 1; + if (device_scene_->data.max_shaders < 300) { + const int num_elements = queue_->num_sort_partition_elements(); + if (num_elements) { + num_sort_partitions_ = max(max_num_paths_ / num_elements, 1); + } + } - integrator_shader_raytrace_sort_counter_.alloc(max_shaders); - integrator_shader_raytrace_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, + num_sort_partitions_); - integrator_shader_mnee_sort_counter_.alloc(max_shaders); - integrator_shader_mnee_sort_counter_.zero_to_device(); + /* Allocate arrays for shader sorting. */ + const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; + if (integrator_shader_sort_counter_.size() < sort_buckets) { + integrator_shader_sort_counter_.alloc(sort_buckets); + integrator_shader_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = + (int *)integrator_shader_sort_counter_.device_pointer; - integrator_shader_sort_prefix_sum_.alloc(max_shaders); + integrator_shader_sort_prefix_sum_.alloc(sort_buckets); integrator_shader_sort_prefix_sum_.zero_to_device(); + } - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = - (int *)integrator_shader_sort_counter_.device_pointer; - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = - (int *)integrator_shader_raytrace_sort_counter_.device_pointer; - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = - (int *)integrator_shader_mnee_sort_counter_.device_pointer; + if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) { + integrator_shader_raytrace_sort_counter_.alloc(sort_buckets); + integrator_shader_raytrace_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = + (int *)integrator_shader_raytrace_sort_counter_.device_pointer; + } + } + + if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) { + if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) { + integrator_shader_mnee_sort_counter_.alloc(sort_buckets); + integrator_shader_mnee_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = + (int *)integrator_shader_mnee_sort_counter_.device_pointer; + } } } @@ -239,7 +261,7 @@ void PathTraceWorkGPU::init_execution() /* Copy to device side struct in constant memory. */ device_->const_copy_to( - "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); + "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); } void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, @@ -333,8 +355,12 @@ void PathTraceWorkGPU::enqueue_reset() queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args); queue_->zero_to_device(integrator_queue_counter_); queue_->zero_to_device(integrator_shader_sort_counter_); - queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); - queue_->zero_to_device(integrator_shader_mnee_sort_counter_); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + } + if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) { + queue_->zero_to_device(integrator_shader_mnee_sort_counter_); + } /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the * counter on the host side because `zero_to_device()` is not doing it. */ @@ -486,9 +512,9 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, /* Compute prefix sum of number of active paths with each shader. */ { const int work_size = 1; - int max_shaders = device_scene_->data.max_shaders; + int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; - DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders); + DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets); queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); } diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index 4c10a221a30..a805258d1b5 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -156,6 +156,9 @@ class PathTraceWorkGPU : public PathTraceWork { bool interop_use_checked_ = false; bool interop_use_ = false; + /* Number of partitions to sort state indices into prior to material sort. */ + int num_sort_partitions_; + /* Maximum number of concurrent integrator states. */ int max_num_paths_; |