Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorPatrick Mours <pmours@nvidia.com>2022-08-12 16:49:30 +0300
committerPatrick Mours <pmours@nvidia.com>2022-08-12 17:00:54 +0300
commit79787bf8e1e1d766e34dc6f8c5eda2efcceaa6cc (patch)
tree95255c7ae7c5075abc07995126efc76701882050 /intern
parent27105af938b14c48fb498a292e2b371cbd8faf31 (diff)
Cycles: Improve denoiser update performance when rendering with multiple GPUs
This patch causes the render buffers to be copied to the denoiser device only once before denoising and output/display is then fed from that single buffer on the denoiser device. That way usually all but one copy (from all the render devices to the denoiser device) can be eliminated, provided that the denoiser device is also the display device (in which case interop is used to update the display). As such this patch also adds some logic that tries to ensure the chosen denoiser device is the same as the display device. Differential Revision: https://developer.blender.org/D15657
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/device/cuda/device_impl.cpp6
-rw-r--r--intern/cycles/device/optix/device_impl.cpp4
-rw-r--r--intern/cycles/integrator/denoiser.cpp7
-rw-r--r--intern/cycles/integrator/path_trace.cpp73
-rw-r--r--intern/cycles/integrator/path_trace.h4
-rw-r--r--intern/cycles/integrator/path_trace_tile.cpp2
-rw-r--r--intern/cycles/integrator/path_trace_tile.h2
-rw-r--r--intern/cycles/session/session.cpp15
8 files changed, 76 insertions, 37 deletions
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 00851a8e91c..01c021551f3 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -1202,11 +1202,11 @@ bool CUDADevice::should_use_graphics_interop()
}
vector<CUdevice> gl_devices(num_all_devices);
- uint num_gl_devices;
+ uint num_gl_devices = 0;
cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
- for (CUdevice gl_device : gl_devices) {
- if (gl_device == cuDevice) {
+ for (uint i = 0; i < num_gl_devices; ++i) {
+ if (gl_devices[i] == cuDevice) {
return true;
}
}
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 151983667c0..94a46acaf18 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -39,6 +39,9 @@ CCL_NAMESPACE_BEGIN
// The original code is Copyright NVIDIA Corporation, BSD-3-Clause.
namespace {
+# if OPTIX_ABI_VERSION >= 60
+using ::optixUtilDenoiserInvokeTiled;
+# else
static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input,
const OptixImage2D &output,
unsigned int overlapWindowSizeInPixels,
@@ -215,6 +218,7 @@ static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser,
}
return OPTIX_SUCCESS;
}
+# endif
# if OPTIX_ABI_VERSION >= 55
static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
index 94991d63e4c..831bd3a4407 100644
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -101,10 +101,17 @@ static Device *find_best_device(Device *device, DenoiserType type)
if ((sub_device->info.denoisers & type) == 0) {
return;
}
+
if (!best_device) {
best_device = sub_device;
}
else {
+ /* Prefer a device that can use graphics interop for faster display update. */
+ if (sub_device->should_use_graphics_interop() &&
+ !best_device->should_use_graphics_interop()) {
+ best_device = sub_device;
+ }
+
/* TODO(sergey): Choose fastest device from available ones. Taking into account performance
* of the device and data transfer cost. */
}
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index ed278821b46..3ec7b601d9f 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -26,6 +26,7 @@ PathTrace::PathTrace(Device *device,
RenderScheduler &render_scheduler,
TileManager &tile_manager)
: device_(device),
+ film_(film),
device_scene_(device_scene),
render_scheduler_(render_scheduler),
tile_manager_(tile_manager)
@@ -60,7 +61,17 @@ PathTrace::~PathTrace()
void PathTrace::load_kernels()
{
if (denoiser_) {
+ /* Activate graphics interop while denoiser device is created, so that it can choose a device
+ * that supports interop for faster display updates. */
+ if (display_ && path_trace_works_.size() > 1) {
+ display_->graphics_interop_activate();
+ }
+
denoiser_->load_kernels(progress_);
+
+ if (display_ && path_trace_works_.size() > 1) {
+ display_->graphics_interop_deactivate();
+ }
}
}
@@ -506,28 +517,30 @@ void PathTrace::denoise(const RenderWork &render_work)
const double start_time = time_dt();
RenderBuffers *buffer_to_denoise = nullptr;
-
- unique_ptr<RenderBuffers> multi_device_buffers;
bool allow_inplace_modification = false;
- if (path_trace_works_.size() == 1) {
- buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+ Device *denoiser_device = denoiser_->get_denoiser_device();
+ if (path_trace_works_.size() > 1 && denoiser_device && !big_tile_denoise_work_) {
+ big_tile_denoise_work_ = PathTraceWork::create(denoiser_device, film_, device_scene_, nullptr);
}
- else {
- Device *denoiser_device = denoiser_->get_denoiser_device();
- if (!denoiser_device) {
- return;
- }
- multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
- multi_device_buffers->reset(render_state_.effective_big_tile_params);
+ if (big_tile_denoise_work_) {
+ big_tile_denoise_work_->set_effective_buffer_params(render_state_.effective_big_tile_params,
+ render_state_.effective_big_tile_params,
+ render_state_.effective_big_tile_params);
- buffer_to_denoise = multi_device_buffers.get();
+ buffer_to_denoise = big_tile_denoise_work_->get_render_buffers();
+ buffer_to_denoise->reset(render_state_.effective_big_tile_params);
- copy_to_render_buffers(multi_device_buffers.get());
+ copy_to_render_buffers(buffer_to_denoise);
allow_inplace_modification = true;
}
+ else {
+ DCHECK_EQ(path_trace_works_.size(), 1);
+
+ buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+ }
if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
buffer_to_denoise,
@@ -536,14 +549,6 @@ void PathTrace::denoise(const RenderWork &render_work)
render_state_.has_denoised_result = true;
}
- if (multi_device_buffers) {
- multi_device_buffers->copy_from_device();
- parallel_for_each(
- path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
- path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
- });
- }
-
render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
}
@@ -635,8 +640,13 @@ void PathTrace::update_display(const RenderWork &render_work)
/* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
* all works in parallel. */
const int num_samples = get_num_samples_in_buffer();
- for (auto &&path_trace_work : path_trace_works_) {
- path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
+ if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+ big_tile_denoise_work_->copy_to_display(display_.get(), pass_mode, num_samples);
+ }
+ else {
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
+ }
}
display_->update_end();
@@ -721,11 +731,10 @@ void PathTrace::write_tile_buffer(const RenderWork &render_work)
VLOG_WORK << "Write tile result via buffer write callback.";
tile_buffer_write();
}
-
/* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
*/
- if (has_multiple_tiles) {
- VLOG_WORK << "Write tile result into .";
+ else {
+ VLOG_WORK << "Write tile result to disk.";
tile_buffer_write_to_disk();
}
}
@@ -901,6 +910,10 @@ bool PathTrace::copy_render_tile_from_device()
return true;
}
+ if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+ return big_tile_denoise_work_->copy_render_buffers_from_device();
+ }
+
bool success = true;
parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
@@ -1002,6 +1015,10 @@ bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
}
+ if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+ return big_tile_denoise_work_->get_render_tile_pixels(pass_accessor, destination);
+ }
+
bool success = true;
parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
@@ -1082,6 +1099,10 @@ void PathTrace::destroy_gpu_resources()
for (auto &&path_trace_work : path_trace_works_) {
path_trace_work->destroy_gpu_resources(display_.get());
}
+
+ if (big_tile_denoise_work_) {
+ big_tile_denoise_work_->destroy_gpu_resources(display_.get());
+ }
}
}
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
index a470a6e1402..9531e4fb186 100644
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -236,6 +236,7 @@ class PathTrace {
/* CPU device for creating temporary render buffers on the CPU side. */
unique_ptr<Device> cpu_device_;
+ Film *film_;
DeviceScene *device_scene_;
RenderScheduler &render_scheduler_;
@@ -261,6 +262,9 @@ class PathTrace {
/* Denoiser which takes care of denoising the big tile. */
unique_ptr<Denoiser> denoiser_;
+ /* Denoiser device descriptor which holds the denoised big tile for multi-device workloads. */
+ unique_ptr<PathTraceWork> denoiser_buffer_;
+
/* State which is common for all the steps of the render work.
* Is brought up to date in the `render()` call and is accessed from all the steps involved into
* rendering the work. */
diff --git a/intern/cycles/integrator/path_trace_tile.cpp b/intern/cycles/integrator/path_trace_tile.cpp
index 2f1f4e810a3..dfe88695013 100644
--- a/intern/cycles/integrator/path_trace_tile.cpp
+++ b/intern/cycles/integrator/path_trace_tile.cpp
@@ -33,7 +33,7 @@ bool PathTraceTile::get_pass_pixels(const string_view pass_name,
if (!copied_from_device_) {
/* Copy from device on demand. */
path_trace_.copy_render_tile_from_device();
- const_cast<PathTraceTile *>(this)->copied_from_device_ = true;
+ copied_from_device_ = true;
}
const BufferParams &buffer_params = path_trace_.get_render_tile_params();
diff --git a/intern/cycles/integrator/path_trace_tile.h b/intern/cycles/integrator/path_trace_tile.h
index 99ae08d04d1..223fa96e113 100644
--- a/intern/cycles/integrator/path_trace_tile.h
+++ b/intern/cycles/integrator/path_trace_tile.h
@@ -24,7 +24,7 @@ class PathTraceTile : public OutputDriver::Tile {
private:
PathTrace &path_trace_;
- bool copied_from_device_;
+ mutable bool copied_from_device_;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/session/session.cpp b/intern/cycles/session/session.cpp
index e7de82a6e1b..c94b53535a7 100644
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@@ -370,6 +370,14 @@ RenderWork Session::run_update_for_next_iteration()
if (update_scene(width, height)) {
profiler.reset(scene->shaders.size(), scene->objects.size());
}
+
+ /* Unlock scene mutex before loading denoiser kernels, since that may attempt to activate
+ * graphics interop, which can deadlock when the scene mutex is still being held. */
+ scene_lock.unlock();
+
+ path_trace_->load_kernels();
+ path_trace_->alloc_work_memory();
+
progress.add_skip_time(update_timer, params.background);
}
@@ -618,12 +626,7 @@ bool Session::update_scene(int width, int height)
Camera *cam = scene->camera;
cam->set_screen_size(width, height);
- const bool scene_update_result = scene->update(progress);
-
- path_trace_->load_kernels();
- path_trace_->alloc_work_memory();
-
- return scene_update_result;
+ return scene->update(progress);
}
static string status_append(const string &status, const string &suffix)