Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2020-07-09 13:20:07 +0300
committerBrecht Van Lommel <brecht@blender.org>2020-07-10 18:10:05 +0300
commit6eeb32706aa28bd4d0f3c26f6a5965facd6c0d62 (patch)
treea49f451587d8b3516eba096e6f94dc6108853cf1 /intern/cycles/device/device_cpu.cpp
parent93791381fec898e6f74a189e4eeb25f66029f131 (diff)
Cycles: support OpenImageDenoise in final renders
Performance is not great currently due to the API not seeming to support efficient denoising of multiple tiles at the same time. So in many cases only one or a few threads will actually be denoising at the same time. In renders with many samples this is not a big problem, but for faster renders it's a signficant overhead. We should try to optimize this still, possibly by batching denoising of a bigger neighborhood of multiple tiles at once.
Diffstat (limited to 'intern/cycles/device/device_cpu.cpp')
-rw-r--r--intern/cycles/device/device_cpu.cpp163
1 files changed, 143 insertions, 20 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 2e4761562a5..878301e8242 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -181,6 +181,7 @@ class CPUDevice : public Device {
#ifdef WITH_OPENIMAGEDENOISE
oidn::DeviceRef oidn_device;
oidn::FilterRef oidn_filter;
+ thread_spin_lock oidn_task_lock;
#endif
bool use_split_kernel;
@@ -948,12 +949,24 @@ class CPUDevice : public Device {
}
}
- void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+ void denoise_openimagedenoise_buffer(DeviceTask &task,
+ float *buffer,
+ size_t offset,
+ size_t stride,
+ size_t x,
+ size_t y,
+ size_t w,
+ size_t h)
{
#ifdef WITH_OPENIMAGEDENOISE
assert(openimagedenoise_supported());
- /* Only one at a time, since OpenImageDenoise itself is multithreaded. */
+ /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
+ * buffers, and for tiled rendering because creating multiple devices and filters
+ * is slow and memory hungry as well.
+ *
+ * TODO: optimize tiled rendering case, by batching together denoising of many
+ * tiles somehow? */
static thread_mutex mutex;
thread_scoped_lock lock(mutex);
@@ -964,11 +977,10 @@ class CPUDevice : public Device {
}
if (!oidn_filter) {
oidn_filter = oidn_device.newFilter("RT");
+ oidn_filter.set("hdr", true);
+ oidn_filter.set("srgb", false);
}
- /* Copy pixels from compute device to CPU (no-op for CPU device). */
- rtile.buffers->buffer.copy_from_device();
-
/* Set images with appropriate stride for our interleaved pass storage. */
const struct {
const char *name;
@@ -981,37 +993,131 @@ class CPUDevice : public Device {
0 }};
for (int i = 0; passes[i].name; i++) {
- const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride;
- const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float);
+ const int64_t pixel_offset = offset + x + y * stride;
+ const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset) *
+ sizeof(float);
const int64_t pixel_stride = task.pass_stride * sizeof(float);
- const int64_t row_stride = rtile.stride * pixel_stride;
+ const int64_t row_stride = stride * pixel_stride;
oidn_filter.setImage(passes[i].name,
- (char *)rtile.buffer + buffer_offset,
+ (char *)buffer + buffer_offset,
oidn::Format::Float3,
- rtile.w,
- rtile.h,
+ w,
+ h,
0,
pixel_stride,
row_stride);
}
/* Execute filter. */
- oidn_filter.set("hdr", true);
- oidn_filter.set("srgb", false);
oidn_filter.commit();
oidn_filter.execute();
-
- /* todo: it may be possible to avoid this copy, but we have to ensure that
- * when other code copies data from the device it doesn't overwrite the
- * denoiser buffers. */
- rtile.buffers->buffer.copy_to_device();
#else
(void)task;
- (void)rtile;
+ (void)buffer;
+ (void)offset;
+ (void)stride;
+ (void)x;
+ (void)ry;
+ (void)w;
+ (void)h;
#endif
}
+ void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+ {
+ if (task.type == DeviceTask::DENOISE_BUFFER) {
+ /* Copy pixels from compute device to CPU (no-op for CPU device). */
+ rtile.buffers->buffer.copy_from_device();
+
+ denoise_openimagedenoise_buffer(task,
+ (float *)rtile.buffer,
+ rtile.offset,
+ rtile.stride,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h);
+
+ /* todo: it may be possible to avoid this copy, but we have to ensure that
+ * when other code copies data from the device it doesn't overwrite the
+ * denoiser buffers. */
+ rtile.buffers->buffer.copy_to_device();
+ }
+ else {
+ /* Per-tile denoising. */
+ rtile.sample = rtile.start_sample + rtile.num_samples;
+
+ /* Map neighboring tiles into one buffer for denoising. */
+ RenderTileNeighbors neighbors(rtile);
+ task.map_neighbor_tiles(neighbors, this);
+ RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+ rtile = center_tile;
+
+ /* Calculate size of the tile to denoise (including overlap). The overlap
+ * size was chosen empirically. OpenImageDenoise specifies an overlap size
+ * of 128 but this is significantly bigger than typical tile size. */
+ const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
+ const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
+
+ /* Adjacent tiles are in separate memory regions, copy into single buffer. */
+ array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
+
+ for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+ RenderTile &ntile = neighbors.tiles[i];
+ if (!ntile.buffer) {
+ continue;
+ }
+
+ const int xmin = max(ntile.x, rect.x);
+ const int ymin = max(ntile.y, rect.y);
+ const int xmax = min(ntile.x + ntile.w, rect.z);
+ const int ymax = min(ntile.y + ntile.h, rect.w);
+
+ const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+ const float *tile_buffer = (float *)ntile.buffer + tile_offset * task.pass_stride;
+
+ const size_t merged_stride = rect_size.x;
+ const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+ float *merged_buffer = merged.data() + merged_offset * task.pass_stride;
+
+ for (int y = ymin; y < ymax; y++) {
+ memcpy(merged_buffer, tile_buffer, sizeof(float) * task.pass_stride * (xmax - xmin));
+ tile_buffer += ntile.stride * task.pass_stride;
+ merged_buffer += merged_stride * task.pass_stride;
+ }
+ }
+
+ /* Denoise */
+ denoise_openimagedenoise_buffer(
+ task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y);
+
+ /* Copy back result from merged buffer. */
+ RenderTile &ntile = neighbors.target;
+ if (ntile.buffer) {
+ const int xmin = max(ntile.x, rect.x);
+ const int ymin = max(ntile.y, rect.y);
+ const int xmax = min(ntile.x + ntile.w, rect.z);
+ const int ymax = min(ntile.y + ntile.h, rect.w);
+
+ const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+ float *tile_buffer = (float *)ntile.buffer + tile_offset * task.pass_stride;
+
+ const size_t merged_stride = rect_size.x;
+ const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+ const float *merged_buffer = merged.data() + merged_offset * task.pass_stride;
+
+ for (int y = ymin; y < ymax; y++) {
+ memcpy(tile_buffer, merged_buffer, sizeof(float) * task.pass_stride * (xmax - xmin));
+ tile_buffer += ntile.stride * task.pass_stride;
+ merged_buffer += merged_stride * task.pass_stride;
+ }
+ }
+
+ task.unmap_neighbor_tiles(neighbors, this);
+ }
+ }
+
void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
{
ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
@@ -1070,10 +1176,23 @@ class CPUDevice : public Device {
}
}
+ /* NLM denoiser. */
DenoisingTask *denoising = NULL;
+ /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+ * avoid waiting with mutex locks in the denoiser, we let only a single
+ * thread acquire denoising tiles. */
+ uint tile_types = task.tile_types;
+ bool hold_denoise_lock = false;
+ if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+ if (!oidn_task_lock.try_lock()) {
+ tile_types &= ~RenderTile::DENOISE;
+ hold_denoise_lock = true;
+ }
+ }
+
RenderTile tile;
- while (task.acquire_tile(this, tile, task.tile_types)) {
+ while (task.acquire_tile(this, tile, tile_types)) {
if (tile.task == RenderTile::PATH_TRACE) {
if (use_split_kernel) {
device_only_memory<uchar> void_buffer(this, "void_buffer");
@@ -1108,6 +1227,10 @@ class CPUDevice : public Device {
}
}
+ if (hold_denoise_lock) {
+ oidn_task_lock.unlock();
+ }
+
profiler.remove_state(&kg->profiler);
thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);