Cycles: support OpenImageDenoise in final renders

Performance is not great currently due to the API not seeming to support efficient denoising of multiple tiles at the same time. So in many cases only one or a few threads will actually be denoising at the same time. In renders with many samples this is not a big problem, but for faster renders it's a signficant overhead. We should try to optimize this still, possibly by batching denoising of a bigger neighborhood of multiple tiles at once.
author: Brecht Van Lommel <brecht@blender.org> 2020-07-09 13:20:07 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2020-07-10 18:10:05 +0300
commit: 6eeb32706aa28bd4d0f3c26f6a5965facd6c0d62 (patch)
tree: a49f451587d8b3516eba096e6f94dc6108853cf1 /intern/cycles/device/device_cpu.cpp
parent: 93791381fec898e6f74a189e4eeb25f66029f131 (diff)
1 files changed, 143 insertions, 20 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 2e4761562a5..878301e8242 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -181,6 +181,7 @@ class CPUDevice : public Device {
 #ifdef WITH_OPENIMAGEDENOISE
   oidn::DeviceRef oidn_device;
   oidn::FilterRef oidn_filter;
+  thread_spin_lock oidn_task_lock;
 #endif
 
   bool use_split_kernel;
@@ -948,12 +949,24 @@ class CPUDevice : public Device {
     }
   }
 
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  void denoise_openimagedenoise_buffer(DeviceTask &task,
+                                       float *buffer,
+                                       size_t offset,
+                                       size_t stride,
+                                       size_t x,
+                                       size_t y,
+                                       size_t w,
+                                       size_t h)
   {
 #ifdef WITH_OPENIMAGEDENOISE
     assert(openimagedenoise_supported());
 
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded. */
+    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
+     * buffers, and for tiled rendering because creating multiple devices and filters
+     * is slow and memory hungry as well.
+     *
+     * TODO: optimize tiled rendering case, by batching together denoising of many
+     * tiles somehow? */
     static thread_mutex mutex;
     thread_scoped_lock lock(mutex);
 
@@ -964,11 +977,10 @@ class CPUDevice : public Device {
     }
     if (!oidn_filter) {
       oidn_filter = oidn_device.newFilter("RT");
+      oidn_filter.set("hdr", true);
+      oidn_filter.set("srgb", false);
     }
 
-    /* Copy pixels from compute device to CPU (no-op for CPU device). */
-    rtile.buffers->buffer.copy_from_device();
-
     /* Set images with appropriate stride for our interleaved pass storage. */
     const struct {
       const char *name;
@@ -981,37 +993,131 @@ class CPUDevice : public Device {
                     0 }};
 
     for (int i = 0; passes[i].name; i++) {
-      const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride;
-      const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float);
+      const int64_t pixel_offset = offset + x + y * stride;
+      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset) *
+                                    sizeof(float);
       const int64_t pixel_stride = task.pass_stride * sizeof(float);
-      const int64_t row_stride = rtile.stride * pixel_stride;
+      const int64_t row_stride = stride * pixel_stride;
 
       oidn_filter.setImage(passes[i].name,
-                           (char *)rtile.buffer + buffer_offset,
+                           (char *)buffer + buffer_offset,
                            oidn::Format::Float3,
-                           rtile.w,
-                           rtile.h,
+                           w,
+                           h,
                            0,
                            pixel_stride,
                            row_stride);
     }
 
     /* Execute filter. */
-    oidn_filter.set("hdr", true);
-    oidn_filter.set("srgb", false);
     oidn_filter.commit();
     oidn_filter.execute();
-
-    /* todo: it may be possible to avoid this copy, but we have to ensure that
-     * when other code copies data from the device it doesn't overwrite the
-     * denoiser buffers. */
-    rtile.buffers->buffer.copy_to_device();
 #else
     (void)task;
-    (void)rtile;
+    (void)buffer;
+    (void)offset;
+    (void)stride;
+    (void)x;
+    (void)ry;
+    (void)w;
+    (void)h;
 #endif
   }
 
+  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  {
+    if (task.type == DeviceTask::DENOISE_BUFFER) {
+      /* Copy pixels from compute device to CPU (no-op for CPU device). */
+      rtile.buffers->buffer.copy_from_device();
+
+      denoise_openimagedenoise_buffer(task,
+                                      (float *)rtile.buffer,
+                                      rtile.offset,
+                                      rtile.stride,
+                                      rtile.x,
+                                      rtile.y,
+                                      rtile.w,
+                                      rtile.h);
+
+      /* todo: it may be possible to avoid this copy, but we have to ensure that
+       * when other code copies data from the device it doesn't overwrite the
+       * denoiser buffers. */
+      rtile.buffers->buffer.copy_to_device();
+    }
+    else {
+      /* Per-tile denoising. */
+      rtile.sample = rtile.start_sample + rtile.num_samples;
+
+      /* Map neighboring tiles into one buffer for denoising. */
+      RenderTileNeighbors neighbors(rtile);
+      task.map_neighbor_tiles(neighbors, this);
+      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+      rtile = center_tile;
+
+      /* Calculate size of the tile to denoise (including overlap). The overlap
+       * size was chosen empirically. OpenImageDenoise specifies an overlap size
+       * of 128 but this is significantly bigger than typical tile size. */
+      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
+      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
+
+      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
+      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
+
+      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+        RenderTile &ntile = neighbors.tiles[i];
+        if (!ntile.buffer) {
+          continue;
+        }
+
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        const float *tile_buffer = (float *)ntile.buffer + tile_offset * task.pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        float *merged_buffer = merged.data() + merged_offset * task.pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          memcpy(merged_buffer, tile_buffer, sizeof(float) * task.pass_stride * (xmax - xmin));
+          tile_buffer += ntile.stride * task.pass_stride;
+          merged_buffer += merged_stride * task.pass_stride;
+        }
+      }
+
+      /* Denoise */
+      denoise_openimagedenoise_buffer(
+          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y);
+
+      /* Copy back result from merged buffer. */
+      RenderTile &ntile = neighbors.target;
+      if (ntile.buffer) {
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        float *tile_buffer = (float *)ntile.buffer + tile_offset * task.pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        const float *merged_buffer = merged.data() + merged_offset * task.pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          memcpy(tile_buffer, merged_buffer, sizeof(float) * task.pass_stride * (xmax - xmin));
+          tile_buffer += ntile.stride * task.pass_stride;
+          merged_buffer += merged_stride * task.pass_stride;
+        }
+      }
+
+      task.unmap_neighbor_tiles(neighbors, this);
+    }
+  }
+
   void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
   {
     ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
@@ -1070,10 +1176,23 @@ class CPUDevice : public Device {
       }
     }
 
+    /* NLM denoiser. */
     DenoisingTask *denoising = NULL;
 
+    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+     * avoid waiting with mutex locks in the denoiser, we let only a single
+     * thread acquire denoising tiles. */
+    uint tile_types = task.tile_types;
+    bool hold_denoise_lock = false;
+    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      if (!oidn_task_lock.try_lock()) {
+        tile_types &= ~RenderTile::DENOISE;
+        hold_denoise_lock = true;
+      }
+    }
+
     RenderTile tile;
-    while (task.acquire_tile(this, tile, task.tile_types)) {
+    while (task.acquire_tile(this, tile, tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         if (use_split_kernel) {
           device_only_memory<uchar> void_buffer(this, "void_buffer");
@@ -1108,6 +1227,10 @@ class CPUDevice : public Device {
       }
     }
 
+    if (hold_denoise_lock) {
+      oidn_task_lock.unlock();
+    }
+
     profiler.remove_state(&kg->profiler);
 
     thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
author	Brecht Van Lommel <brecht@blender.org>	2020-07-09 13:20:07 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2020-07-10 18:10:05 +0300
commit	6eeb32706aa28bd4d0f3c26f6a5965facd6c0d62 (patch)
tree	a49f451587d8b3516eba096e6f94dc6108853cf1 /intern/cycles/device/device_cpu.cpp
parent	93791381fec898e6f74a189e4eeb25f66029f131 (diff)