diff options
Diffstat (limited to 'intern/cycles/device/device_cpu.cpp')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 235 |
1 files changed, 202 insertions, 33 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 8f68e66a1b4..ee3a3ddea64 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -182,6 +182,7 @@ class CPUDevice : public Device { oidn::DeviceRef oidn_device; oidn::FilterRef oidn_filter; #endif + thread_spin_lock oidn_task_lock; bool use_split_kernel; @@ -948,12 +949,25 @@ class CPUDevice : public Device { } } - void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) + void denoise_openimagedenoise_buffer(DeviceTask &task, + float *buffer, + const size_t offset, + const size_t stride, + const size_t x, + const size_t y, + const size_t w, + const size_t h, + const float scale) { #ifdef WITH_OPENIMAGEDENOISE assert(openimagedenoise_supported()); - /* Only one at a time, since OpenImageDenoise itself is multithreaded. */ + /* Only one at a time, since OpenImageDenoise itself is multithreaded for full + * buffers, and for tiled rendering because creating multiple devices and filters + * is slow and memory hungry as well. + * + * TODO: optimize tiled rendering case, by batching together denoising of many + * tiles somehow? */ static thread_mutex mutex; thread_scoped_lock lock(mutex); @@ -964,54 +978,192 @@ class CPUDevice : public Device { } if (!oidn_filter) { oidn_filter = oidn_device.newFilter("RT"); + oidn_filter.set("hdr", true); + oidn_filter.set("srgb", false); } - /* Copy pixels from compute device to CPU (no-op for CPU device). */ - rtile.buffers->buffer.copy_from_device(); - /* Set images with appropriate stride for our interleaved pass storage. */ - const struct { + struct { const char *name; - int offset; - } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR}, - {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL}, - {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO}, - {"output", 0}, + const int offset; + const bool scale; + const bool use; + array<float> scaled_buffer; + } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true}, + {"albedo", + task.pass_denoising_data + DENOISING_PASS_ALBEDO, + true, + task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO}, + {"normal", + task.pass_denoising_data + DENOISING_PASS_NORMAL, + true, + task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL}, + {"output", 0, false, true}, { NULL, 0 }}; for (int i = 0; passes[i].name; i++) { - const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride; - const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float); - const int64_t pixel_stride = task.pass_stride * sizeof(float); - const int64_t row_stride = rtile.stride * pixel_stride; + if (!passes[i].use) { + continue; + } - oidn_filter.setImage(passes[i].name, - (char *)rtile.buffer + buffer_offset, - oidn::Format::Float3, - rtile.w, - rtile.h, - 0, - pixel_stride, - row_stride); + const int64_t pixel_offset = offset + x + y * stride; + const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset); + const int64_t pixel_stride = task.pass_stride; + const int64_t row_stride = stride * pixel_stride; + + if (passes[i].scale && scale != 1.0f) { + /* Normalize albedo and normal passes as they are scaled by the number of samples. + * For the color passes OIDN will perform auto-exposure making it unnecessary. */ + array<float> &scaled_buffer = passes[i].scaled_buffer; + scaled_buffer.resize(w * h * 3); + + for (int y = 0; y < h; y++) { + const float *pass_row = buffer + buffer_offset + y * row_stride; + float *scaled_row = scaled_buffer.data() + y * w * 3; + + for (int x = 0; x < w; x++) { + scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale; + scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale; + scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale; + } + } + + oidn_filter.setImage( + passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0); + } + else { + oidn_filter.setImage(passes[i].name, + buffer + buffer_offset, + oidn::Format::Float3, + w, + h, + 0, + pixel_stride * sizeof(float), + row_stride * sizeof(float)); + } } /* Execute filter. */ - oidn_filter.set("hdr", true); - oidn_filter.set("srgb", false); oidn_filter.commit(); oidn_filter.execute(); - - /* todo: it may be possible to avoid this copy, but we have to ensure that - * when other code copies data from the device it doesn't overwrite the - * denoiser buffers. */ - rtile.buffers->buffer.copy_to_device(); #else (void)task; - (void)rtile; + (void)buffer; + (void)offset; + (void)stride; + (void)x; + (void)y; + (void)w; + (void)h; + (void)scale; #endif } + void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) + { + if (task.type == DeviceTask::DENOISE_BUFFER) { + /* Copy pixels from compute device to CPU (no-op for CPU device). */ + rtile.buffers->buffer.copy_from_device(); + + denoise_openimagedenoise_buffer(task, + (float *)rtile.buffer, + rtile.offset, + rtile.stride, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + 1.0f / rtile.sample); + + /* todo: it may be possible to avoid this copy, but we have to ensure that + * when other code copies data from the device it doesn't overwrite the + * denoiser buffers. */ + rtile.buffers->buffer.copy_to_device(); + } + else { + /* Per-tile denoising. */ + rtile.sample = rtile.start_sample + rtile.num_samples; + const float scale = 1.0f / rtile.sample; + const float invscale = rtile.sample; + const size_t pass_stride = task.pass_stride; + + /* Map neighboring tiles into one buffer for denoising. */ + RenderTileNeighbors neighbors(rtile); + task.map_neighbor_tiles(neighbors, this); + RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; + rtile = center_tile; + + /* Calculate size of the tile to denoise (including overlap). The overlap + * size was chosen empirically. OpenImageDenoise specifies an overlap size + * of 128 but this is significantly bigger than typical tile size. */ + const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds()); + const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); + + /* Adjacent tiles are in separate memory regions, copy into single buffer. */ + array<float> merged(rect_size.x * rect_size.y * task.pass_stride); + + for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { + RenderTile &ntile = neighbors.tiles[i]; + if (!ntile.buffer) { + continue; + } + + const int xmin = max(ntile.x, rect.x); + const int ymin = max(ntile.y, rect.y); + const int xmax = min(ntile.x + ntile.w, rect.z); + const int ymax = min(ntile.y + ntile.h, rect.w); + + const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride; + const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride; + + const size_t merged_stride = rect_size.x; + const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride; + float *merged_buffer = merged.data() + merged_offset * pass_stride; + + for (int y = ymin; y < ymax; y++) { + for (int x = 0; x < pass_stride * (xmax - xmin); x++) { + merged_buffer[x] = tile_buffer[x] * scale; + } + tile_buffer += ntile.stride * pass_stride; + merged_buffer += merged_stride * pass_stride; + } + } + + /* Denoise */ + denoise_openimagedenoise_buffer( + task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f); + + /* Copy back result from merged buffer. */ + RenderTile &ntile = neighbors.target; + if (ntile.buffer) { + const int xmin = max(ntile.x, rect.x); + const int ymin = max(ntile.y, rect.y); + const int xmax = min(ntile.x + ntile.w, rect.z); + const int ymax = min(ntile.y + ntile.h, rect.w); + + const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride; + float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride; + + const size_t merged_stride = rect_size.x; + const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride; + const float *merged_buffer = merged.data() + merged_offset * pass_stride; + + for (int y = ymin; y < ymax; y++) { + for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) { + tile_buffer[x + 0] = merged_buffer[x + 0] * invscale; + tile_buffer[x + 1] = merged_buffer[x + 1] * invscale; + tile_buffer[x + 2] = merged_buffer[x + 2] * invscale; + } + tile_buffer += ntile.stride * pass_stride; + merged_buffer += merged_stride * pass_stride; + } + } + + task.unmap_neighbor_tiles(neighbors, this); + } + } + void denoise_nlm(DenoisingTask &denoising, RenderTile &tile) { ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); @@ -1040,7 +1192,7 @@ class CPUDevice : public Device { denoising.render_buffer.samples = tile.sample; denoising.buffer.gpu_temporary_mem = false; - denoising.run_denoising(&tile); + denoising.run_denoising(tile); } void thread_render(DeviceTask &task) @@ -1070,10 +1222,23 @@ class CPUDevice : public Device { } } + /* NLM denoiser. */ DenoisingTask *denoising = NULL; + /* OpenImageDenoise: we can only denoise with one thread at a time, so to + * avoid waiting with mutex locks in the denoiser, we let only a single + * thread acquire denoising tiles. */ + uint tile_types = task.tile_types; + bool hold_denoise_lock = false; + if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + if (!oidn_task_lock.try_lock()) { + tile_types &= ~RenderTile::DENOISE; + hold_denoise_lock = true; + } + } + RenderTile tile; - while (task.acquire_tile(this, tile, task.tile_types)) { + while (task.acquire_tile(this, tile, tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { if (use_split_kernel) { device_only_memory<uchar> void_buffer(this, "void_buffer"); @@ -1108,6 +1273,10 @@ class CPUDevice : public Device { } } + if (hold_denoise_lock) { + oidn_task_lock.unlock(); + } + profiler.remove_state(&kg->profiler); thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer); |