diff options
Diffstat (limited to 'intern/cycles/device/device_cpu.cpp')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 177 |
1 files changed, 129 insertions, 48 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index fc6febd8cee..8f68e66a1b4 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -51,10 +51,12 @@ #include "util/util_function.h" #include "util/util_logging.h" #include "util/util_map.h" +#include "util/util_openimagedenoise.h" #include "util/util_opengl.h" #include "util/util_optimization.h" #include "util/util_progress.h" #include "util/util_system.h" +#include "util/util_task.h" #include "util/util_thread.h" CCL_NAMESPACE_BEGIN @@ -161,7 +163,7 @@ class CPUSplitKernel : public DeviceSplitKernel { virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &); virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); }; @@ -176,6 +178,10 @@ class CPUDevice : public Device { #ifdef WITH_OSL OSLGlobals osl_globals; #endif +#ifdef WITH_OPENIMAGEDENOISE + oidn::DeviceRef oidn_device; + oidn::FilterRef oidn_filter; +#endif bool use_split_kernel; @@ -332,7 +338,7 @@ class CPUDevice : public Device { ~CPUDevice() { - task_pool.stop(); + task_pool.cancel(); texture_info.free(); } @@ -344,17 +350,6 @@ class CPUDevice : public Device { virtual BVHLayoutMask get_bvh_layout_mask() const { BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; - if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { - bvh_layout_mask |= BVH_LAYOUT_BVH4; - } - /* MSVC does not support the -march=native switch and you always end up */ - /* with an sse2 kernel when you use WITH_KERNEL_NATIVE. We *cannot* feed */ - /* that kernel BVH8 even if the CPU flags would allow for it. */ -#if (defined(__x86_64__) || defined(_M_X64)) && !(defined(_MSC_VER) && defined(WITH_KERNEL_NATIVE)) - if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { - bvh_layout_mask |= BVH_LAYOUT_BVH8; - } -#endif #ifdef WITH_EMBREE bvh_layout_mask |= BVH_LAYOUT_EMBREE; #endif /* WITH_EMBREE */ @@ -527,26 +522,18 @@ class CPUDevice : public Device { #endif } - void thread_run(DeviceTask *task) + void thread_run(DeviceTask &task) { - if (task->type == DeviceTask::RENDER) - thread_render(*task); - else if (task->type == DeviceTask::SHADER) - thread_shader(*task); - else if (task->type == DeviceTask::FILM_CONVERT) - thread_film_convert(*task); - else if (task->type == DeviceTask::DENOISE_BUFFER) - thread_denoise(*task); + if (task.type == DeviceTask::RENDER) + thread_render(task); + else if (task.type == DeviceTask::SHADER) + thread_shader(task); + else if (task.type == DeviceTask::FILM_CONVERT) + thread_film_convert(task); + else if (task.type == DeviceTask::DENOISE_BUFFER) + thread_denoise(task); } - class CPUDeviceTask : public DeviceTask { - public: - CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&CPUDevice::thread_run, device, this); - } - }; - bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, @@ -961,7 +948,71 @@ class CPUDevice : public Device { } } - void denoise(DenoisingTask &denoising, RenderTile &tile) + void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) + { +#ifdef WITH_OPENIMAGEDENOISE + assert(openimagedenoise_supported()); + + /* Only one at a time, since OpenImageDenoise itself is multithreaded. */ + static thread_mutex mutex; + thread_scoped_lock lock(mutex); + + /* Create device and filter, cached for reuse. */ + if (!oidn_device) { + oidn_device = oidn::newDevice(); + oidn_device.commit(); + } + if (!oidn_filter) { + oidn_filter = oidn_device.newFilter("RT"); + } + + /* Copy pixels from compute device to CPU (no-op for CPU device). */ + rtile.buffers->buffer.copy_from_device(); + + /* Set images with appropriate stride for our interleaved pass storage. */ + const struct { + const char *name; + int offset; + } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR}, + {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL}, + {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO}, + {"output", 0}, + { NULL, + 0 }}; + + for (int i = 0; passes[i].name; i++) { + const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride; + const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float); + const int64_t pixel_stride = task.pass_stride * sizeof(float); + const int64_t row_stride = rtile.stride * pixel_stride; + + oidn_filter.setImage(passes[i].name, + (char *)rtile.buffer + buffer_offset, + oidn::Format::Float3, + rtile.w, + rtile.h, + 0, + pixel_stride, + row_stride); + } + + /* Execute filter. */ + oidn_filter.set("hdr", true); + oidn_filter.set("srgb", false); + oidn_filter.commit(); + oidn_filter.execute(); + + /* todo: it may be possible to avoid this copy, but we have to ensure that + * when other code copies data from the device it doesn't overwrite the + * denoiser buffers. */ + rtile.buffers->buffer.copy_to_device(); +#else + (void)task; + (void)rtile; +#endif + } + + void denoise_nlm(DenoisingTask &denoising, RenderTile &tile) { ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); @@ -1019,15 +1070,14 @@ class CPUDevice : public Device { } } - RenderTile tile; - DenoisingTask denoising(this, task); - denoising.profiler = &kg->profiler; + DenoisingTask *denoising = NULL; + RenderTile tile; while (task.acquire_tile(this, tile, task.tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { if (use_split_kernel) { device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(&task, tile, kgbuffer, void_buffer); + split_kernel->path_trace(task, tile, kgbuffer, void_buffer); } else { render(task, tile, kg); @@ -1037,7 +1087,16 @@ class CPUDevice : public Device { render(task, tile, kg); } else if (tile.task == RenderTile::DENOISE) { - denoise(denoising, tile); + if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + denoise_openimagedenoise(task, tile); + } + else if (task.denoising.type == DENOISER_NLM) { + if (denoising == NULL) { + denoising = new DenoisingTask(this, task); + denoising->profiler = &kg->profiler; + } + denoise_nlm(*denoising, tile); + } task.update_progress(&tile, tile.w * tile.h); } @@ -1055,6 +1114,7 @@ class CPUDevice : public Device { kg->~KernelGlobals(); kgbuffer.free(); delete split_kernel; + delete denoising; } void thread_denoise(DeviceTask &task) @@ -1072,16 +1132,22 @@ class CPUDevice : public Device { tile.stride = task.stride; tile.buffers = task.buffers; - DenoisingTask denoising(this, task); + if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + denoise_openimagedenoise(task, tile); + } + else { + DenoisingTask denoising(this, task); - ProfilingState denoising_profiler_state; - profiler.add_state(&denoising_profiler_state); - denoising.profiler = &denoising_profiler_state; + ProfilingState denoising_profiler_state; + profiler.add_state(&denoising_profiler_state); + denoising.profiler = &denoising_profiler_state; - denoise(denoising, tile); - task.update_progress(&tile, tile.w * tile.h); + denoise_nlm(denoising, tile); + + profiler.remove_state(&denoising_profiler_state); + } - profiler.remove_state(&denoising_profiler_state); + task.update_progress(&tile, tile.w * tile.h); } void thread_film_convert(DeviceTask &task) @@ -1155,13 +1221,24 @@ class CPUDevice : public Device { /* split task into smaller ones */ list<DeviceTask> tasks; - if (task.type == DeviceTask::SHADER) + if (task.type == DeviceTask::DENOISE_BUFFER && + task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + /* Denoise entire buffer at once with OIDN, it has own threading. */ + tasks.push_back(task); + } + else if (task.type == DeviceTask::SHADER) { task.split(tasks, info.cpu_threads, 256); - else + } + else { task.split(tasks, info.cpu_threads); + } - foreach (DeviceTask &task, tasks) - task_pool.push(new CPUDeviceTask(this, task)); + foreach (DeviceTask &task, tasks) { + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy); + }); + } } void task_wait() @@ -1326,7 +1403,7 @@ int2 CPUSplitKernel::split_kernel_local_size() int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/, device_memory & /*data*/, - DeviceTask * /*task*/) + DeviceTask & /*task*/) { return make_int2(1, 1); } @@ -1358,6 +1435,10 @@ void device_cpu_info(vector<DeviceInfo> &devices) info.has_osl = true; info.has_half_images = true; info.has_profiling = true; + info.denoisers = DENOISER_NLM; + if (openimagedenoise_supported()) { + info.denoisers |= DENOISER_OPENIMAGEDENOISE; + } devices.insert(devices.begin(), info); } |