diff options
author | Brecht Van Lommel <brecht@blender.org> | 2021-11-05 23:01:23 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2021-11-06 00:04:36 +0300 |
commit | 97ff37bf54474efbce39653a1387ad55091d4964 (patch) | |
tree | 58ff9592807dbd98d126b179627e5c56f5309956 /intern/cycles/integrator | |
parent | d1a9425a2fde32b6786b333ab55661da507e818b (diff) |
Cycles: perform CPU film reading in the kernel, to use AVX2 half conversion
Adds a bunch of CPU kernel function to process on row of pixels, and use those
instead of calling unoptimized implementations.
Fixes T92598
Diffstat (limited to 'intern/cycles/integrator')
-rw-r--r-- | intern/cycles/integrator/pass_accessor_cpu.cpp | 106 | ||||
-rw-r--r-- | intern/cycles/integrator/pass_accessor_cpu.h | 32 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_cpu.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/integrator/shader_eval.cpp | 2 |
4 files changed, 45 insertions, 97 deletions
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp index 820da757be0..77ca332d142 100644 --- a/intern/cycles/integrator/pass_accessor_cpu.cpp +++ b/intern/cycles/integrator/pass_accessor_cpu.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include "device/device.h" + #include "integrator/pass_accessor_cpu.h" #include "session/buffers.h" + #include "util/log.h" #include "util/tbb.h" @@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN * Kernel processing. */ -template<typename Processor> -inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers, - const BufferParams &buffer_params, - const Destination &destination, - const Processor &processor) const -{ - KernelFilmConvert kfilm_convert; - init_kernel_film_convert(&kfilm_convert, buffer_params, destination); - - if (destination.pixels) { - /* NOTE: No overlays are applied since they are not used for final renders. - * Can be supported via some sort of specialization to avoid code duplication. */ - - run_get_pass_kernel_processor_float( - &kfilm_convert, render_buffers, buffer_params, destination, processor); - } - - if (destination.pixels_half_rgba) { - /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */ - - if (destination.num_components == 1) { - run_get_pass_kernel_processor_half_rgba(&kfilm_convert, - render_buffers, - buffer_params, - destination, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - float pixel; - processor(kfilm_convert, buffer, &pixel); - - pixel_rgba[0] = pixel; - pixel_rgba[1] = pixel; - pixel_rgba[2] = pixel; - pixel_rgba[3] = 1.0f; - }); - } - else if (destination.num_components == 3) { - run_get_pass_kernel_processor_half_rgba(&kfilm_convert, - render_buffers, - buffer_params, - destination, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - processor(kfilm_convert, buffer, pixel_rgba); - pixel_rgba[3] = 1.0f; - }); - } - else if (destination.num_components == 4) { - run_get_pass_kernel_processor_half_rgba( - &kfilm_convert, render_buffers, buffer_params, destination, processor); - } - } -} - -template<typename Processor> inline void PassAccessorCPU::run_get_pass_kernel_processor_float( const KernelFilmConvert *kfilm_convert, const RenderBuffers *render_buffers, const BufferParams &buffer_params, const Destination &destination, - const Processor &processor) const + const CPUKernels::FilmConvertFunction func) const { + /* NOTE: No overlays are applied since they are not used for final renders. + * Can be supported via some sort of specialization to avoid code duplication. */ + DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented."; const int64_t pass_stride = buffer_params.pass_stride; @@ -112,21 +61,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float( const float *buffer = window_data + y * buffer_row_stride; float *pixel = destination.pixels + (y * buffer_params.width + destination.offset) * pixel_stride; - - for (int64_t x = 0; x < buffer_params.window_width; - ++x, buffer += pass_stride, pixel += pixel_stride) { - processor(kfilm_convert, buffer, pixel); - } + func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride); }); } -template<typename Processor> inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( const KernelFilmConvert *kfilm_convert, const RenderBuffers *render_buffers, const BufferParams &buffer_params, const Destination &destination, - const Processor &processor) const + const CPUKernels::FilmConvertHalfRGBAFunction func) const { const int64_t pass_stride = buffer_params.pass_stride; const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride; @@ -141,16 +85,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) { const float *buffer = window_data + y * buffer_row_stride; half4 *pixel = dst_start + y * destination_stride; - for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) { - - float pixel_rgba[4]; - processor(kfilm_convert, buffer, pixel_rgba); - - film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); - - *pixel = float4_to_half4_display( - make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); - } + func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride); }); } @@ -163,8 +98,25 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( const BufferParams &buffer_params, \ const Destination &destination) const \ { \ - run_get_pass_kernel_processor( \ - render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \ + const CPUKernels &kernels = Device::get_cpu_kernels(); \ + KernelFilmConvert kfilm_convert; \ + init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \ +\ + if (destination.pixels) { \ + run_get_pass_kernel_processor_float(&kfilm_convert, \ + render_buffers, \ + buffer_params, \ + destination, \ + kernels.film_convert_##pass); \ + } \ +\ + if (destination.pixels_half_rgba) { \ + run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \ + render_buffers, \ + buffer_params, \ + destination, \ + kernels.film_convert_half_rgba_##pass); \ + } \ } /* Float (scalar) passes. */ diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h index 0313dc5bb0d..9ed38ab256e 100644 --- a/intern/cycles/integrator/pass_accessor_cpu.h +++ b/intern/cycles/integrator/pass_accessor_cpu.h @@ -16,6 +16,8 @@ #pragma once +#include "device/cpu/kernel.h" + #include "integrator/pass_accessor.h" CCL_NAMESPACE_BEGIN @@ -28,25 +30,19 @@ class PassAccessorCPU : public PassAccessor { using PassAccessor::PassAccessor; protected: - template<typename Processor> - inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers, - const BufferParams &buffer_params, - const Destination &destination, - const Processor &processor) const; - - template<typename Processor> - inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert, - const RenderBuffers *render_buffers, - const BufferParams &buffer_params, - const Destination &destination, - const Processor &processor) const; + inline void run_get_pass_kernel_processor_float( + const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const CPUKernels::FilmConvertFunction func) const; - template<typename Processor> - inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert, - const RenderBuffers *render_buffers, - const BufferParams &buffer_params, - const Destination &destination, - const Processor &processor) const; + inline void run_get_pass_kernel_processor_half_rgba( + const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const CPUKernels::FilmConvertHalfRGBAFunction func) const; #define DECLARE_PASS_ACCESSOR(pass) \ virtual void get_pass_##pass(const RenderBuffers *render_buffers, \ diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index 541a7eca02f..36ce2be9f6d 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device, DeviceScene *device_scene, bool *cancel_requested_flag) : PathTraceWork(device, film, device_scene, cancel_requested_flag), - kernels_(*(device->get_cpu_kernels())) + kernels_(Device::get_cpu_kernels()) { DCHECK_EQ(device->info.type, DEVICE_CPU); } diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp index 42cbf87f254..9ec530c81df 100644 --- a/intern/cycles/integrator/shader_eval.cpp +++ b/intern/cycles/integrator/shader_eval.cpp @@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device, device->get_cpu_kernel_thread_globals(kernel_thread_globals); /* Find required kernel function. */ - const CPUKernels &kernels = *(device->get_cpu_kernels()); + const CPUKernels &kernels = Device::get_cpu_kernels(); /* Simple parallel_for over all work items. */ KernelShaderEvalInput *input_data = input.data(); |