From 97ff37bf54474efbce39653a1387ad55091d4964 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Fri, 5 Nov 2021 21:01:23 +0100 Subject: Cycles: perform CPU film reading in the kernel, to use AVX2 half conversion Adds a bunch of CPU kernel function to process on row of pixels, and use those instead of calling unoptimized implementations. Fixes T92598 --- intern/cycles/device/cpu/device_impl.cpp | 8 ++----- intern/cycles/device/cpu/device_impl.h | 3 --- intern/cycles/device/cpu/kernel.cpp | 19 +++++++++++++++- intern/cycles/device/cpu/kernel.h | 37 ++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 10 deletions(-) (limited to 'intern/cycles/device/cpu') diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp index d494b40f71d..68dec7f0af2 100644 --- a/intern/cycles/device/cpu/device_impl.cpp +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_ { /* Pick any kernel, all of them are supposed to have same level of microarchitecture * optimization. */ - VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels."; + VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name() + << " CPU kernels."; if (info.cpu_threads == 0) { info.cpu_threads = TaskScheduler::num_threads(); @@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) Device::build_bvh(bvh, progress, refit); } -const CPUKernels *CPUDevice::get_cpu_kernels() const -{ - return &kernels; -} - void CPUDevice::get_cpu_kernel_thread_globals( vector &kernel_thread_globals) { diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h index 553728ccc3b..90d217bb624 100644 --- a/intern/cycles/device/cpu/device_impl.h +++ b/intern/cycles/device/cpu/device_impl.h @@ -57,8 +57,6 @@ class CPUDevice : public Device { RTCDevice embree_device; #endif - CPUKernels kernels; - CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_); ~CPUDevice(); @@ -90,7 +88,6 @@ class CPUDevice : public Device { void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - virtual const CPUKernels *get_cpu_kernels() const override; virtual void get_cpu_kernel_thread_globals( vector &kernel_thread_globals) override; virtual void *get_cpu_osl_memory() override; diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp index 3b253c094fd..91c472d41e8 100644 --- a/intern/cycles/device/cpu/kernel.cpp +++ b/intern/cycles/device/cpu/kernel.cpp @@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name) #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name)) +#define REGISTER_KERNEL_FILM_CONVERT(name) \ + film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \ + film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name)) CPUKernels::CPUKernels() : /* Integrator. */ @@ -50,11 +53,25 @@ CPUKernels::CPUKernels() REGISTER_KERNEL(adaptive_sampling_filter_x), REGISTER_KERNEL(adaptive_sampling_filter_y), /* Cryptomatte. */ - REGISTER_KERNEL(cryptomatte_postprocess) + REGISTER_KERNEL(cryptomatte_postprocess), + /* Film Convert. */ + REGISTER_KERNEL_FILM_CONVERT(depth), + REGISTER_KERNEL_FILM_CONVERT(mist), + REGISTER_KERNEL_FILM_CONVERT(sample_count), + REGISTER_KERNEL_FILM_CONVERT(float), + REGISTER_KERNEL_FILM_CONVERT(light_path), + REGISTER_KERNEL_FILM_CONVERT(float3), + REGISTER_KERNEL_FILM_CONVERT(motion), + REGISTER_KERNEL_FILM_CONVERT(cryptomatte), + REGISTER_KERNEL_FILM_CONVERT(shadow_catcher), + REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow), + REGISTER_KERNEL_FILM_CONVERT(combined), + REGISTER_KERNEL_FILM_CONVERT(float4) { } #undef REGISTER_KERNEL +#undef REGISTER_KERNEL_FILM_CONVERT #undef KERNEL_FUNCTIONS CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h index 5beeaf148a1..406bd07ab3d 100644 --- a/intern/cycles/device/cpu/kernel.h +++ b/intern/cycles/device/cpu/kernel.h @@ -17,11 +17,13 @@ #pragma once #include "device/cpu/kernel_function.h" +#include "util/half.h" #include "util/types.h" CCL_NAMESPACE_BEGIN struct KernelGlobalsCPU; +struct KernelFilmConvert; struct IntegratorStateCPU; struct TileInfo; @@ -102,6 +104,41 @@ class CPUKernels { CryptomattePostprocessFunction cryptomatte_postprocess; + /* Film Convert. */ + using FilmConvertFunction = CPUKernelFunction; + using FilmConvertHalfRGBAFunction = + CPUKernelFunction; + +#define KERNEL_FILM_CONVERT_FUNCTION(name) \ + FilmConvertFunction film_convert_##name; \ + FilmConvertHalfRGBAFunction film_convert_half_rgba_##name; + + KERNEL_FILM_CONVERT_FUNCTION(depth) + KERNEL_FILM_CONVERT_FUNCTION(mist) + KERNEL_FILM_CONVERT_FUNCTION(sample_count) + KERNEL_FILM_CONVERT_FUNCTION(float) + + KERNEL_FILM_CONVERT_FUNCTION(light_path) + KERNEL_FILM_CONVERT_FUNCTION(float3) + + KERNEL_FILM_CONVERT_FUNCTION(motion) + KERNEL_FILM_CONVERT_FUNCTION(cryptomatte) + KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher) + KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow) + KERNEL_FILM_CONVERT_FUNCTION(combined) + KERNEL_FILM_CONVERT_FUNCTION(float4) + +#undef KERNEL_FILM_CONVERT_FUNCTION + CPUKernels(); }; -- cgit v1.2.3