Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2021-11-05 23:01:23 +0300
committerBrecht Van Lommel <brecht@blender.org>2021-11-06 00:04:36 +0300
commit97ff37bf54474efbce39653a1387ad55091d4964 (patch)
tree58ff9592807dbd98d126b179627e5c56f5309956 /intern/cycles
parentd1a9425a2fde32b6786b333ab55661da507e818b (diff)
Cycles: perform CPU film reading in the kernel, to use AVX2 half conversion
Adds a bunch of CPU kernel function to process on row of pixels, and use those instead of calling unoptimized implementations. Fixes T92598
Diffstat (limited to 'intern/cycles')
-rw-r--r--intern/cycles/device/cpu/device_impl.cpp8
-rw-r--r--intern/cycles/device/cpu/device_impl.h3
-rw-r--r--intern/cycles/device/cpu/kernel.cpp19
-rw-r--r--intern/cycles/device/cpu/kernel.h37
-rw-r--r--intern/cycles/device/device.cpp8
-rw-r--r--intern/cycles/device/device.h2
-rw-r--r--intern/cycles/integrator/pass_accessor_cpu.cpp106
-rw-r--r--intern/cycles/integrator/pass_accessor_cpu.h32
-rw-r--r--intern/cycles/integrator/path_trace_work_cpu.cpp2
-rw-r--r--intern/cycles/integrator/shader_eval.cpp2
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.h1
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch.h31
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch_impl.h81
13 files changed, 220 insertions, 112 deletions
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index d494b40f71d..68dec7f0af2 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
{
/* Pick any kernel, all of them are supposed to have same level of microarchitecture
* optimization. */
- VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels.";
+ VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
+ << " CPU kernels.";
if (info.cpu_threads == 0) {
info.cpu_threads = TaskScheduler::num_threads();
@@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
Device::build_bvh(bvh, progress, refit);
}
-const CPUKernels *CPUDevice::get_cpu_kernels() const
-{
- return &kernels;
-}
-
void CPUDevice::get_cpu_kernel_thread_globals(
vector<CPUKernelThreadGlobals> &kernel_thread_globals)
{
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
index 553728ccc3b..90d217bb624 100644
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -57,8 +57,6 @@ class CPUDevice : public Device {
RTCDevice embree_device;
#endif
- CPUKernels kernels;
-
CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
~CPUDevice();
@@ -90,7 +88,6 @@ class CPUDevice : public Device {
void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
- virtual const CPUKernels *get_cpu_kernels() const override;
virtual void get_cpu_kernel_thread_globals(
vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
virtual void *get_cpu_osl_memory() override;
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
index 3b253c094fd..91c472d41e8 100644
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN
KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+#define REGISTER_KERNEL_FILM_CONVERT(name) \
+ film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
+ film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))
CPUKernels::CPUKernels()
: /* Integrator. */
@@ -50,11 +53,25 @@ CPUKernels::CPUKernels()
REGISTER_KERNEL(adaptive_sampling_filter_x),
REGISTER_KERNEL(adaptive_sampling_filter_y),
/* Cryptomatte. */
- REGISTER_KERNEL(cryptomatte_postprocess)
+ REGISTER_KERNEL(cryptomatte_postprocess),
+ /* Film Convert. */
+ REGISTER_KERNEL_FILM_CONVERT(depth),
+ REGISTER_KERNEL_FILM_CONVERT(mist),
+ REGISTER_KERNEL_FILM_CONVERT(sample_count),
+ REGISTER_KERNEL_FILM_CONVERT(float),
+ REGISTER_KERNEL_FILM_CONVERT(light_path),
+ REGISTER_KERNEL_FILM_CONVERT(float3),
+ REGISTER_KERNEL_FILM_CONVERT(motion),
+ REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
+ REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
+ REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
+ REGISTER_KERNEL_FILM_CONVERT(combined),
+ REGISTER_KERNEL_FILM_CONVERT(float4)
{
}
#undef REGISTER_KERNEL
+#undef REGISTER_KERNEL_FILM_CONVERT
#undef KERNEL_FUNCTIONS
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
index 5beeaf148a1..406bd07ab3d 100644
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -17,11 +17,13 @@
#pragma once
#include "device/cpu/kernel_function.h"
+#include "util/half.h"
#include "util/types.h"
CCL_NAMESPACE_BEGIN
struct KernelGlobalsCPU;
+struct KernelFilmConvert;
struct IntegratorStateCPU;
struct TileInfo;
@@ -102,6 +104,41 @@ class CPUKernels {
CryptomattePostprocessFunction cryptomatte_postprocess;
+ /* Film Convert. */
+ using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+ const float *buffer,
+ float *pixel,
+ const int width,
+ const int buffer_stride,
+ const int pixel_stride)>;
+ using FilmConvertHalfRGBAFunction =
+ CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+ const float *buffer,
+ half4 *pixel,
+ const int width,
+ const int buffer_stride)>;
+
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+ FilmConvertFunction film_convert_##name; \
+ FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
+
+ KERNEL_FILM_CONVERT_FUNCTION(depth)
+ KERNEL_FILM_CONVERT_FUNCTION(mist)
+ KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+ KERNEL_FILM_CONVERT_FUNCTION(float)
+
+ KERNEL_FILM_CONVERT_FUNCTION(light_path)
+ KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+ KERNEL_FILM_CONVERT_FUNCTION(motion)
+ KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+ KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+ KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+ KERNEL_FILM_CONVERT_FUNCTION(combined)
+ KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
CPUKernels();
};
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 69e959b6f7b..63d0a49d3eb 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -23,6 +23,7 @@
#include "device/queue.h"
#include "device/cpu/device.h"
+#include "device/cpu/kernel.h"
#include "device/cuda/device.h"
#include "device/dummy/device.h"
#include "device/hip/device.h"
@@ -363,10 +364,11 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
return nullptr;
}
-const CPUKernels *Device::get_cpu_kernels() const
+const CPUKernels &Device::get_cpu_kernels()
{
- LOG(FATAL) << "Device does not support CPU kernels.";
- return nullptr;
+ /* Initialize CPU kernels once and reuse. */
+ static CPUKernels kernels;
+ return kernels;
}
void Device::get_cpu_kernel_thread_globals(
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 3cb177adde7..65188459c2c 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -180,7 +180,7 @@ class Device {
* These may not be used on GPU or multi-devices. */
/* Get CPU kernel functions for native instruction set. */
- virtual const CPUKernels *get_cpu_kernels() const;
+ static const CPUKernels &get_cpu_kernels();
/* Get kernel globals to pass to kernels. */
virtual void get_cpu_kernel_thread_globals(
vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
index 820da757be0..77ca332d142 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -14,9 +14,12 @@
* limitations under the License.
*/
+#include "device/device.h"
+
#include "integrator/pass_accessor_cpu.h"
#include "session/buffers.h"
+
#include "util/log.h"
#include "util/tbb.h"
@@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN
* Kernel processing.
*/
-template<typename Processor>
-inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
- const BufferParams &buffer_params,
- const Destination &destination,
- const Processor &processor) const
-{
- KernelFilmConvert kfilm_convert;
- init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
-
- if (destination.pixels) {
- /* NOTE: No overlays are applied since they are not used for final renders.
- * Can be supported via some sort of specialization to avoid code duplication. */
-
- run_get_pass_kernel_processor_float(
- &kfilm_convert, render_buffers, buffer_params, destination, processor);
- }
-
- if (destination.pixels_half_rgba) {
- /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
-
- if (destination.num_components == 1) {
- run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
- render_buffers,
- buffer_params,
- destination,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- float pixel;
- processor(kfilm_convert, buffer, &pixel);
-
- pixel_rgba[0] = pixel;
- pixel_rgba[1] = pixel;
- pixel_rgba[2] = pixel;
- pixel_rgba[3] = 1.0f;
- });
- }
- else if (destination.num_components == 3) {
- run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
- render_buffers,
- buffer_params,
- destination,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- processor(kfilm_convert, buffer, pixel_rgba);
- pixel_rgba[3] = 1.0f;
- });
- }
- else if (destination.num_components == 4) {
- run_get_pass_kernel_processor_half_rgba(
- &kfilm_convert, render_buffers, buffer_params, destination, processor);
- }
- }
-}
-
-template<typename Processor>
inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
const KernelFilmConvert *kfilm_convert,
const RenderBuffers *render_buffers,
const BufferParams &buffer_params,
const Destination &destination,
- const Processor &processor) const
+ const CPUKernels::FilmConvertFunction func) const
{
+ /* NOTE: No overlays are applied since they are not used for final renders.
+ * Can be supported via some sort of specialization to avoid code duplication. */
+
DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
const int64_t pass_stride = buffer_params.pass_stride;
@@ -112,21 +61,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
const float *buffer = window_data + y * buffer_row_stride;
float *pixel = destination.pixels +
(y * buffer_params.width + destination.offset) * pixel_stride;
-
- for (int64_t x = 0; x < buffer_params.window_width;
- ++x, buffer += pass_stride, pixel += pixel_stride) {
- processor(kfilm_convert, buffer, pixel);
- }
+ func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
});
}
-template<typename Processor>
inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
const KernelFilmConvert *kfilm_convert,
const RenderBuffers *render_buffers,
const BufferParams &buffer_params,
const Destination &destination,
- const Processor &processor) const
+ const CPUKernels::FilmConvertHalfRGBAFunction func) const
{
const int64_t pass_stride = buffer_params.pass_stride;
const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride;
@@ -141,16 +85,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) {
const float *buffer = window_data + y * buffer_row_stride;
half4 *pixel = dst_start + y * destination_stride;
- for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) {
-
- float pixel_rgba[4];
- processor(kfilm_convert, buffer, pixel_rgba);
-
- film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
-
- *pixel = float4_to_half4_display(
- make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
- }
+ func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
});
}
@@ -163,8 +98,25 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
const BufferParams &buffer_params, \
const Destination &destination) const \
{ \
- run_get_pass_kernel_processor( \
- render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+ const CPUKernels &kernels = Device::get_cpu_kernels(); \
+ KernelFilmConvert kfilm_convert; \
+ init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \
+\
+ if (destination.pixels) { \
+ run_get_pass_kernel_processor_float(&kfilm_convert, \
+ render_buffers, \
+ buffer_params, \
+ destination, \
+ kernels.film_convert_##pass); \
+ } \
+\
+ if (destination.pixels_half_rgba) { \
+ run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \
+ render_buffers, \
+ buffer_params, \
+ destination, \
+ kernels.film_convert_half_rgba_##pass); \
+ } \
}
/* Float (scalar) passes. */
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
index 0313dc5bb0d..9ed38ab256e 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.h
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -16,6 +16,8 @@
#pragma once
+#include "device/cpu/kernel.h"
+
#include "integrator/pass_accessor.h"
CCL_NAMESPACE_BEGIN
@@ -28,25 +30,19 @@ class PassAccessorCPU : public PassAccessor {
using PassAccessor::PassAccessor;
protected:
- template<typename Processor>
- inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
- const BufferParams &buffer_params,
- const Destination &destination,
- const Processor &processor) const;
-
- template<typename Processor>
- inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
- const RenderBuffers *render_buffers,
- const BufferParams &buffer_params,
- const Destination &destination,
- const Processor &processor) const;
+ inline void run_get_pass_kernel_processor_float(
+ const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const CPUKernels::FilmConvertFunction func) const;
- template<typename Processor>
- inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
- const RenderBuffers *render_buffers,
- const BufferParams &buffer_params,
- const Destination &destination,
- const Processor &processor) const;
+ inline void run_get_pass_kernel_processor_half_rgba(
+ const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const CPUKernels::FilmConvertHalfRGBAFunction func) const;
#define DECLARE_PASS_ACCESSOR(pass) \
virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index 541a7eca02f..36ce2be9f6d 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
DeviceScene *device_scene,
bool *cancel_requested_flag)
: PathTraceWork(device, film, device_scene, cancel_requested_flag),
- kernels_(*(device->get_cpu_kernels()))
+ kernels_(Device::get_cpu_kernels())
{
DCHECK_EQ(device->info.type, DEVICE_CPU);
}
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
index 42cbf87f254..9ec530c81df 100644
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device,
device->get_cpu_kernel_thread_globals(kernel_thread_globals);
/* Find required kernel function. */
- const CPUKernels &kernels = *(device->get_cpu_kernels());
+ const CPUKernels &kernels = Device::get_cpu_kernels();
/* Simple parallel_for over all work items. */
KernelShaderEvalInput *input_data = input.data();
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index c49d7ca445a..6af8094b1ea 100644
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -18,6 +18,7 @@
/* CPU Kernel Interface */
+#include "util/half.h"
#include "util/types.h"
#include "kernel/types.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 432ac5e15a9..2f9a3f7c59d 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -52,6 +52,37 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
#undef KERNEL_INTEGRATOR_INIT_FUNCTION
#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ float *pixel, \
+ const int width, \
+ const int buffer_stride, \
+ const int pixel_stride); \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+ const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ half4 *pixel, \
+ const int width, \
+ const int buffer_stride);
+
+KERNEL_FILM_CONVERT_FUNCTION(depth)
+KERNEL_FILM_CONVERT_FUNCTION(mist)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+KERNEL_FILM_CONVERT_FUNCTION(float)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path)
+KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+KERNEL_FILM_CONVERT_FUNCTION(combined)
+KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
/* --------------------------------------------------------------------
* Shader evaluation.
*/
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 6df5d7787fc..1ea5002e300 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -47,8 +47,8 @@
# include "kernel/integrator/megakernel.h"
# include "kernel/film/adaptive_sampling.h"
-# include "kernel/film/read.h"
# include "kernel/film/id_passes.h"
+# include "kernel/film/read.h"
# include "kernel/bake/bake.h"
@@ -232,6 +232,85 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
#endif
}
+/* --------------------------------------------------------------------
+ * Film Convert.
+ */
+
+#ifdef KERNEL_STUB
+
+# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ float *pixel, \
+ const int width, \
+ const int buffer_stride, \
+ const int pixel_stride) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+ } \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+ const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ half4 *pixel, \
+ const int width, \
+ const int buffer_stride) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+ }
+
+#else
+
+# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ float *pixel, \
+ const int width, \
+ const int buffer_stride, \
+ const int pixel_stride) \
+ { \
+ for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
+ film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
+ } \
+ } \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+ const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ half4 *pixel, \
+ const int width, \
+ const int buffer_stride) \
+ { \
+ for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
+ float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
+ film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
+ if (is_float) { \
+ pixel_rgba[1] = pixel_rgba[0]; \
+ pixel_rgba[2] = pixel_rgba[0]; \
+ } \
+ film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
+ *pixel = float4_to_half4_display( \
+ make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
+ } \
+ }
+
+#endif
+
+KERNEL_FILM_CONVERT_FUNCTION(depth, true)
+KERNEL_FILM_CONVERT_FUNCTION(mist, true)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
+KERNEL_FILM_CONVERT_FUNCTION(float, true)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
+KERNEL_FILM_CONVERT_FUNCTION(float3, false)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion, false)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
+KERNEL_FILM_CONVERT_FUNCTION(combined, false)
+KERNEL_FILM_CONVERT_FUNCTION(float4, false)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
#undef KERNEL_INVOKE
#undef DEFINE_INTEGRATOR_KERNEL
#undef DEFINE_INTEGRATOR_SHADE_KERNEL