From 282516e53eba9bb3aaddd67b2b099fea98bd4c1f Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Thu, 21 Oct 2021 19:25:38 +0200 Subject: Cleanup: refactor float/half conversions for clarity --- intern/cycles/integrator/pass_accessor.cpp | 2 +- intern/cycles/integrator/pass_accessor_cpu.cpp | 4 +- intern/cycles/kernel/device/cpu/image.h | 4 +- intern/cycles/kernel/device/cuda/compat.h | 7 + intern/cycles/kernel/device/gpu/kernel.h | 2 +- intern/cycles/kernel/device/optix/compat.h | 7 + intern/cycles/util/util_half.h | 170 ++++++++++++------------- intern/cycles/util/util_image.h | 4 +- 8 files changed, 102 insertions(+), 98 deletions(-) (limited to 'intern') diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp index 4ef9ce7ef42..1308b03b06c 100644 --- a/intern/cycles/integrator/pass_accessor.cpp +++ b/intern/cycles/integrator/pass_accessor.cpp @@ -115,7 +115,7 @@ static void pad_pixels(const BufferParams &buffer_params, } if (destination.pixels_half_rgba) { - const half one = float_to_half(1.0f); + const half one = float_to_half_display(1.0f); half4 *pixel = destination.pixels_half_rgba + destination.offset; for (size_t i = 0; i < size; i++, pixel++) { diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp index 80908271ff6..e3cb81d31b7 100644 --- a/intern/cycles/integrator/pass_accessor_cpu.cpp +++ b/intern/cycles/integrator/pass_accessor_cpu.cpp @@ -148,8 +148,8 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); - float4_store_half(&pixel->x, - make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); + *pixel = float4_to_half4_display( + make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); } }); } diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h index 44c5d7ef065..93f956e354d 100644 --- a/intern/cycles/kernel/device/cpu/image.h +++ b/intern/cycles/kernel/device/cpu/image.h @@ -72,12 +72,12 @@ template struct TextureInterpolator { static ccl_always_inline float4 read(half4 r) { - return half4_to_float4(r); + return half4_to_float4_image(r); } static ccl_always_inline float4 read(half r) { - float f = half_to_float(r); + float f = half_to_float_image(r); return make_float4(f, f, f, 1.0f); } diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index 685c7a5b753..8a50eb1a3d5 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -128,6 +128,13 @@ __device__ half __float2half(const float f) return val; } +__device__ float __half2float(const half h) +{ + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h)); + return val; +} + /* Types */ #include "util/util_half.h" diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index eeac09d4b29..335cb1ec0c0 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -516,7 +516,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba( film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel); ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x; - float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); + *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); } /* Common implementation for half4 destination and 3-channel input pass. */ diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index c9ec9be05df..d27b7d55475 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -120,6 +120,13 @@ __device__ half __float2half(const float f) return val; } +__device__ float __half2float(const half h) +{ + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h)); + return val; +} + /* Types */ #include "util/util_half.h" diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index 81723abe1e2..0db5acd319a 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -59,99 +59,16 @@ struct half4 { half x, y, z, w; }; -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) +/* Conversion to/from half float for image textures + * + * Simplified float to half for fast sampling on processor without a native + * instruction, and eliminating any NaN and inf values. */ -ccl_device_inline void float4_store_half(ccl_private half *h, float4 f) +ccl_device_inline half float_to_half_image(float f) { - h[0] = __float2half(f.x); - h[1] = __float2half(f.y); - h[2] = __float2half(f.z); - h[3] = __float2half(f.w); -} - +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) + return __float2half(f); #else - -ccl_device_inline void float4_store_half(ccl_private half *h, float4 f) -{ - -# ifndef __KERNEL_SSE2__ - for (int i = 0; i < 4; i++) { - /* optimized float to half for pixels: - * assumes no negative, no nan, no inf, and sets denormal to 0 */ - union { - uint i; - float f; - } in; - in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f; - int x = in.i; - - int absolute = x & 0x7FFFFFFF; - int Z = absolute + 0xC8000000; - int result = (absolute < 0x38800000) ? 0 : Z; - int rshift = (result >> 13); - - h[i] = (rshift & 0x7FFF); - } -# else - /* same as above with SSE */ - ssef x = min(max(load4f(f), 0.0f), 65504.0f); - -# ifdef __KERNEL_AVX2__ - ssei rpack = _mm_cvtps_ph(x, 0); -# else - ssei absolute = cast(x) & 0x7FFFFFFF; - ssei Z = absolute + 0xC8000000; - ssei result = andnot(absolute < 0x38800000, Z); - ssei rshift = (result >> 13) & 0x7FFF; - ssei rpack = _mm_packs_epi32(rshift, rshift); -# endif - - _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack)); -# endif -} - -# ifndef __KERNEL_HIP__ - -ccl_device_inline float half_to_float(half h) -{ - float f; - - *((int *)&f) = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13); - - return f; -} -# else - -ccl_device_inline float half_to_float(std::uint32_t a) noexcept -{ - - std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U; - - std::uint32_t v = __float_as_uint(__uint_as_float(u) * - __uint_as_float(0x77800000U) /*0x1.0p+112f*/) + - 0x38000000U; - - u = (a & 0x7fff) != 0 ? v : u; - - return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/; -} - -# endif /* __KERNEL_HIP__ */ - -ccl_device_inline float4 half4_to_float4(half4 h) -{ - float4 f; - - f.x = half_to_float(h.x); - f.y = half_to_float(h.y); - f.z = half_to_float(h.z); - f.w = half_to_float(h.w); - - return f; -} - -ccl_device_inline half float_to_half(float f) -{ const uint u = __float_as_uint(f); /* Sign bit, shifted to its position. */ uint sign_bit = u & 0x80000000; @@ -170,10 +87,83 @@ ccl_device_inline half float_to_half(float f) value_bits = (exponent_bits == 0 ? 0 : value_bits); /* Re-insert sign bit and return. */ return (value_bits | sign_bit); +#endif +} + +ccl_device_inline float half_to_float_image(half h) +{ +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) + return __half2float(h); +#else + const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13); + return __int_as_float(x); +#endif } +ccl_device_inline float4 half4_to_float4_image(const half4 h) +{ + /* Unable to use because it gives different results half_to_float_image, can we + * modify float_to_half_image so the conversion results are identical? */ +#if 0 /* defined(__KERNEL_AVX2__) */ + /* CPU: AVX. */ + __m128i x = _mm_castpd_si128(_mm_load_sd((const double *)&h)); + return float4(_mm_cvtph_ps(x)); #endif + const float4 f = make_float4(half_to_float_image(h.x), + half_to_float_image(h.y), + half_to_float_image(h.z), + half_to_float_image(h.w)); + return f; +} + +/* Conversion to half float texture for display. + * + * Simplified float to half for fast display texture conversion on processors + * without a native instruction. Assumes no negative, no NaN, no inf, and sets + * denormal to 0. */ + +ccl_device_inline half float_to_half_display(const float f) +{ +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) + return __float2half(f); +#else + const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f); + const int absolute = x & 0x7FFFFFFF; + const int Z = absolute + 0xC8000000; + const int result = (absolute < 0x38800000) ? 0 : Z; + const int rshift = (result >> 13); + return (rshift & 0x7FFF); +#endif +} + +ccl_device_inline half4 float4_to_half4_display(const float4 f) +{ +#ifdef __KERNEL_SSE2__ + /* CPU: SSE and AVX. */ + ssef x = min(max(load4f(f), 0.0f), 65504.0f); +# ifdef __KERNEL_AVX2__ + ssei rpack = _mm_cvtps_ph(x, 0); +# else + ssei absolute = cast(x) & 0x7FFFFFFF; + ssei Z = absolute + 0xC8000000; + ssei result = andnot(absolute < 0x38800000, Z); + ssei rshift = (result >> 13) & 0x7FFF; + ssei rpack = _mm_packs_epi32(rshift, rshift); +# endif + half4 h; + _mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack)); + return h; +#else + /* GPU and scalar fallback. */ + const half4 h = {float_to_half_display(f.x), + float_to_half_display(f.y), + float_to_half_display(f.z), + float_to_half_display(f.w)}; + return h; +#endif +} + CCL_NAMESPACE_END #endif /* __UTIL_HALF_H__ */ diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h index 27ec7ffb423..b082b971613 100644 --- a/intern/cycles/util/util_image.h +++ b/intern/cycles/util/util_image.h @@ -56,7 +56,7 @@ template<> inline float util_image_cast_to_float(uint16_t value) } template<> inline float util_image_cast_to_float(half value) { - return half_to_float(value); + return half_to_float_image(value); } /* Cast float value to output pixel type. */ @@ -88,7 +88,7 @@ template<> inline uint16_t util_image_cast_from_float(float value) } template<> inline half util_image_cast_from_float(float value) { - return float_to_half(value); + return float_to_half_image(value); } CCL_NAMESPACE_END -- cgit v1.2.3