diff options
Diffstat (limited to 'intern/cycles/util/util_half.h')
-rw-r--r-- | intern/cycles/util/util_half.h | 177 |
1 files changed, 97 insertions, 80 deletions
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index 3868509c21b..9c40f5310c2 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -21,7 +21,7 @@ #include "util/util_math.h" #ifdef __KERNEL_SSE2__ -#include "util/util_simd.h" +# include "util/util_simd.h" #endif CCL_NAMESPACE_BEGIN @@ -30,122 +30,139 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_OPENCL__ -#define float4_store_half(h, f, scale) vstore_half4(f * (scale), 0, h); +# define float4_store_half(h, f, scale) vstore_half4(f *(scale), 0, h); #else /* CUDA has its own half data type, no need to define then */ -#ifndef __KERNEL_CUDA__ +# ifndef __KERNEL_CUDA__ /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from unsigned shorts. */ class half { -public: - half() : v(0) {} - half(const unsigned short& i) : v(i) {} - operator unsigned short() { return v; } - half& operator =(const unsigned short& i) { v = i; return *this; } -private: - unsigned short v; + public: + half() : v(0) + { + } + half(const unsigned short &i) : v(i) + { + } + operator unsigned short() + { + return v; + } + half &operator=(const unsigned short &i) + { + v = i; + return *this; + } + + private: + unsigned short v; }; -#endif +# endif -struct half4 { half x, y, z, w; }; +struct half4 { + half x, y, z, w; +}; -#ifdef __KERNEL_CUDA__ +# ifdef __KERNEL_CUDA__ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) { - h[0] = __float2half(f.x * scale); - h[1] = __float2half(f.y * scale); - h[2] = __float2half(f.z * scale); - h[3] = __float2half(f.w * scale); + h[0] = __float2half(f.x * scale); + h[1] = __float2half(f.y * scale); + h[2] = __float2half(f.z * scale); + h[3] = __float2half(f.w * scale); } -#else +# else ccl_device_inline void float4_store_half(half *h, float4 f, float scale) { -#ifndef __KERNEL_SSE2__ - for(int i = 0; i < 4; i++) { - /* optimized float to half for pixels: - * assumes no negative, no nan, no inf, and sets denormal to 0 */ - union { uint i; float f; } in; - float fscale = f[i] * scale; - in.f = (fscale > 0.0f)? ((fscale < 65504.0f)? fscale: 65504.0f): 0.0f; - int x = in.i; - - int absolute = x & 0x7FFFFFFF; - int Z = absolute + 0xC8000000; - int result = (absolute < 0x38800000)? 0: Z; - int rshift = (result >> 13); - - h[i] = (rshift & 0x7FFF); - } -#else - /* same as above with SSE */ - ssef fscale = load4f(f) * scale; - ssef x = min(max(fscale, 0.0f), 65504.0f); - -#ifdef __KERNEL_AVX2__ - ssei rpack = _mm_cvtps_ph(x, 0); -#else - ssei absolute = cast(x) & 0x7FFFFFFF; - ssei Z = absolute + 0xC8000000; - ssei result = andnot(absolute < 0x38800000, Z); - ssei rshift = (result >> 13) & 0x7FFF; - ssei rpack = _mm_packs_epi32(rshift, rshift); -#endif - - _mm_storel_pi((__m64*)h, _mm_castsi128_ps(rpack)); -#endif +# ifndef __KERNEL_SSE2__ + for (int i = 0; i < 4; i++) { + /* optimized float to half for pixels: + * assumes no negative, no nan, no inf, and sets denormal to 0 */ + union { + uint i; + float f; + } in; + float fscale = f[i] * scale; + in.f = (fscale > 0.0f) ? ((fscale < 65504.0f) ? fscale : 65504.0f) : 0.0f; + int x = in.i; + + int absolute = x & 0x7FFFFFFF; + int Z = absolute + 0xC8000000; + int result = (absolute < 0x38800000) ? 0 : Z; + int rshift = (result >> 13); + + h[i] = (rshift & 0x7FFF); + } +# else + /* same as above with SSE */ + ssef fscale = load4f(f) * scale; + ssef x = min(max(fscale, 0.0f), 65504.0f); + +# ifdef __KERNEL_AVX2__ + ssei rpack = _mm_cvtps_ph(x, 0); +# else + ssei absolute = cast(x) & 0x7FFFFFFF; + ssei Z = absolute + 0xC8000000; + ssei result = andnot(absolute < 0x38800000, Z); + ssei rshift = (result >> 13) & 0x7FFF; + ssei rpack = _mm_packs_epi32(rshift, rshift); +# endif + + _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack)); +# endif } ccl_device_inline float half_to_float(half h) { - float f; + float f; - *((int*) &f) = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13); + *((int *)&f) = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13); - return f; + return f; } ccl_device_inline float4 half4_to_float4(half4 h) { - float4 f; + float4 f; - f.x = half_to_float(h.x); - f.y = half_to_float(h.y); - f.z = half_to_float(h.z); - f.w = half_to_float(h.w); + f.x = half_to_float(h.x); + f.y = half_to_float(h.y); + f.z = half_to_float(h.z); + f.w = half_to_float(h.w); - return f; + return f; } ccl_device_inline half float_to_half(float f) { - const uint u = __float_as_uint(f); - /* Sign bit, shifted to it's position. */ - uint sign_bit = u & 0x80000000; - sign_bit >>= 16; - /* Exponent. */ - uint exponent_bits = u & 0x7f800000; - /* Non-sign bits. */ - uint value_bits = u & 0x7fffffff; - value_bits >>= 13; /* Align mantissa on MSB. */ - value_bits -= 0x1c000; /* Adjust bias. */ - /* Flush-to-zero. */ - value_bits = (exponent_bits < 0x38800000) ? 0 : value_bits; - /* Clamp-to-max. */ - value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits; - /* Denormals-as-zero. */ - value_bits = (exponent_bits == 0 ? 0 : value_bits); - /* Re-insert sign bit and return. */ - return (value_bits | sign_bit); + const uint u = __float_as_uint(f); + /* Sign bit, shifted to it's position. */ + uint sign_bit = u & 0x80000000; + sign_bit >>= 16; + /* Exponent. */ + uint exponent_bits = u & 0x7f800000; + /* Non-sign bits. */ + uint value_bits = u & 0x7fffffff; + value_bits >>= 13; /* Align mantissa on MSB. */ + value_bits -= 0x1c000; /* Adjust bias. */ + /* Flush-to-zero. */ + value_bits = (exponent_bits < 0x38800000) ? 0 : value_bits; + /* Clamp-to-max. */ + value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits; + /* Denormals-as-zero. */ + value_bits = (exponent_bits == 0 ? 0 : value_bits); + /* Re-insert sign bit and return. */ + return (value_bits | sign_bit); } -#endif +# endif #endif CCL_NAMESPACE_END -#endif /* __UTIL_HALF_H__ */ +#endif /* __UTIL_HALF_H__ */ |