diff options
Diffstat (limited to 'intern/cycles/util/util_half.h')
-rw-r--r-- | intern/cycles/util/util_half.h | 46 |
1 files changed, 18 insertions, 28 deletions
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index a8d4ee75e20..d9edfec5da3 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -28,14 +28,8 @@ CCL_NAMESPACE_BEGIN /* Half Floats */ -#ifdef __KERNEL_OPENCL__ - -# define float4_store_half(h, f, scale) vstore_half4(f *(scale), 0, h); - -#else - /* CUDA has its own half data type, no need to define then */ -# ifndef __KERNEL_CUDA__ +#ifndef __KERNEL_CUDA__ /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -59,27 +53,27 @@ class half { private: unsigned short v; }; -# endif +#endif struct half4 { half x, y, z, w; }; -# ifdef __KERNEL_CUDA__ +#ifdef __KERNEL_CUDA__ -ccl_device_inline void float4_store_half(half *h, float4 f, float scale) +ccl_device_inline void float4_store_half(half *h, float4 f) { - h[0] = __float2half(f.x * scale); - h[1] = __float2half(f.y * scale); - h[2] = __float2half(f.z * scale); - h[3] = __float2half(f.w * scale); + h[0] = __float2half(f.x); + h[1] = __float2half(f.y); + h[2] = __float2half(f.z); + h[3] = __float2half(f.w); } -# else +#else -ccl_device_inline void float4_store_half(half *h, float4 f, float scale) +ccl_device_inline void float4_store_half(half *h, float4 f) { -# ifndef __KERNEL_SSE2__ +# ifndef __KERNEL_SSE2__ for (int i = 0; i < 4; i++) { /* optimized float to half for pixels: * assumes no negative, no nan, no inf, and sets denormal to 0 */ @@ -87,8 +81,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) uint i; float f; } in; - float fscale = f[i] * scale; - in.f = (fscale > 0.0f) ? ((fscale < 65504.0f) ? fscale : 65504.0f) : 0.0f; + in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f; int x = in.i; int absolute = x & 0x7FFFFFFF; @@ -98,23 +91,22 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) h[i] = (rshift & 0x7FFF); } -# else +# else /* same as above with SSE */ - ssef fscale = load4f(f) * scale; - ssef x = min(max(fscale, 0.0f), 65504.0f); + ssef x = min(max(load4f(f), 0.0f), 65504.0f); -# ifdef __KERNEL_AVX2__ +# ifdef __KERNEL_AVX2__ ssei rpack = _mm_cvtps_ph(x, 0); -# else +# else ssei absolute = cast(x) & 0x7FFFFFFF; ssei Z = absolute + 0xC8000000; ssei result = andnot(absolute < 0x38800000, Z); ssei rshift = (result >> 13) & 0x7FFF; ssei rpack = _mm_packs_epi32(rshift, rshift); -# endif +# endif _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack)); -# endif +# endif } ccl_device_inline float half_to_float(half h) @@ -160,8 +152,6 @@ ccl_device_inline half float_to_half(float f) return (value_bits | sign_bit); } -# endif - #endif CCL_NAMESPACE_END |