diff options
-rw-r--r-- | intern/cycles/CMakeLists.txt | 4 | ||||
-rw-r--r-- | intern/cycles/util/util_half.h | 34 |
2 files changed, 18 insertions, 20 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index c4cadfe0bf7..ed6961f49e0 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -64,7 +64,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC) set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse") endif() if(CXX_HAS_AVX2) - set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mfpmath=sse") + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") @@ -80,7 +80,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx") endif() if(CXX_HAS_AVX2) - set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2") + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") endif() diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index 76941569bd2..9642f8ed523 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -54,12 +54,10 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) for(int i = 0; i < 4; i++) { /* optimized float to half for pixels: * assumes no negative, no nan, no inf, and sets denormal to 0 */ - union { uint i; float f; } in; float fscale = f[i] * scale; - in.f = (fscale > 0.0f)? ((fscale < 65500.0f)? fscale: 65500.0f): 0.0f; - int x = in.i; + float x = min(max(fscale, 0.0f), 65504.0f); - int absolute = x & 0x7FFFFFFF; + int absolute = __float_as_uint(in) & 0x7FFFFFFF; int Z = absolute + 0xC8000000; int result = (absolute < 0x38800000)? 0: Z; int rshift = (result >> 13); @@ -68,20 +66,20 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) } #else /* same as above with SSE */ - const ssef mm_scale = ssef(scale); - const ssei mm_38800000 = ssei(0x38800000); - const ssei mm_7FFF = ssei(0x7FFF); - const ssei mm_7FFFFFFF = ssei(0x7FFFFFFF); - const ssei mm_C8000000 = ssei(0xC8000000); - - ssef mm_fscale = load4f(f) * mm_scale; - ssei x = cast(min(max(mm_fscale, ssef(0.0f)), ssef(65500.0f))); - ssei absolute = x & mm_7FFFFFFF; - ssei Z = absolute + mm_C8000000; - ssei result = andnot(absolute < mm_38800000, Z); - ssei rh = (result >> 13) & mm_7FFF; - - _mm_storel_pi((__m64*)h, _mm_castsi128_ps(_mm_packs_epi32(rh, rh))); + ssef fscale = load4f(f) * scale; + ssef x = min(max(fscale, 0.0f), 65504.0f); + +#ifdef __KERNEL_AVX2__ + ssei rpack = _mm_cvtps_ph(x, 0); +#else + ssei absolute = cast(x) & 0x7FFFFFFF; + ssei Z = absolute + 0xC8000000; + ssei result = andnot(absolute < 0x38800000, Z); + ssei rshift = (result >> 13) & 0x7FFF; + ssei rpack = _mm_packs_epi32(rshift, rshift); +#endif + + _mm_storel_pi((__m64*)h, _mm_castsi128_ps(rpack)); #endif } |