diff options
Diffstat (limited to 'intern/cycles/util/util_simd.h')
-rw-r--r-- | intern/cycles/util/util_simd.h | 613 |
1 files changed, 360 insertions, 253 deletions
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index c92fc1ae391..8fcaadc5f53 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -20,439 +20,550 @@ #ifndef __KERNEL_GPU__ -#include <limits> +# include <limits> -#include "util/util_defines.h" +# include "util/util_defines.h" /* SSE Intrinsics includes * * We assume __KERNEL_SSEX__ flags to have been defined at this point */ /* SSE intrinsics headers */ -#ifndef FREE_WINDOWS64 +# ifndef FREE_WINDOWS64 -#ifdef _MSC_VER -# include <intrin.h> -#elif (defined(__x86_64__) || defined(__i386__)) -# include <x86intrin.h> -#endif +# ifdef _MSC_VER +# include <intrin.h> +# elif (defined(__x86_64__) || defined(__i386__)) +# include <x86intrin.h> +# endif -#else +# else /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. * Since we can't avoid including <windows.h>, better only include that */ -#include "util/util_windows.h" +# include "util/util_windows.h" -#endif +# endif -#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86) - #define SIMD_SET_FLUSH_TO_ZERO \ - _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); -#else - #define SIMD_SET_FLUSH_TO_ZERO -#endif +# if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86) +# define SIMD_SET_FLUSH_TO_ZERO \ + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +# else +# define SIMD_SET_FLUSH_TO_ZERO +# endif CCL_NAMESPACE_BEGIN -#ifdef __KERNEL_SSE2__ +# ifdef __KERNEL_SSE2__ extern const __m128 _mm_lookupmask_ps[16]; /* Special Types */ static struct TrueTy { -__forceinline operator bool( ) const { return true; } + __forceinline operator bool() const + { + return true; + } } True ccl_maybe_unused; static struct FalseTy { -__forceinline operator bool( ) const { return false; } + __forceinline operator bool() const + { + return false; + } } False ccl_maybe_unused; -static struct NegInfTy -{ -__forceinline operator float ( ) const { return -std::numeric_limits<float>::infinity(); } -__forceinline operator int ( ) const { return std::numeric_limits<int>::min(); } +static struct NegInfTy { + __forceinline operator float() const + { + return -std::numeric_limits<float>::infinity(); + } + __forceinline operator int() const + { + return std::numeric_limits<int>::min(); + } } neg_inf ccl_maybe_unused; -static struct PosInfTy -{ -__forceinline operator float ( ) const { return std::numeric_limits<float>::infinity(); } -__forceinline operator int ( ) const { return std::numeric_limits<int>::max(); } +static struct PosInfTy { + __forceinline operator float() const + { + return std::numeric_limits<float>::infinity(); + } + __forceinline operator int() const + { + return std::numeric_limits<int>::max(); + } } inf ccl_maybe_unused, pos_inf ccl_maybe_unused; /* Intrinsics Functions */ -#if defined(__BMI__) && defined(__GNUC__) -# ifndef _tzcnt_u32 -# define _tzcnt_u32 __tzcnt_u32 -# endif -# ifndef _tzcnt_u64 -# define _tzcnt_u64 __tzcnt_u64 -# endif -#endif +# if defined(__BMI__) && defined(__GNUC__) +# ifndef _tzcnt_u32 +# define _tzcnt_u32 __tzcnt_u32 +# endif +# ifndef _tzcnt_u64 +# define _tzcnt_u64 __tzcnt_u64 +# endif +# endif -#if defined(__LZCNT__) -#define _lzcnt_u32 __lzcnt32 -#define _lzcnt_u64 __lzcnt64 -#endif +# if defined(__LZCNT__) +# define _lzcnt_u32 __lzcnt32 +# define _lzcnt_u64 __lzcnt64 +# endif -#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__) +# if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__) -__forceinline int __popcnt(int in) { +__forceinline int __popcnt(int in) +{ return _mm_popcnt_u32(in); } -#if !defined(_MSC_VER) -__forceinline unsigned int __popcnt(unsigned int in) { +# if !defined(_MSC_VER) +__forceinline unsigned int __popcnt(unsigned int in) +{ return _mm_popcnt_u32(in); } -#endif +# endif -#if defined(__KERNEL_64_BIT__) -__forceinline long long __popcnt(long long in) { +# if defined(__KERNEL_64_BIT__) +__forceinline long long __popcnt(long long in) +{ return _mm_popcnt_u64(in); } -__forceinline size_t __popcnt(size_t in) { +__forceinline size_t __popcnt(size_t in) +{ return _mm_popcnt_u64(in); } -#endif +# endif -__forceinline int __bsf(int v) { -#if defined(__KERNEL_AVX2__) +__forceinline int __bsf(int v) +{ +# if defined(__KERNEL_AVX2__) return _tzcnt_u32(v); -#else - unsigned long r = 0; _BitScanForward(&r,v); return r; -#endif +# else + unsigned long r = 0; + _BitScanForward(&r, v); + return r; +# endif } -__forceinline unsigned int __bsf(unsigned int v) { -#if defined(__KERNEL_AVX2__) +__forceinline unsigned int __bsf(unsigned int v) +{ +# if defined(__KERNEL_AVX2__) return _tzcnt_u32(v); -#else - unsigned long r = 0; _BitScanForward(&r,v); return r; -#endif +# else + unsigned long r = 0; + _BitScanForward(&r, v); + return r; +# endif } -__forceinline int __bsr(int v) { - unsigned long r = 0; _BitScanReverse(&r,v); return r; +__forceinline int __bsr(int v) +{ + unsigned long r = 0; + _BitScanReverse(&r, v); + return r; } -__forceinline int __btc(int v, int i) { - long r = v; _bittestandcomplement(&r,i); return r; +__forceinline int __btc(int v, int i) +{ + long r = v; + _bittestandcomplement(&r, i); + return r; } -__forceinline int __bts(int v, int i) { - long r = v; _bittestandset(&r,i); return r; +__forceinline int __bts(int v, int i) +{ + long r = v; + _bittestandset(&r, i); + return r; } -__forceinline int __btr(int v, int i) { - long r = v; _bittestandreset(&r,i); return r; +__forceinline int __btr(int v, int i) +{ + long r = v; + _bittestandreset(&r, i); + return r; } -__forceinline int bitscan(int v) { -#if defined(__KERNEL_AVX2__) +__forceinline int bitscan(int v) +{ +# if defined(__KERNEL_AVX2__) return _tzcnt_u32(v); -#else +# else return __bsf(v); -#endif +# endif } __forceinline int clz(const int x) { -#if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_AVX2__) return _lzcnt_u32(x); -#else - if(UNLIKELY(x == 0)) return 32; +# else + if (UNLIKELY(x == 0)) + return 32; return 31 - __bsr(x); -#endif +# endif } -__forceinline int __bscf(int& v) +__forceinline int __bscf(int &v) { int i = __bsf(v); - v &= v-1; + v &= v - 1; return i; } -__forceinline unsigned int __bscf(unsigned int& v) +__forceinline unsigned int __bscf(unsigned int &v) { unsigned int i = __bsf(v); - v &= v-1; + v &= v - 1; return i; } -#if defined(__KERNEL_64_BIT__) +# if defined(__KERNEL_64_BIT__) -__forceinline size_t __bsf(size_t v) { -#if defined(__KERNEL_AVX2__) +__forceinline size_t __bsf(size_t v) +{ +# if defined(__KERNEL_AVX2__) return _tzcnt_u64(v); -#else - unsigned long r = 0; _BitScanForward64(&r,v); return r; -#endif +# else + unsigned long r = 0; + _BitScanForward64(&r, v); + return r; +# endif } -__forceinline size_t __bsr(size_t v) { - unsigned long r = 0; _BitScanReverse64(&r,v); return r; +__forceinline size_t __bsr(size_t v) +{ + unsigned long r = 0; + _BitScanReverse64(&r, v); + return r; } -__forceinline size_t __btc(size_t v, size_t i) { - size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; +__forceinline size_t __btc(size_t v, size_t i) +{ + size_t r = v; + _bittestandcomplement64((__int64 *)&r, i); + return r; } -__forceinline size_t __bts(size_t v, size_t i) { - __int64 r = v; _bittestandset64(&r,i); return r; +__forceinline size_t __bts(size_t v, size_t i) +{ + __int64 r = v; + _bittestandset64(&r, i); + return r; } -__forceinline size_t __btr(size_t v, size_t i) { - __int64 r = v; _bittestandreset64(&r,i); return r; +__forceinline size_t __btr(size_t v, size_t i) +{ + __int64 r = v; + _bittestandreset64(&r, i); + return r; } -__forceinline size_t bitscan(size_t v) { -#if defined(__KERNEL_AVX2__) -#if defined(__KERNEL_64_BIT__) +__forceinline size_t bitscan(size_t v) +{ +# if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_64_BIT__) return _tzcnt_u64(v); -#else +# else return _tzcnt_u32(v); -#endif -#else +# endif +# else return __bsf(v); -#endif +# endif } -__forceinline size_t __bscf(size_t& v) +__forceinline size_t __bscf(size_t &v) { size_t i = __bsf(v); - v &= v-1; + v &= v - 1; return i; } -#endif /* __KERNEL_64_BIT__ */ +# endif /* __KERNEL_64_BIT__ */ -#else /* _WIN32 */ +# else /* _WIN32 */ -__forceinline unsigned int __popcnt(unsigned int in) { - int r = 0; asm ("popcnt %1,%0" : "=r"(r) : "r"(in)); return r; +__forceinline unsigned int __popcnt(unsigned int in) +{ + int r = 0; + asm("popcnt %1,%0" : "=r"(r) : "r"(in)); + return r; } -__forceinline int __bsf(int v) { - int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +__forceinline int __bsf(int v) +{ + int r = 0; + asm("bsf %1,%0" : "=r"(r) : "r"(v)); + return r; } -__forceinline int __bsr(int v) { - int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +__forceinline int __bsr(int v) +{ + int r = 0; + asm("bsr %1,%0" : "=r"(r) : "r"(v)); + return r; } -__forceinline int __btc(int v, int i) { - int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +__forceinline int __btc(int v, int i) +{ + int r = 0; + asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); + return r; } -__forceinline int __bts(int v, int i) { - int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +__forceinline int __bts(int v, int i) +{ + int r = 0; + asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); + return r; } -__forceinline int __btr(int v, int i) { - int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +__forceinline int __btr(int v, int i) +{ + int r = 0; + asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); + return r; } -#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__)) -__forceinline size_t __bsf(size_t v) { - size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \ + !(defined(__ILP32__) && defined(__x86_64__)) +__forceinline size_t __bsf(size_t v) +{ + size_t r = 0; + asm("bsf %1,%0" : "=r"(r) : "r"(v)); + return r; } -#endif +# endif -__forceinline unsigned int __bsf(unsigned int v) { - unsigned int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +__forceinline unsigned int __bsf(unsigned int v) +{ + unsigned int r = 0; + asm("bsf %1,%0" : "=r"(r) : "r"(v)); + return r; } -__forceinline size_t __bsr(size_t v) { - size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +__forceinline size_t __bsr(size_t v) +{ + size_t r = 0; + asm("bsr %1,%0" : "=r"(r) : "r"(v)); + return r; } -__forceinline size_t __btc(size_t v, size_t i) { - size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +__forceinline size_t __btc(size_t v, size_t i) +{ + size_t r = 0; + asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); + return r; } -__forceinline size_t __bts(size_t v, size_t i) { - size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +__forceinline size_t __bts(size_t v, size_t i) +{ + size_t r = 0; + asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); + return r; } -__forceinline size_t __btr(size_t v, size_t i) { - size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +__forceinline size_t __btr(size_t v, size_t i) +{ + size_t r = 0; + asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); + return r; } -__forceinline int bitscan(int v) { -#if defined(__KERNEL_AVX2__) +__forceinline int bitscan(int v) +{ +# if defined(__KERNEL_AVX2__) return _tzcnt_u32(v); -#else +# else return __bsf(v); -#endif +# endif } -__forceinline unsigned int bitscan(unsigned int v) { -#if defined(__KERNEL_AVX2__) +__forceinline unsigned int bitscan(unsigned int v) +{ +# if defined(__KERNEL_AVX2__) return _tzcnt_u32(v); -#else +# else return __bsf(v); -#endif +# endif } -#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__)) -__forceinline size_t bitscan(size_t v) { -#if defined(__KERNEL_AVX2__) -#if defined(__KERNEL_64_BIT__) +# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \ + !(defined(__ILP32__) && defined(__x86_64__)) +__forceinline size_t bitscan(size_t v) +{ +# if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_64_BIT__) return _tzcnt_u64(v); -#else +# else return _tzcnt_u32(v); -#endif -#else +# endif +# else return __bsf(v); -#endif +# endif } -#endif +# endif __forceinline int clz(const int x) { -#if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_AVX2__) return _lzcnt_u32(x); -#else - if(UNLIKELY(x == 0)) return 32; +# else + if (UNLIKELY(x == 0)) + return 32; return 31 - __bsr(x); -#endif +# endif } -__forceinline int __bscf(int& v) +__forceinline int __bscf(int &v) { int i = bitscan(v); -#if defined(__KERNEL_AVX2__) - v &= v-1; -#else - v = __btc(v,i); -#endif +# if defined(__KERNEL_AVX2__) + v &= v - 1; +# else + v = __btc(v, i); +# endif return i; } -__forceinline unsigned int __bscf(unsigned int& v) +__forceinline unsigned int __bscf(unsigned int &v) { unsigned int i = bitscan(v); - v &= v-1; + v &= v - 1; return i; } -#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__)) -__forceinline size_t __bscf(size_t& v) +# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \ + !(defined(__ILP32__) && defined(__x86_64__)) +__forceinline size_t __bscf(size_t &v) { size_t i = bitscan(v); -#if defined(__KERNEL_AVX2__) - v &= v-1; -#else - v = __btc(v,i); -#endif +# if defined(__KERNEL_AVX2__) + v &= v - 1; +# else + v = __btc(v, i); +# endif return i; } -#endif +# endif -#endif /* _WIN32 */ +# endif /* _WIN32 */ /* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other * platforms when compiling code outside the kernel. */ -#if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) +# if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) /* Emulation of SSE4 functions with SSE2 */ -#define _MM_FROUND_TO_NEAREST_INT 0x00 -#define _MM_FROUND_TO_NEG_INF 0x01 -#define _MM_FROUND_TO_POS_INF 0x02 -#define _MM_FROUND_TO_ZERO 0x03 -#define _MM_FROUND_CUR_DIRECTION 0x04 +# define _MM_FROUND_TO_NEAREST_INT 0x00 +# define _MM_FROUND_TO_NEG_INF 0x01 +# define _MM_FROUND_TO_POS_INF 0x02 +# define _MM_FROUND_TO_ZERO 0x03 +# define _MM_FROUND_CUR_DIRECTION 0x04 -#undef _mm_blendv_ps -#define _mm_blendv_ps _mm_blendv_ps_emu -__forceinline __m128 _mm_blendv_ps_emu( __m128 value, __m128 input, __m128 mask) +# undef _mm_blendv_ps +# define _mm_blendv_ps _mm_blendv_ps_emu +__forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask) { - __m128i isignmask = _mm_set1_epi32(0x80000000); - __m128 signmask = _mm_castsi128_ps(isignmask); - __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask)); - __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask); - __m128 cmpmask = _mm_castsi128_ps(icmpmask); - return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value)); + __m128i isignmask = _mm_set1_epi32(0x80000000); + __m128 signmask = _mm_castsi128_ps(isignmask); + __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask)); + __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask); + __m128 cmpmask = _mm_castsi128_ps(icmpmask); + return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value)); } -#undef _mm_blend_ps -#define _mm_blend_ps _mm_blend_ps_emu -__forceinline __m128 _mm_blend_ps_emu( __m128 value, __m128 input, const int mask) +# undef _mm_blend_ps +# define _mm_blend_ps _mm_blend_ps_emu +__forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask) { - assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); + assert(mask < 0x10); + return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); } -#undef _mm_blendv_epi8 -#define _mm_blendv_epi8 _mm_blendv_epi8_emu -__forceinline __m128i _mm_blendv_epi8_emu( __m128i value, __m128i input, __m128i mask) +# undef _mm_blendv_epi8 +# define _mm_blendv_epi8 _mm_blendv_epi8_emu +__forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask) { - return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); + return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); } -#undef _mm_min_epi32 -#define _mm_min_epi32 _mm_min_epi32_emu -__forceinline __m128i _mm_min_epi32_emu( __m128i value, __m128i input) +# undef _mm_min_epi32 +# define _mm_min_epi32 _mm_min_epi32_emu +__forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input) { - return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); + return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); } -#undef _mm_max_epi32 -#define _mm_max_epi32 _mm_max_epi32_emu -__forceinline __m128i _mm_max_epi32_emu( __m128i value, __m128i input) +# undef _mm_max_epi32 +# define _mm_max_epi32 _mm_max_epi32_emu +__forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input) { - return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); + return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } -#undef _mm_extract_epi32 -#define _mm_extract_epi32 _mm_extract_epi32_emu -__forceinline int _mm_extract_epi32_emu( __m128i input, const int index) +# undef _mm_extract_epi32 +# define _mm_extract_epi32 _mm_extract_epi32_emu +__forceinline int _mm_extract_epi32_emu(__m128i input, const int index) { - switch(index) { - case 0: return _mm_cvtsi128_si32(input); - case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1))); - case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2))); - case 3: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3))); - default: assert(false); return 0; + switch (index) { + case 0: + return _mm_cvtsi128_si32(input); + case 1: + return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1))); + case 2: + return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2))); + case 3: + return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3))); + default: + assert(false); + return 0; } } -#undef _mm_insert_epi32 -#define _mm_insert_epi32 _mm_insert_epi32_emu -__forceinline __m128i _mm_insert_epi32_emu( __m128i value, int input, const int index) +# undef _mm_insert_epi32 +# define _mm_insert_epi32 _mm_insert_epi32_emu +__forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index) { - assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; + assert(index >= 0 && index < 4); + ((int *)&value)[index] = input; + return value; } -#undef _mm_insert_ps -#define _mm_insert_ps _mm_insert_ps_emu -__forceinline __m128 _mm_insert_ps_emu( __m128 value, __m128 input, const int index) +# undef _mm_insert_ps +# define _mm_insert_ps _mm_insert_ps_emu +__forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index) { - assert(index < 0x100); - ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; - return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); + assert(index < 0x100); + ((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6]; + return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value); } -#undef _mm_round_ps -#define _mm_round_ps _mm_round_ps_emu -__forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags) +# undef _mm_round_ps +# define _mm_round_ps _mm_round_ps_emu +__forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags) { - switch(flags) - { - case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value)); - case _MM_FROUND_TO_NEG_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f)))); - case _MM_FROUND_TO_POS_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps( 0.5f)))); - case _MM_FROUND_TO_ZERO : return _mm_cvtepi32_ps(_mm_cvttps_epi32(value)); + switch (flags) { + case _MM_FROUND_TO_NEAREST_INT: + return _mm_cvtepi32_ps(_mm_cvtps_epi32(value)); + case _MM_FROUND_TO_NEG_INF: + return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f)))); + case _MM_FROUND_TO_POS_INF: + return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f)))); + case _MM_FROUND_TO_ZERO: + return _mm_cvtepi32_ps(_mm_cvttps_epi32(value)); } return value; } -#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ +# endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ -#else /* __KERNEL_SSE2__ */ +# else /* __KERNEL_SSE2__ */ /* This section is for utility functions which operates on non-register data * which might be used from a non-vectorized code. @@ -460,38 +571,34 @@ __forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags) ccl_device_inline int bitscan(int value) { - assert(value != 0); - int bit = 0; - while((value & (1 << bit)) == 0) { - ++bit; - } - return bit; + assert(value != 0); + int bit = 0; + while ((value & (1 << bit)) == 0) { + ++bit; + } + return bit; } ccl_device_inline int __bsr(int value) { - assert(value != 0); - int bit = 0; - while(value >>= 1) { - ++bit; - } - return bit; + assert(value != 0); + int bit = 0; + while (value >>= 1) { + ++bit; + } + return bit; } -#endif /* __KERNEL_SSE2__ */ +# endif /* __KERNEL_SSE2__ */ /* quiet unused define warnings */ -#if defined(__KERNEL_SSE2__) || \ - defined(__KERNEL_SSE3__) || \ - defined(__KERNEL_SSSE3__) || \ - defined(__KERNEL_SSE41__) || \ - defined(__KERNEL_AVX__) || \ - defined(__KERNEL_AVX2__) - /* do nothing */ -#endif +# if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \ + defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) +/* do nothing */ +# endif CCL_NAMESPACE_END -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_GPU__ */ -#endif /* __UTIL_SIMD_TYPES_H__ */ +#endif /* __UTIL_SIMD_TYPES_H__ */ |