Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/util/util_simd.h')
-rw-r--r--intern/cycles/util/util_simd.h613
1 files changed, 360 insertions, 253 deletions
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index c92fc1ae391..8fcaadc5f53 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -20,439 +20,550 @@
#ifndef __KERNEL_GPU__
-#include <limits>
+# include <limits>
-#include "util/util_defines.h"
+# include "util/util_defines.h"
/* SSE Intrinsics includes
*
* We assume __KERNEL_SSEX__ flags to have been defined at this point */
/* SSE intrinsics headers */
-#ifndef FREE_WINDOWS64
+# ifndef FREE_WINDOWS64
-#ifdef _MSC_VER
-# include <intrin.h>
-#elif (defined(__x86_64__) || defined(__i386__))
-# include <x86intrin.h>
-#endif
+# ifdef _MSC_VER
+# include <intrin.h>
+# elif (defined(__x86_64__) || defined(__i386__))
+# include <x86intrin.h>
+# endif
-#else
+# else
/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
* Since we can't avoid including <windows.h>, better only include that */
-#include "util/util_windows.h"
+# include "util/util_windows.h"
-#endif
+# endif
-#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
- #define SIMD_SET_FLUSH_TO_ZERO \
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#else
- #define SIMD_SET_FLUSH_TO_ZERO
-#endif
+# if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
+# define SIMD_SET_FLUSH_TO_ZERO \
+ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
+ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+# else
+# define SIMD_SET_FLUSH_TO_ZERO
+# endif
CCL_NAMESPACE_BEGIN
-#ifdef __KERNEL_SSE2__
+# ifdef __KERNEL_SSE2__
extern const __m128 _mm_lookupmask_ps[16];
/* Special Types */
static struct TrueTy {
-__forceinline operator bool( ) const { return true; }
+ __forceinline operator bool() const
+ {
+ return true;
+ }
} True ccl_maybe_unused;
static struct FalseTy {
-__forceinline operator bool( ) const { return false; }
+ __forceinline operator bool() const
+ {
+ return false;
+ }
} False ccl_maybe_unused;
-static struct NegInfTy
-{
-__forceinline operator float ( ) const { return -std::numeric_limits<float>::infinity(); }
-__forceinline operator int ( ) const { return std::numeric_limits<int>::min(); }
+static struct NegInfTy {
+ __forceinline operator float() const
+ {
+ return -std::numeric_limits<float>::infinity();
+ }
+ __forceinline operator int() const
+ {
+ return std::numeric_limits<int>::min();
+ }
} neg_inf ccl_maybe_unused;
-static struct PosInfTy
-{
-__forceinline operator float ( ) const { return std::numeric_limits<float>::infinity(); }
-__forceinline operator int ( ) const { return std::numeric_limits<int>::max(); }
+static struct PosInfTy {
+ __forceinline operator float() const
+ {
+ return std::numeric_limits<float>::infinity();
+ }
+ __forceinline operator int() const
+ {
+ return std::numeric_limits<int>::max();
+ }
} inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
/* Intrinsics Functions */
-#if defined(__BMI__) && defined(__GNUC__)
-# ifndef _tzcnt_u32
-# define _tzcnt_u32 __tzcnt_u32
-# endif
-# ifndef _tzcnt_u64
-# define _tzcnt_u64 __tzcnt_u64
-# endif
-#endif
+# if defined(__BMI__) && defined(__GNUC__)
+# ifndef _tzcnt_u32
+# define _tzcnt_u32 __tzcnt_u32
+# endif
+# ifndef _tzcnt_u64
+# define _tzcnt_u64 __tzcnt_u64
+# endif
+# endif
-#if defined(__LZCNT__)
-#define _lzcnt_u32 __lzcnt32
-#define _lzcnt_u64 __lzcnt64
-#endif
+# if defined(__LZCNT__)
+# define _lzcnt_u32 __lzcnt32
+# define _lzcnt_u64 __lzcnt64
+# endif
-#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
+# if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
-__forceinline int __popcnt(int in) {
+__forceinline int __popcnt(int in)
+{
return _mm_popcnt_u32(in);
}
-#if !defined(_MSC_VER)
-__forceinline unsigned int __popcnt(unsigned int in) {
+# if !defined(_MSC_VER)
+__forceinline unsigned int __popcnt(unsigned int in)
+{
return _mm_popcnt_u32(in);
}
-#endif
+# endif
-#if defined(__KERNEL_64_BIT__)
-__forceinline long long __popcnt(long long in) {
+# if defined(__KERNEL_64_BIT__)
+__forceinline long long __popcnt(long long in)
+{
return _mm_popcnt_u64(in);
}
-__forceinline size_t __popcnt(size_t in) {
+__forceinline size_t __popcnt(size_t in)
+{
return _mm_popcnt_u64(in);
}
-#endif
+# endif
-__forceinline int __bsf(int v) {
-#if defined(__KERNEL_AVX2__)
+__forceinline int __bsf(int v)
+{
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-#else
- unsigned long r = 0; _BitScanForward(&r,v); return r;
-#endif
+# else
+ unsigned long r = 0;
+ _BitScanForward(&r, v);
+ return r;
+# endif
}
-__forceinline unsigned int __bsf(unsigned int v) {
-#if defined(__KERNEL_AVX2__)
+__forceinline unsigned int __bsf(unsigned int v)
+{
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-#else
- unsigned long r = 0; _BitScanForward(&r,v); return r;
-#endif
+# else
+ unsigned long r = 0;
+ _BitScanForward(&r, v);
+ return r;
+# endif
}
-__forceinline int __bsr(int v) {
- unsigned long r = 0; _BitScanReverse(&r,v); return r;
+__forceinline int __bsr(int v)
+{
+ unsigned long r = 0;
+ _BitScanReverse(&r, v);
+ return r;
}
-__forceinline int __btc(int v, int i) {
- long r = v; _bittestandcomplement(&r,i); return r;
+__forceinline int __btc(int v, int i)
+{
+ long r = v;
+ _bittestandcomplement(&r, i);
+ return r;
}
-__forceinline int __bts(int v, int i) {
- long r = v; _bittestandset(&r,i); return r;
+__forceinline int __bts(int v, int i)
+{
+ long r = v;
+ _bittestandset(&r, i);
+ return r;
}
-__forceinline int __btr(int v, int i) {
- long r = v; _bittestandreset(&r,i); return r;
+__forceinline int __btr(int v, int i)
+{
+ long r = v;
+ _bittestandreset(&r, i);
+ return r;
}
-__forceinline int bitscan(int v) {
-#if defined(__KERNEL_AVX2__)
+__forceinline int bitscan(int v)
+{
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-#else
+# else
return __bsf(v);
-#endif
+# endif
}
__forceinline int clz(const int x)
{
-#if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _lzcnt_u32(x);
-#else
- if(UNLIKELY(x == 0)) return 32;
+# else
+ if (UNLIKELY(x == 0))
+ return 32;
return 31 - __bsr(x);
-#endif
+# endif
}
-__forceinline int __bscf(int& v)
+__forceinline int __bscf(int &v)
{
int i = __bsf(v);
- v &= v-1;
+ v &= v - 1;
return i;
}
-__forceinline unsigned int __bscf(unsigned int& v)
+__forceinline unsigned int __bscf(unsigned int &v)
{
unsigned int i = __bsf(v);
- v &= v-1;
+ v &= v - 1;
return i;
}
-#if defined(__KERNEL_64_BIT__)
+# if defined(__KERNEL_64_BIT__)
-__forceinline size_t __bsf(size_t v) {
-#if defined(__KERNEL_AVX2__)
+__forceinline size_t __bsf(size_t v)
+{
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u64(v);
-#else
- unsigned long r = 0; _BitScanForward64(&r,v); return r;
-#endif
+# else
+ unsigned long r = 0;
+ _BitScanForward64(&r, v);
+ return r;
+# endif
}
-__forceinline size_t __bsr(size_t v) {
- unsigned long r = 0; _BitScanReverse64(&r,v); return r;
+__forceinline size_t __bsr(size_t v)
+{
+ unsigned long r = 0;
+ _BitScanReverse64(&r, v);
+ return r;
}
-__forceinline size_t __btc(size_t v, size_t i) {
- size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
+__forceinline size_t __btc(size_t v, size_t i)
+{
+ size_t r = v;
+ _bittestandcomplement64((__int64 *)&r, i);
+ return r;
}
-__forceinline size_t __bts(size_t v, size_t i) {
- __int64 r = v; _bittestandset64(&r,i); return r;
+__forceinline size_t __bts(size_t v, size_t i)
+{
+ __int64 r = v;
+ _bittestandset64(&r, i);
+ return r;
}
-__forceinline size_t __btr(size_t v, size_t i) {
- __int64 r = v; _bittestandreset64(&r,i); return r;
+__forceinline size_t __btr(size_t v, size_t i)
+{
+ __int64 r = v;
+ _bittestandreset64(&r, i);
+ return r;
}
-__forceinline size_t bitscan(size_t v) {
-#if defined(__KERNEL_AVX2__)
-#if defined(__KERNEL_64_BIT__)
+__forceinline size_t bitscan(size_t v)
+{
+# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_64_BIT__)
return _tzcnt_u64(v);
-#else
+# else
return _tzcnt_u32(v);
-#endif
-#else
+# endif
+# else
return __bsf(v);
-#endif
+# endif
}
-__forceinline size_t __bscf(size_t& v)
+__forceinline size_t __bscf(size_t &v)
{
size_t i = __bsf(v);
- v &= v-1;
+ v &= v - 1;
return i;
}
-#endif /* __KERNEL_64_BIT__ */
+# endif /* __KERNEL_64_BIT__ */
-#else /* _WIN32 */
+# else /* _WIN32 */
-__forceinline unsigned int __popcnt(unsigned int in) {
- int r = 0; asm ("popcnt %1,%0" : "=r"(r) : "r"(in)); return r;
+__forceinline unsigned int __popcnt(unsigned int in)
+{
+ int r = 0;
+ asm("popcnt %1,%0" : "=r"(r) : "r"(in));
+ return r;
}
-__forceinline int __bsf(int v) {
- int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+__forceinline int __bsf(int v)
+{
+ int r = 0;
+ asm("bsf %1,%0" : "=r"(r) : "r"(v));
+ return r;
}
-__forceinline int __bsr(int v) {
- int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+__forceinline int __bsr(int v)
+{
+ int r = 0;
+ asm("bsr %1,%0" : "=r"(r) : "r"(v));
+ return r;
}
-__forceinline int __btc(int v, int i) {
- int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+__forceinline int __btc(int v, int i)
+{
+ int r = 0;
+ asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
+ return r;
}
-__forceinline int __bts(int v, int i) {
- int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+__forceinline int __bts(int v, int i)
+{
+ int r = 0;
+ asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
+ return r;
}
-__forceinline int __btr(int v, int i) {
- int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+__forceinline int __btr(int v, int i)
+{
+ int r = 0;
+ asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
+ return r;
}
-#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__))
-__forceinline size_t __bsf(size_t v) {
- size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
+ !(defined(__ILP32__) && defined(__x86_64__))
+__forceinline size_t __bsf(size_t v)
+{
+ size_t r = 0;
+ asm("bsf %1,%0" : "=r"(r) : "r"(v));
+ return r;
}
-#endif
+# endif
-__forceinline unsigned int __bsf(unsigned int v) {
- unsigned int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+__forceinline unsigned int __bsf(unsigned int v)
+{
+ unsigned int r = 0;
+ asm("bsf %1,%0" : "=r"(r) : "r"(v));
+ return r;
}
-__forceinline size_t __bsr(size_t v) {
- size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+__forceinline size_t __bsr(size_t v)
+{
+ size_t r = 0;
+ asm("bsr %1,%0" : "=r"(r) : "r"(v));
+ return r;
}
-__forceinline size_t __btc(size_t v, size_t i) {
- size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+__forceinline size_t __btc(size_t v, size_t i)
+{
+ size_t r = 0;
+ asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
+ return r;
}
-__forceinline size_t __bts(size_t v, size_t i) {
- size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+__forceinline size_t __bts(size_t v, size_t i)
+{
+ size_t r = 0;
+ asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
+ return r;
}
-__forceinline size_t __btr(size_t v, size_t i) {
- size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+__forceinline size_t __btr(size_t v, size_t i)
+{
+ size_t r = 0;
+ asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
+ return r;
}
-__forceinline int bitscan(int v) {
-#if defined(__KERNEL_AVX2__)
+__forceinline int bitscan(int v)
+{
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-#else
+# else
return __bsf(v);
-#endif
+# endif
}
-__forceinline unsigned int bitscan(unsigned int v) {
-#if defined(__KERNEL_AVX2__)
+__forceinline unsigned int bitscan(unsigned int v)
+{
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-#else
+# else
return __bsf(v);
-#endif
+# endif
}
-#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__))
-__forceinline size_t bitscan(size_t v) {
-#if defined(__KERNEL_AVX2__)
-#if defined(__KERNEL_64_BIT__)
+# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
+ !(defined(__ILP32__) && defined(__x86_64__))
+__forceinline size_t bitscan(size_t v)
+{
+# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_64_BIT__)
return _tzcnt_u64(v);
-#else
+# else
return _tzcnt_u32(v);
-#endif
-#else
+# endif
+# else
return __bsf(v);
-#endif
+# endif
}
-#endif
+# endif
__forceinline int clz(const int x)
{
-#if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _lzcnt_u32(x);
-#else
- if(UNLIKELY(x == 0)) return 32;
+# else
+ if (UNLIKELY(x == 0))
+ return 32;
return 31 - __bsr(x);
-#endif
+# endif
}
-__forceinline int __bscf(int& v)
+__forceinline int __bscf(int &v)
{
int i = bitscan(v);
-#if defined(__KERNEL_AVX2__)
- v &= v-1;
-#else
- v = __btc(v,i);
-#endif
+# if defined(__KERNEL_AVX2__)
+ v &= v - 1;
+# else
+ v = __btc(v, i);
+# endif
return i;
}
-__forceinline unsigned int __bscf(unsigned int& v)
+__forceinline unsigned int __bscf(unsigned int &v)
{
unsigned int i = bitscan(v);
- v &= v-1;
+ v &= v - 1;
return i;
}
-#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__))
-__forceinline size_t __bscf(size_t& v)
+# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
+ !(defined(__ILP32__) && defined(__x86_64__))
+__forceinline size_t __bscf(size_t &v)
{
size_t i = bitscan(v);
-#if defined(__KERNEL_AVX2__)
- v &= v-1;
-#else
- v = __btc(v,i);
-#endif
+# if defined(__KERNEL_AVX2__)
+ v &= v - 1;
+# else
+ v = __btc(v, i);
+# endif
return i;
}
-#endif
+# endif
-#endif /* _WIN32 */
+# endif /* _WIN32 */
/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
* __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
* platforms when compiling code outside the kernel. */
-#if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
+# if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
/* Emulation of SSE4 functions with SSE2 */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
+# define _MM_FROUND_TO_NEAREST_INT 0x00
+# define _MM_FROUND_TO_NEG_INF 0x01
+# define _MM_FROUND_TO_POS_INF 0x02
+# define _MM_FROUND_TO_ZERO 0x03
+# define _MM_FROUND_CUR_DIRECTION 0x04
-#undef _mm_blendv_ps
-#define _mm_blendv_ps _mm_blendv_ps_emu
-__forceinline __m128 _mm_blendv_ps_emu( __m128 value, __m128 input, __m128 mask)
+# undef _mm_blendv_ps
+# define _mm_blendv_ps _mm_blendv_ps_emu
+__forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
{
- __m128i isignmask = _mm_set1_epi32(0x80000000);
- __m128 signmask = _mm_castsi128_ps(isignmask);
- __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
- __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
- __m128 cmpmask = _mm_castsi128_ps(icmpmask);
- return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
+ __m128i isignmask = _mm_set1_epi32(0x80000000);
+ __m128 signmask = _mm_castsi128_ps(isignmask);
+ __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
+ __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
+ __m128 cmpmask = _mm_castsi128_ps(icmpmask);
+ return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
}
-#undef _mm_blend_ps
-#define _mm_blend_ps _mm_blend_ps_emu
-__forceinline __m128 _mm_blend_ps_emu( __m128 value, __m128 input, const int mask)
+# undef _mm_blend_ps
+# define _mm_blend_ps _mm_blend_ps_emu
+__forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask)
{
- assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
+ assert(mask < 0x10);
+ return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
}
-#undef _mm_blendv_epi8
-#define _mm_blendv_epi8 _mm_blendv_epi8_emu
-__forceinline __m128i _mm_blendv_epi8_emu( __m128i value, __m128i input, __m128i mask)
+# undef _mm_blendv_epi8
+# define _mm_blendv_epi8 _mm_blendv_epi8_emu
+__forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask)
{
- return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
+ return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
}
-#undef _mm_min_epi32
-#define _mm_min_epi32 _mm_min_epi32_emu
-__forceinline __m128i _mm_min_epi32_emu( __m128i value, __m128i input)
+# undef _mm_min_epi32
+# define _mm_min_epi32 _mm_min_epi32_emu
+__forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
{
- return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
+ return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
}
-#undef _mm_max_epi32
-#define _mm_max_epi32 _mm_max_epi32_emu
-__forceinline __m128i _mm_max_epi32_emu( __m128i value, __m128i input)
+# undef _mm_max_epi32
+# define _mm_max_epi32 _mm_max_epi32_emu
+__forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
{
- return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
+ return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
}
-#undef _mm_extract_epi32
-#define _mm_extract_epi32 _mm_extract_epi32_emu
-__forceinline int _mm_extract_epi32_emu( __m128i input, const int index)
+# undef _mm_extract_epi32
+# define _mm_extract_epi32 _mm_extract_epi32_emu
+__forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
{
- switch(index) {
- case 0: return _mm_cvtsi128_si32(input);
- case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
- case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
- case 3: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
- default: assert(false); return 0;
+ switch (index) {
+ case 0:
+ return _mm_cvtsi128_si32(input);
+ case 1:
+ return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
+ case 2:
+ return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
+ case 3:
+ return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
+ default:
+ assert(false);
+ return 0;
}
}
-#undef _mm_insert_epi32
-#define _mm_insert_epi32 _mm_insert_epi32_emu
-__forceinline __m128i _mm_insert_epi32_emu( __m128i value, int input, const int index)
+# undef _mm_insert_epi32
+# define _mm_insert_epi32 _mm_insert_epi32_emu
+__forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index)
{
- assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value;
+ assert(index >= 0 && index < 4);
+ ((int *)&value)[index] = input;
+ return value;
}
-#undef _mm_insert_ps
-#define _mm_insert_ps _mm_insert_ps_emu
-__forceinline __m128 _mm_insert_ps_emu( __m128 value, __m128 input, const int index)
+# undef _mm_insert_ps
+# define _mm_insert_ps _mm_insert_ps_emu
+__forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index)
{
- assert(index < 0x100);
- ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6];
- return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value);
+ assert(index < 0x100);
+ ((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6];
+ return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
}
-#undef _mm_round_ps
-#define _mm_round_ps _mm_round_ps_emu
-__forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags)
+# undef _mm_round_ps
+# define _mm_round_ps _mm_round_ps_emu
+__forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
{
- switch(flags)
- {
- case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
- case _MM_FROUND_TO_NEG_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
- case _MM_FROUND_TO_POS_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps( 0.5f))));
- case _MM_FROUND_TO_ZERO : return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
+ switch (flags) {
+ case _MM_FROUND_TO_NEAREST_INT:
+ return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
+ case _MM_FROUND_TO_NEG_INF:
+ return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
+ case _MM_FROUND_TO_POS_INF:
+ return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
+ case _MM_FROUND_TO_ZERO:
+ return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
}
return value;
}
-#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
+# endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
-#else /* __KERNEL_SSE2__ */
+# else /* __KERNEL_SSE2__ */
/* This section is for utility functions which operates on non-register data
* which might be used from a non-vectorized code.
@@ -460,38 +571,34 @@ __forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags)
ccl_device_inline int bitscan(int value)
{
- assert(value != 0);
- int bit = 0;
- while((value & (1 << bit)) == 0) {
- ++bit;
- }
- return bit;
+ assert(value != 0);
+ int bit = 0;
+ while ((value & (1 << bit)) == 0) {
+ ++bit;
+ }
+ return bit;
}
ccl_device_inline int __bsr(int value)
{
- assert(value != 0);
- int bit = 0;
- while(value >>= 1) {
- ++bit;
- }
- return bit;
+ assert(value != 0);
+ int bit = 0;
+ while (value >>= 1) {
+ ++bit;
+ }
+ return bit;
}
-#endif /* __KERNEL_SSE2__ */
+# endif /* __KERNEL_SSE2__ */
/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__) || \
- defined(__KERNEL_SSE3__) || \
- defined(__KERNEL_SSSE3__) || \
- defined(__KERNEL_SSE41__) || \
- defined(__KERNEL_AVX__) || \
- defined(__KERNEL_AVX2__)
- /* do nothing */
-#endif
+# if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
+ defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+/* do nothing */
+# endif
CCL_NAMESPACE_END
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_GPU__ */
-#endif /* __UTIL_SIMD_TYPES_H__ */
+#endif /* __UTIL_SIMD_TYPES_H__ */