diff options
author | Brecht Van Lommel <brecht@blender.org> | 2021-02-14 17:34:23 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2021-02-17 18:26:24 +0300 |
commit | 8119f0aad21c3ce88e82d68ed20cd5a8edc99703 (patch) | |
tree | 38c117be872788f9858c09b96b63af6c666fe770 /intern/cycles/util | |
parent | db28411fd90b77035dddc1682bb2786da34f73e9 (diff) |
Cycles: refactor intrinsic functions implementation
* Add processor independent fallbacks
* Use uint32_t and uint64_t types
* Remove unused functions
* Better comments and less indentation
Ref D8237, T78710
Diffstat (limited to 'intern/cycles/util')
-rw-r--r-- | intern/cycles/util/util_avxb.h | 8 | ||||
-rw-r--r-- | intern/cycles/util/util_avxi.h | 8 | ||||
-rw-r--r-- | intern/cycles/util/util_color.h | 2 | ||||
-rw-r--r-- | intern/cycles/util/util_half.h | 2 | ||||
-rw-r--r-- | intern/cycles/util/util_simd.h | 524 | ||||
-rw-r--r-- | intern/cycles/util/util_sseb.h | 8 | ||||
-rw-r--r-- | intern/cycles/util/util_ssef.h | 10 | ||||
-rw-r--r-- | intern/cycles/util/util_ssei.h | 8 | ||||
-rw-r--r-- | intern/cycles/util/util_types.h | 21 |
9 files changed, 220 insertions, 371 deletions
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h index 34fafd188de..17d505c077a 100644 --- a/intern/cycles/util/util_avxb.h +++ b/intern/cycles/util/util_avxb.h @@ -191,12 +191,12 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b) //////////////////////////////////////////////////////////////////////////////// #if defined(__KERNEL_SSE41__) -__forceinline size_t popcnt(const avxb &a) +__forceinline uint32_t popcnt(const avxb &a) { - return __popcnt(_mm256_movemask_ps(a)); + return _mm_popcnt_u32(_mm256_movemask_ps(a)); } #else -__forceinline size_t popcnt(const avxb &a) +__forceinline uint32_t popcnt(const avxb &a) { return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) + bool(a[7]); @@ -224,7 +224,7 @@ __forceinline bool none(const avxb &b) return _mm256_movemask_ps(b) == 0x0; } -__forceinline size_t movemask(const avxb &a) +__forceinline uint32_t movemask(const avxb &a) { return _mm256_movemask_ps(a); } diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h index e658a4f848f..3db646e61f4 100644 --- a/intern/cycles/util/util_avxi.h +++ b/intern/cycles/util/util_avxi.h @@ -711,21 +711,21 @@ __forceinline int reduce_add(const avxi &v) return extract<0>(extract<0>(vreduce_add(v))); } -__forceinline size_t select_min(const avxi &v) +__forceinline uint32_t select_min(const avxi &v) { return __bsf(movemask(v == vreduce_min(v))); } -__forceinline size_t select_max(const avxi &v) +__forceinline uint32_t select_max(const avxi &v) { return __bsf(movemask(v == vreduce_max(v))); } -__forceinline size_t select_min(const avxb &valid, const avxi &v) +__forceinline uint32_t select_min(const avxb &valid, const avxi &v) { const avxi a = select(valid, v, avxi(pos_inf)); return __bsf(movemask(valid & (a == vreduce_min(a)))); } -__forceinline size_t select_max(const avxb &valid, const avxi &v) +__forceinline uint32_t select_max(const avxb &valid, const avxi &v) { const avxi a = select(valid, v, avxi(neg_inf)); return __bsf(movemask(valid & (a == vreduce_max(a)))); diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h index c6937ca78fe..1b493d0ed5e 100644 --- a/intern/cycles/util/util_color.h +++ b/intern/cycles/util/util_color.h @@ -20,7 +20,7 @@ #include "util/util_math.h" #include "util/util_types.h" -#ifdef __KERNEL_SSE2__ +#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__) # include "util/util_simd.h" #endif diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index 3bac7008905..a8d4ee75e20 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -20,7 +20,7 @@ #include "util/util_math.h" #include "util/util_types.h" -#ifdef __KERNEL_SSE2__ +#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__) # include "util/util_simd.h" #endif diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index de0e3c39f30..3a6761c6a2f 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -18,49 +18,41 @@ #ifndef __UTIL_SIMD_TYPES_H__ #define __UTIL_SIMD_TYPES_H__ -#ifndef __KERNEL_GPU__ +#include <limits> +#include <stdint.h> -# include <limits> - -# include "util/util_defines.h" +#include "util/util_defines.h" /* SSE Intrinsics includes * - * We assume __KERNEL_SSEX__ flags to have been defined at this point */ - -/* SSE intrinsics headers */ -# ifndef FREE_WINDOWS64 - -# ifdef _MSC_VER -# include <intrin.h> -# elif (defined(__x86_64__) || defined(__i386__)) -# include <x86intrin.h> -# endif - -# else - -/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. + * We assume __KERNEL_SSEX__ flags to have been defined at this point. + * + * MinGW64 has conflicting declarations for these SSE headers in <windows.h>. * Since we can't avoid including <windows.h>, better only include that */ -# include "util/util_windows.h" - -# endif - -# if defined(__x86_64__) || defined(_M_X64) -# define SIMD_SET_FLUSH_TO_ZERO \ - _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); -# else -# define SIMD_SET_FLUSH_TO_ZERO -# endif +#if defined(FREE_WINDOWS64) +# include "util/util_windows.h" +#elif defined(_MSC_VER) +# include <intrin.h> +#elif (defined(__x86_64__) || defined(__i386__)) +# include <x86intrin.h> +#endif + +/* Floating Point Control, for Embree. */ +#if defined(__x86_64__) || defined(_M_X64) +# define SIMD_SET_FLUSH_TO_ZERO \ + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#else +# define SIMD_SET_FLUSH_TO_ZERO +#endif CCL_NAMESPACE_BEGIN -# ifdef __KERNEL_SSE2__ +/* Data structures used by SSE classes. */ +#ifdef __KERNEL_SSE2__ extern const __m128 _mm_lookupmask_ps[16]; -/* Special Types */ - static struct TrueTy { __forceinline operator bool() const { @@ -122,377 +114,281 @@ static struct PosInfTy { static struct StepTy { } step ccl_maybe_unused; -/* Intrinsics Functions */ +#endif -# if defined(__BMI__) && defined(__GNUC__) -# ifndef _tzcnt_u32 -# define _tzcnt_u32 __tzcnt_u32 -# endif -# ifndef _tzcnt_u64 -# define _tzcnt_u64 __tzcnt_u64 -# endif -# endif - -# if defined(__LZCNT__) -# define _lzcnt_u32 __lzcnt32 -# define _lzcnt_u64 __lzcnt64 -# endif - -# if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__) - -__forceinline int __popcnt(int in) -{ - return _mm_popcnt_u32(in); -} +/* Intrinsics Functions + * + * For fast bit operations. */ -# if !defined(_MSC_VER) -__forceinline unsigned int __popcnt(unsigned int in) -{ - return _mm_popcnt_u32(in); -} -# endif +#if defined(__BMI__) && defined(__GNUC__) +# ifndef _tzcnt_u32 +# define _tzcnt_u32 __tzcnt_u32 +# endif +# ifndef _tzcnt_u64 +# define _tzcnt_u64 __tzcnt_u64 +# endif +#endif -# if defined(__KERNEL_64_BIT__) -__forceinline long long __popcnt(long long in) -{ - return _mm_popcnt_u64(in); -} -__forceinline size_t __popcnt(size_t in) -{ - return _mm_popcnt_u64(in); -} -# endif +#if defined(__LZCNT__) +# define _lzcnt_u32 __lzcnt32 +# define _lzcnt_u64 __lzcnt64 +#endif -__forceinline int __bsf(int v) +#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__) +/* Intrinsic functions on Windows. */ +__forceinline uint32_t __bsf(uint32_t v) { -# if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_AVX2__) return _tzcnt_u32(v); -# else +# else unsigned long r = 0; _BitScanForward(&r, v); return r; -# endif +# endif } -__forceinline unsigned int __bsf(unsigned int v) +__forceinline uint32_t __bsf(uint32_t v) { -# if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_AVX2__) return _tzcnt_u32(v); -# else +# else unsigned long r = 0; _BitScanForward(&r, v); return r; -# endif +# endif } -__forceinline int __bsr(int v) +__forceinline uint32_t __bsr(uint32_t v) { unsigned long r = 0; _BitScanReverse(&r, v); return r; } -__forceinline int __btc(int v, int i) +__forceinline uint32_t __btc(uint32_t v, uint32_t i) { long r = v; _bittestandcomplement(&r, i); return r; } -__forceinline int __bts(int v, int i) +__forceinline uint32_t bitscan(uint32_t v) { - long r = v; - _bittestandset(&r, i); - return r; -} - -__forceinline int __btr(int v, int i) -{ - long r = v; - _bittestandreset(&r, i); - return r; -} - -__forceinline int bitscan(int v) -{ -# if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_AVX2__) return _tzcnt_u32(v); -# else +# else return __bsf(v); -# endif -} - -__forceinline int clz(const int x) -{ -# if defined(__KERNEL_AVX2__) - return _lzcnt_u32(x); -# else - if (UNLIKELY(x == 0)) - return 32; - return 31 - __bsr(x); -# endif -} - -__forceinline int __bscf(int &v) -{ - int i = __bsf(v); - v &= v - 1; - return i; -} - -__forceinline unsigned int __bscf(unsigned int &v) -{ - unsigned int i = __bsf(v); - v &= v - 1; - return i; +# endif } -# if defined(__KERNEL_64_BIT__) +# if defined(__KERNEL_64_BIT__) -__forceinline size_t __bsf(size_t v) +__forceinline uint64_t __bsf(uint64_t v) { -# if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_AVX2__) return _tzcnt_u64(v); -# else +# else unsigned long r = 0; _BitScanForward64(&r, v); return r; -# endif +# endif } -__forceinline size_t __bsr(size_t v) +__forceinline uint64_t __bsr(uint64_t v) { unsigned long r = 0; _BitScanReverse64(&r, v); return r; } -__forceinline size_t __btc(size_t v, size_t i) +__forceinline uint64_t __btc(uint64_t v, uint64_t i) { - size_t r = v; + uint64_t r = v; _bittestandcomplement64((__int64 *)&r, i); return r; } -__forceinline size_t __bts(size_t v, size_t i) +__forceinline uint64_t bitscan(uint64_t v) { - __int64 r = v; - _bittestandset64(&r, i); - return r; -} - -__forceinline size_t __btr(size_t v, size_t i) -{ - __int64 r = v; - _bittestandreset64(&r, i); - return r; -} - -__forceinline size_t bitscan(size_t v) -{ -# if defined(__KERNEL_AVX2__) -# if defined(__KERNEL_64_BIT__) +# if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_64_BIT__) return _tzcnt_u64(v); -# else +# else return _tzcnt_u32(v); -# endif -# else +# endif +# else return __bsf(v); -# endif -} - -__forceinline size_t __bscf(size_t &v) -{ - size_t i = __bsf(v); - v &= v - 1; - return i; +# endif } -# endif /* __KERNEL_64_BIT__ */ +# endif /* __KERNEL_64_BIT__ */ -# else /* _WIN32 */ - -__forceinline unsigned int __popcnt(unsigned int in) -{ - int r = 0; - asm("popcnt %1,%0" : "=r"(r) : "r"(in)); - return r; -} +#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__) +/* Instrinsic functions with x86 SSE. */ -__forceinline int __bsf(int v) +__forceinline uint32_t __bsf(const uint32_t v) { - int r = 0; + uint32_t r = 0; asm("bsf %1,%0" : "=r"(r) : "r"(v)); return r; } -__forceinline int __bsr(int v) +__forceinline uint32_t __bsr(const uint32_t v) { - int r = 0; + uint32_t r = 0; asm("bsr %1,%0" : "=r"(r) : "r"(v)); return r; } -__forceinline int __btc(int v, int i) +__forceinline uint32_t __btc(const uint32_t v, uint32_t i) { - int r = 0; + uint32_t r = 0; asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; } -__forceinline int __bts(int v, int i) -{ - int r = 0; - asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); - return r; -} - -__forceinline int __btr(int v, int i) +# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \ + !(defined(__ILP32__) && defined(__x86_64__)) +__forceinline uint64_t __bsf(const uint64_t v) { - int r = 0; - asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); - return r; -} - -# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \ - !(defined(__ILP32__) && defined(__x86_64__)) -__forceinline size_t __bsf(size_t v) -{ - size_t r = 0; + uint64_t r = 0; asm("bsf %1,%0" : "=r"(r) : "r"(v)); return r; } -# endif +# endif -__forceinline unsigned int __bsf(unsigned int v) +__forceinline uint64_t __bsr(const uint64_t v) { - unsigned int r = 0; - asm("bsf %1,%0" : "=r"(r) : "r"(v)); + uint64_t r = 0; + asm("bsr %1,%0" : "=r"(r) : "r"(v)); return r; } -__forceinline size_t __bsr(size_t v) +__forceinline uint64_t __btc(const uint64_t v, const uint64_t i) { - size_t r = 0; - asm("bsr %1,%0" : "=r"(r) : "r"(v)); + uint64_t r = 0; + asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; } -__forceinline size_t __btc(size_t v, size_t i) +__forceinline uint32_t bitscan(uint32_t v) { - size_t r = 0; - asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); - return r; +# if defined(__KERNEL_AVX2__) + return _tzcnt_u32(v); +# else + return __bsf(v); +# endif } -__forceinline size_t __bts(size_t v, size_t i) +# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \ + !(defined(__ILP32__) && defined(__x86_64__)) +__forceinline uint64_t bitscan(uint64_t v) { - size_t r = 0; - asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); - return r; +# if defined(__KERNEL_AVX2__) +# if defined(__KERNEL_64_BIT__) + return _tzcnt_u64(v); +# else + return _tzcnt_u32(v); +# endif +# else + return __bsf(v); +# endif } +# endif -__forceinline size_t __btr(size_t v, size_t i) +#else +/* Intrinsic functions fallback for arbitrary processor. */ +__forceinline uint32_t __bsf(const uint32_t x) { - size_t r = 0; - asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); - return r; + for (int i = 0; i < 32; i++) { + if (x & (1U << i)) + return i; + } + return 32; } -__forceinline int bitscan(int v) +__forceinline uint32_t __bsr(const uint32_t x) { -# if defined(__KERNEL_AVX2__) - return _tzcnt_u32(v); -# else - return __bsf(v); -# endif + for (int i = 0; i < 32; i++) { + if (x & (1U << (31 - i))) + return (31 - i); + } + return 32; } -__forceinline unsigned int bitscan(unsigned int v) +__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit) { -# if defined(__KERNEL_AVX2__) - return _tzcnt_u32(v); -# else - return __bsf(v); -# endif + uint32_t mask = 1U << bit; + return x & (~mask); } -# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \ - !(defined(__ILP32__) && defined(__x86_64__)) -__forceinline size_t bitscan(size_t v) +__forceinline uint32_t __bsf(const uint64_t x) { -# if defined(__KERNEL_AVX2__) -# if defined(__KERNEL_64_BIT__) - return _tzcnt_u64(v); -# else - return _tzcnt_u32(v); -# endif -# else - return __bsf(v); -# endif + for (int i = 0; i < 64; i++) { + if (x & (1UL << i)) + return i; + } + return 64; } -# endif -__forceinline int clz(const int x) +__forceinline uint32_t __bsr(const uint64_t x) { -# if defined(__KERNEL_AVX2__) - return _lzcnt_u32(x); -# else - if (UNLIKELY(x == 0)) - return 32; - return 31 - __bsr(x); -# endif + for (int i = 0; i < 64; i++) { + if (x & (1UL << (63 - i))) + return (63 - i); + } + return 64; } -__forceinline int __bscf(int &v) +__forceinline uint64_t __btc(const uint64_t x, const uint32_t bit) { - int i = bitscan(v); -# if defined(__KERNEL_AVX2__) - v &= v - 1; -# else - v = __btc(v, i); -# endif - return i; + uint64_t mask = 1UL << bit; + return x & (~mask); } -__forceinline unsigned int __bscf(unsigned int &v) +__forceinline uint32_t bitscan(uint32_t value) { - unsigned int i = bitscan(v); - v &= v - 1; - return i; + assert(value != 0); + uint32_t bit = 0; + while ((value & (1 << bit)) == 0) { + ++bit; + } + return bit; } -# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \ - !(defined(__ILP32__) && defined(__x86_64__)) -__forceinline size_t __bscf(size_t &v) +__forceinline uint64_t bitscan(uint64_t value) { - size_t i = bitscan(v); -# if defined(__KERNEL_AVX2__) - v &= v - 1; -# else - v = __btc(v, i); -# endif - return i; + assert(value != 0); + uint64_t bit = 0; + while ((value & (1 << bit)) == 0) { + ++bit; + } + return bit; } -# endif -# endif /* _WIN32 */ +#endif /* Intrinsics */ + +/* SSE compatibility. + * + * Various utilities to smooth over differences between SSE versions and + * implementations. */ +#ifdef __KERNEL_SSE2__ /* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other * platforms when compiling code outside the kernel. */ -# if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) +# if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) /* Emulation of SSE4 functions with SSE2 */ -# define _MM_FROUND_TO_NEAREST_INT 0x00 -# define _MM_FROUND_TO_NEG_INF 0x01 -# define _MM_FROUND_TO_POS_INF 0x02 -# define _MM_FROUND_TO_ZERO 0x03 -# define _MM_FROUND_CUR_DIRECTION 0x04 +# define _MM_FROUND_TO_NEAREST_INT 0x00 +# define _MM_FROUND_TO_NEG_INF 0x01 +# define _MM_FROUND_TO_POS_INF 0x02 +# define _MM_FROUND_TO_ZERO 0x03 +# define _MM_FROUND_CUR_DIRECTION 0x04 -# undef _mm_blendv_ps -# define _mm_blendv_ps _mm_blendv_ps_emu +# undef _mm_blendv_ps +# define _mm_blendv_ps _mm_blendv_ps_emu __forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask) { __m128i isignmask = _mm_set1_epi32(0x80000000); @@ -503,37 +399,37 @@ __forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask) return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value)); } -# undef _mm_blend_ps -# define _mm_blend_ps _mm_blend_ps_emu +# undef _mm_blend_ps +# define _mm_blend_ps _mm_blend_ps_emu __forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask) { assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); } -# undef _mm_blendv_epi8 -# define _mm_blendv_epi8 _mm_blendv_epi8_emu +# undef _mm_blendv_epi8 +# define _mm_blendv_epi8 _mm_blendv_epi8_emu __forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask) { return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); } -# undef _mm_min_epi32 -# define _mm_min_epi32 _mm_min_epi32_emu +# undef _mm_min_epi32 +# define _mm_min_epi32 _mm_min_epi32_emu __forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input) { return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); } -# undef _mm_max_epi32 -# define _mm_max_epi32 _mm_max_epi32_emu +# undef _mm_max_epi32 +# define _mm_max_epi32 _mm_max_epi32_emu __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input) { return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } -# undef _mm_extract_epi32 -# define _mm_extract_epi32 _mm_extract_epi32_emu +# undef _mm_extract_epi32 +# define _mm_extract_epi32 _mm_extract_epi32_emu __forceinline int _mm_extract_epi32_emu(__m128i input, const int index) { switch (index) { @@ -551,8 +447,8 @@ __forceinline int _mm_extract_epi32_emu(__m128i input, const int index) } } -# undef _mm_insert_epi32 -# define _mm_insert_epi32 _mm_insert_epi32_emu +# undef _mm_insert_epi32 +# define _mm_insert_epi32 _mm_insert_epi32_emu __forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index) { assert(index >= 0 && index < 4); @@ -560,8 +456,8 @@ __forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int i return value; } -# undef _mm_insert_ps -# define _mm_insert_ps _mm_insert_ps_emu +# undef _mm_insert_ps +# define _mm_insert_ps _mm_insert_ps_emu __forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index) { assert(index < 0x100); @@ -569,8 +465,8 @@ __forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int ind return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value); } -# undef _mm_round_ps -# define _mm_round_ps _mm_round_ps_emu +# undef _mm_round_ps +# define _mm_round_ps _mm_round_ps_emu __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags) { switch (flags) { @@ -586,51 +482,23 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags) return value; } -# endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ +# endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ /* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves. * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */ -# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) -# undef _mm256_cvtss_f32 -# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a))) -# endif - -# else /* __KERNEL_SSE2__ */ - -/* This section is for utility functions which operates on non-register data - * which might be used from a non-vectorized code. - */ - -ccl_device_inline int bitscan(int value) -{ - assert(value != 0); - int bit = 0; - while ((value & (1 << bit)) == 0) { - ++bit; - } - return bit; -} - -ccl_device_inline int __bsr(int value) -{ - assert(value != 0); - int bit = 0; - while (value >>= 1) { - ++bit; - } - return bit; -} +# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) +# undef _mm256_cvtss_f32 +# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a))) +# endif -# endif /* __KERNEL_SSE2__ */ +#endif /* __KERNEL_SSE2__ */ /* quiet unused define warnings */ -# if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \ - defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) +#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \ + defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) /* do nothing */ -# endif +#endif CCL_NAMESPACE_END -#endif /* __KERNEL_GPU__ */ - #endif /* __UTIL_SIMD_TYPES_H__ */ diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h index 56f8f676ba1..edf13e0c493 100644 --- a/intern/cycles/util/util_sseb.h +++ b/intern/cycles/util/util_sseb.h @@ -258,12 +258,12 @@ template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b //////////////////////////////////////////////////////////////////////////////// # if defined(__KERNEL_SSE41__) -__forceinline size_t popcnt(const sseb &a) +__forceinline uint32_t popcnt(const sseb &a) { - return __popcnt(_mm_movemask_ps(a)); + return _mm_popcnt_u32(_mm_movemask_ps(a)); } # else -__forceinline size_t popcnt(const sseb &a) +__forceinline uint32_t popcnt(const sseb &a) { return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]); } @@ -290,7 +290,7 @@ __forceinline bool none(const sseb &b) return _mm_movemask_ps(b) == 0x0; } -__forceinline size_t movemask(const sseb &a) +__forceinline uint32_t movemask(const sseb &a) { return _mm_movemask_ps(a); } diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index e9f0efb4efb..b14640ced40 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -730,27 +730,27 @@ __forceinline float reduce_add(const ssef &v) return _mm_cvtss_f32(vreduce_add(v)); } -__forceinline size_t select_min(const ssef &v) +__forceinline uint32_t select_min(const ssef &v) { return __bsf(movemask(v == vreduce_min(v))); } -__forceinline size_t select_max(const ssef &v) +__forceinline uint32_t select_max(const ssef &v) { return __bsf(movemask(v == vreduce_max(v))); } -__forceinline size_t select_min(const sseb &valid, const ssef &v) +__forceinline uint32_t select_min(const sseb &valid, const ssef &v) { const ssef a = select(valid, v, ssef(pos_inf)); return __bsf(movemask(valid & (a == vreduce_min(a)))); } -__forceinline size_t select_max(const sseb &valid, const ssef &v) +__forceinline uint32_t select_max(const sseb &valid, const ssef &v) { const ssef a = select(valid, v, ssef(neg_inf)); return __bsf(movemask(valid & (a == vreduce_max(a)))); } -__forceinline size_t movemask(const ssef &a) +__forceinline uint32_t movemask(const ssef &a) { return _mm_movemask_ps(a); } diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h index e2bf81310cc..c03ab18a6df 100644 --- a/intern/cycles/util/util_ssei.h +++ b/intern/cycles/util/util_ssei.h @@ -516,21 +516,21 @@ __forceinline int reduce_add(const ssei &v) return extract<0>(vreduce_add(v)); } -__forceinline size_t select_min(const ssei &v) +__forceinline uint32_t select_min(const ssei &v) { return __bsf(movemask(v == vreduce_min(v))); } -__forceinline size_t select_max(const ssei &v) +__forceinline uint32_t select_max(const ssei &v) { return __bsf(movemask(v == vreduce_max(v))); } -__forceinline size_t select_min(const sseb &valid, const ssei &v) +__forceinline uint32_t select_min(const sseb &valid, const ssei &v) { const ssei a = select(valid, v, ssei((int)pos_inf)); return __bsf(movemask(valid & (a == vreduce_min(a)))); } -__forceinline size_t select_max(const sseb &valid, const ssei &v) +__forceinline uint32_t select_max(const sseb &valid, const ssei &v) { const ssei a = select(valid, v, ssei((int)neg_inf)); return __bsf(movemask(valid & (a == vreduce_max(a)))); diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index fc80fa9696c..87358877e3c 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -23,7 +23,7 @@ /* Standard Integer Types */ -#if !defined(__KERNEL_GPU__) && !defined(_WIN32) +#if !defined(__KERNEL_GPU__) # include <stdint.h> #endif @@ -57,25 +57,6 @@ typedef unsigned long uint64_t; #endif #ifndef __KERNEL_GPU__ -# ifdef _WIN32 -typedef signed char int8_t; -typedef unsigned char uint8_t; - -typedef signed short int16_t; -typedef unsigned short uint16_t; - -typedef signed int int32_t; -typedef unsigned int uint32_t; - -typedef long long int64_t; -typedef unsigned long long uint64_t; -# ifdef __KERNEL_64_BIT__ -typedef int64_t ssize_t; -# else -typedef int32_t ssize_t; -# endif -# endif /* _WIN32 */ - /* Generic Memory Pointer */ typedef uint64_t device_ptr; |