/* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2013 Intel Corporation * Modifications Copyright 2014-2022 Blender Foundation. */ #ifndef __UTIL_SSEI_H__ #define __UTIL_SSEI_H__ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ struct sseb; struct ssef; /*! 4-wide SSE integer type. */ struct ssei { typedef sseb Mask; // mask type typedef ssei Int; // int type typedef ssef Float; // float type enum { size = 4 }; // number of SIMD elements union { __m128i m128; int32_t i[4]; }; // data //////////////////////////////////////////////////////////////////////////////// /// Constructors, Assignment & Cast Operators //////////////////////////////////////////////////////////////////////////////// __forceinline ssei() { } __forceinline ssei(const ssei &a) { m128 = a.m128; } __forceinline ssei &operator=(const ssei &a) { m128 = a.m128; return *this; } __forceinline ssei(const __m128i a) : m128(a) { } __forceinline operator const __m128i &(void) const { return m128; } __forceinline operator __m128i &(void) { return m128; } __forceinline ssei(const int a) : m128(_mm_set1_epi32(a)) { } __forceinline ssei(int a, int b, int c, int d) : m128(_mm_setr_epi32(a, b, c, d)) { } __forceinline explicit ssei(const __m128 a) : m128(_mm_cvtps_epi32(a)) { } //////////////////////////////////////////////////////////////////////////////// /// Array Access //////////////////////////////////////////////////////////////////////////////// __forceinline const int32_t &operator[](const size_t index) const { assert(index < 4); return i[index]; } __forceinline int32_t &operator[](const size_t index) { assert(index < 4); return i[index]; } }; //////////////////////////////////////////////////////////////////////////////// /// Unary Operators //////////////////////////////////////////////////////////////////////////////// __forceinline const ssei cast(const __m128 &a) { return _mm_castps_si128(a); } __forceinline const ssei operator+(const ssei &a) { return a; } __forceinline const ssei operator-(const ssei &a) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } # if defined(__KERNEL_SSSE3__) __forceinline const ssei abs(const ssei &a) { return _mm_abs_epi32(a.m128); } # endif //////////////////////////////////////////////////////////////////////////////// /// Binary Operators //////////////////////////////////////////////////////////////////////////////// __forceinline const ssei operator+(const ssei &a, const ssei &b) { return _mm_add_epi32(a.m128, b.m128); } __forceinline const ssei operator+(const ssei &a, const int32_t &b) { return a + ssei(b); } __forceinline const ssei operator+(const int32_t &a, const ssei &b) { return ssei(a) + b; } __forceinline const ssei operator-(const ssei &a, const ssei &b) { return _mm_sub_epi32(a.m128, b.m128); } __forceinline const ssei operator-(const ssei &a, const int32_t &b) { return a - ssei(b); } __forceinline const ssei operator-(const int32_t &a, const ssei &b) { return ssei(a) - b; } # if defined(__KERNEL_SSE41__) __forceinline const ssei operator*(const ssei &a, const ssei &b) { return _mm_mullo_epi32(a.m128, b.m128); } __forceinline const ssei operator*(const ssei &a, const int32_t &b) { return a * ssei(b); } __forceinline const ssei operator*(const int32_t &a, const ssei &b) { return ssei(a) * b; } # endif __forceinline const ssei operator&(const ssei &a, const ssei &b) { return _mm_and_si128(a.m128, b.m128); } __forceinline const ssei operator&(const ssei &a, const int32_t &b) { return a & ssei(b); } __forceinline const ssei operator&(const int32_t &a, const ssei &b) { return ssei(a) & b; } __forceinline const ssei operator|(const ssei &a, const ssei &b) { return _mm_or_si128(a.m128, b.m128); } __forceinline const ssei operator|(const ssei &a, const int32_t &b) { return a | ssei(b); } __forceinline const ssei operator|(const int32_t &a, const ssei &b) { return ssei(a) | b; } __forceinline const ssei operator^(const ssei &a, const ssei &b) { return _mm_xor_si128(a.m128, b.m128); } __forceinline const ssei operator^(const ssei &a, const int32_t &b) { return a ^ ssei(b); } __forceinline const ssei operator^(const int32_t &a, const ssei &b) { return ssei(a) ^ b; } __forceinline const ssei operator<<(const ssei &a, const int32_t &n) { return _mm_slli_epi32(a.m128, n); } __forceinline const ssei operator>>(const ssei &a, const int32_t &n) { return _mm_srai_epi32(a.m128, n); } __forceinline const ssei andnot(const ssei &a, const ssei &b) { return _mm_andnot_si128(a.m128, b.m128); } __forceinline const ssei andnot(const sseb &a, const ssei &b) { return _mm_andnot_si128(cast(a.m128), b.m128); } __forceinline const ssei andnot(const ssei &a, const sseb &b) { return _mm_andnot_si128(a.m128, cast(b.m128)); } __forceinline const ssei sra(const ssei &a, const int32_t &b) { return _mm_srai_epi32(a.m128, b); } __forceinline const ssei srl(const ssei &a, const int32_t &b) { return _mm_srli_epi32(a.m128, b); } # if defined(__KERNEL_SSE41__) __forceinline const ssei min(const ssei &a, const ssei &b) { return _mm_min_epi32(a.m128, b.m128); } __forceinline const ssei min(const ssei &a, const int32_t &b) { return min(a, ssei(b)); } __forceinline const ssei min(const int32_t &a, const ssei &b) { return min(ssei(a), b); } __forceinline const ssei max(const ssei &a, const ssei &b) { return _mm_max_epi32(a.m128, b.m128); } __forceinline const ssei max(const ssei &a, const int32_t &b) { return max(a, ssei(b)); } __forceinline const ssei max(const int32_t &a, const ssei &b) { return max(ssei(a), b); } # endif //////////////////////////////////////////////////////////////////////////////// /// Assignment Operators //////////////////////////////////////////////////////////////////////////////// __forceinline ssei &operator+=(ssei &a, const ssei &b) { return a = a + b; } __forceinline ssei &operator+=(ssei &a, const int32_t &b) { return a = a + b; } __forceinline ssei &operator-=(ssei &a, const ssei &b) { return a = a - b; } __forceinline ssei &operator-=(ssei &a, const int32_t &b) { return a = a - b; } # if defined(__KERNEL_SSE41__) __forceinline ssei &operator*=(ssei &a, const ssei &b) { return a = a * b; } __forceinline ssei &operator*=(ssei &a, const int32_t &b) { return a = a * b; } # endif __forceinline ssei &operator&=(ssei &a, const ssei &b) { return a = a & b; } __forceinline ssei &operator&=(ssei &a, const int32_t &b) { return a = a & b; } __forceinline ssei &operator|=(ssei &a, const ssei &b) { return a = a | b; } __forceinline ssei &operator|=(ssei &a, const int32_t &b) { return a = a | b; } __forceinline ssei &operator^=(ssei &a, const ssei &b) { return a = a ^ b; } __forceinline ssei &operator^=(ssei &a, const int32_t &b) { return a = a ^ b; } __forceinline ssei &operator<<=(ssei &a, const int32_t &b) { return a = a << b; } __forceinline ssei &operator>>=(ssei &a, const int32_t &b) { return a = a >> b; } //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators + Select //////////////////////////////////////////////////////////////////////////////// __forceinline const sseb operator==(const ssei &a, const ssei &b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128)); } __forceinline const sseb operator==(const ssei &a, const int32_t &b) { return a == ssei(b); } __forceinline const sseb operator==(const int32_t &a, const ssei &b) { return ssei(a) == b; } __forceinline const sseb operator!=(const ssei &a, const ssei &b) { return !(a == b); } __forceinline const sseb operator!=(const ssei &a, const int32_t &b) { return a != ssei(b); } __forceinline const sseb operator!=(const int32_t &a, const ssei &b) { return ssei(a) != b; } __forceinline const sseb operator<(const ssei &a, const ssei &b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a.m128, b.m128)); } __forceinline const sseb operator<(const ssei &a, const int32_t &b) { return a < ssei(b); } __forceinline const sseb operator<(const int32_t &a, const ssei &b) { return ssei(a) < b; } __forceinline const sseb operator>=(const ssei &a, const ssei &b) { return !(a < b); } __forceinline const sseb operator>=(const ssei &a, const int32_t &b) { return a >= ssei(b); } __forceinline const sseb operator>=(const int32_t &a, const ssei &b) { return ssei(a) >= b; } __forceinline const sseb operator>(const ssei &a, const ssei &b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a.m128, b.m128)); } __forceinline const sseb operator>(const ssei &a, const int32_t &b) { return a > ssei(b); } __forceinline const sseb operator>(const int32_t &a, const ssei &b) { return ssei(a) > b; } __forceinline const sseb operator<=(const ssei &a, const ssei &b) { return !(a > b); } __forceinline const sseb operator<=(const ssei &a, const int32_t &b) { return a <= ssei(b); } __forceinline const sseb operator<=(const int32_t &a, const ssei &b) { return ssei(a) <= b; } __forceinline const ssei select(const sseb &m, const ssei &t, const ssei &f) { # ifdef __KERNEL_SSE41__ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); # else return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); # endif } __forceinline const ssei select(const int mask, const ssei &t, const ssei &f) { # if defined(__KERNEL_SSE41__) && \ ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER)) return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); # else return select(sseb(mask), t, f); # endif } //////////////////////////////////////////////////////////////////////////////// // Movement/Shifting/Shuffling Functions //////////////////////////////////////////////////////////////////////////////// __forceinline ssei unpacklo(const ssei &a, const ssei &b) { return _mm_unpacklo_epi32(a, b); } __forceinline ssei unpackhi(const ssei &a, const ssei &b) { return _mm_unpackhi_epi32(a, b); } template __forceinline const ssei shuffle(const ssei &a) { # ifdef __KERNEL_NEON__ int32x4_t result = shuffle_neon(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(result); # else return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)); # endif } template __forceinline const ssei shuffle(const ssei &a, const ssei &b) { # ifdef __KERNEL_NEON__ int32x4_t result = shuffle_neon(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)); return vreinterpretq_m128i_s32(result); # else return _mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); # endif } template __forceinline const ssei shuffle(const ssei &b) { return shuffle(b); } # if defined(__KERNEL_SSE41__) template __forceinline int extract(const ssei &b) { return _mm_extract_epi32(b, src); } template __forceinline const ssei insert(const ssei &a, const int32_t b) { return _mm_insert_epi32(a, b, dst); } # else template __forceinline int extract(const ssei &b) { return b[src]; } template __forceinline const ssei insert(const ssei &a, const int32_t b) { ssei c = a; c[dst] = b; return c; } # endif //////////////////////////////////////////////////////////////////////////////// /// Reductions //////////////////////////////////////////////////////////////////////////////// # if defined(__KERNEL_SSE41__) __forceinline const ssei vreduce_min(const ssei &v) { ssei h = min(shuffle<1, 0, 3, 2>(v), v); return min(shuffle<2, 3, 0, 1>(h), h); } __forceinline const ssei vreduce_max(const ssei &v) { ssei h = max(shuffle<1, 0, 3, 2>(v), v); return max(shuffle<2, 3, 0, 1>(h), h); } __forceinline const ssei vreduce_add(const ssei &v) { ssei h = shuffle<1, 0, 3, 2>(v) + v; return shuffle<2, 3, 0, 1>(h) + h; } __forceinline int reduce_min(const ssei &v) { # ifdef __KERNEL_NEON__ return vminvq_s32(vreinterpretq_s32_m128i(v)); # else return extract<0>(vreduce_min(v)); # endif } __forceinline int reduce_max(const ssei &v) { # ifdef __KERNEL_NEON__ return vmaxvq_s32(vreinterpretq_s32_m128i(v)); # else return extract<0>(vreduce_max(v)); # endif } __forceinline int reduce_add(const ssei &v) { # ifdef __KERNEL_NEON__ return vaddvq_s32(vreinterpretq_s32_m128i(v)); # else return extract<0>(vreduce_add(v)); # endif } __forceinline uint32_t select_min(const ssei &v) { return __bsf(movemask(v == vreduce_min(v))); } __forceinline uint32_t select_max(const ssei &v) { return __bsf(movemask(v == vreduce_max(v))); } __forceinline uint32_t select_min(const sseb &valid, const ssei &v) { const ssei a = select(valid, v, ssei((int)pos_inf)); return __bsf(movemask(valid & (a == vreduce_min(a)))); } __forceinline uint32_t select_max(const sseb &valid, const ssei &v) { const ssei a = select(valid, v, ssei((int)neg_inf)); return __bsf(movemask(valid & (a == vreduce_max(a)))); } # else __forceinline int ssei_min(int a, int b) { return (a < b) ? a : b; } __forceinline int ssei_max(int a, int b) { return (a > b) ? a : b; } __forceinline int reduce_min(const ssei &v) { return ssei_min(ssei_min(v[0], v[1]), ssei_min(v[2], v[3])); } __forceinline int reduce_max(const ssei &v) { return ssei_max(ssei_max(v[0], v[1]), ssei_max(v[2], v[3])); } __forceinline int reduce_add(const ssei &v) { return v[0] + v[1] + v[2] + v[3]; } # endif //////////////////////////////////////////////////////////////////////////////// /// Memory load and store operations //////////////////////////////////////////////////////////////////////////////// __forceinline ssei load4i(const void *const a) { return _mm_load_si128((__m128i *)a); } __forceinline void store4i(void *ptr, const ssei &v) { _mm_store_si128((__m128i *)ptr, v); } __forceinline void storeu4i(void *ptr, const ssei &v) { _mm_storeu_si128((__m128i *)ptr, v); } __forceinline void store4i(const sseb &mask, void *ptr, const ssei &i) { # if defined(__KERNEL_AVX__) _mm_maskstore_ps((float *)ptr, (__m128i)mask, _mm_castsi128_ps(i)); # else *(ssei *)ptr = select(mask, i, *(ssei *)ptr); # endif } __forceinline ssei load4i_nt(void *ptr) { # if defined(__KERNEL_SSE41__) return _mm_stream_load_si128((__m128i *)ptr); # else return _mm_load_si128((__m128i *)ptr); # endif } __forceinline void store4i_nt(void *ptr, const ssei &v) { # if defined(__KERNEL_SSE41__) _mm_stream_ps((float *)ptr, _mm_castsi128_ps(v)); # else _mm_store_si128((__m128i *)ptr, v); # endif } //////////////////////////////////////////////////////////////////////////////// /// Debug Functions //////////////////////////////////////////////////////////////////////////////// ccl_device_inline void print_ssei(const char *label, const ssei &a) { printf("%s: %df %df %df %d\n", label, a[0], a[1], a[2], a[3]); } #endif CCL_NAMESPACE_END #endif