From 651a19b2c72d75e3e0e486e5fe62308081370b9b Mon Sep 17 00:00:00 2001 From: XhmikosR Date: Mon, 17 May 2010 12:51:03 +0000 Subject: cleanup git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@1891 10f7b99b-c216-0410-bff0-8a66a9350fd8 --- src/DSUtil/mmintrin64.c | 1254 ++++++++++++++++++++-------------------- src/DSUtil/simd.h | 5 + src/DSUtil/simd_common.h | 14 +- src/DSUtil/simd_instructions.h | 4 + 4 files changed, 646 insertions(+), 631 deletions(-) (limited to 'src/DSUtil') diff --git a/src/DSUtil/mmintrin64.c b/src/DSUtil/mmintrin64.c index 07b70b16a..d64bb4d2c 100644 --- a/src/DSUtil/mmintrin64.c +++ b/src/DSUtil/mmintrin64.c @@ -27,979 +27,979 @@ // MMX __m64 _mm_setzero_si64(void) { - __m64 rv = {0}; - return rv; + __m64 rv = {0}; + return rv; } __m64 _mm_set_pi32(int _I1, int _I0) { - __m64 rv = {0}; - rv.m64_i32[0] = _I0; - rv.m64_i32[1] = _I1; - return rv; + __m64 rv = {0}; + rv.m64_i32[0] = _I0; + rv.m64_i32[1] = _I1; + return rv; } __m64 _mm_set_pi16(short _S3, short _S2, short _S1, short _S0) { - __m64 rv = {0}; - rv.m64_i16[0] = _S0; - rv.m64_i16[1] = _S1; - rv.m64_i16[2] = _S2; - rv.m64_i16[3] = _S3; - return rv; + __m64 rv = {0}; + rv.m64_i16[0] = _S0; + rv.m64_i16[1] = _S1; + rv.m64_i16[2] = _S2; + rv.m64_i16[3] = _S3; + return rv; } __m64 _mm_set_pi8(char _B7, char _B6, char _B5, char _B4, char _B3, char _B2, char _B1, char _B0) { - __m64 rv = {0}; - rv.m64_i8[0] = _B0; - rv.m64_i8[1] = _B1; - rv.m64_i8[2] = _B2; - rv.m64_i8[3] = _B3; - rv.m64_i8[4] = _B4; - rv.m64_i8[5] = _B5; - rv.m64_i8[6] = _B6; - rv.m64_i8[7] = _B7; - return rv; + __m64 rv = {0}; + rv.m64_i8[0] = _B0; + rv.m64_i8[1] = _B1; + rv.m64_i8[2] = _B2; + rv.m64_i8[3] = _B3; + rv.m64_i8[4] = _B4; + rv.m64_i8[5] = _B5; + rv.m64_i8[6] = _B6; + rv.m64_i8[7] = _B7; + return rv; } __m64 _mm_set1_pi32(int _I) { - __m64 rv = {0}; - rv.m64_i32[0] = rv.m64_i32[1] = _I; - return rv; + __m64 rv = {0}; + rv.m64_i32[0] = rv.m64_i32[1] = _I; + return rv; } __m64 _mm_set1_pi16(short _S) { - __m64 rv = {0}; - rv.m64_i16[0] = rv.m64_i16[1] = rv.m64_i16[2] = rv.m64_i16[3] = _S; - return rv; + __m64 rv = {0}; + rv.m64_i16[0] = rv.m64_i16[1] = rv.m64_i16[2] = rv.m64_i16[3] = _S; + return rv; } __m64 _mm_set1_pi8(char _B) { - __m64 rv = {0}; - rv.m64_i8[0] = rv.m64_i8[1] = rv.m64_i8[2] = rv.m64_i8[3] = rv.m64_i8[4] = rv.m64_i8[5] = rv.m64_i8[6] = rv.m64_i8[7] = _B; - return rv; + __m64 rv = {0}; + rv.m64_i8[0] = rv.m64_i8[1] = rv.m64_i8[2] = rv.m64_i8[3] = rv.m64_i8[4] = rv.m64_i8[5] = rv.m64_i8[6] = rv.m64_i8[7] = _B; + return rv; } __m64 _m_psubb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_sub_epi8(lhs, rhs); + lhs = _mm_sub_epi8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_paddusb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_adds_epu8(lhs, rhs); + lhs = _mm_adds_epu8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psubsw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_subs_epi16(lhs, rhs); + lhs = _mm_subs_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psubsb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_subs_epi8(lhs, rhs); + lhs = _mm_subs_epi8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_paddw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_add_epi16(lhs, rhs); + lhs = _mm_add_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } void _m_empty(void) { } __m64 _m_packuswb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; - lhs.m128i_i64[1] = _MM2.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; + lhs.m128i_i64[1] = _MM2.m64_i64; - lhs = _mm_packus_epi16(lhs, rhs); + lhs = _mm_packus_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psrlwi(__m64 _M, int _Count) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _M.m64_i64; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _M.m64_i64; - lhs = _mm_srli_epi16(lhs, _Count); + lhs = _mm_srli_epi16(lhs, _Count); - _M.m64_i64 = lhs.m128i_i64[0]; - return _M; + _M.m64_i64 = lhs.m128i_i64[0]; + return _M; } __m64 _m_pmullw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_mullo_epi16(lhs, rhs); + lhs = _mm_mullo_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_from_int(int _I) { - __m64 rv = {0}; - rv.m64_i32[0] = _I; - return rv; + __m64 rv = {0}; + rv.m64_i32[0] = _I; + return rv; } int _m_to_int(__m64 _M) { - return (int)(_M.m64_u64 & 0xffffffff); + return (int)(_M.m64_u64 & 0xffffffff); } __m64 _m_psrlqi(__m64 _M, int _Count) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _M.m64_i64; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _M.m64_i64; - lhs = _mm_srli_epi64(lhs, _Count); + lhs = _mm_srli_epi64(lhs, _Count); - _M.m64_i64 = lhs.m128i_i64[0]; - return _M; + _M.m64_i64 = lhs.m128i_i64[0]; + return _M; } __m64 _m_paddd(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_add_epi32(lhs, rhs); + lhs = _mm_add_epi32(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pmaddwd(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_madd_epi16(lhs, rhs); + lhs = _mm_madd_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_punpcklbw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_unpacklo_epi8(lhs, rhs); + lhs = _mm_unpacklo_epi8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_paddb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_add_epi8(lhs, rhs); + lhs = _mm_add_epi8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_por(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_or_si128(lhs, rhs); + lhs = _mm_or_si128(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pand(__m64 _MM1, __m64 _MM2) { - __m128i lhs, rhs; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs, rhs; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_and_si128(lhs, rhs); + lhs = _mm_and_si128(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pandn(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_andnot_si128(lhs, rhs); + lhs = _mm_andnot_si128(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pcmpgtb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_cmpgt_epi8(lhs, rhs); + lhs = _mm_cmpgt_epi8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psubusb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_subs_epu8(lhs, rhs); + lhs = _mm_subs_epu8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psrawi(__m64 _M, int _Count) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _M.m64_i64; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _M.m64_i64; - lhs = _mm_srai_epi16(lhs, _Count); + lhs = _mm_srai_epi16(lhs, _Count); - _M.m64_i64 = lhs.m128i_i64[0]; - return _M; + _M.m64_i64 = lhs.m128i_i64[0]; + return _M; } __m64 _m_psubw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_sub_epi16(lhs, rhs); + lhs = _mm_sub_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psllwi(__m64 _M, int _Count) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _M.m64_i64; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _M.m64_i64; - lhs = _mm_slli_epi16(lhs, _Count); + lhs = _mm_slli_epi16(lhs, _Count); - _M.m64_i64 = lhs.m128i_i64[0]; - return _M; + _M.m64_i64 = lhs.m128i_i64[0]; + return _M; } __m64 _m_paddusw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_adds_epu16(lhs, rhs); + lhs = _mm_adds_epu16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pxor(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_xor_si128(lhs, rhs); + lhs = _mm_xor_si128(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pslldi(__m64 _M, int _Count) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _M.m64_i64; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _M.m64_i64; - lhs = _mm_slli_epi32(lhs, _Count); + lhs = _mm_slli_epi32(lhs, _Count); - _M.m64_i64 = lhs.m128i_i64[0]; - return _M; + _M.m64_i64 = lhs.m128i_i64[0]; + return _M; } __m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i8[ 8] = _MM1.m64_i8[4]; - lhs.m128i_i8[ 9] = _MM1.m64_i8[5]; - lhs.m128i_i8[10] = _MM1.m64_i8[6]; - lhs.m128i_i8[11] = _MM1.m64_i8[7]; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i8[ 8] = _MM1.m64_i8[4]; + lhs.m128i_i8[ 9] = _MM1.m64_i8[5]; + lhs.m128i_i8[10] = _MM1.m64_i8[6]; + lhs.m128i_i8[11] = _MM1.m64_i8[7]; - rhs.m128i_i8[ 8] = _MM2.m64_i8[4]; - rhs.m128i_i8[ 9] = _MM2.m64_i8[5]; - rhs.m128i_i8[10] = _MM2.m64_i8[6]; - rhs.m128i_i8[11] = _MM2.m64_i8[7]; + rhs.m128i_i8[ 8] = _MM2.m64_i8[4]; + rhs.m128i_i8[ 9] = _MM2.m64_i8[5]; + rhs.m128i_i8[10] = _MM2.m64_i8[6]; + rhs.m128i_i8[11] = _MM2.m64_i8[7]; - lhs = _mm_unpackhi_epi8(lhs, rhs); + lhs = _mm_unpackhi_epi8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_punpcklwd(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_unpacklo_epi16(lhs, rhs); + lhs = _mm_unpacklo_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_punpckldq(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_unpacklo_epi32(lhs, rhs); + lhs = _mm_unpacklo_epi32(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pcmpgtw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_cmpgt_epi16(lhs, rhs); + lhs = _mm_cmpgt_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_cmpgt_epi32(lhs, rhs); + lhs = _mm_cmpgt_epi32(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_cmpeq_epi8(lhs, rhs); + lhs = _mm_cmpeq_epi8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_cmpeq_epi32(lhs, rhs); + lhs = _mm_cmpeq_epi32(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_punpckhwd(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i16[4] = _MM1.m64_i16[2]; - lhs.m128i_i16[5] = _MM1.m64_i16[3]; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i16[4] = _MM1.m64_i16[2]; + lhs.m128i_i16[5] = _MM1.m64_i16[3]; - rhs.m128i_i16[4] = _MM2.m64_i16[2]; - rhs.m128i_i16[5] = _MM2.m64_i16[3]; + rhs.m128i_i16[4] = _MM2.m64_i16[2]; + rhs.m128i_i16[5] = _MM2.m64_i16[3]; - lhs = _mm_unpackhi_epi16(lhs, rhs); + lhs = _mm_unpackhi_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_punpckhdq(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i32[2] = _MM1.m64_i32[1]; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i32[2] = _MM1.m64_i32[1]; - rhs.m128i_i32[2] = _MM2.m64_i32[1]; + rhs.m128i_i32[2] = _MM2.m64_i32[1]; - lhs = _mm_unpackhi_epi32(lhs, rhs); + lhs = _mm_unpackhi_epi32(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psrldi(__m64 _M, int _Count) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _M.m64_i64; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _M.m64_i64; - lhs = _mm_srli_epi32(lhs, _Count); + lhs = _mm_srli_epi32(lhs, _Count); - _M.m64_i64 = lhs.m128i_i64[0]; - return _M; + _M.m64_i64 = lhs.m128i_i64[0]; + return _M; } __m64 _m_psubd(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_sub_epi32(lhs, rhs); + lhs = _mm_sub_epi32(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pmulhw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_mulhi_epi16(lhs, rhs); + lhs = _mm_mulhi_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psllqi(__m64 _M, int _Count) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _M.m64_i64; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _M.m64_i64; - lhs = _mm_slli_epi64(lhs, _Count); + lhs = _mm_slli_epi64(lhs, _Count); - _M.m64_i64 = lhs.m128i_i64[0]; - return _M; + _M.m64_i64 = lhs.m128i_i64[0]; + return _M; } __m64 _m_pcmpeqw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_cmpeq_epi16(lhs, rhs); + lhs = _mm_cmpeq_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_paddsb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_adds_epi8(lhs, rhs); + lhs = _mm_adds_epi8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_packsswb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_packs_epi16(lhs, rhs); + lhs = _mm_packs_epi16(lhs, rhs); - _MM1.m64_i32[0] = lhs.m128i_i32[0]; - _MM1.m64_i32[1] = lhs.m128i_i32[2]; - return _MM1; + _MM1.m64_i32[0] = lhs.m128i_i32[0]; + _MM1.m64_i32[1] = lhs.m128i_i32[2]; + return _MM1; } __m64 _m_psradi(__m64 _M, int _Count) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _M.m64_i64; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _M.m64_i64; - lhs = _mm_srai_epi32(lhs, _Count); + lhs = _mm_srai_epi32(lhs, _Count); - _M.m64_i64 = lhs.m128i_i64[0]; - return _M; + _M.m64_i64 = lhs.m128i_i64[0]; + return _M; } __m64 _m_paddsw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_adds_epi16(lhs, rhs); + lhs = _mm_adds_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psubusw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_subs_epu16(lhs, rhs); + lhs = _mm_subs_epu16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_packssdw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i32[0] = _MM1.m64_i32[0]; - lhs.m128i_i32[1] = _MM1.m64_i32[1]; - lhs.m128i_i32[2] = _MM2.m64_i32[0]; - lhs.m128i_i32[3] = _MM2.m64_i32[1]; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i32[0] = _MM1.m64_i32[0]; + lhs.m128i_i32[1] = _MM1.m64_i32[1]; + lhs.m128i_i32[2] = _MM2.m64_i32[0]; + lhs.m128i_i32[3] = _MM2.m64_i32[1]; - lhs = _mm_packs_epi32(lhs, rhs); + lhs = _mm_packs_epi32(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psraw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_sra_epi16(lhs, rhs); + lhs = _mm_sra_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } // SSE __m64 _m_pmaxub(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_max_epu8(lhs, rhs); + lhs = _mm_max_epu8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pminub(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_min_epu8(lhs, rhs); + lhs = _mm_min_epu8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pavgb(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_avg_epu8(lhs, rhs); + lhs = _mm_avg_epu8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pshufw(__m64 _MM1, int _Imm) { - __m128i lhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; - - switch(_Imm) - { - case 0: lhs = _mm_shufflelo_epi16(lhs, 0); break; - case 1: lhs = _mm_shufflelo_epi16(lhs, 1); break; - case 2: lhs = _mm_shufflelo_epi16(lhs, 2); break; - case 3: lhs = _mm_shufflelo_epi16(lhs, 3); break; - case 4: lhs = _mm_shufflelo_epi16(lhs, 4); break; - case 5: lhs = _mm_shufflelo_epi16(lhs, 5); break; - case 6: lhs = _mm_shufflelo_epi16(lhs, 6); break; - case 7: lhs = _mm_shufflelo_epi16(lhs, 7); break; - case 8: lhs = _mm_shufflelo_epi16(lhs, 8); break; - case 9: lhs = _mm_shufflelo_epi16(lhs, 9); break; - case 10: lhs = _mm_shufflelo_epi16(lhs, 10); break; - case 11: lhs = _mm_shufflelo_epi16(lhs, 11); break; - case 12: lhs = _mm_shufflelo_epi16(lhs, 12); break; - case 13: lhs = _mm_shufflelo_epi16(lhs, 13); break; - case 14: lhs = _mm_shufflelo_epi16(lhs, 14); break; - case 15: lhs = _mm_shufflelo_epi16(lhs, 15); break; - case 16: lhs = _mm_shufflelo_epi16(lhs, 16); break; - case 17: lhs = _mm_shufflelo_epi16(lhs, 17); break; - case 18: lhs = _mm_shufflelo_epi16(lhs, 18); break; - case 19: lhs = _mm_shufflelo_epi16(lhs, 19); break; - case 20: lhs = _mm_shufflelo_epi16(lhs, 20); break; - case 21: lhs = _mm_shufflelo_epi16(lhs, 21); break; - case 22: lhs = _mm_shufflelo_epi16(lhs, 22); break; - case 23: lhs = _mm_shufflelo_epi16(lhs, 23); break; - case 24: lhs = _mm_shufflelo_epi16(lhs, 24); break; - case 25: lhs = _mm_shufflelo_epi16(lhs, 25); break; - case 26: lhs = _mm_shufflelo_epi16(lhs, 26); break; - case 27: lhs = _mm_shufflelo_epi16(lhs, 27); break; - case 28: lhs = _mm_shufflelo_epi16(lhs, 28); break; - case 29: lhs = _mm_shufflelo_epi16(lhs, 29); break; - case 30: lhs = _mm_shufflelo_epi16(lhs, 30); break; - case 31: lhs = _mm_shufflelo_epi16(lhs, 31); break; - case 32: lhs = _mm_shufflelo_epi16(lhs, 32); break; - case 33: lhs = _mm_shufflelo_epi16(lhs, 33); break; - case 34: lhs = _mm_shufflelo_epi16(lhs, 34); break; - case 35: lhs = _mm_shufflelo_epi16(lhs, 35); break; - case 36: lhs = _mm_shufflelo_epi16(lhs, 36); break; - case 37: lhs = _mm_shufflelo_epi16(lhs, 37); break; - case 38: lhs = _mm_shufflelo_epi16(lhs, 38); break; - case 39: lhs = _mm_shufflelo_epi16(lhs, 39); break; - case 40: lhs = _mm_shufflelo_epi16(lhs, 40); break; - case 41: lhs = _mm_shufflelo_epi16(lhs, 41); break; - case 42: lhs = _mm_shufflelo_epi16(lhs, 42); break; - case 43: lhs = _mm_shufflelo_epi16(lhs, 43); break; - case 44: lhs = _mm_shufflelo_epi16(lhs, 44); break; - case 45: lhs = _mm_shufflelo_epi16(lhs, 45); break; - case 46: lhs = _mm_shufflelo_epi16(lhs, 46); break; - case 47: lhs = _mm_shufflelo_epi16(lhs, 47); break; - case 48: lhs = _mm_shufflelo_epi16(lhs, 48); break; - case 49: lhs = _mm_shufflelo_epi16(lhs, 49); break; - case 50: lhs = _mm_shufflelo_epi16(lhs, 50); break; - case 51: lhs = _mm_shufflelo_epi16(lhs, 51); break; - case 52: lhs = _mm_shufflelo_epi16(lhs, 52); break; - case 53: lhs = _mm_shufflelo_epi16(lhs, 53); break; - case 54: lhs = _mm_shufflelo_epi16(lhs, 54); break; - case 55: lhs = _mm_shufflelo_epi16(lhs, 55); break; - case 56: lhs = _mm_shufflelo_epi16(lhs, 56); break; - case 57: lhs = _mm_shufflelo_epi16(lhs, 57); break; - case 58: lhs = _mm_shufflelo_epi16(lhs, 58); break; - case 59: lhs = _mm_shufflelo_epi16(lhs, 59); break; - case 60: lhs = _mm_shufflelo_epi16(lhs, 60); break; - case 61: lhs = _mm_shufflelo_epi16(lhs, 61); break; - case 62: lhs = _mm_shufflelo_epi16(lhs, 62); break; - case 63: lhs = _mm_shufflelo_epi16(lhs, 63); break; - case 64: lhs = _mm_shufflelo_epi16(lhs, 64); break; - case 65: lhs = _mm_shufflelo_epi16(lhs, 65); break; - case 66: lhs = _mm_shufflelo_epi16(lhs, 66); break; - case 67: lhs = _mm_shufflelo_epi16(lhs, 67); break; - case 68: lhs = _mm_shufflelo_epi16(lhs, 68); break; - case 69: lhs = _mm_shufflelo_epi16(lhs, 69); break; - case 70: lhs = _mm_shufflelo_epi16(lhs, 70); break; - case 71: lhs = _mm_shufflelo_epi16(lhs, 71); break; - case 72: lhs = _mm_shufflelo_epi16(lhs, 72); break; - case 73: lhs = _mm_shufflelo_epi16(lhs, 73); break; - case 74: lhs = _mm_shufflelo_epi16(lhs, 74); break; - case 75: lhs = _mm_shufflelo_epi16(lhs, 75); break; - case 76: lhs = _mm_shufflelo_epi16(lhs, 76); break; - case 77: lhs = _mm_shufflelo_epi16(lhs, 77); break; - case 78: lhs = _mm_shufflelo_epi16(lhs, 78); break; - case 79: lhs = _mm_shufflelo_epi16(lhs, 79); break; - case 80: lhs = _mm_shufflelo_epi16(lhs, 80); break; - case 81: lhs = _mm_shufflelo_epi16(lhs, 81); break; - case 82: lhs = _mm_shufflelo_epi16(lhs, 82); break; - case 83: lhs = _mm_shufflelo_epi16(lhs, 83); break; - case 84: lhs = _mm_shufflelo_epi16(lhs, 84); break; - case 85: lhs = _mm_shufflelo_epi16(lhs, 85); break; - case 86: lhs = _mm_shufflelo_epi16(lhs, 86); break; - case 87: lhs = _mm_shufflelo_epi16(lhs, 87); break; - case 88: lhs = _mm_shufflelo_epi16(lhs, 88); break; - case 89: lhs = _mm_shufflelo_epi16(lhs, 89); break; - case 90: lhs = _mm_shufflelo_epi16(lhs, 90); break; - case 91: lhs = _mm_shufflelo_epi16(lhs, 91); break; - case 92: lhs = _mm_shufflelo_epi16(lhs, 92); break; - case 93: lhs = _mm_shufflelo_epi16(lhs, 93); break; - case 94: lhs = _mm_shufflelo_epi16(lhs, 94); break; - case 95: lhs = _mm_shufflelo_epi16(lhs, 95); break; - case 96: lhs = _mm_shufflelo_epi16(lhs, 96); break; - case 97: lhs = _mm_shufflelo_epi16(lhs, 97); break; - case 98: lhs = _mm_shufflelo_epi16(lhs, 98); break; - case 99: lhs = _mm_shufflelo_epi16(lhs, 99); break; - case 100: lhs = _mm_shufflelo_epi16(lhs, 100); break; - case 101: lhs = _mm_shufflelo_epi16(lhs, 101); break; - case 102: lhs = _mm_shufflelo_epi16(lhs, 102); break; - case 103: lhs = _mm_shufflelo_epi16(lhs, 103); break; - case 104: lhs = _mm_shufflelo_epi16(lhs, 104); break; - case 105: lhs = _mm_shufflelo_epi16(lhs, 105); break; - case 106: lhs = _mm_shufflelo_epi16(lhs, 106); break; - case 107: lhs = _mm_shufflelo_epi16(lhs, 107); break; - case 108: lhs = _mm_shufflelo_epi16(lhs, 108); break; - case 109: lhs = _mm_shufflelo_epi16(lhs, 109); break; - case 110: lhs = _mm_shufflelo_epi16(lhs, 110); break; - case 111: lhs = _mm_shufflelo_epi16(lhs, 111); break; - case 112: lhs = _mm_shufflelo_epi16(lhs, 112); break; - case 113: lhs = _mm_shufflelo_epi16(lhs, 113); break; - case 114: lhs = _mm_shufflelo_epi16(lhs, 114); break; - case 115: lhs = _mm_shufflelo_epi16(lhs, 115); break; - case 116: lhs = _mm_shufflelo_epi16(lhs, 116); break; - case 117: lhs = _mm_shufflelo_epi16(lhs, 117); break; - case 118: lhs = _mm_shufflelo_epi16(lhs, 118); break; - case 119: lhs = _mm_shufflelo_epi16(lhs, 119); break; - case 120: lhs = _mm_shufflelo_epi16(lhs, 120); break; - case 121: lhs = _mm_shufflelo_epi16(lhs, 121); break; - case 122: lhs = _mm_shufflelo_epi16(lhs, 122); break; - case 123: lhs = _mm_shufflelo_epi16(lhs, 123); break; - case 124: lhs = _mm_shufflelo_epi16(lhs, 124); break; - case 125: lhs = _mm_shufflelo_epi16(lhs, 125); break; - case 126: lhs = _mm_shufflelo_epi16(lhs, 126); break; - case 127: lhs = _mm_shufflelo_epi16(lhs, 127); break; - case 128: lhs = _mm_shufflelo_epi16(lhs, 128); break; - case 129: lhs = _mm_shufflelo_epi16(lhs, 129); break; - case 130: lhs = _mm_shufflelo_epi16(lhs, 130); break; - case 131: lhs = _mm_shufflelo_epi16(lhs, 131); break; - case 132: lhs = _mm_shufflelo_epi16(lhs, 132); break; - case 133: lhs = _mm_shufflelo_epi16(lhs, 133); break; - case 134: lhs = _mm_shufflelo_epi16(lhs, 134); break; - case 135: lhs = _mm_shufflelo_epi16(lhs, 135); break; - case 136: lhs = _mm_shufflelo_epi16(lhs, 136); break; - case 137: lhs = _mm_shufflelo_epi16(lhs, 137); break; - case 138: lhs = _mm_shufflelo_epi16(lhs, 138); break; - case 139: lhs = _mm_shufflelo_epi16(lhs, 139); break; - case 140: lhs = _mm_shufflelo_epi16(lhs, 140); break; - case 141: lhs = _mm_shufflelo_epi16(lhs, 141); break; - case 142: lhs = _mm_shufflelo_epi16(lhs, 142); break; - case 143: lhs = _mm_shufflelo_epi16(lhs, 143); break; - case 144: lhs = _mm_shufflelo_epi16(lhs, 144); break; - case 145: lhs = _mm_shufflelo_epi16(lhs, 145); break; - case 146: lhs = _mm_shufflelo_epi16(lhs, 146); break; - case 147: lhs = _mm_shufflelo_epi16(lhs, 147); break; - case 148: lhs = _mm_shufflelo_epi16(lhs, 148); break; - case 149: lhs = _mm_shufflelo_epi16(lhs, 149); break; - case 150: lhs = _mm_shufflelo_epi16(lhs, 150); break; - case 151: lhs = _mm_shufflelo_epi16(lhs, 151); break; - case 152: lhs = _mm_shufflelo_epi16(lhs, 152); break; - case 153: lhs = _mm_shufflelo_epi16(lhs, 153); break; - case 154: lhs = _mm_shufflelo_epi16(lhs, 154); break; - case 155: lhs = _mm_shufflelo_epi16(lhs, 155); break; - case 156: lhs = _mm_shufflelo_epi16(lhs, 156); break; - case 157: lhs = _mm_shufflelo_epi16(lhs, 157); break; - case 158: lhs = _mm_shufflelo_epi16(lhs, 158); break; - case 159: lhs = _mm_shufflelo_epi16(lhs, 159); break; - case 160: lhs = _mm_shufflelo_epi16(lhs, 160); break; - case 161: lhs = _mm_shufflelo_epi16(lhs, 161); break; - case 162: lhs = _mm_shufflelo_epi16(lhs, 162); break; - case 163: lhs = _mm_shufflelo_epi16(lhs, 163); break; - case 164: lhs = _mm_shufflelo_epi16(lhs, 164); break; - case 165: lhs = _mm_shufflelo_epi16(lhs, 165); break; - case 166: lhs = _mm_shufflelo_epi16(lhs, 166); break; - case 167: lhs = _mm_shufflelo_epi16(lhs, 167); break; - case 168: lhs = _mm_shufflelo_epi16(lhs, 168); break; - case 169: lhs = _mm_shufflelo_epi16(lhs, 169); break; - case 170: lhs = _mm_shufflelo_epi16(lhs, 170); break; - case 171: lhs = _mm_shufflelo_epi16(lhs, 171); break; - case 172: lhs = _mm_shufflelo_epi16(lhs, 172); break; - case 173: lhs = _mm_shufflelo_epi16(lhs, 173); break; - case 174: lhs = _mm_shufflelo_epi16(lhs, 174); break; - case 175: lhs = _mm_shufflelo_epi16(lhs, 175); break; - case 176: lhs = _mm_shufflelo_epi16(lhs, 176); break; - case 177: lhs = _mm_shufflelo_epi16(lhs, 177); break; - case 178: lhs = _mm_shufflelo_epi16(lhs, 178); break; - case 179: lhs = _mm_shufflelo_epi16(lhs, 179); break; - case 180: lhs = _mm_shufflelo_epi16(lhs, 180); break; - case 181: lhs = _mm_shufflelo_epi16(lhs, 181); break; - case 182: lhs = _mm_shufflelo_epi16(lhs, 182); break; - case 183: lhs = _mm_shufflelo_epi16(lhs, 183); break; - case 184: lhs = _mm_shufflelo_epi16(lhs, 184); break; - case 185: lhs = _mm_shufflelo_epi16(lhs, 185); break; - case 186: lhs = _mm_shufflelo_epi16(lhs, 186); break; - case 187: lhs = _mm_shufflelo_epi16(lhs, 187); break; - case 188: lhs = _mm_shufflelo_epi16(lhs, 188); break; - case 189: lhs = _mm_shufflelo_epi16(lhs, 189); break; - case 190: lhs = _mm_shufflelo_epi16(lhs, 190); break; - case 191: lhs = _mm_shufflelo_epi16(lhs, 191); break; - case 192: lhs = _mm_shufflelo_epi16(lhs, 192); break; - case 193: lhs = _mm_shufflelo_epi16(lhs, 193); break; - case 194: lhs = _mm_shufflelo_epi16(lhs, 194); break; - case 195: lhs = _mm_shufflelo_epi16(lhs, 195); break; - case 196: lhs = _mm_shufflelo_epi16(lhs, 196); break; - case 197: lhs = _mm_shufflelo_epi16(lhs, 197); break; - case 198: lhs = _mm_shufflelo_epi16(lhs, 198); break; - case 199: lhs = _mm_shufflelo_epi16(lhs, 199); break; - case 200: lhs = _mm_shufflelo_epi16(lhs, 200); break; - case 201: lhs = _mm_shufflelo_epi16(lhs, 201); break; - case 202: lhs = _mm_shufflelo_epi16(lhs, 202); break; - case 203: lhs = _mm_shufflelo_epi16(lhs, 203); break; - case 204: lhs = _mm_shufflelo_epi16(lhs, 204); break; - case 205: lhs = _mm_shufflelo_epi16(lhs, 205); break; - case 206: lhs = _mm_shufflelo_epi16(lhs, 206); break; - case 207: lhs = _mm_shufflelo_epi16(lhs, 207); break; - case 208: lhs = _mm_shufflelo_epi16(lhs, 208); break; - case 209: lhs = _mm_shufflelo_epi16(lhs, 209); break; - case 210: lhs = _mm_shufflelo_epi16(lhs, 210); break; - case 211: lhs = _mm_shufflelo_epi16(lhs, 211); break; - case 212: lhs = _mm_shufflelo_epi16(lhs, 212); break; - case 213: lhs = _mm_shufflelo_epi16(lhs, 213); break; - case 214: lhs = _mm_shufflelo_epi16(lhs, 214); break; - case 215: lhs = _mm_shufflelo_epi16(lhs, 215); break; - case 216: lhs = _mm_shufflelo_epi16(lhs, 216); break; - case 217: lhs = _mm_shufflelo_epi16(lhs, 217); break; - case 218: lhs = _mm_shufflelo_epi16(lhs, 218); break; - case 219: lhs = _mm_shufflelo_epi16(lhs, 219); break; - case 220: lhs = _mm_shufflelo_epi16(lhs, 220); break; - case 221: lhs = _mm_shufflelo_epi16(lhs, 221); break; - case 222: lhs = _mm_shufflelo_epi16(lhs, 222); break; - case 223: lhs = _mm_shufflelo_epi16(lhs, 223); break; - case 224: lhs = _mm_shufflelo_epi16(lhs, 224); break; - case 225: lhs = _mm_shufflelo_epi16(lhs, 225); break; - case 226: lhs = _mm_shufflelo_epi16(lhs, 226); break; - case 227: lhs = _mm_shufflelo_epi16(lhs, 227); break; - case 228: lhs = _mm_shufflelo_epi16(lhs, 228); break; - case 229: lhs = _mm_shufflelo_epi16(lhs, 229); break; - case 230: lhs = _mm_shufflelo_epi16(lhs, 230); break; - case 231: lhs = _mm_shufflelo_epi16(lhs, 231); break; - case 232: lhs = _mm_shufflelo_epi16(lhs, 232); break; - case 233: lhs = _mm_shufflelo_epi16(lhs, 233); break; - case 234: lhs = _mm_shufflelo_epi16(lhs, 234); break; - case 235: lhs = _mm_shufflelo_epi16(lhs, 235); break; - case 236: lhs = _mm_shufflelo_epi16(lhs, 236); break; - case 237: lhs = _mm_shufflelo_epi16(lhs, 237); break; - case 238: lhs = _mm_shufflelo_epi16(lhs, 238); break; - case 239: lhs = _mm_shufflelo_epi16(lhs, 239); break; - case 240: lhs = _mm_shufflelo_epi16(lhs, 240); break; - case 241: lhs = _mm_shufflelo_epi16(lhs, 241); break; - case 242: lhs = _mm_shufflelo_epi16(lhs, 242); break; - case 243: lhs = _mm_shufflelo_epi16(lhs, 243); break; - case 244: lhs = _mm_shufflelo_epi16(lhs, 244); break; - case 245: lhs = _mm_shufflelo_epi16(lhs, 245); break; - case 246: lhs = _mm_shufflelo_epi16(lhs, 246); break; - case 247: lhs = _mm_shufflelo_epi16(lhs, 247); break; - case 248: lhs = _mm_shufflelo_epi16(lhs, 248); break; - case 249: lhs = _mm_shufflelo_epi16(lhs, 249); break; - case 250: lhs = _mm_shufflelo_epi16(lhs, 250); break; - case 251: lhs = _mm_shufflelo_epi16(lhs, 251); break; - case 252: lhs = _mm_shufflelo_epi16(lhs, 252); break; - case 253: lhs = _mm_shufflelo_epi16(lhs, 253); break; - case 254: lhs = _mm_shufflelo_epi16(lhs, 254); break; - case 255: lhs = _mm_shufflelo_epi16(lhs, 255); break; - } - - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + __m128i lhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; + + switch(_Imm) + { + case 0: lhs = _mm_shufflelo_epi16(lhs, 0); break; + case 1: lhs = _mm_shufflelo_epi16(lhs, 1); break; + case 2: lhs = _mm_shufflelo_epi16(lhs, 2); break; + case 3: lhs = _mm_shufflelo_epi16(lhs, 3); break; + case 4: lhs = _mm_shufflelo_epi16(lhs, 4); break; + case 5: lhs = _mm_shufflelo_epi16(lhs, 5); break; + case 6: lhs = _mm_shufflelo_epi16(lhs, 6); break; + case 7: lhs = _mm_shufflelo_epi16(lhs, 7); break; + case 8: lhs = _mm_shufflelo_epi16(lhs, 8); break; + case 9: lhs = _mm_shufflelo_epi16(lhs, 9); break; + case 10: lhs = _mm_shufflelo_epi16(lhs, 10); break; + case 11: lhs = _mm_shufflelo_epi16(lhs, 11); break; + case 12: lhs = _mm_shufflelo_epi16(lhs, 12); break; + case 13: lhs = _mm_shufflelo_epi16(lhs, 13); break; + case 14: lhs = _mm_shufflelo_epi16(lhs, 14); break; + case 15: lhs = _mm_shufflelo_epi16(lhs, 15); break; + case 16: lhs = _mm_shufflelo_epi16(lhs, 16); break; + case 17: lhs = _mm_shufflelo_epi16(lhs, 17); break; + case 18: lhs = _mm_shufflelo_epi16(lhs, 18); break; + case 19: lhs = _mm_shufflelo_epi16(lhs, 19); break; + case 20: lhs = _mm_shufflelo_epi16(lhs, 20); break; + case 21: lhs = _mm_shufflelo_epi16(lhs, 21); break; + case 22: lhs = _mm_shufflelo_epi16(lhs, 22); break; + case 23: lhs = _mm_shufflelo_epi16(lhs, 23); break; + case 24: lhs = _mm_shufflelo_epi16(lhs, 24); break; + case 25: lhs = _mm_shufflelo_epi16(lhs, 25); break; + case 26: lhs = _mm_shufflelo_epi16(lhs, 26); break; + case 27: lhs = _mm_shufflelo_epi16(lhs, 27); break; + case 28: lhs = _mm_shufflelo_epi16(lhs, 28); break; + case 29: lhs = _mm_shufflelo_epi16(lhs, 29); break; + case 30: lhs = _mm_shufflelo_epi16(lhs, 30); break; + case 31: lhs = _mm_shufflelo_epi16(lhs, 31); break; + case 32: lhs = _mm_shufflelo_epi16(lhs, 32); break; + case 33: lhs = _mm_shufflelo_epi16(lhs, 33); break; + case 34: lhs = _mm_shufflelo_epi16(lhs, 34); break; + case 35: lhs = _mm_shufflelo_epi16(lhs, 35); break; + case 36: lhs = _mm_shufflelo_epi16(lhs, 36); break; + case 37: lhs = _mm_shufflelo_epi16(lhs, 37); break; + case 38: lhs = _mm_shufflelo_epi16(lhs, 38); break; + case 39: lhs = _mm_shufflelo_epi16(lhs, 39); break; + case 40: lhs = _mm_shufflelo_epi16(lhs, 40); break; + case 41: lhs = _mm_shufflelo_epi16(lhs, 41); break; + case 42: lhs = _mm_shufflelo_epi16(lhs, 42); break; + case 43: lhs = _mm_shufflelo_epi16(lhs, 43); break; + case 44: lhs = _mm_shufflelo_epi16(lhs, 44); break; + case 45: lhs = _mm_shufflelo_epi16(lhs, 45); break; + case 46: lhs = _mm_shufflelo_epi16(lhs, 46); break; + case 47: lhs = _mm_shufflelo_epi16(lhs, 47); break; + case 48: lhs = _mm_shufflelo_epi16(lhs, 48); break; + case 49: lhs = _mm_shufflelo_epi16(lhs, 49); break; + case 50: lhs = _mm_shufflelo_epi16(lhs, 50); break; + case 51: lhs = _mm_shufflelo_epi16(lhs, 51); break; + case 52: lhs = _mm_shufflelo_epi16(lhs, 52); break; + case 53: lhs = _mm_shufflelo_epi16(lhs, 53); break; + case 54: lhs = _mm_shufflelo_epi16(lhs, 54); break; + case 55: lhs = _mm_shufflelo_epi16(lhs, 55); break; + case 56: lhs = _mm_shufflelo_epi16(lhs, 56); break; + case 57: lhs = _mm_shufflelo_epi16(lhs, 57); break; + case 58: lhs = _mm_shufflelo_epi16(lhs, 58); break; + case 59: lhs = _mm_shufflelo_epi16(lhs, 59); break; + case 60: lhs = _mm_shufflelo_epi16(lhs, 60); break; + case 61: lhs = _mm_shufflelo_epi16(lhs, 61); break; + case 62: lhs = _mm_shufflelo_epi16(lhs, 62); break; + case 63: lhs = _mm_shufflelo_epi16(lhs, 63); break; + case 64: lhs = _mm_shufflelo_epi16(lhs, 64); break; + case 65: lhs = _mm_shufflelo_epi16(lhs, 65); break; + case 66: lhs = _mm_shufflelo_epi16(lhs, 66); break; + case 67: lhs = _mm_shufflelo_epi16(lhs, 67); break; + case 68: lhs = _mm_shufflelo_epi16(lhs, 68); break; + case 69: lhs = _mm_shufflelo_epi16(lhs, 69); break; + case 70: lhs = _mm_shufflelo_epi16(lhs, 70); break; + case 71: lhs = _mm_shufflelo_epi16(lhs, 71); break; + case 72: lhs = _mm_shufflelo_epi16(lhs, 72); break; + case 73: lhs = _mm_shufflelo_epi16(lhs, 73); break; + case 74: lhs = _mm_shufflelo_epi16(lhs, 74); break; + case 75: lhs = _mm_shufflelo_epi16(lhs, 75); break; + case 76: lhs = _mm_shufflelo_epi16(lhs, 76); break; + case 77: lhs = _mm_shufflelo_epi16(lhs, 77); break; + case 78: lhs = _mm_shufflelo_epi16(lhs, 78); break; + case 79: lhs = _mm_shufflelo_epi16(lhs, 79); break; + case 80: lhs = _mm_shufflelo_epi16(lhs, 80); break; + case 81: lhs = _mm_shufflelo_epi16(lhs, 81); break; + case 82: lhs = _mm_shufflelo_epi16(lhs, 82); break; + case 83: lhs = _mm_shufflelo_epi16(lhs, 83); break; + case 84: lhs = _mm_shufflelo_epi16(lhs, 84); break; + case 85: lhs = _mm_shufflelo_epi16(lhs, 85); break; + case 86: lhs = _mm_shufflelo_epi16(lhs, 86); break; + case 87: lhs = _mm_shufflelo_epi16(lhs, 87); break; + case 88: lhs = _mm_shufflelo_epi16(lhs, 88); break; + case 89: lhs = _mm_shufflelo_epi16(lhs, 89); break; + case 90: lhs = _mm_shufflelo_epi16(lhs, 90); break; + case 91: lhs = _mm_shufflelo_epi16(lhs, 91); break; + case 92: lhs = _mm_shufflelo_epi16(lhs, 92); break; + case 93: lhs = _mm_shufflelo_epi16(lhs, 93); break; + case 94: lhs = _mm_shufflelo_epi16(lhs, 94); break; + case 95: lhs = _mm_shufflelo_epi16(lhs, 95); break; + case 96: lhs = _mm_shufflelo_epi16(lhs, 96); break; + case 97: lhs = _mm_shufflelo_epi16(lhs, 97); break; + case 98: lhs = _mm_shufflelo_epi16(lhs, 98); break; + case 99: lhs = _mm_shufflelo_epi16(lhs, 99); break; + case 100: lhs = _mm_shufflelo_epi16(lhs, 100); break; + case 101: lhs = _mm_shufflelo_epi16(lhs, 101); break; + case 102: lhs = _mm_shufflelo_epi16(lhs, 102); break; + case 103: lhs = _mm_shufflelo_epi16(lhs, 103); break; + case 104: lhs = _mm_shufflelo_epi16(lhs, 104); break; + case 105: lhs = _mm_shufflelo_epi16(lhs, 105); break; + case 106: lhs = _mm_shufflelo_epi16(lhs, 106); break; + case 107: lhs = _mm_shufflelo_epi16(lhs, 107); break; + case 108: lhs = _mm_shufflelo_epi16(lhs, 108); break; + case 109: lhs = _mm_shufflelo_epi16(lhs, 109); break; + case 110: lhs = _mm_shufflelo_epi16(lhs, 110); break; + case 111: lhs = _mm_shufflelo_epi16(lhs, 111); break; + case 112: lhs = _mm_shufflelo_epi16(lhs, 112); break; + case 113: lhs = _mm_shufflelo_epi16(lhs, 113); break; + case 114: lhs = _mm_shufflelo_epi16(lhs, 114); break; + case 115: lhs = _mm_shufflelo_epi16(lhs, 115); break; + case 116: lhs = _mm_shufflelo_epi16(lhs, 116); break; + case 117: lhs = _mm_shufflelo_epi16(lhs, 117); break; + case 118: lhs = _mm_shufflelo_epi16(lhs, 118); break; + case 119: lhs = _mm_shufflelo_epi16(lhs, 119); break; + case 120: lhs = _mm_shufflelo_epi16(lhs, 120); break; + case 121: lhs = _mm_shufflelo_epi16(lhs, 121); break; + case 122: lhs = _mm_shufflelo_epi16(lhs, 122); break; + case 123: lhs = _mm_shufflelo_epi16(lhs, 123); break; + case 124: lhs = _mm_shufflelo_epi16(lhs, 124); break; + case 125: lhs = _mm_shufflelo_epi16(lhs, 125); break; + case 126: lhs = _mm_shufflelo_epi16(lhs, 126); break; + case 127: lhs = _mm_shufflelo_epi16(lhs, 127); break; + case 128: lhs = _mm_shufflelo_epi16(lhs, 128); break; + case 129: lhs = _mm_shufflelo_epi16(lhs, 129); break; + case 130: lhs = _mm_shufflelo_epi16(lhs, 130); break; + case 131: lhs = _mm_shufflelo_epi16(lhs, 131); break; + case 132: lhs = _mm_shufflelo_epi16(lhs, 132); break; + case 133: lhs = _mm_shufflelo_epi16(lhs, 133); break; + case 134: lhs = _mm_shufflelo_epi16(lhs, 134); break; + case 135: lhs = _mm_shufflelo_epi16(lhs, 135); break; + case 136: lhs = _mm_shufflelo_epi16(lhs, 136); break; + case 137: lhs = _mm_shufflelo_epi16(lhs, 137); break; + case 138: lhs = _mm_shufflelo_epi16(lhs, 138); break; + case 139: lhs = _mm_shufflelo_epi16(lhs, 139); break; + case 140: lhs = _mm_shufflelo_epi16(lhs, 140); break; + case 141: lhs = _mm_shufflelo_epi16(lhs, 141); break; + case 142: lhs = _mm_shufflelo_epi16(lhs, 142); break; + case 143: lhs = _mm_shufflelo_epi16(lhs, 143); break; + case 144: lhs = _mm_shufflelo_epi16(lhs, 144); break; + case 145: lhs = _mm_shufflelo_epi16(lhs, 145); break; + case 146: lhs = _mm_shufflelo_epi16(lhs, 146); break; + case 147: lhs = _mm_shufflelo_epi16(lhs, 147); break; + case 148: lhs = _mm_shufflelo_epi16(lhs, 148); break; + case 149: lhs = _mm_shufflelo_epi16(lhs, 149); break; + case 150: lhs = _mm_shufflelo_epi16(lhs, 150); break; + case 151: lhs = _mm_shufflelo_epi16(lhs, 151); break; + case 152: lhs = _mm_shufflelo_epi16(lhs, 152); break; + case 153: lhs = _mm_shufflelo_epi16(lhs, 153); break; + case 154: lhs = _mm_shufflelo_epi16(lhs, 154); break; + case 155: lhs = _mm_shufflelo_epi16(lhs, 155); break; + case 156: lhs = _mm_shufflelo_epi16(lhs, 156); break; + case 157: lhs = _mm_shufflelo_epi16(lhs, 157); break; + case 158: lhs = _mm_shufflelo_epi16(lhs, 158); break; + case 159: lhs = _mm_shufflelo_epi16(lhs, 159); break; + case 160: lhs = _mm_shufflelo_epi16(lhs, 160); break; + case 161: lhs = _mm_shufflelo_epi16(lhs, 161); break; + case 162: lhs = _mm_shufflelo_epi16(lhs, 162); break; + case 163: lhs = _mm_shufflelo_epi16(lhs, 163); break; + case 164: lhs = _mm_shufflelo_epi16(lhs, 164); break; + case 165: lhs = _mm_shufflelo_epi16(lhs, 165); break; + case 166: lhs = _mm_shufflelo_epi16(lhs, 166); break; + case 167: lhs = _mm_shufflelo_epi16(lhs, 167); break; + case 168: lhs = _mm_shufflelo_epi16(lhs, 168); break; + case 169: lhs = _mm_shufflelo_epi16(lhs, 169); break; + case 170: lhs = _mm_shufflelo_epi16(lhs, 170); break; + case 171: lhs = _mm_shufflelo_epi16(lhs, 171); break; + case 172: lhs = _mm_shufflelo_epi16(lhs, 172); break; + case 173: lhs = _mm_shufflelo_epi16(lhs, 173); break; + case 174: lhs = _mm_shufflelo_epi16(lhs, 174); break; + case 175: lhs = _mm_shufflelo_epi16(lhs, 175); break; + case 176: lhs = _mm_shufflelo_epi16(lhs, 176); break; + case 177: lhs = _mm_shufflelo_epi16(lhs, 177); break; + case 178: lhs = _mm_shufflelo_epi16(lhs, 178); break; + case 179: lhs = _mm_shufflelo_epi16(lhs, 179); break; + case 180: lhs = _mm_shufflelo_epi16(lhs, 180); break; + case 181: lhs = _mm_shufflelo_epi16(lhs, 181); break; + case 182: lhs = _mm_shufflelo_epi16(lhs, 182); break; + case 183: lhs = _mm_shufflelo_epi16(lhs, 183); break; + case 184: lhs = _mm_shufflelo_epi16(lhs, 184); break; + case 185: lhs = _mm_shufflelo_epi16(lhs, 185); break; + case 186: lhs = _mm_shufflelo_epi16(lhs, 186); break; + case 187: lhs = _mm_shufflelo_epi16(lhs, 187); break; + case 188: lhs = _mm_shufflelo_epi16(lhs, 188); break; + case 189: lhs = _mm_shufflelo_epi16(lhs, 189); break; + case 190: lhs = _mm_shufflelo_epi16(lhs, 190); break; + case 191: lhs = _mm_shufflelo_epi16(lhs, 191); break; + case 192: lhs = _mm_shufflelo_epi16(lhs, 192); break; + case 193: lhs = _mm_shufflelo_epi16(lhs, 193); break; + case 194: lhs = _mm_shufflelo_epi16(lhs, 194); break; + case 195: lhs = _mm_shufflelo_epi16(lhs, 195); break; + case 196: lhs = _mm_shufflelo_epi16(lhs, 196); break; + case 197: lhs = _mm_shufflelo_epi16(lhs, 197); break; + case 198: lhs = _mm_shufflelo_epi16(lhs, 198); break; + case 199: lhs = _mm_shufflelo_epi16(lhs, 199); break; + case 200: lhs = _mm_shufflelo_epi16(lhs, 200); break; + case 201: lhs = _mm_shufflelo_epi16(lhs, 201); break; + case 202: lhs = _mm_shufflelo_epi16(lhs, 202); break; + case 203: lhs = _mm_shufflelo_epi16(lhs, 203); break; + case 204: lhs = _mm_shufflelo_epi16(lhs, 204); break; + case 205: lhs = _mm_shufflelo_epi16(lhs, 205); break; + case 206: lhs = _mm_shufflelo_epi16(lhs, 206); break; + case 207: lhs = _mm_shufflelo_epi16(lhs, 207); break; + case 208: lhs = _mm_shufflelo_epi16(lhs, 208); break; + case 209: lhs = _mm_shufflelo_epi16(lhs, 209); break; + case 210: lhs = _mm_shufflelo_epi16(lhs, 210); break; + case 211: lhs = _mm_shufflelo_epi16(lhs, 211); break; + case 212: lhs = _mm_shufflelo_epi16(lhs, 212); break; + case 213: lhs = _mm_shufflelo_epi16(lhs, 213); break; + case 214: lhs = _mm_shufflelo_epi16(lhs, 214); break; + case 215: lhs = _mm_shufflelo_epi16(lhs, 215); break; + case 216: lhs = _mm_shufflelo_epi16(lhs, 216); break; + case 217: lhs = _mm_shufflelo_epi16(lhs, 217); break; + case 218: lhs = _mm_shufflelo_epi16(lhs, 218); break; + case 219: lhs = _mm_shufflelo_epi16(lhs, 219); break; + case 220: lhs = _mm_shufflelo_epi16(lhs, 220); break; + case 221: lhs = _mm_shufflelo_epi16(lhs, 221); break; + case 222: lhs = _mm_shufflelo_epi16(lhs, 222); break; + case 223: lhs = _mm_shufflelo_epi16(lhs, 223); break; + case 224: lhs = _mm_shufflelo_epi16(lhs, 224); break; + case 225: lhs = _mm_shufflelo_epi16(lhs, 225); break; + case 226: lhs = _mm_shufflelo_epi16(lhs, 226); break; + case 227: lhs = _mm_shufflelo_epi16(lhs, 227); break; + case 228: lhs = _mm_shufflelo_epi16(lhs, 228); break; + case 229: lhs = _mm_shufflelo_epi16(lhs, 229); break; + case 230: lhs = _mm_shufflelo_epi16(lhs, 230); break; + case 231: lhs = _mm_shufflelo_epi16(lhs, 231); break; + case 232: lhs = _mm_shufflelo_epi16(lhs, 232); break; + case 233: lhs = _mm_shufflelo_epi16(lhs, 233); break; + case 234: lhs = _mm_shufflelo_epi16(lhs, 234); break; + case 235: lhs = _mm_shufflelo_epi16(lhs, 235); break; + case 236: lhs = _mm_shufflelo_epi16(lhs, 236); break; + case 237: lhs = _mm_shufflelo_epi16(lhs, 237); break; + case 238: lhs = _mm_shufflelo_epi16(lhs, 238); break; + case 239: lhs = _mm_shufflelo_epi16(lhs, 239); break; + case 240: lhs = _mm_shufflelo_epi16(lhs, 240); break; + case 241: lhs = _mm_shufflelo_epi16(lhs, 241); break; + case 242: lhs = _mm_shufflelo_epi16(lhs, 242); break; + case 243: lhs = _mm_shufflelo_epi16(lhs, 243); break; + case 244: lhs = _mm_shufflelo_epi16(lhs, 244); break; + case 245: lhs = _mm_shufflelo_epi16(lhs, 245); break; + case 246: lhs = _mm_shufflelo_epi16(lhs, 246); break; + case 247: lhs = _mm_shufflelo_epi16(lhs, 247); break; + case 248: lhs = _mm_shufflelo_epi16(lhs, 248); break; + case 249: lhs = _mm_shufflelo_epi16(lhs, 249); break; + case 250: lhs = _mm_shufflelo_epi16(lhs, 250); break; + case 251: lhs = _mm_shufflelo_epi16(lhs, 251); break; + case 252: lhs = _mm_shufflelo_epi16(lhs, 252); break; + case 253: lhs = _mm_shufflelo_epi16(lhs, 253); break; + case 254: lhs = _mm_shufflelo_epi16(lhs, 254); break; + case 255: lhs = _mm_shufflelo_epi16(lhs, 255); break; + } + + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pmulhuw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_mulhi_epu16(lhs, rhs); + lhs = _mm_mulhi_epu16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_psadbw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_sad_epu8(lhs, rhs); + lhs = _mm_sad_epu8(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pminsw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_min_epi16(lhs, rhs); + lhs = _mm_min_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } __m64 _m_pmaxsw(__m64 _MM1, __m64 _MM2) { - __m128i lhs = {0}, rhs = {0}; - lhs.m128i_i64[0] = _MM1.m64_i64; + __m128i lhs = {0}, rhs = {0}; + lhs.m128i_i64[0] = _MM1.m64_i64; - rhs.m128i_i64[0] = _MM2.m64_i64; + rhs.m128i_i64[0] = _MM2.m64_i64; - lhs = _mm_max_epi16(lhs, rhs); + lhs = _mm_max_epi16(lhs, rhs); - _MM1.m64_i64 = lhs.m128i_i64[0]; - return _MM1; + _MM1.m64_i64 = lhs.m128i_i64[0]; + return _MM1; } void _mm_stream_pi(__m64* _MM1, __m64 _MM2) { - _mm_stream_si32(&(_MM1->m64_i32[0]), _MM2.m64_i32[0]); - _mm_stream_si32(&(_MM1->m64_i32[1]), _MM2.m64_i32[1]); + _mm_stream_si32(&(_MM1->m64_i32[0]), _MM2.m64_i32[0]); + _mm_stream_si32(&(_MM1->m64_i32[1]), _MM2.m64_i32[1]); } __m64 _mm_cvt_ps2pi(__m128 _A) { - __m64 rv = {0}; - rv.m64_i32[0] = (int)_A.m128_f32[0]; - rv.m64_i32[1] = (int)_A.m128_f32[1]; - return rv; + __m64 rv = {0}; + rv.m64_i32[0] = (int)_A.m128_f32[0]; + rv.m64_i32[1] = (int)_A.m128_f32[1]; + return rv; } __m128 _mm_cvt_pi2ps(__m128 _MM1, __m64 _MM2) { - _MM1.m128_f32[0] = (float)_MM2.m64_i32[0]; - _MM1.m128_f32[1] = (float)_MM2.m64_i32[1]; - return _MM1; + _MM1.m128_f32[0] = (float)_MM2.m64_i32[0]; + _MM1.m128_f32[1] = (float)_MM2.m64_i32[1]; + return _MM1; } // SSE2 __m64 _mm_movepi64_pi64(__m128i _Q) { - __m64 rv = {0}; - rv.m64_i64 = _Q.m128i_i64[0]; - return rv; + __m64 rv = {0}; + rv.m64_i64 = _Q.m128i_i64[0]; + return rv; } diff --git a/src/DSUtil/simd.h b/src/DSUtil/simd.h index 0d6d68a29..a7c8a5bd4 100644 --- a/src/DSUtil/simd.h +++ b/src/DSUtil/simd.h @@ -70,6 +70,7 @@ static __forceinline void cvtps2pi(__m64 &dst,const __m128 &src) {dst=_mm_cvtps_ static __forceinline void cmpnltps(__m128 &dst,const __m128 &src) {dst=_mm_cmpnlt_ps(dst,src);} static __forceinline void cvtpi2ps(__m128 &dst,const __m64 &src) {dst=_mm_cvtpi32_ps(dst,src);} +#ifdef __SSE2__ static __forceinline void movq(__m128i &dst,const __m128i &src) {dst=src;} static __forceinline void movq(__m128i &dst,const void *src) {dst=*(__m128i*)src;} static __forceinline void movq(const void *dst,__m128i &src) {*(__m128i*)dst=src;} @@ -132,6 +133,8 @@ static __forceinline void movhpd(void *dst,const __m128d &src) {_mm_storeh_pd((d static __forceinline void movlhps(__m128i &dst,const __m128i &src) {(__m128&)dst=_mm_movelh_ps((__m128&)dst,(const __m128&)src);} #endif +#endif //__SSE2__ + //======================================= MMX ====================================== #define MMX_INSTRUCTIONS \ static __forceinline __m setzero_si64(void) {return _mm_setzero_si64();} \ @@ -495,6 +498,7 @@ static __forceinline void memadd(unsigned char *dst,const unsigned char *src,uns } //====================================== SSE2 ====================================== +#ifdef __SSE2__ struct Tsse2 { typedef __m128i __m; @@ -552,6 +556,7 @@ struct Tsse2 static __forceinline void pavgb(__m &mmr1,const void *mmr2) {mmr1=_mm_avg_epu8(mmr1,*(__m*)mmr2);} static __forceinline void sfence(void) {_mm_sfence();} }; +#endif //__SSE2__ template static __forceinline typename _mm::__m abs_16(const typename _mm::__m &mm0) { diff --git a/src/DSUtil/simd_common.h b/src/DSUtil/simd_common.h index 0b5e307e6..50be07c44 100644 --- a/src/DSUtil/simd_common.h +++ b/src/DSUtil/simd_common.h @@ -1,10 +1,14 @@ #ifndef _SIMD_COMMON_H_ #define _SIMD_COMMON_H_ +#if !defined(__GNUC__) + #define __SSE2__ +#endif + #ifdef __GNUC__ - #define __forceinline __attribute__((__always_inline__)) inline -#else - #define inline __forceinline + #ifndef __forceinline + #define __forceinline __attribute__((__always_inline__)) inline + #endif #endif #ifdef __GNUC__ @@ -13,7 +17,9 @@ #include #include -#include +#ifdef __SSE2__ + #include +#endif #ifdef __GNUC__ #undef __inline diff --git a/src/DSUtil/simd_instructions.h b/src/DSUtil/simd_instructions.h index 6ce9a2b29..e5098c731 100644 --- a/src/DSUtil/simd_instructions.h +++ b/src/DSUtil/simd_instructions.h @@ -52,6 +52,8 @@ MMX_INSTRUCTION(pavgb,_mm_avg_pu8) MMX_INSTRUCTION(psadbw,_mm_sad_pu8) +#ifdef __SSE2__ + SSE2I_INSTRUCTION(pand,_mm_and_si128) SSE2I_INSTRUCTION(por,_mm_or_si128) SSE2I_INSTRUCTION(pxor,_mm_xor_si128) @@ -81,3 +83,5 @@ SSE2I_INSTRUCTION(pcmpeqb,_mm_cmpeq_epi8) SSE2I_INSTRUCTION(pcmpeqw,_mm_cmpeq_epi16) SSE2I_INSTRUCTION(pcmpgtb,_mm_cmpgt_epi8) SSE2I_INSTRUCTION(pcmpgtw,_mm_cmpgt_epi16) + +#endif \ No newline at end of file -- cgit v1.2.3