Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/mpc-hc.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXhmikosR <xhmikosr@users.sourceforge.net>2011-07-26 23:10:49 +0400
committerXhmikosR <xhmikosr@users.sourceforge.net>2011-07-26 23:10:49 +0400
commitcba07a38ede1ecb42add37226540cc8fbea528f2 (patch)
tree96cef7cd473edf320e775d7f32714a190d52826f /src/DSUtil/simd.h
parent65beae737d11fb724af746b3928154fb465ac666 (diff)
apply astyle
git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@3483 10f7b99b-c216-0410-bff0-8a66a9350fd8
Diffstat (limited to 'src/DSUtil/simd.h')
-rw-r--r--src/DSUtil/simd.h1244
1 files changed, 772 insertions, 472 deletions
diff --git a/src/DSUtil/simd.h b/src/DSUtil/simd.h
index a7c8a5bd4..6e21c5ea1 100644
--- a/src/DSUtil/simd.h
+++ b/src/DSUtil/simd.h
@@ -22,115 +22,373 @@
#undef MMX_INSTRUCTION
#undef SSE2I_INSTRUCTION
-static __forceinline void movq(__m64 &dst,const __m64 &src) {dst=src;}
-static __forceinline void movq(__m64 &dst,const void *src) {dst=*(__m64*)src;}
-static __forceinline void movq(__m64 &dst,const __int64 &src) {dst=*(__m64*)&src;}
-static __forceinline void movq(void *dst,const __m64 &src) {*(__m64*)dst=src;}
-static __forceinline void movntq(void *dst,const __m64 &src) {_mm_stream_pi((__m64*)dst,src);}
-
-static __forceinline void movdqu(__m64 &dst,const void *src) {dst=*(__m64*)src;}
-
-static __forceinline void movd(__m64 &dst,int src) {dst=_mm_cvtsi32_si64(src);}
-static __forceinline void movd(int &dst,const __m64 &src) {dst=_mm_cvtsi64_si32(src);}
-static __forceinline void movd(__m64 &dst,const void *src) {dst=_mm_cvtsi32_si64(*(const int*)src);}
-static __forceinline void movd(void *dst,const __m64 &src) {*(int*)dst=_mm_cvtsi64_si32(src);}
-
-static __forceinline void psllq(__m64 &dst,int i) {dst=_mm_slli_si64(dst,i);}
-static __forceinline void pslld(__m64 &dst,int i) {dst=_mm_slli_pi32(dst,i);}
-static __forceinline void psllw(__m64 &dst,int i) {dst=_mm_slli_pi16(dst,i);}
-static __forceinline void psrlq(__m64 &dst,int i) {dst=_mm_srli_si64(dst,i);}
-static __forceinline void psrld(__m64 &dst,int i) {dst=_mm_srli_pi32(dst,i);}
-static __forceinline void psrlw(__m64 &dst,int i) {dst=_mm_srli_pi16(dst,i);}
-static __forceinline void psraw(__m64 &dst,int i) {dst=_mm_srai_pi16(dst,i);}
-static __forceinline void psraw(__m64 &dst,const __m64 &src) {dst=_mm_sra_pi16(dst,src);}
-static __forceinline void psrad(__m64 &dst,int i) {dst=_mm_srai_pi32(dst,i);}
-
-static __forceinline void prefetcht0(const void *a) {_mm_prefetch((char*)a,_MM_HINT_T0);}
-
-static __forceinline void movaps(__m128 &dst,const __m128 &src) {dst=src;}
-static __forceinline void movaps(void *dst,const __m128 &src) {*(__m128*)dst=src;}
-static __forceinline void movups(__m128 &dst,const void *src) {dst=_mm_loadu_ps((float*)src);}
-static __forceinline void movups(void *dst,const __m128 &src) {_mm_storeu_ps((float*)dst,src);}
-static __forceinline void movss(__m128 &dst,const void *src) {dst=_mm_load_ss((float*)src);}
-static __forceinline void movss(void *dst,const __m128 &src) {_mm_store_ss((float*)dst,src);}
-static __forceinline void movhlps(__m128 &dst,const __m128 &src) {dst=_mm_movehl_ps(dst,src);}
-static __forceinline void movlhps(__m128 &dst,const __m128 &src) {dst=_mm_movelh_ps(dst,src);}
-static __forceinline void movlps(__m128 &dst,const void *src) {dst=_mm_loadl_pi(dst,(const __m64*)src);}
-static __forceinline void movlps(void *dst,const __m128 &src) {_mm_storel_pi((__m64*)dst,src);}
-static __forceinline void movhps(__m128 &dst,const void *src) {dst=_mm_loadh_pi(dst,(const __m64*)src);}
-static __forceinline void movhps(void *dst,const __m128 &src) {_mm_storeh_pi((__m64*)dst,src);}
-
-static __forceinline void xorps(__m128 &dst,const __m128 &src) {dst=_mm_xor_ps(dst,src);}
-static __forceinline void addps(__m128 &dst,const __m128 &src) {dst=_mm_add_ps(dst,src);}
-static __forceinline void addss(__m128 &dst,const __m128 &src) {dst=_mm_add_ss(dst,src);}
-static __forceinline void mulps(__m128 &dst,const __m128 &src) {dst=_mm_mul_ps(dst,src);}
-static __forceinline void mulss(__m128 &dst,const __m128 &src) {dst=_mm_mul_ss(dst,src);}
-static __forceinline void minps(__m128 &dst,const __m128 &src) {dst=_mm_min_ps(dst,src);}
-static __forceinline void cvtps2pi(__m64 &dst,const __m128 &src) {dst=_mm_cvtps_pi32(src);}
-static __forceinline void cmpnltps(__m128 &dst,const __m128 &src) {dst=_mm_cmpnlt_ps(dst,src);}
-static __forceinline void cvtpi2ps(__m128 &dst,const __m64 &src) {dst=_mm_cvtpi32_ps(dst,src);}
+static __forceinline void movq(__m64 &dst,const __m64 &src)
+{
+ dst=src;
+}
+static __forceinline void movq(__m64 &dst,const void *src)
+{
+ dst=*(__m64*)src;
+}
+static __forceinline void movq(__m64 &dst,const __int64 &src)
+{
+ dst=*(__m64*)&src;
+}
+static __forceinline void movq(void *dst,const __m64 &src)
+{
+ *(__m64*)dst=src;
+}
+static __forceinline void movntq(void *dst,const __m64 &src)
+{
+ _mm_stream_pi((__m64*)dst,src);
+}
+
+static __forceinline void movdqu(__m64 &dst,const void *src)
+{
+ dst=*(__m64*)src;
+}
+
+static __forceinline void movd(__m64 &dst,int src)
+{
+ dst=_mm_cvtsi32_si64(src);
+}
+static __forceinline void movd(int &dst,const __m64 &src)
+{
+ dst=_mm_cvtsi64_si32(src);
+}
+static __forceinline void movd(__m64 &dst,const void *src)
+{
+ dst=_mm_cvtsi32_si64(*(const int*)src);
+}
+static __forceinline void movd(void *dst,const __m64 &src)
+{
+ *(int*)dst=_mm_cvtsi64_si32(src);
+}
+
+static __forceinline void psllq(__m64 &dst,int i)
+{
+ dst=_mm_slli_si64(dst,i);
+}
+static __forceinline void pslld(__m64 &dst,int i)
+{
+ dst=_mm_slli_pi32(dst,i);
+}
+static __forceinline void psllw(__m64 &dst,int i)
+{
+ dst=_mm_slli_pi16(dst,i);
+}
+static __forceinline void psrlq(__m64 &dst,int i)
+{
+ dst=_mm_srli_si64(dst,i);
+}
+static __forceinline void psrld(__m64 &dst,int i)
+{
+ dst=_mm_srli_pi32(dst,i);
+}
+static __forceinline void psrlw(__m64 &dst,int i)
+{
+ dst=_mm_srli_pi16(dst,i);
+}
+static __forceinline void psraw(__m64 &dst,int i)
+{
+ dst=_mm_srai_pi16(dst,i);
+}
+static __forceinline void psraw(__m64 &dst,const __m64 &src)
+{
+ dst=_mm_sra_pi16(dst,src);
+}
+static __forceinline void psrad(__m64 &dst,int i)
+{
+ dst=_mm_srai_pi32(dst,i);
+}
+
+static __forceinline void prefetcht0(const void *a)
+{
+ _mm_prefetch((char*)a,_MM_HINT_T0);
+}
+
+static __forceinline void movaps(__m128 &dst,const __m128 &src)
+{
+ dst=src;
+}
+static __forceinline void movaps(void *dst,const __m128 &src)
+{
+ *(__m128*)dst=src;
+}
+static __forceinline void movups(__m128 &dst,const void *src)
+{
+ dst=_mm_loadu_ps((float*)src);
+}
+static __forceinline void movups(void *dst,const __m128 &src)
+{
+ _mm_storeu_ps((float*)dst,src);
+}
+static __forceinline void movss(__m128 &dst,const void *src)
+{
+ dst=_mm_load_ss((float*)src);
+}
+static __forceinline void movss(void *dst,const __m128 &src)
+{
+ _mm_store_ss((float*)dst,src);
+}
+static __forceinline void movhlps(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_movehl_ps(dst,src);
+}
+static __forceinline void movlhps(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_movelh_ps(dst,src);
+}
+static __forceinline void movlps(__m128 &dst,const void *src)
+{
+ dst=_mm_loadl_pi(dst,(const __m64*)src);
+}
+static __forceinline void movlps(void *dst,const __m128 &src)
+{
+ _mm_storel_pi((__m64*)dst,src);
+}
+static __forceinline void movhps(__m128 &dst,const void *src)
+{
+ dst=_mm_loadh_pi(dst,(const __m64*)src);
+}
+static __forceinline void movhps(void *dst,const __m128 &src)
+{
+ _mm_storeh_pi((__m64*)dst,src);
+}
+
+static __forceinline void xorps(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_xor_ps(dst,src);
+}
+static __forceinline void addps(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_add_ps(dst,src);
+}
+static __forceinline void addss(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_add_ss(dst,src);
+}
+static __forceinline void mulps(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_mul_ps(dst,src);
+}
+static __forceinline void mulss(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_mul_ss(dst,src);
+}
+static __forceinline void minps(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_min_ps(dst,src);
+}
+static __forceinline void cvtps2pi(__m64 &dst,const __m128 &src)
+{
+ dst=_mm_cvtps_pi32(src);
+}
+static __forceinline void cmpnltps(__m128 &dst,const __m128 &src)
+{
+ dst=_mm_cmpnlt_ps(dst,src);
+}
+static __forceinline void cvtpi2ps(__m128 &dst,const __m64 &src)
+{
+ dst=_mm_cvtpi32_ps(dst,src);
+}
#ifdef __SSE2__
-static __forceinline void movq(__m128i &dst,const __m128i &src) {dst=src;}
-static __forceinline void movq(__m128i &dst,const void *src) {dst=*(__m128i*)src;}
-static __forceinline void movq(const void *dst,__m128i &src) {*(__m128i*)dst=src;}
-static __forceinline void movd(__m128i &dst,const void *src) {dst=_mm_loadl_epi64((__m128i*)src);}
-static __forceinline void movd(void *dst,const __m128i &src) {_mm_storel_epi64((__m128i*)dst,src);}
-
-static __forceinline void movdqu(__m128i &dst,const void *src) {dst=_mm_loadu_si128((__m128i*)src);}
-static __forceinline void movdqu(__m128i &dst,const __m128i &src) {dst=_mm_loadu_si128(&src);}
-static __forceinline void movdqa(__m128i &dst,const __m128i &src) {dst=src;}
-static __forceinline void movdqa(__m128i &dst,const void * src) {dst=_mm_load_si128((__m128i*)src);}
-static __forceinline void movdqa(void *dst,const __m128i &src) {_mm_store_si128((__m128i*)dst,src);}
-static __forceinline void movntdq(void *dst,const __m128i &src) {_mm_stream_si128((__m128i*)dst,src);}
-static __forceinline void movdq2q(__m64 &dst,const __m128i &src) {dst=_mm_movepi64_pi64(src);}
-
-static __forceinline void psrlw(__m128i &dst,int i) {dst=_mm_srli_epi16(dst,i);}
-static __forceinline void psrlq(__m128i &dst,int i) {dst=_mm_srli_epi64(dst,i);}
-static __forceinline void psrad(__m128i &dst,int i) {dst=_mm_srai_epi32(dst,i);}
-static __forceinline void psraw(__m128i &dst,int i) {dst=_mm_srai_epi16(dst,i);}
-static __forceinline void psraw(__m128i &dst,const __m128i &src) {dst=_mm_sra_epi16(dst,src);}
-static __forceinline void psllw(__m128i &dst,int i) {dst=_mm_slli_epi16(dst,i);}
-static __forceinline void pslld(__m128i &dst,int i) {dst=_mm_slli_epi32(dst,i);}
-static __forceinline void psllq(__m128i &dst,int i) {dst=_mm_slli_epi64(dst,i);}
+static __forceinline void movq(__m128i &dst,const __m128i &src)
+{
+ dst=src;
+}
+static __forceinline void movq(__m128i &dst,const void *src)
+{
+ dst=*(__m128i*)src;
+}
+static __forceinline void movq(const void *dst,__m128i &src)
+{
+ *(__m128i*)dst=src;
+}
+static __forceinline void movd(__m128i &dst,const void *src)
+{
+ dst=_mm_loadl_epi64((__m128i*)src);
+}
+static __forceinline void movd(void *dst,const __m128i &src)
+{
+ _mm_storel_epi64((__m128i*)dst,src);
+}
+
+static __forceinline void movdqu(__m128i &dst,const void *src)
+{
+ dst=_mm_loadu_si128((__m128i*)src);
+}
+static __forceinline void movdqu(__m128i &dst,const __m128i &src)
+{
+ dst=_mm_loadu_si128(&src);
+}
+static __forceinline void movdqa(__m128i &dst,const __m128i &src)
+{
+ dst=src;
+}
+static __forceinline void movdqa(__m128i &dst,const void * src)
+{
+ dst=_mm_load_si128((__m128i*)src);
+}
+static __forceinline void movdqa(void *dst,const __m128i &src)
+{
+ _mm_store_si128((__m128i*)dst,src);
+}
+static __forceinline void movntdq(void *dst,const __m128i &src)
+{
+ _mm_stream_si128((__m128i*)dst,src);
+}
+static __forceinline void movdq2q(__m64 &dst,const __m128i &src)
+{
+ dst=_mm_movepi64_pi64(src);
+}
+
+static __forceinline void psrlw(__m128i &dst,int i)
+{
+ dst=_mm_srli_epi16(dst,i);
+}
+static __forceinline void psrlq(__m128i &dst,int i)
+{
+ dst=_mm_srli_epi64(dst,i);
+}
+static __forceinline void psrad(__m128i &dst,int i)
+{
+ dst=_mm_srai_epi32(dst,i);
+}
+static __forceinline void psraw(__m128i &dst,int i)
+{
+ dst=_mm_srai_epi16(dst,i);
+}
+static __forceinline void psraw(__m128i &dst,const __m128i &src)
+{
+ dst=_mm_sra_epi16(dst,src);
+}
+static __forceinline void psllw(__m128i &dst,int i)
+{
+ dst=_mm_slli_epi16(dst,i);
+}
+static __forceinline void pslld(__m128i &dst,int i)
+{
+ dst=_mm_slli_epi32(dst,i);
+}
+static __forceinline void psllq(__m128i &dst,int i)
+{
+ dst=_mm_slli_epi64(dst,i);
+}
//static __forceinline void pshufd(__m128i &dst,const __m128i &src,const int i) {dst=_mm_shuffle_epi32(src,i);}
//static __forceinline void pshuflw(__m128i &dst,const __m128i &src,const int i) {dst=_mm_shufflelo_epi16(src,i);}
//static __forceinline void pshufhw(__m128i &dst,const __m128i &src,const int i) {dst=_mm_shufflehi_epi16(src,i);}
-static __forceinline void cvtps2dq(__m128i &dst,const __m128 &src) {dst=_mm_cvtps_epi32(src);}
-static __forceinline void cvtdq2ps(__m128 &dst,const __m128i &src) {dst=_mm_cvtepi32_ps(src);}
+static __forceinline void cvtps2dq(__m128i &dst,const __m128 &src)
+{
+ dst=_mm_cvtps_epi32(src);
+}
+static __forceinline void cvtdq2ps(__m128 &dst,const __m128i &src)
+{
+ dst=_mm_cvtepi32_ps(src);
+}
-static __forceinline void movlpd(__m128d &dst,const void *src) {dst=_mm_loadl_pd(dst,(double*)src);}
-static __forceinline void movhpd(__m128d &dst,const void *src) {dst=_mm_loadh_pd(dst,(double*)src);}
-static __forceinline void movlpd(void *dst,const __m128d &src) {_mm_storel_pd((double*)dst,src);}
-static __forceinline void movhpd(void *dst,const __m128d &src) {_mm_storeh_pd((double*)dst,src);}
+static __forceinline void movlpd(__m128d &dst,const void *src)
+{
+ dst=_mm_loadl_pd(dst,(double*)src);
+}
+static __forceinline void movhpd(__m128d &dst,const void *src)
+{
+ dst=_mm_loadh_pd(dst,(double*)src);
+}
+static __forceinline void movlpd(void *dst,const __m128d &src)
+{
+ _mm_storel_pd((double*)dst,src);
+}
+static __forceinline void movhpd(void *dst,const __m128d &src)
+{
+ _mm_storeh_pd((double*)dst,src);
+}
#if defined(__INTEL_COMPILER) || (defined(__GNUC__) && __GNUC__>=4)
- static __forceinline void movlpd(__m128i &dst,const void *src) {dst=_mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(dst),(double*)src));}
- static __forceinline void movhpd(__m128i &dst,const void *src) {dst=_mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(dst),(double*)src));}
- static __forceinline void movlpd(void *dst,const __m128i &src) {_mm_storel_pd((double*)dst,_mm_castsi128_pd(src));}
- static __forceinline void movhpd(void *dst,const __m128i &src) {_mm_storeh_pd((double*)dst,_mm_castsi128_pd(src));}
+static __forceinline void movlpd(__m128i &dst,const void *src)
+{
+ dst=_mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(dst),(double*)src));
+}
+static __forceinline void movhpd(__m128i &dst,const void *src)
+{
+ dst=_mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(dst),(double*)src));
+}
+static __forceinline void movlpd(void *dst,const __m128i &src)
+{
+ _mm_storel_pd((double*)dst,_mm_castsi128_pd(src));
+}
+static __forceinline void movhpd(void *dst,const __m128i &src)
+{
+ _mm_storeh_pd((double*)dst,_mm_castsi128_pd(src));
+}
- static __forceinline void movlps(__m128i &dst,const void *src) {dst=_mm_castps_si128(_mm_loadl_pi(_mm_castsi128_ps(dst),(const __m64*)src));}
- static __forceinline void movlps(void *dst,const __m128i &src) {_mm_storel_pi((__m64*)dst,_mm_castsi128_ps(src));}
- static __forceinline void movhps(__m128i &dst,const void *src) {dst=_mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(dst),(const __m64*)src));}
- static __forceinline void movhps(void *dst,const __m128i &src) {_mm_storeh_pi((__m64*)dst,_mm_castsi128_ps(src));}
+static __forceinline void movlps(__m128i &dst,const void *src)
+{
+ dst=_mm_castps_si128(_mm_loadl_pi(_mm_castsi128_ps(dst),(const __m64*)src));
+}
+static __forceinline void movlps(void *dst,const __m128i &src)
+{
+ _mm_storel_pi((__m64*)dst,_mm_castsi128_ps(src));
+}
+static __forceinline void movhps(__m128i &dst,const void *src)
+{
+ dst=_mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(dst),(const __m64*)src));
+}
+static __forceinline void movhps(void *dst,const __m128i &src)
+{
+ _mm_storeh_pi((__m64*)dst,_mm_castsi128_ps(src));
+}
- static __forceinline void movlhps(__m128i &dst,const __m128i &src) {dst=_mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(dst),_mm_castsi128_ps(src)));}
+static __forceinline void movlhps(__m128i &dst,const __m128i &src)
+{
+ dst=_mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(dst),_mm_castsi128_ps(src)));
+}
#else
- static __forceinline __m128i _mm_castps_si128(__m128 &src) {return (__m128i&)src;}
- static __forceinline void movlpd(__m128i &dst,const void *src) {(__m128d&)dst=_mm_loadl_pd((__m128d&)dst,(double*)src);}
- static __forceinline void movhpd(__m128i &dst,const void *src) {(__m128d&)dst=_mm_loadh_pd((__m128d&)dst,(double*)src);}
- static __forceinline void movlpd(void *dst,const __m128i &src) {_mm_storel_pd((double*)dst,(const __m128d&)src);}
- static __forceinline void movhpd(void *dst,const __m128i &src) {_mm_storeh_pd((double*)dst,(const __m128d&)src);}
-
- static __forceinline void movlps(__m128i &dst,const void *src) {(__m128&)dst=_mm_loadl_pi((__m128&)dst,(const __m64*)src);}
- static __forceinline void movlps(void *dst,const __m128i &src) {_mm_storel_pi((__m64*)dst,(const __m128&)src);}
- static __forceinline void movhps(__m128i &dst,const void *src) {(__m128&)dst=_mm_loadh_pi((__m128&)dst,(const __m64*)src);}
- static __forceinline void movhps(void *dst,const __m128i &src) {_mm_storeh_pi((__m64*)dst,(const __m128&)src);}
-
- static __forceinline void movlhps(__m128i &dst,const __m128i &src) {(__m128&)dst=_mm_movelh_ps((__m128&)dst,(const __m128&)src);}
+static __forceinline __m128i _mm_castps_si128(__m128 &src)
+{
+ return (__m128i&)src;
+}
+static __forceinline void movlpd(__m128i &dst,const void *src)
+{
+ (__m128d&)dst=_mm_loadl_pd((__m128d&)dst,(double*)src);
+}
+static __forceinline void movhpd(__m128i &dst,const void *src)
+{
+ (__m128d&)dst=_mm_loadh_pd((__m128d&)dst,(double*)src);
+}
+static __forceinline void movlpd(void *dst,const __m128i &src)
+{
+ _mm_storel_pd((double*)dst,(const __m128d&)src);
+}
+static __forceinline void movhpd(void *dst,const __m128i &src)
+{
+ _mm_storeh_pd((double*)dst,(const __m128d&)src);
+}
+
+static __forceinline void movlps(__m128i &dst,const void *src)
+{
+ (__m128&)dst=_mm_loadl_pi((__m128&)dst,(const __m64*)src);
+}
+static __forceinline void movlps(void *dst,const __m128i &src)
+{
+ _mm_storel_pi((__m64*)dst,(const __m128&)src);
+}
+static __forceinline void movhps(__m128i &dst,const void *src)
+{
+ (__m128&)dst=_mm_loadh_pi((__m128&)dst,(const __m64*)src);
+}
+static __forceinline void movhps(void *dst,const __m128i &src)
+{
+ _mm_storeh_pi((__m64*)dst,(const __m128&)src);
+}
+
+static __forceinline void movlhps(__m128i &dst,const __m128i &src)
+{
+ (__m128&)dst=_mm_movelh_ps((__m128&)dst,(const __m128&)src);
+}
#endif
#endif //__SSE2__
@@ -175,405 +433,447 @@ static __forceinline void movhpd(void *dst,const __m128d &src) {_mm_storeh_pd((d
static __forceinline __m loadU(const void *ptr) {return *(__m*)ptr;} \
static __forceinline void empty(void) {_mm_empty();}
-struct Tmmx
-{
- typedef __m64 __m;
- typedef int32_t int2;
- typedef int32_t integer2_t;
- static const size_t size=sizeof(__m);
- static const int align=0;
- typedef Tmmx T64;
-
- static __forceinline void pmaxub(__m64 &mmr1,const __m64 &mmr2)
- {
- mmr1=_mm_subs_pu8(mmr1,mmr2);
- mmr1=_mm_adds_pu8(mmr1,mmr2);
- }
- static __forceinline void pmaxub(__m64 &mmr1,const void *mmr2)
- {
- pmaxub(mmr1,*(__m64*)mmr2);
- }
- static __forceinline void pminub(__m64 &mmr1,const __m64 &mmr2)
- {
- __m64 mmrw;
- pcmpeqb (mmrw,mmrw );
- psubusb (mmrw,mmr2 );
- paddusb (mmr1, mmrw);
- psubusb (mmr1, mmrw);
- }
- static __forceinline void pminub(__m64 &mmr1,const void *mmr2)
- {
- pminub(mmr1,*(__m64*)mmr2);
- }
- static __forceinline void pmaxsw(__m64 &a,const __m64 &b)
- {
- psubusw(a,b);
- paddw(a,b);
- }
- static __forceinline void pminsw(__m64 &mm4,const __m64 &mm0)
- {
- __m64 mm2;
- movq (mm2,mm4);
- psubusw(mm2,mm0);
- psubw (mm4,mm2);
- }
- static __forceinline void pavgb(__m64 &rega,__m64 regb)
- {
- __m64 regr;
- static const __int64 regfe=0xfefefefefefefefeULL;//_mm_set1_pi8(/*0xfe*/-2);
- movq (regr,rega);
- por (regr,regb);
- pxor (regb,rega);
- pand (regb,regfe);
- psrlq (regb,1);
- psubb (regr,regb);
- rega=regr;
- }
- static __forceinline void pavgb(__m64 &rega,const void *regb)
- {
- pavgb(rega,*(__m64*)regb);
- }
- static __forceinline void v_pavgb(__m64 &mmr1,const __m64 &mmr2,__m64 &mmrw,const __int64 &smask)
- {
- movq( mmrw,mmr2 );
- pand( mmrw, smask );
- psrlw( mmrw,1 );
- pand( mmr1,smask );
- psrlw( mmr1,1 );
- paddusb( mmr1,mmrw );
- }
- static __forceinline void v_pavgb(__m64 &mmr1,const void *mmr2,__m64 &mmrw,const __int64 &smask)
- {
- v_pavgb(mmr1,*(__m64*)mmr2,mmrw,smask);
- }
- static __forceinline void sfence(void)
- {
- }
- static __forceinline void movntq(void *dst,const __m64 &src)
- {
- movq(dst,src);
- }
- static __forceinline void v_pminub(__m64 &mmr1,const __m64 &mmr2,__m64 &mmrw)
- {
- pcmpeqb (mmrw,mmrw );
- psubusb (mmrw,mmr2 );
- paddusb (mmr1, mmrw);
- psubusb (mmr1, mmrw);
- }
- static __forceinline void v_pminub(__m64 &mmr1,const __int64 &mmr2,__m64 &mmrw)
- {
- v_pminub(mmr1,*(const __m64*)&mmr2,mmrw);
- }
- static __forceinline void pmulhuw(__m64 &mm3,const __m64 &mm2)
- {
- __m64 mm5;
- movq ( mm5, mm2);
- psraw ( mm5, 15 );
- pand ( mm5, mm3);
- pmulhw ( mm3, mm2);
- paddw ( mm3, mm5);
- }
- static __forceinline void prefetchnta(const void*)
- {
- }
- static __forceinline void prefetcht0(const void*)
- {
- }
- static __forceinline __m64 _mm_shuffle_pi16_0(__m64 mm3)
- {
- __m64 mm2;
- static const __int64 qwLowWord=0x000000000000FFFF;
- pand (mm3, qwLowWord); // mm3 = same limited to low word
- movq (mm2, mm3); // mm2 = same
- psllq (mm3, 16 ); // mm3 = moved to second word
- por (mm2, mm3); // mm2 = copied to first and second words
- movq (mm3, mm2); // mm3 = same
- psllq (mm3, 32 ); // mm3 = moved to third and fourth words
- por (mm2, mm3); // mm2 = low word copied to all four words
- return mm2;
- }
- static __forceinline __m64 _mm_shuffle_pi16_1(const __m64 &src)
- {
- static const __int64 const1=0x00000000FFFF0000LL;
- static const __int64 const2=0x000000000000FFFFLL;
- __m64 w0=_mm_srli_si64(_mm_and_si64(src,*(__m64*)&const1),16);
- __m64 w1=_mm_and_si64(src,*(__m64*)&const2);
- return _mm_or_si64(_mm_or_si64(_mm_or_si64(_mm_slli_si64(w1,48),_mm_slli_si64(w1,32)),_mm_slli_si64(w1,16)),w0);
- }
- static __forceinline __m64 _mm_shuffle_pi16_14(const __m64 &src)
- {
- static const __int64 const1=0x000000000000FFFFLL;
- static const __int64 const2=0xffffffff00000000ULL;
- __m64 w34=_mm_and_si64(src,*(__m64*)&const1);
- __m64 w12=_mm_srli_si64(_mm_and_si64(src,*(__m64*)&const2),32);
- return _mm_or_si64(w12,_mm_or_si64(_mm_slli_si64(w34,32),_mm_slli_si64(w34,48)));
- }
- static __forceinline __m64 _mm_shuffle_pi16_x50(const __m64 &src)
- {
- static const __int64 const1=0x00000000ffff0000LL;
- static const __int64 const2=0x000000000000ffffLL;
- __m64 w3=_mm_and_si64(src,*(__m64*)&const1);
- __m64 w4=_mm_and_si64(src,*(__m64*)&const2);
- return _mm_or_si64( _mm_or_si64(_mm_slli_si64(w3,32),_mm_slli_si64(w3,16)) , _mm_or_si64(_mm_slli_si64(w4,16),w4));
- }
- static __forceinline void psadbw(__m64 &mm0,const __m64 &SourceMM)
- {
- __m64 mm1;
- movq (mm1, SourceMM);
- __m64 mm4;
- movq (mm4, mm0);
- psubusb (mm0, mm1);
- psubusb (mm1, mm4);
- por (mm0, mm1);
- __m64 mm7=_mm_setzero_si64();
- movq (mm1,mm0);
- punpcklbw (mm0,mm7);
- punpckhbw (mm1,mm7);
- paddusw (mm0,mm1);
- static const __int64 mmx_one=0x0001000100010001LL;
- pmaddwd (mm0, mmx_one);
- movq (mm7, mm0);
- psrlq (mm7, 32);
- paddd (mm0, mm7);
- static const __int64 mmx_ffff=0x00000000000fffffLL;
- pand (mm0, mmx_ffff);
- }
- static __forceinline __m64 min_pu8(const __m64 &mm1,const __m64 &mm2)
- {
- __m64 mm0=mm1;
- pminub(mm0,mm2);
- return mm0;
- }
- static __forceinline __m64 max_pu8(const __m64 &mm1,const __m64 &mm2)
- {
- __m64 mm0=mm1;
- pmaxub(mm0,mm2);
- return mm0;
- }
- static __forceinline __m64 min_pi16(const __m64 &mm1,const __m64 &mm2)
- {
- __m64 mm0=mm1;
- pminsw(mm0,mm2);
- return mm0;
- }
- static __forceinline __m64 max_pi16(const __m64 &mm1,const __m64 &mm2)
- {
- __m64 mm0=mm1;
- pmaxsw(mm0,mm2);
- return mm0;
- }
- MMX_INSTRUCTIONS
+struct Tmmx {
+ typedef __m64 __m;
+ typedef int32_t int2;
+ typedef int32_t integer2_t;
+ static const size_t size=sizeof(__m);
+ static const int align=0;
+ typedef Tmmx T64;
+
+ static __forceinline void pmaxub(__m64 &mmr1,const __m64 &mmr2) {
+ mmr1=_mm_subs_pu8(mmr1,mmr2);
+ mmr1=_mm_adds_pu8(mmr1,mmr2);
+ }
+ static __forceinline void pmaxub(__m64 &mmr1,const void *mmr2) {
+ pmaxub(mmr1,*(__m64*)mmr2);
+ }
+ static __forceinline void pminub(__m64 &mmr1,const __m64 &mmr2) {
+ __m64 mmrw;
+ pcmpeqb (mmrw,mmrw );
+ psubusb (mmrw,mmr2 );
+ paddusb (mmr1, mmrw);
+ psubusb (mmr1, mmrw);
+ }
+ static __forceinline void pminub(__m64 &mmr1,const void *mmr2) {
+ pminub(mmr1,*(__m64*)mmr2);
+ }
+ static __forceinline void pmaxsw(__m64 &a,const __m64 &b) {
+ psubusw(a,b);
+ paddw(a,b);
+ }
+ static __forceinline void pminsw(__m64 &mm4,const __m64 &mm0) {
+ __m64 mm2;
+ movq (mm2,mm4);
+ psubusw(mm2,mm0);
+ psubw (mm4,mm2);
+ }
+ static __forceinline void pavgb(__m64 &rega,__m64 regb) {
+ __m64 regr;
+ static const __int64 regfe=0xfefefefefefefefeULL;//_mm_set1_pi8(/*0xfe*/-2);
+ movq (regr,rega);
+ por (regr,regb);
+ pxor (regb,rega);
+ pand (regb,regfe);
+ psrlq (regb,1);
+ psubb (regr,regb);
+ rega=regr;
+ }
+ static __forceinline void pavgb(__m64 &rega,const void *regb) {
+ pavgb(rega,*(__m64*)regb);
+ }
+ static __forceinline void v_pavgb(__m64 &mmr1,const __m64 &mmr2,__m64 &mmrw,const __int64 &smask) {
+ movq( mmrw,mmr2 );
+ pand( mmrw, smask );
+ psrlw( mmrw,1 );
+ pand( mmr1,smask );
+ psrlw( mmr1,1 );
+ paddusb( mmr1,mmrw );
+ }
+ static __forceinline void v_pavgb(__m64 &mmr1,const void *mmr2,__m64 &mmrw,const __int64 &smask) {
+ v_pavgb(mmr1,*(__m64*)mmr2,mmrw,smask);
+ }
+ static __forceinline void sfence(void) {
+ }
+ static __forceinline void movntq(void *dst,const __m64 &src) {
+ movq(dst,src);
+ }
+ static __forceinline void v_pminub(__m64 &mmr1,const __m64 &mmr2,__m64 &mmrw) {
+ pcmpeqb (mmrw,mmrw );
+ psubusb (mmrw,mmr2 );
+ paddusb (mmr1, mmrw);
+ psubusb (mmr1, mmrw);
+ }
+ static __forceinline void v_pminub(__m64 &mmr1,const __int64 &mmr2,__m64 &mmrw) {
+ v_pminub(mmr1,*(const __m64*)&mmr2,mmrw);
+ }
+ static __forceinline void pmulhuw(__m64 &mm3,const __m64 &mm2) {
+ __m64 mm5;
+ movq ( mm5, mm2);
+ psraw ( mm5, 15 );
+ pand ( mm5, mm3);
+ pmulhw ( mm3, mm2);
+ paddw ( mm3, mm5);
+ }
+ static __forceinline void prefetchnta(const void*) {
+ }
+ static __forceinline void prefetcht0(const void*) {
+ }
+ static __forceinline __m64 _mm_shuffle_pi16_0(__m64 mm3) {
+ __m64 mm2;
+ static const __int64 qwLowWord=0x000000000000FFFF;
+ pand (mm3, qwLowWord); // mm3 = same limited to low word
+ movq (mm2, mm3); // mm2 = same
+ psllq (mm3, 16 ); // mm3 = moved to second word
+ por (mm2, mm3); // mm2 = copied to first and second words
+ movq (mm3, mm2); // mm3 = same
+ psllq (mm3, 32 ); // mm3 = moved to third and fourth words
+ por (mm2, mm3); // mm2 = low word copied to all four words
+ return mm2;
+ }
+ static __forceinline __m64 _mm_shuffle_pi16_1(const __m64 &src) {
+ static const __int64 const1=0x00000000FFFF0000LL;
+ static const __int64 const2=0x000000000000FFFFLL;
+ __m64 w0=_mm_srli_si64(_mm_and_si64(src,*(__m64*)&const1),16);
+ __m64 w1=_mm_and_si64(src,*(__m64*)&const2);
+ return _mm_or_si64(_mm_or_si64(_mm_or_si64(_mm_slli_si64(w1,48),_mm_slli_si64(w1,32)),_mm_slli_si64(w1,16)),w0);
+ }
+ static __forceinline __m64 _mm_shuffle_pi16_14(const __m64 &src) {
+ static const __int64 const1=0x000000000000FFFFLL;
+ static const __int64 const2=0xffffffff00000000ULL;
+ __m64 w34=_mm_and_si64(src,*(__m64*)&const1);
+ __m64 w12=_mm_srli_si64(_mm_and_si64(src,*(__m64*)&const2),32);
+ return _mm_or_si64(w12,_mm_or_si64(_mm_slli_si64(w34,32),_mm_slli_si64(w34,48)));
+ }
+ static __forceinline __m64 _mm_shuffle_pi16_x50(const __m64 &src) {
+ static const __int64 const1=0x00000000ffff0000LL;
+ static const __int64 const2=0x000000000000ffffLL;
+ __m64 w3=_mm_and_si64(src,*(__m64*)&const1);
+ __m64 w4=_mm_and_si64(src,*(__m64*)&const2);
+ return _mm_or_si64( _mm_or_si64(_mm_slli_si64(w3,32),_mm_slli_si64(w3,16)) , _mm_or_si64(_mm_slli_si64(w4,16),w4));
+ }
+ static __forceinline void psadbw(__m64 &mm0,const __m64 &SourceMM) {
+ __m64 mm1;
+ movq (mm1, SourceMM);
+ __m64 mm4;
+ movq (mm4, mm0);
+ psubusb (mm0, mm1);
+ psubusb (mm1, mm4);
+ por (mm0, mm1);
+ __m64 mm7=_mm_setzero_si64();
+ movq (mm1,mm0);
+ punpcklbw (mm0,mm7);
+ punpckhbw (mm1,mm7);
+ paddusw (mm0,mm1);
+ static const __int64 mmx_one=0x0001000100010001LL;
+ pmaddwd (mm0, mmx_one);
+ movq (mm7, mm0);
+ psrlq (mm7, 32);
+ paddd (mm0, mm7);
+ static const __int64 mmx_ffff=0x00000000000fffffLL;
+ pand (mm0, mmx_ffff);
+ }
+ static __forceinline __m64 min_pu8(const __m64 &mm1,const __m64 &mm2) {
+ __m64 mm0=mm1;
+ pminub(mm0,mm2);
+ return mm0;
+ }
+ static __forceinline __m64 max_pu8(const __m64 &mm1,const __m64 &mm2) {
+ __m64 mm0=mm1;
+ pmaxub(mm0,mm2);
+ return mm0;
+ }
+ static __forceinline __m64 min_pi16(const __m64 &mm1,const __m64 &mm2) {
+ __m64 mm0=mm1;
+ pminsw(mm0,mm2);
+ return mm0;
+ }
+ static __forceinline __m64 max_pi16(const __m64 &mm1,const __m64 &mm2) {
+ __m64 mm0=mm1;
+ pmaxsw(mm0,mm2);
+ return mm0;
+ }
+ MMX_INSTRUCTIONS
};
//===================================== MMXEXT =====================================
-struct Tmmxext
-{
- typedef Tmmx::__m __m;
- typedef Tmmx::int2 int2;
- static const size_t size=Tmmx::size;
- static const int align=Tmmx::align;
- typedef Tmmxext T64;
-
- static __forceinline void pmaxub(__m64 &mmr1,const __m64 &mmr2)
- {
- mmr1=_mm_max_pu8(mmr1,mmr2);
- }
- static __forceinline void pmaxub(__m64 &mmr1,const void *mmr2)
- {
- pmaxub(mmr1,*(__m64*)mmr2);
- }
- static __forceinline void pminub(__m64 &mmr1,const __m64 &mmr2)
- {
- mmr1=_mm_min_pu8(mmr1,mmr2);
- }
- static __forceinline void pminub(__m64 &mmr1,const void *mmr2)
- {
- pminub(mmr1,*(__m64*)mmr2);
- }
- static __forceinline void pminsw(__m64 &mmr1,const __m64 &mmr2)
- {
- mmr1=_mm_min_pi16(mmr1,mmr2);
- }
- static __forceinline void pavgb(__m64 &mmr1,const __m64 &mmr2)
- {
- mmr1=_mm_avg_pu8(mmr1,mmr2);
- }
- static __forceinline void pavgb(__m64 &mmr1,const void *mmr2)
- {
- mmr1=_mm_avg_pu8(mmr1,*(__m64*)mmr2);
- }
- static __forceinline void v_pavgb(__m64 &mmr1,const __m64 &mmr2,__m64,__int64)
- {
- mmr1=_mm_avg_pu8(mmr1,mmr2);
- }
- static __forceinline void v_pavgb(__m64 &mmr1,const void *mmr2,__m64,__int64)
- {
- mmr1=_mm_avg_pu8(mmr1,*(__m64*)mmr2);
- }
- static __forceinline void sfence(void)
- {
- _mm_sfence();
- }
- static __forceinline void movntq(void *dst,const __m64 &src)
- {
- _mm_stream_pi((__m64*)dst,src);
- }
- static __forceinline void v_pminub(__m64 &mmr1,const __m64 &mmr2,__m64)
- {
- mmr1=_mm_min_pu8(mmr1,mmr2);
- }
- static __forceinline void v_pminub(__m64 &mmr1,const __int64 &mmr2,__m64 &mmrw)
- {
- v_pminub(mmr1,*(const __m64*)&mmr2,mmrw);
- }
- static __forceinline void pmulhuw(__m64 &mmr1,const __m64 &mmr2)
- {
- mmr1=_mm_mulhi_pu16(mmr1,mmr2);
- }
- static __forceinline void prefetchnta(const void *ptr)
- {
- _mm_prefetch((const char*)ptr,_MM_HINT_NTA);
- }
- static __forceinline void prefetcht0(const void *ptr)
- {
- _mm_prefetch((const char*)ptr,_MM_HINT_T0);
- }
- static __forceinline __m64 _mm_shuffle_pi16_0(const __m64 &src)
- {
- return _mm_shuffle_pi16(src,0);
- }
- static __forceinline __m64 _mm_shuffle_pi16_1(const __m64 &src)
- {
- return _mm_shuffle_pi16(src,1);
- }
- static __forceinline __m64 _mm_shuffle_pi16_14(const __m64 &src)
- {
- return _mm_shuffle_pi16(src,(3 << 2) + 2);
- }
- static __forceinline __m64 _mm_shuffle_pi16_x50(const __m64 &src)
- {
- return _mm_shuffle_pi16(src,0x50);
- }
- static __forceinline void psadbw(__m64 &mm3,const __m64 &mm2)
- {
- mm3=_mm_sad_pu8(mm3,mm2);
- }
- static __forceinline __m64 min_pu8(const __m64 &mm1,const __m64 &mm2)
- {
- return _mm_min_pu8(mm1,mm2);
- }
- static __forceinline __m64 max_pu8(const __m64 &mm1,const __m64 &mm2)
- {
- return _mm_max_pu8(mm1,mm2);
- }
- static __forceinline __m64 min_pi16(const __m64 &mm1,const __m64 &mm2)
- {
- return _mm_min_pi16(mm1,mm2);
- }
- static __forceinline __m64 max_pi16(const __m64 &mm1,const __m64 &mm2)
- {
- return _mm_max_pi16(mm1,mm2);
- }
- static __forceinline void pmaxsw(__m64 &dst,const __m64 &src)
- {
- dst=_mm_max_pi16(dst,src);
- }
-
- MMX_INSTRUCTIONS
+struct Tmmxext {
+ typedef Tmmx::__m __m;
+ typedef Tmmx::int2 int2;
+ static const size_t size=Tmmx::size;
+ static const int align=Tmmx::align;
+ typedef Tmmxext T64;
+
+ static __forceinline void pmaxub(__m64 &mmr1,const __m64 &mmr2) {
+ mmr1=_mm_max_pu8(mmr1,mmr2);
+ }
+ static __forceinline void pmaxub(__m64 &mmr1,const void *mmr2) {
+ pmaxub(mmr1,*(__m64*)mmr2);
+ }
+ static __forceinline void pminub(__m64 &mmr1,const __m64 &mmr2) {
+ mmr1=_mm_min_pu8(mmr1,mmr2);
+ }
+ static __forceinline void pminub(__m64 &mmr1,const void *mmr2) {
+ pminub(mmr1,*(__m64*)mmr2);
+ }
+ static __forceinline void pminsw(__m64 &mmr1,const __m64 &mmr2) {
+ mmr1=_mm_min_pi16(mmr1,mmr2);
+ }
+ static __forceinline void pavgb(__m64 &mmr1,const __m64 &mmr2) {
+ mmr1=_mm_avg_pu8(mmr1,mmr2);
+ }
+ static __forceinline void pavgb(__m64 &mmr1,const void *mmr2) {
+ mmr1=_mm_avg_pu8(mmr1,*(__m64*)mmr2);
+ }
+ static __forceinline void v_pavgb(__m64 &mmr1,const __m64 &mmr2,__m64,__int64) {
+ mmr1=_mm_avg_pu8(mmr1,mmr2);
+ }
+ static __forceinline void v_pavgb(__m64 &mmr1,const void *mmr2,__m64,__int64) {
+ mmr1=_mm_avg_pu8(mmr1,*(__m64*)mmr2);
+ }
+ static __forceinline void sfence(void) {
+ _mm_sfence();
+ }
+ static __forceinline void movntq(void *dst,const __m64 &src) {
+ _mm_stream_pi((__m64*)dst,src);
+ }
+ static __forceinline void v_pminub(__m64 &mmr1,const __m64 &mmr2,__m64) {
+ mmr1=_mm_min_pu8(mmr1,mmr2);
+ }
+ static __forceinline void v_pminub(__m64 &mmr1,const __int64 &mmr2,__m64 &mmrw) {
+ v_pminub(mmr1,*(const __m64*)&mmr2,mmrw);
+ }
+ static __forceinline void pmulhuw(__m64 &mmr1,const __m64 &mmr2) {
+ mmr1=_mm_mulhi_pu16(mmr1,mmr2);
+ }
+ static __forceinline void prefetchnta(const void *ptr) {
+ _mm_prefetch((const char*)ptr,_MM_HINT_NTA);
+ }
+ static __forceinline void prefetcht0(const void *ptr) {
+ _mm_prefetch((const char*)ptr,_MM_HINT_T0);
+ }
+ static __forceinline __m64 _mm_shuffle_pi16_0(const __m64 &src) {
+ return _mm_shuffle_pi16(src,0);
+ }
+ static __forceinline __m64 _mm_shuffle_pi16_1(const __m64 &src) {
+ return _mm_shuffle_pi16(src,1);
+ }
+ static __forceinline __m64 _mm_shuffle_pi16_14(const __m64 &src) {
+ return _mm_shuffle_pi16(src,(3 << 2) + 2);
+ }
+ static __forceinline __m64 _mm_shuffle_pi16_x50(const __m64 &src) {
+ return _mm_shuffle_pi16(src,0x50);
+ }
+ static __forceinline void psadbw(__m64 &mm3,const __m64 &mm2) {
+ mm3=_mm_sad_pu8(mm3,mm2);
+ }
+ static __forceinline __m64 min_pu8(const __m64 &mm1,const __m64 &mm2) {
+ return _mm_min_pu8(mm1,mm2);
+ }
+ static __forceinline __m64 max_pu8(const __m64 &mm1,const __m64 &mm2) {
+ return _mm_max_pu8(mm1,mm2);
+ }
+ static __forceinline __m64 min_pi16(const __m64 &mm1,const __m64 &mm2) {
+ return _mm_min_pi16(mm1,mm2);
+ }
+ static __forceinline __m64 max_pi16(const __m64 &mm1,const __m64 &mm2) {
+ return _mm_max_pi16(mm1,mm2);
+ }
+ static __forceinline void pmaxsw(__m64 &dst,const __m64 &src) {
+ dst=_mm_max_pi16(dst,src);
+ }
+
+ MMX_INSTRUCTIONS
};
static __forceinline __m64 _mm_absdif_u8(__m64 mm1,__m64 mm2)
{
- __m64 mm7=mm1;
- mm1=_mm_subs_pu8(mm1,mm2);
- mm2=_mm_subs_pu8(mm2,mm7);
- return _mm_or_si64(mm2,mm1);
+ __m64 mm7=mm1;
+ mm1=_mm_subs_pu8(mm1,mm2);
+ mm2=_mm_subs_pu8(mm2,mm7);
+ return _mm_or_si64(mm2,mm1);
}
static __forceinline void memadd(unsigned char *dst,const unsigned char *src,unsigned int len)
{
- __m64 *dst8=(__m64*)dst;const __m64 *src8=(__m64*)src;
- for (unsigned int i=0;i<len/8;i++,src8++,dst8++)
- *dst8=_mm_adds_pu8(*src8,*dst8);
+ __m64 *dst8=(__m64*)dst;
+ const __m64 *src8=(__m64*)src;
+ for (unsigned int i=0; i<len/8; i++,src8++,dst8++) {
+ *dst8=_mm_adds_pu8(*src8,*dst8);
+ }
}
//====================================== SSE2 ======================================
#ifdef __SSE2__
-struct Tsse2
-{
- typedef __m128i __m;
- typedef __m64 int2;
- typedef int64_t integer2_t;
- static const size_t size=sizeof(__m);
- static const int align=16;
- typedef Tmmxext T64;
- static __forceinline __m setzero_si64(void) {return _mm_setzero_si128();}
- static __forceinline __m set_pi8(char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0) {return _mm_set_epi8(b7,b6,b5,b4,b3,b2,b1,b0,b7,b6,b5,b4,b3,b2,b1,b0);}
- static __forceinline __m set_pi32(int i1,int i0) {return _mm_set_epi32(i1,i0,i1,i0);}
- static __forceinline __m set1_pi8(char b) {return _mm_set1_epi8(b);}
- static __forceinline __m set1_pi16(short s) {return _mm_set1_epi16(s);}
- static __forceinline __m set1_pi64(int64_t s) {__align16(int64_t,x[])={s,s};return *(__m*)x;}//__m128i _mm_set1_epi64(*(__m64*)&s); TODO: _mm_set1_epi64x
- static __forceinline __m packs_pu16(const __m &m1,const __m &m2) {return _mm_packus_epi16(m1,m2);}
- static __forceinline __m slli_pi16(const __m &m,int count) {return _mm_slli_epi16(m,count);}
- static __forceinline __m srli_pi16(const __m &m,int count) {return _mm_srli_epi16(m,count);}
- static __forceinline __m srli_si64(const __m &m,int count) {return _mm_srli_epi64(m,count);}
- static __forceinline __m srai_pi16(const __m &m,int count) {return _mm_srai_epi16(m,count);}
- static __forceinline __m madd_pi16(const __m &m1,const __m &m2) {return _mm_madd_epi16(m1,m2);}
- static __forceinline __m add_pi16(const __m &m1,const __m &m2) {return _mm_add_epi16(m1,m2);}
- static __forceinline __m adds_pi16(const __m &m1,const __m &m2) {return _mm_adds_epi16(m1,m2);}
- static __forceinline __m adds_pu16(const __m &m1,const __m &m2) {return _mm_adds_epu16(m1,m2);}
- static __forceinline __m adds_pu8(const __m &m1,const __m &m2) {return _mm_adds_epu8(m1,m2);}
- static __forceinline __m sub_pi16(const __m &m1,const __m &m2) {return _mm_sub_epi16(m1,m2);}
- static __forceinline __m subs_pi16(const __m &m1,const __m &m2) {return _mm_subs_epi16(m1,m2);}
- static __forceinline __m subs_pu16(const __m &m1,const __m &m2) {return _mm_subs_epu16(m1,m2);}
- static __forceinline __m subs_pu8(const __m &m1,const __m &m2) {return _mm_subs_epu8(m1,m2);}
- static __forceinline __m or_si64(const __m &m1,const __m &m2) {return _mm_or_si128(m1,m2);}
- static __forceinline __m xor_si64(const __m &m1,const __m &m2) {return _mm_xor_si128(m1,m2);}
- static __forceinline __m and_si64(const __m &m1,const __m &m2) {return _mm_and_si128(m1,m2);}
- static __forceinline __m andnot_si64(const __m &m1,const __m &m2) {return _mm_andnot_si128(m1,m2);}
- static __forceinline __m mullo_pi16(const __m &m1,const __m &m2) {return _mm_mullo_epi16(m1,m2);}
- static __forceinline __m mulhi_pi16(const __m &m1,const __m &m2) {return _mm_mulhi_epi16(m1,m2);}
- static __forceinline __m unpacklo_pi8(const __m &m1,const __m &m2) {return _mm_unpacklo_epi8(m1,m2);}
- static __forceinline __m unpackhi_pi8(const __m &m1,const __m &m2) {return _mm_unpackhi_epi8(m1,m2);}
- static __forceinline __m cmpgt_pi16(const __m &m1,const __m &m2) {return _mm_cmpgt_epi16(m1,m2);}
- static __forceinline __m cmpeq_pi16(const __m &m1,const __m &m2) {return _mm_cmpeq_epi16(m1,m2);}
- static __forceinline __m cmpeq_pi8(const __m &m1,const __m &m2) {return _mm_cmpeq_epi8(m1,m2);}
- static __forceinline __m min_pi16(const __m &mm1,const __m &mm2) {return _mm_min_epi16(mm1,mm2);}
- static __forceinline __m max_pi16(const __m &mm1,const __m &mm2) {return _mm_max_epi16(mm1,mm2);}
- static __forceinline __m load2(const void *ptr) {return _mm_loadl_epi64((const __m128i*)ptr);}
- static __forceinline void store2(void *ptr,const __m &m) {_mm_storel_epi64((__m128i*)ptr,m);}
- static __forceinline void storeU(void *ptr,const __m &m) { _mm_storeu_si128((__m*)ptr,m);}
- static __forceinline __m loadU(const void *ptr) {return _mm_loadu_si128((const __m*)ptr);}
- static __forceinline void empty(void) {/*_mm_empty();*/}
-
- static __forceinline void psadbw(__m &mm3,const __m &mm2) {mm3=_mm_sad_epu8(mm3,mm2);}
- static __forceinline void prefetchnta(const void *ptr) {_mm_prefetch((const char*)ptr,_MM_HINT_NTA);}
- static __forceinline __m shuffle_pi16_0(const __m &mm0) {return _mm_shufflehi_epi16(_mm_shufflelo_epi16(mm0,0),0);}
- static __forceinline void pmaxub(__m &mmr1,const __m &mmr2) {mmr1=_mm_max_epu8(mmr1,mmr2);}
- static __forceinline void pmulhuw(__m &mmr1,const __m &mmr2) {mmr1=_mm_mulhi_epu16(mmr1,mmr2);}
- static __forceinline void movntq(void *dst,const __m &src) {_mm_stream_si128((__m128i*)dst,src);}
- static __forceinline void pavgb(__m &mmr1,const __m &mmr2) {mmr1=_mm_avg_epu8(mmr1,mmr2);}
- static __forceinline void pavgb(__m &mmr1,const void *mmr2) {mmr1=_mm_avg_epu8(mmr1,*(__m*)mmr2);}
- static __forceinline void sfence(void) {_mm_sfence();}
+struct Tsse2 {
+ typedef __m128i __m;
+ typedef __m64 int2;
+ typedef int64_t integer2_t;
+ static const size_t size=sizeof(__m);
+ static const int align=16;
+ typedef Tmmxext T64;
+ static __forceinline __m setzero_si64(void) {
+ return _mm_setzero_si128();
+ }
+ static __forceinline __m set_pi8(char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0) {
+ return _mm_set_epi8(b7,b6,b5,b4,b3,b2,b1,b0,b7,b6,b5,b4,b3,b2,b1,b0);
+ }
+ static __forceinline __m set_pi32(int i1,int i0) {
+ return _mm_set_epi32(i1,i0,i1,i0);
+ }
+ static __forceinline __m set1_pi8(char b) {
+ return _mm_set1_epi8(b);
+ }
+ static __forceinline __m set1_pi16(short s) {
+ return _mm_set1_epi16(s);
+ }
+ static __forceinline __m set1_pi64(int64_t s) {
+ __align16(int64_t,x[])= {s,s}; //__m128i _mm_set1_epi64(*(__m64*)&s); TODO: _mm_set1_epi64x
+ return *(__m*)x;
+ }
+ static __forceinline __m packs_pu16(const __m &m1,const __m &m2) {
+ return _mm_packus_epi16(m1,m2);
+ }
+ static __forceinline __m slli_pi16(const __m &m,int count) {
+ return _mm_slli_epi16(m,count);
+ }
+ static __forceinline __m srli_pi16(const __m &m,int count) {
+ return _mm_srli_epi16(m,count);
+ }
+ static __forceinline __m srli_si64(const __m &m,int count) {
+ return _mm_srli_epi64(m,count);
+ }
+ static __forceinline __m srai_pi16(const __m &m,int count) {
+ return _mm_srai_epi16(m,count);
+ }
+ static __forceinline __m madd_pi16(const __m &m1,const __m &m2) {
+ return _mm_madd_epi16(m1,m2);
+ }
+ static __forceinline __m add_pi16(const __m &m1,const __m &m2) {
+ return _mm_add_epi16(m1,m2);
+ }
+ static __forceinline __m adds_pi16(const __m &m1,const __m &m2) {
+ return _mm_adds_epi16(m1,m2);
+ }
+ static __forceinline __m adds_pu16(const __m &m1,const __m &m2) {
+ return _mm_adds_epu16(m1,m2);
+ }
+ static __forceinline __m adds_pu8(const __m &m1,const __m &m2) {
+ return _mm_adds_epu8(m1,m2);
+ }
+ static __forceinline __m sub_pi16(const __m &m1,const __m &m2) {
+ return _mm_sub_epi16(m1,m2);
+ }
+ static __forceinline __m subs_pi16(const __m &m1,const __m &m2) {
+ return _mm_subs_epi16(m1,m2);
+ }
+ static __forceinline __m subs_pu16(const __m &m1,const __m &m2) {
+ return _mm_subs_epu16(m1,m2);
+ }
+ static __forceinline __m subs_pu8(const __m &m1,const __m &m2) {
+ return _mm_subs_epu8(m1,m2);
+ }
+ static __forceinline __m or_si64(const __m &m1,const __m &m2) {
+ return _mm_or_si128(m1,m2);
+ }
+ static __forceinline __m xor_si64(const __m &m1,const __m &m2) {
+ return _mm_xor_si128(m1,m2);
+ }
+ static __forceinline __m and_si64(const __m &m1,const __m &m2) {
+ return _mm_and_si128(m1,m2);
+ }
+ static __forceinline __m andnot_si64(const __m &m1,const __m &m2) {
+ return _mm_andnot_si128(m1,m2);
+ }
+ static __forceinline __m mullo_pi16(const __m &m1,const __m &m2) {
+ return _mm_mullo_epi16(m1,m2);
+ }
+ static __forceinline __m mulhi_pi16(const __m &m1,const __m &m2) {
+ return _mm_mulhi_epi16(m1,m2);
+ }
+ static __forceinline __m unpacklo_pi8(const __m &m1,const __m &m2) {
+ return _mm_unpacklo_epi8(m1,m2);
+ }
+ static __forceinline __m unpackhi_pi8(const __m &m1,const __m &m2) {
+ return _mm_unpackhi_epi8(m1,m2);
+ }
+ static __forceinline __m cmpgt_pi16(const __m &m1,const __m &m2) {
+ return _mm_cmpgt_epi16(m1,m2);
+ }
+ static __forceinline __m cmpeq_pi16(const __m &m1,const __m &m2) {
+ return _mm_cmpeq_epi16(m1,m2);
+ }
+ static __forceinline __m cmpeq_pi8(const __m &m1,const __m &m2) {
+ return _mm_cmpeq_epi8(m1,m2);
+ }
+ static __forceinline __m min_pi16(const __m &mm1,const __m &mm2) {
+ return _mm_min_epi16(mm1,mm2);
+ }
+ static __forceinline __m max_pi16(const __m &mm1,const __m &mm2) {
+ return _mm_max_epi16(mm1,mm2);
+ }
+ static __forceinline __m load2(const void *ptr) {
+ return _mm_loadl_epi64((const __m128i*)ptr);
+ }
+ static __forceinline void store2(void *ptr,const __m &m) {
+ _mm_storel_epi64((__m128i*)ptr,m);
+ }
+ static __forceinline void storeU(void *ptr,const __m &m) {
+ _mm_storeu_si128((__m*)ptr,m);
+ }
+ static __forceinline __m loadU(const void *ptr) {
+ return _mm_loadu_si128((const __m*)ptr);
+ }
+ static __forceinline void empty(void) {
+ /*_mm_empty();*/
+ }
+
+ static __forceinline void psadbw(__m &mm3,const __m &mm2) {
+ mm3=_mm_sad_epu8(mm3,mm2);
+ }
+ static __forceinline void prefetchnta(const void *ptr) {
+ _mm_prefetch((const char*)ptr,_MM_HINT_NTA);
+ }
+ static __forceinline __m shuffle_pi16_0(const __m &mm0) {
+ return _mm_shufflehi_epi16(_mm_shufflelo_epi16(mm0,0),0);
+ }
+ static __forceinline void pmaxub(__m &mmr1,const __m &mmr2) {
+ mmr1=_mm_max_epu8(mmr1,mmr2);
+ }
+ static __forceinline void pmulhuw(__m &mmr1,const __m &mmr2) {
+ mmr1=_mm_mulhi_epu16(mmr1,mmr2);
+ }
+ static __forceinline void movntq(void *dst,const __m &src) {
+ _mm_stream_si128((__m128i*)dst,src);
+ }
+ static __forceinline void pavgb(__m &mmr1,const __m &mmr2) {
+ mmr1=_mm_avg_epu8(mmr1,mmr2);
+ }
+ static __forceinline void pavgb(__m &mmr1,const void *mmr2) {
+ mmr1=_mm_avg_epu8(mmr1,*(__m*)mmr2);
+ }
+ static __forceinline void sfence(void) {
+ _mm_sfence();
+ }
};
#endif //__SSE2__
template<class _mm> static __forceinline typename _mm::__m abs_16(const typename _mm::__m &mm0)
{
- typename _mm::__m mm6=_mm::srai_pi16(mm0,15);
- return _mm::sub_pi16(_mm::xor_si64(mm0,mm6),mm6);
+ typename _mm::__m mm6=_mm::srai_pi16(mm0,15);
+ return _mm::sub_pi16(_mm::xor_si64(mm0,mm6),mm6);
}
template<class _mm> static __forceinline typename _mm::__m absdif_s16(typename _mm::__m mm0,typename _mm::__m mm1)
{
- typename _mm::__m mm2=mm0;
- mm0=_mm::cmpgt_pi16(mm0,mm1);
- typename _mm::__m mm4=mm2;
- mm2=_mm::xor_si64(mm2,mm1);
- mm2=_mm::and_si64(mm2,mm0);
- typename _mm::__m mm3=mm2;
- mm4=_mm::xor_si64(mm4,mm2);
- mm1=_mm::xor_si64(mm1,mm3);
- return _mm::sub_pi16(mm1,mm4);
+ typename _mm::__m mm2=mm0;
+ mm0=_mm::cmpgt_pi16(mm0,mm1);
+ typename _mm::__m mm4=mm2;
+ mm2=_mm::xor_si64(mm2,mm1);
+ mm2=_mm::and_si64(mm2,mm0);
+ typename _mm::__m mm3=mm2;
+ mm4=_mm::xor_si64(mm4,mm2);
+ mm1=_mm::xor_si64(mm1,mm3);
+ return _mm::sub_pi16(mm1,mm4);
}
#pragma warning(pop)