/* * Copyright (C) 2003-2006 Gabest * http://www.gabest.org * * This Program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This Program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU Make; see the file COPYING. If not, write to * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. * http://www.gnu.org/copyleft/gpl.html * * Based on Intel's AP-942 * */ #include "stdafx.h" #include #include "libmpeg2.h" #include "attributes.h" #include "../../../DSUtil/simd.h" // Intel's SSE2 implementation of iDCT // AP-945 // http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf static const int BITS_INV_ACC=4; static const int SHIFT_INV_ROW=16-BITS_INV_ACC; static const int SHIFT_INV_COL=1+BITS_INV_ACC; static const int RND_INV_ROW =1024*(6-BITS_INV_ACC); static const int RND_INV_COL =16*(BITS_INV_ACC-3); static const int RND_INV_CORR =RND_INV_COL-1; static __align16(const short,M128_round_inv_row[8]) = {RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0}; static __align16(const short,M128_one_corr[8]) = {1,1,1,1,1,1,1,1}; static __align16(const short,M128_round_inv_col[8]) = {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL}; static __align16(const short,M128_round_inv_corr[8])= {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR}; static __align16(const short,M128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5 static __align16(const short,M128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5 static __align16(const short,M128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5 static __align16(const short,M128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5 static __align16(const int16_t,M128_tab_i_04[])= {16384, 21407, 16384, 8867, 16384, -8867, 16384, -21407, 16384, 8867, -16384, -21407, -16384, 21407, 16384, -8867, 22725, 19266, 19266, -4520, 12873, -22725, 4520, -12873, 12873, 4520, -22725, -12873, 4520, 19266, 19266, -22725}; static __align16(const int16_t,M128_tab_i_17[])= {22725, 29692, 22725, 12299, 22725, -12299, 22725, -29692, 22725, 12299, -22725, -29692, -22725, 29692, 22725, -12299, 31521, 26722, 26722, -6270, 17855, -31521, 6270, -17855, 17855, 6270, -31521, -17855, 6270, 26722, 26722, -31521}; static __align16(const int16_t,M128_tab_i_26[])= {21407, 27969, 21407, 11585, 21407, -11585, 21407, -27969, 21407, 11585, -21407, -27969, -21407, 27969, 21407, -11585, 29692, 25172, 25172, -5906, 16819, -29692, 5906, -16819, 16819, 5906, -29692, -16819, 5906, 25172, 25172, -29692}; static __align16(const int16_t,M128_tab_i_35[])= {19266, 25172, 19266, 10426, 19266, -10426, 19266, -25172, 19266, 10426, -19266, -25172, -19266, 25172, 19266, -10426, 26722, 22654, 22654, -5315, 15137, -26722, 5315, -15137, 15137, 5315, -26722, -15137, 5315, 22654, 22654, -26722}; static __forceinline void DCT_8_INV_ROW(const uint8_t * const ecx,const uint8_t * const esi,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7) { xmm0=_mm_shufflelo_epi16(xmm0, 0xD8 ); xmm1=_mm_shuffle_epi32( xmm0, 0 ); pmaddwd (xmm1, esi); xmm3=_mm_shuffle_epi32( xmm0, 0x55); xmm0=_mm_shufflehi_epi16( xmm0, 0xD8 ); pmaddwd( xmm3, esi+32 ); xmm2=_mm_shuffle_epi32( xmm0, 0xAA ); xmm0=_mm_shuffle_epi32( xmm0, 0xFF ); pmaddwd( xmm2, esi+16 ); xmm4=_mm_shufflehi_epi16( xmm4, 0xD8 ); paddd (xmm1, M128_round_inv_row); xmm4=_mm_shufflelo_epi16 (xmm4, 0xD8 ); pmaddwd (xmm0, esi+48 ); xmm5=_mm_shuffle_epi32( xmm4, 0 ); xmm6=_mm_shuffle_epi32( xmm4, 0xAA ); pmaddwd (xmm5, ecx ); paddd (xmm1, xmm2 ); movdqa (xmm2, xmm1 ); xmm7=_mm_shuffle_epi32( xmm4, 0x55 ); pmaddwd (xmm6, ecx+16 ); paddd (xmm0, xmm3 ); xmm4=_mm_shuffle_epi32( xmm4, 0xFF ); psubd (xmm2, xmm0 ); pmaddwd (xmm7, ecx+32 ); paddd (xmm0, xmm1 ); psrad (xmm2, 12 ); paddd (xmm5, M128_round_inv_row); pmaddwd (xmm4, ecx+48 ); paddd (xmm5, xmm6 ); movdqa (xmm6, xmm5 ); psrad (xmm0, 12 ); xmm2=_mm_shuffle_epi32( xmm2, 0x1B ); packssdw (xmm0, xmm2 ); paddd (xmm4, xmm7 ); psubd (xmm6, xmm4 ); paddd (xmm4, xmm5 ); psrad (xmm6, 12 ); psrad (xmm4, 12 ); xmm6=_mm_shuffle_epi32( xmm6, 0x1B ); packssdw (xmm4, xmm6 ); } static __forceinline void DCT_8_INV_COL_8(__m128i &src0,__m128i &src1,__m128i &src2,__m128i &src3,__m128i &src4,__m128i &src5,__m128i &src6,__m128i &src7, __m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7) { movdqa( xmm1, M128_tg_3_16 ); movdqa( xmm2, xmm0 ); movdqa( xmm3, src3 ); pmulhw( xmm0, xmm1 ); pmulhw( xmm1, xmm3 ); movdqa( xmm5, M128_tg_1_16 ); movdqa( xmm6, xmm4 ); pmulhw( xmm4, xmm5 ); paddsw( xmm0, xmm2 ); pmulhw( xmm5, src1 ); paddsw( xmm1, xmm3 ); movdqa( xmm7, src6 ); paddsw( xmm0, xmm3 ); movdqa( xmm3, M128_tg_2_16 ); psubsw( xmm2, xmm1 ); pmulhw( xmm7, xmm3 ); movdqa( xmm1, xmm0 ); pmulhw( xmm3, src2 ); psubsw( xmm5, xmm6 ); paddsw( xmm4, src1 ); paddsw( xmm0, xmm4 ); paddsw( xmm0, M128_one_corr ); psubsw( xmm4, xmm1 ); movdqa( xmm6, xmm5 ); psubsw( xmm5, xmm2 ); paddsw( xmm5, M128_one_corr ); paddsw( xmm6, xmm2 ); movdqa( src7, xmm0 ); movdqa( xmm1, xmm4 ); movdqa( xmm0, M128_cos_4_16 ); paddsw( xmm4, xmm5 ); movdqa( xmm2, M128_cos_4_16 ); pmulhw( xmm2, xmm4 ); movdqa( src3, xmm6 ); psubsw( xmm1, xmm5 ); paddsw( xmm7, src2 ); psubsw( xmm3, src6 ); movdqa( xmm6, src0 ); pmulhw( xmm0, xmm1 ); movdqa( xmm5, src4 ); paddsw( xmm5, xmm6 ); psubsw( xmm6, src4 ); paddsw( xmm4, xmm2 ); por ( xmm4, M128_one_corr ); paddsw( xmm0, xmm1 ); por ( xmm0, M128_one_corr ); movdqa( xmm2, xmm5 ); paddsw( xmm5, xmm7 ); movdqa( xmm1, xmm6 ); paddsw( xmm5, M128_round_inv_col ); psubsw( xmm2, xmm7 ); movdqa( xmm7, src7 ); paddsw( xmm6, xmm3 ); paddsw( xmm6, M128_round_inv_col ); paddsw( xmm7, xmm5 ); psraw ( xmm7, SHIFT_INV_COL ); psubsw( xmm1, xmm3 ); paddsw( xmm1, M128_round_inv_corr ); movdqa( xmm3, xmm6 ); paddsw( xmm2, M128_round_inv_corr ); paddsw( xmm6, xmm4 ); movdqa( src0,xmm7 ); psraw (xmm6, SHIFT_INV_COL ); movdqa( xmm7, xmm1 ); paddsw( xmm1, xmm0 ); movdqa( src1, xmm6 ); psraw (xmm1, SHIFT_INV_COL ); movdqa( xmm6, src3 ); psubsw( xmm7, xmm0 ); psraw (xmm7, SHIFT_INV_COL ); movdqa( src2, xmm1 ); psubsw( xmm5, src7 ); psraw (xmm5, SHIFT_INV_COL ); movdqa( src7, xmm5 ); psubsw( xmm3, xmm4 ); paddsw( xmm6, xmm2 ); psubsw( xmm2, src3 ); psraw (xmm6, SHIFT_INV_COL ); psraw (xmm2, SHIFT_INV_COL ); movdqa( src3, xmm6 ); psraw (xmm3, SHIFT_INV_COL ); movdqa( src4, xmm2 ); movdqa( src5, xmm7 ); movdqa( src6, xmm3 ); } static __forceinline void idct_M128ASM(__m128i &src0,__m128i &src1,__m128i &src2,__m128i &src3,__m128i &src4,__m128i &src5,__m128i &src6,__m128i &src7) { __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; movdqa (xmm0, src0); uint8_t *esi=(uint8_t*)M128_tab_i_04; movdqa (xmm4, src2); uint8_t *ecx=(uint8_t*)M128_tab_i_26; DCT_8_INV_ROW(ecx,esi,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7); movdqa (src0, xmm0); movdqa (src2, xmm4); movdqa (xmm0, src4); movdqa (xmm4, src6); DCT_8_INV_ROW(ecx,esi,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7); movdqa (src4, xmm0); movdqa (src6, xmm4); movdqa (xmm0, src3); esi=(uint8_t*)M128_tab_i_35; movdqa (xmm4, src1); ecx=(uint8_t*)M128_tab_i_17; DCT_8_INV_ROW(ecx,esi,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7); movdqa (src3, xmm0); movdqa (src1, xmm4); movdqa (xmm0, src5); movdqa (xmm4, src7); DCT_8_INV_ROW(ecx,esi,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7); DCT_8_INV_COL_8(src0,src1,src2,src3,src4,src5,src6,src7, xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7); } void mpeg2_idct_copy_sse2(int16_t* block, uint8_t* dest, const int stride) { __m128i &src0=*(__m128i*)(block+0*16/2); __m128i &src1=*(__m128i*)(block+1*16/2); __m128i &src2=*(__m128i*)(block+2*16/2); __m128i &src3=*(__m128i*)(block+3*16/2); __m128i &src4=*(__m128i*)(block+4*16/2); __m128i &src5=*(__m128i*)(block+5*16/2); __m128i &src6=*(__m128i*)(block+6*16/2); __m128i &src7=*(__m128i*)(block+7*16/2); idct_M128ASM (src0,src1,src2,src3,src4,src5,src6,src7); __m128i zero = _mm_setzero_si128(); __m128i r0 = _mm_packus_epi16(_mm_load_si128(&src0), _mm_load_si128(&src1)); __m128i r1 = _mm_packus_epi16(_mm_load_si128(&src2), _mm_load_si128(&src3)); __m128i r2 = _mm_packus_epi16(_mm_load_si128(&src4), _mm_load_si128(&src5)); __m128i r3 = _mm_packus_epi16(_mm_load_si128(&src6), _mm_load_si128(&src7)); _mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0); _mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0); _mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1); _mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1); _mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2); _mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2); _mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3); _mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3); _mm_store_si128(&src0, zero); _mm_store_si128(&src1, zero); _mm_store_si128(&src2, zero); _mm_store_si128(&src3, zero); _mm_store_si128(&src4, zero); _mm_store_si128(&src5, zero); _mm_store_si128(&src6, zero); _mm_store_si128(&src7, zero); } void mpeg2_idct_add_sse2(int,int16_t* block, uint8_t* dest, const int stride) { __m128i &src0=*(__m128i*)(block+0*16/2); __m128i &src1=*(__m128i*)(block+1*16/2); __m128i &src2=*(__m128i*)(block+2*16/2); __m128i &src3=*(__m128i*)(block+3*16/2); __m128i &src4=*(__m128i*)(block+4*16/2); __m128i &src5=*(__m128i*)(block+5*16/2); __m128i &src6=*(__m128i*)(block+6*16/2); __m128i &src7=*(__m128i*)(block+7*16/2); idct_M128ASM (src0,src1,src2,src3,src4,src5,src6,src7); __m128i zero = _mm_setzero_si128(); __m128i r0 = _mm_load_si128(&src0); __m128i r1 = _mm_load_si128(&src1); __m128i r2 = _mm_load_si128(&src2); __m128i r3 = _mm_load_si128(&src3); __m128i r4 = _mm_load_si128(&src4); __m128i r5 = _mm_load_si128(&src5); __m128i r6 = _mm_load_si128(&src6); __m128i r7 = _mm_load_si128(&src7); __m128 q0 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[0*stride]); __m128 q1 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[1*stride]); __m128 q2 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[2*stride]); __m128 q3 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[3*stride]); __m128 q4 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[4*stride]); __m128 q5 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[5*stride]); __m128 q6 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[6*stride]); __m128 q7 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[7*stride]); r0 = _mm_adds_epi16(r0, _mm_unpacklo_epi8(*(__m128i*)&q0, zero)); r1 = _mm_adds_epi16(r1, _mm_unpacklo_epi8(*(__m128i*)&q1, zero)); r2 = _mm_adds_epi16(r2, _mm_unpacklo_epi8(*(__m128i*)&q2, zero)); r3 = _mm_adds_epi16(r3, _mm_unpacklo_epi8(*(__m128i*)&q3, zero)); r4 = _mm_adds_epi16(r4, _mm_unpacklo_epi8(*(__m128i*)&q4, zero)); r5 = _mm_adds_epi16(r5, _mm_unpacklo_epi8(*(__m128i*)&q5, zero)); r6 = _mm_adds_epi16(r6, _mm_unpacklo_epi8(*(__m128i*)&q6, zero)); r7 = _mm_adds_epi16(r7, _mm_unpacklo_epi8(*(__m128i*)&q7, zero)); r0 = _mm_packus_epi16(r0, r1); r1 = _mm_packus_epi16(r2, r3); r2 = _mm_packus_epi16(r4, r5); r3 = _mm_packus_epi16(r6, r7); _mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0); _mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0); _mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1); _mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1); _mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2); _mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2); _mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3); _mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3); _mm_store_si128(&src0, zero); _mm_store_si128(&src1, zero); _mm_store_si128(&src2, zero); _mm_store_si128(&src3, zero); _mm_store_si128(&src4, zero); _mm_store_si128(&src5, zero); _mm_store_si128(&src6, zero); _mm_store_si128(&src7, zero); } void mpeg2_idct_init_sse2() { }