/* * $Id$ * * (C) 2003-2006 Gabest * (C) 2006-2012 see Authors.txt * * This file is part of MPC-HC. * * MPC-HC is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * MPC-HC is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * */ // Based on Intel's AP-942 #include "stdafx.h" #include #include "libmpeg2.h" #include "attributes.h" #include "../../../DSUtil/simd.h" static const __m128i const_1_16_bytes=_mm_set1_epi16(1); static void MC_put_o_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) { const int edi= eax+eax; const int ebx= edi+eax; for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { __m128i xmm0,xmm1,xmm2,xmm3; movdqu (xmm0, edx ); movdqu (xmm1, edx+eax); movdqu (xmm2, edx+edi); movdqu (xmm3, edx+ebx); movdqa (ecx, xmm0 ); movdqa (ecx+eax, xmm1 ); movdqa (ecx+edi, xmm2 ); movdqa (ecx+ebx, xmm3 ); } } static void MC_put_o_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) { const int edi= eax+eax; const int ebx= edi+eax; for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { __m128d xmm0,xmm1,xmm2,xmm3; movlpd (xmm0, edx); movlpd (xmm1, edx+eax); movlpd (xmm2, edx+edi); movlpd (xmm3, edx+ebx); movlpd (ecx, xmm0); movlpd (ecx+eax, xmm1 ); movlpd (ecx+edi, xmm2); movlpd (ecx+ebx, xmm3 ); } } static void MC_put_x_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) { const int edi= eax+eax; const int ebx= edi+eax; for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; movdqu (xmm0, edx); movdqu (xmm1, edx+1); movdqu (xmm2, edx+eax); movdqu (xmm3, edx+eax+1); movdqu (xmm4, edx+edi); movdqu( xmm5, edx+edi+1); movdqu( xmm6, edx+ebx ); movdqu( xmm7, edx+ebx+1 ); pavgb (xmm0, xmm1); pavgb (xmm2, xmm3); pavgb (xmm4, xmm5); pavgb (xmm6, xmm7); movdqa (ecx, xmm0); movdqa (ecx+eax, xmm2); movdqa (ecx+edi, xmm4); movdqa (ecx+ebx, xmm6); } } static void MC_put_x_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) { const int edi= eax+eax; const int ebx= edi+eax; __m128i xmm0,xmm1,xmm2,xmm3; for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { movlpd (xmm0, edx); movlpd (xmm1, edx+1); movhpd (xmm0, edx+eax); movhpd (xmm1, edx+eax+1); movlpd (xmm2, edx+edi); movlpd (xmm3, edx+edi+1); movhpd (xmm2, edx+ebx); movhpd (xmm3, edx+ebx+1); pavgb (xmm0, xmm1); pavgb (xmm2, xmm3); movlpd (ecx, xmm0); movhpd (ecx+eax, xmm0); movlpd (ecx+edi, xmm2); movhpd (ecx+ebx, xmm2); } } static void MC_put_y_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) { const int edi= eax+eax; const int ebx= edi+eax; __m128i xmm0; movdqu (xmm0, edx); for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { __m128i xmm1,xmm2,xmm3,xmm4; movdqu (xmm1, edx+eax); movdqu (xmm2, edx+edi ); movdqu (xmm3, edx+ebx ); movdqu (xmm4, edx+edi*2 ); pavgb (xmm0, xmm1 ); pavgb (xmm1, xmm2 ); pavgb (xmm2, xmm3 ); pavgb (xmm3, xmm4 ); movdqa (ecx, xmm0 ); movdqa (ecx+eax, xmm1 ); movdqa (ecx+edi, xmm2 ); movdqa (ecx+ebx, xmm3 ); movdqa (xmm0, xmm4 ); } } static void MC_put_y_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) { const int edi= eax+eax; const int ebx= edi+eax; __m128i xmm0; movlpd (xmm0, edx); for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { __m128i xmm1,xmm2,xmm3,xmm4; movlpd (xmm1, edx+eax ); movlpd (xmm2, edx+edi ); movlpd (xmm3, edx+ebx ); movlpd (xmm4, edx+edi*2 ); pavgb (xmm0, xmm1 ); pavgb (xmm1, xmm2); pavgb (xmm2, xmm3 ); pavgb (xmm3, xmm4 ); movlpd (ecx, xmm0 ); movlpd (ecx+eax, xmm1 ); movlpd (ecx+edi, xmm2 ); movlpd (ecx+ebx, xmm3 ); movdqa (xmm0, xmm4 ); } } static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref ; uint8_t *ecx= dest; int eax= stride; int esi= height; int edi= eax+eax; __m128i xmm7,xmm0,xmm1,xmm4,xmm5,xmm2,xmm3; movdqa (xmm7, const_1_16_bytes ); movdqu (xmm0, edx ); movdqu (xmm1, edx+1 ); for (; esi; edx+= edi,ecx+= edi,esi-= 2) { movdqu (xmm2, edx+eax ); movdqu (xmm3, edx+eax+1 ); movdqu (xmm4, edx+edi ); movdqu (xmm5, edx+edi+1 ); pavgb (xmm0, xmm1 ); pavgb (xmm2, xmm3 ); movdqa( xmm1, xmm5 ); pavgb (xmm5, xmm4 ); psubusb( xmm2, xmm7 ); pavgb (xmm0, xmm2 ); pavgb (xmm2, xmm5); movdqa (ecx, xmm0); movdqa (xmm0, xmm4); movdqa (ecx+eax, xmm2); } } static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int eax= stride; int esi= height; int edi= eax+eax; __m128i xmm7,xmm0,xmm2,xmm1,xmm3,xmm4,xmm5; movdqa (xmm7, const_1_16_bytes); movlpd (xmm0, edx); movlpd (xmm1, edx+1); for (; esi; edx+= edi,ecx+= edi,esi-= 2) { movlpd (xmm2, edx+eax); movlpd (xmm3, edx+eax+1); movlpd (xmm4, edx+edi); movlpd (xmm5, edx+edi+1); pavgb (xmm0, xmm1 ); pavgb (xmm2, xmm3 ); movdqa( xmm1, xmm5 ); pavgb (xmm5, xmm4 ); psubusb( xmm2, xmm7 ); pavgb (xmm0, xmm2 ); pavgb (xmm2, xmm5); movlpd (ecx, xmm0); movdqa (xmm0, xmm4); movlpd (ecx+eax, xmm2); } } static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int esi= height; int eax= stride; int edi= eax+eax; int ebx= edi+eax; for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { __m128i xmm0,xmm1,xmm2,xmm3; movdqu (xmm0, edx); movdqu (xmm1, edx+eax ); movdqu (xmm2, edx+edi); movdqu (xmm3, edx+ebx ); pavgb (xmm0, ecx); pavgb (xmm1, ecx+eax); pavgb (xmm2, ecx+edi); pavgb (xmm3, ecx+ebx); movdqa (ecx, xmm0); movdqa (ecx+eax, xmm1 ); movdqa (ecx+edi, xmm2); movdqa (ecx+ebx, xmm3 ); } } static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int esi= height; int eax= stride; int edi= eax+eax; int ebx= edi+eax; __m128i xmm0,xmm1,xmm2,xmm3; for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { movlpd (xmm0, edx); movhpd (xmm0, edx+eax ); movlpd (xmm2, edx+edi); movhpd (xmm2, edx+ebx ); movlpd (xmm1, ecx); movhpd (xmm1, ecx+eax); movlpd (xmm3, ecx+edi); movhpd (xmm3, ecx+ebx); pavgb (xmm0, xmm1); pavgb (xmm2, xmm3); movlpd (ecx, xmm0); movhpd (ecx+eax, xmm0); movlpd (ecx+edi, xmm2); movhpd (ecx+ebx, xmm2); } } static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int esi= height; int eax= stride; int edi= eax+eax; int ebx= edi+eax; __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { movdqu (xmm0, edx); movdqu (xmm1, edx+1); movdqu (xmm2, edx+eax); movdqu (xmm3, edx+eax+1); movdqu (xmm4, edx+edi); movdqu (xmm5, edx+edi+1); movdqu (xmm6, edx+ebx); movdqu (xmm7, edx+ebx+1); pavgb (xmm0, xmm1); pavgb (xmm2, xmm3); pavgb (xmm4, xmm5); pavgb (xmm6, xmm7); pavgb (xmm0, ecx); pavgb (xmm2, ecx+eax); pavgb (xmm4, ecx+edi); pavgb (xmm6, ecx+ebx); movdqa (ecx, xmm0); movdqa (ecx+eax, xmm2); movdqa (ecx+edi, xmm4); movdqa (ecx+ebx, xmm6); } } static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int esi= height; int eax= stride; int edi= eax+eax; int ebx= edi+eax; __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5; for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { movlpd (xmm0, edx); movlpd (xmm1, edx+1); movhpd (xmm0, edx+eax); movhpd (xmm1, edx+eax+1); movlpd (xmm2, edx+edi); movlpd (xmm3, edx+edi+1); movhpd (xmm2, edx+ebx); movhpd (xmm3, edx+ebx+1); movlpd (xmm4, ecx); movhpd (xmm4, ecx+eax); movlpd (xmm5, ecx+edi); movhpd (xmm5, ecx+ebx); pavgb (xmm0, xmm1); pavgb (xmm2, xmm3); pavgb (xmm0, xmm4); pavgb (xmm2, xmm5); movlpd (ecx, xmm0); movhpd (ecx+eax, xmm0); movlpd (ecx+edi, xmm2); movhpd (ecx+ebx, xmm2); } } static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int esi= height; int eax= stride; int edi= eax+eax; int ebx= edi+eax; __m128i xmm0,xmm1,xmm2,xmm3,xmm4; movdqu (xmm0,edx); for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { movdqu (xmm1, edx+eax ); movdqu (xmm2, edx+edi ); movdqu (xmm3, edx+ebx ); movdqu (xmm4, edx+edi*2 ); pavgb (xmm0, xmm1 ); pavgb (xmm1, xmm2 ); pavgb (xmm2, xmm3 ); pavgb (xmm3, xmm4 ); pavgb (xmm0, ecx); pavgb (xmm1, ecx+eax ); pavgb (xmm2, ecx+edi); pavgb (xmm3, ecx+ebx ); movdqa (ecx, xmm0 ); movdqa (ecx+eax, xmm1 ); movdqa (ecx+edi, xmm2 ); movdqa (ecx+ebx, xmm3 ); movdqa (xmm0, xmm4 ); } } static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int esi= height; int eax= stride; int edi= eax+eax; int ebx= edi+eax; __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5; movhpd (xmm0, edx ); movlpd (xmm0, edx+eax ); for (; esi; edx+=edi*2,ecx+=edi*2,esi-=4) { movlhps (xmm1, xmm0); movlpd (xmm1, edx+edi ); movlhps (xmm2, xmm1); movlpd (xmm2, edx+ebx ); movlhps (xmm3, xmm2); movlpd (xmm3, edx+edi*2 ); movhpd (xmm4, ecx ); movlpd (xmm4, ecx+eax ); movhpd (xmm5, ecx+edi ); movlpd (xmm5, ecx+ebx ); pavgb (xmm0, xmm1 ); pavgb (xmm2, xmm3); pavgb (xmm0, xmm4); pavgb (xmm2, xmm5); movhpd (ecx, xmm0 ); movlpd (ecx+eax, xmm0 ); movhpd (ecx+edi, xmm2 ); movlpd (ecx+ebx, xmm2); movdqa (xmm0, xmm3 ); } } static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int esi= height; int eax= stride; int edi= eax+eax; __m128i xmm7,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5; movdqa (xmm7, const_1_16_bytes ); movdqu (xmm0, edx ); movdqu (xmm1, edx+1 ); for (; esi; edx+=edi,ecx+=edi,esi-=2) { movdqu (xmm2, edx+eax ); movdqu (xmm3, edx+eax+1 ); movdqu (xmm4, edx+edi ); movdqu (xmm5, edx+edi+1 ); pavgb (xmm0, xmm1 ); pavgb (xmm2, xmm3 ); movdqa (xmm1, xmm5 ); pavgb (xmm5, xmm4 ); psubusb (xmm2, xmm7 ); pavgb (xmm0, xmm2 ); pavgb (xmm2, xmm5); pavgb (xmm0, ecx ); pavgb (xmm2, ecx+eax); movdqa (ecx, xmm0); movdqa (xmm0, xmm4); movdqa (ecx+eax, xmm2); } } static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) { const uint8_t *edx= ref; uint8_t *ecx= dest; int esi= height; int eax= stride; int edi= eax+eax; __m128i xmm7,xmm0,xmm2,xmm1,xmm3,xmm4; movdqa (xmm7, const_1_16_bytes ); movhpd (xmm0, edx ); movlpd (xmm0, edx+eax ); movhpd (xmm2, edx+1 ); movlpd (xmm2, edx+eax+1 ); for (; esi; edx+=edi,ecx+=edi,esi-=2) { movhpd (xmm1, edx+eax ); movlpd (xmm1, edx+edi ); movhpd (xmm3, edx+eax+1 ); movlpd (xmm3, edx+edi+1 ); pavgb (xmm0, xmm1 ); pavgb (xmm2, xmm3 ); psubusb( xmm0, xmm7 ); pavgb (xmm0, xmm2 ); movhpd( xmm4, ecx); movlpd( xmm4, ecx+eax); pavgb (xmm0, xmm4 ); movhpd (ecx, xmm0 ); movlpd (ecx+eax, xmm0 ); movdqa (xmm0, xmm1 ); movdqa (xmm2, xmm3 ); } } mpeg2_mc_t mpeg2_mc_sse2 = { { MC_put_o_16_sse2, MC_put_x_16_sse2, MC_put_y_16_sse2, MC_put_xy_16_sse2, MC_put_o_8_sse2, MC_put_x_8_sse2, MC_put_y_8_sse2, MC_put_xy_8_sse2 }, { MC_avg_o_16_sse2, MC_avg_x_16_sse2, MC_avg_y_16_sse2, MC_avg_xy_16_sse2, MC_avg_o_8_sse2, MC_avg_x_8_sse2, MC_avg_y_8_sse2, MC_avg_xy_8_sse2 } };