From 610e00b3594bf0f2a75713f20e9c4edf0d03a818 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Sat, 13 Oct 2012 10:04:50 -0500 Subject: x86: h264: Convert 8-bit QPEL inline assembly to YASM Signed-off-by: Diego Biurrun --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/dsputil.asm | 231 +++++++ libavcodec/x86/dsputil_avg_template.c | 130 ---- libavcodec/x86/dsputil_mmx.c | 107 +--- libavcodec/x86/h264_qpel.c | 1141 +++++---------------------------- libavcodec/x86/h264_qpel_8bit.asm | 862 +++++++++++++++++++++++++ 6 files changed, 1276 insertions(+), 1198 deletions(-) create mode 100644 libavcodec/x86/h264_qpel_8bit.asm (limited to 'libavcodec') diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 053290b464..1e70d535f5 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -51,7 +51,8 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ x86/h264_weight_10bit.o YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ x86/h264_intrapred_10bit.o -YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_10bit.o +YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ + x86/h264_qpel_10bit.o YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 726d4c6362..dd0c795235 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -1354,3 +1354,234 @@ BSWAP32_BUF INIT_XMM ssse3 BSWAP32_BUF + +%macro op_avgh 3 + movh %3, %2 + pavgb %1, %3 + movh %2, %1 +%endmacro + +%macro op_avg 2 + pavgb %1, %2 + mova %2, %1 +%endmacro + +%macro op_puth 2-3 + movh %2, %1 +%endmacro + +%macro op_put 2 + mova %2, %1 +%endmacro + +; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +%macro PIXELS4_L2 1 +%define OP op_%1h +cglobal %1_pixels4_l2, 6,6 + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + test r5d, 1 + je .loop + movd m0, [r1] + movd m1, [r2] + add r1, r4 + add r2, 4 + pavgb m0, m1 + OP m0, [r0], m3 + add r0, r3 + dec r5d +.loop: + mova m0, [r1] + mova m1, [r1+r4] + lea r1, [r1+2*r4] + pavgb m0, [r2] + pavgb m1, [r2+4] + OP m0, [r0], m3 + OP m1, [r0+r3], m3 + lea r0, [r0+2*r3] + mova m0, [r1] + mova m1, [r1+r4] + lea r1, [r1+2*r4] + pavgb m0, [r2+8] + pavgb m1, [r2+12] + OP m0, [r0], m3 + OP m1, [r0+r3], m3 + lea r0, [r0+2*r3] + add r2, 16 + sub r5d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PIXELS4_L2 put +PIXELS4_L2 avg + +; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +%macro PIXELS8_L2 1 +%define OP op_%1 +cglobal %1_pixels8_l2, 6,6 + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + test r5d, 1 + je .loop + mova m0, [r1] + mova m1, [r2] + add r1, r4 + add r2, 8 + pavgb m0, m1 + OP m0, [r0] + add r0, r3 + dec r5d +.loop: + mova m0, [r1] + mova m1, [r1+r4] + lea r1, [r1+2*r4] + pavgb m0, [r2] + pavgb m1, [r2+8] + OP m0, [r0] + OP m1, [r0+r3] + lea r0, [r0+2*r3] + mova m0, [r1] + mova m1, [r1+r4] + lea r1, [r1+2*r4] + pavgb m0, [r2+16] + pavgb m1, [r2+24] + OP m0, [r0] + OP m1, [r0+r3] + lea r0, [r0+2*r3] + add r2, 32 + sub r5d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PIXELS8_L2 put +PIXELS8_L2 avg + +; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +%macro PIXELS16_L2 1 +%define OP op_%1 +cglobal %1_pixels16_l2, 6,6 + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + test r5d, 1 + je .loop + mova m0, [r1] + mova m1, [r1+8] + pavgb m0, [r2] + pavgb m1, [r2+8] + add r1, r4 + add r2, 16 + OP m0, [r0] + OP m1, [r0+8] + add r0, r3 + dec r5d +.loop: + mova m0, [r1] + mova m1, [r1+8] + add r1, r4 + pavgb m0, [r2] + pavgb m1, [r2+8] + OP m0, [r0] + OP m1, [r0+8] + add r0, r3 + mova m0, [r1] + mova m1, [r1+8] + add r1, r4 + pavgb m0, [r2+16] + pavgb m1, [r2+24] + OP m0, [r0] + OP m1, [r0+8] + add r0, r3 + add r2, 32 + sub r5d, 2 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PIXELS16_L2 put +PIXELS16_L2 avg + +INIT_MMX mmxext +; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PIXELS48 2 +%if %2 == 4 +%define OP movh +%else +%define OP mova +%endif +cglobal %1_pixels%2, 4,5 + movsxdifnidn r2, r2d + lea r4, [r2*3] +.loop: + OP m0, [r1] + OP m1, [r1+r2] + OP m2, [r1+r2*2] + OP m3, [r1+r4] + lea r1, [r1+r2*4] +%ifidn %1, avg + pavgb m0, [r0] + pavgb m1, [r0+r2] + pavgb m2, [r0+r2*2] + pavgb m3, [r0+r4] +%endif + OP [r0], m0 + OP [r0+r2], m1 + OP [r0+r2*2], m2 + OP [r0+r4], m3 + sub r3d, 4 + lea r0, [r0+r2*4] + jne .loop + RET +%endmacro + +PIXELS48 put, 4 +PIXELS48 avg, 4 +PIXELS48 put, 8 +PIXELS48 avg, 8 + +INIT_XMM sse2 +; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +cglobal put_pixels16, 4,5,4 + movsxdifnidn r2, r2d + lea r4, [r2*3] +.loop: + movu m0, [r1] + movu m1, [r1+r2] + movu m2, [r1+r2*2] + movu m3, [r1+r4] + lea r1, [r1+r2*4] + mova [r0], m0 + mova [r0+r2], m1 + mova [r0+r2*2], m2 + mova [r0+r4], m3 + sub r3d, 4 + lea r0, [r0+r2*4] + jnz .loop + REP_RET + +; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +cglobal avg_pixels16, 4,5,4 + movsxdifnidn r2, r2d + lea r4, [r2*3] +.loop: + movu m0, [r1] + movu m1, [r1+r2] + movu m2, [r1+r2*2] + movu m3, [r1+r4] + lea r1, [r1+r2*4] + pavgb m0, [r0] + pavgb m1, [r0+r2] + pavgb m2, [r0+r2*2] + pavgb m3, [r0+r4] + mova [r0], m0 + mova [r0+r2], m1 + mova [r0+r2*2], m2 + mova [r0+r4], m3 + sub r3d, 4 + lea r0, [r0+r2*4] + jnz .loop + REP_RET diff --git a/libavcodec/x86/dsputil_avg_template.c b/libavcodec/x86/dsputil_avg_template.c index cffcd4e284..1a4343d76e 100644 --- a/libavcodec/x86/dsputil_avg_template.c +++ b/libavcodec/x86/dsputil_avg_template.c @@ -56,57 +56,6 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_ } #ifndef SKIP_FOR_3DNOW -static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movd (%1), %%mm0 \n\t" - "movd (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $4, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "movd (%2), %%mm2 \n\t" - "movd 4(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movd %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movd (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "movd 8(%2), %%mm2 \n\t" - "movd 12(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movd %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $16, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -} - - static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { __asm__ volatile( @@ -227,58 +176,6 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src :"memory");*/ } -static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movd (%1), %%mm0 \n\t" - "movd (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $4, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 4(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movd %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movd (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 8(%2), %%mm0 \n\t" - PAVGB" 12(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movd %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $16, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -} - - static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { __asm__ volatile( @@ -876,33 +773,6 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line :"%"REG_a, "memory"); } -#ifndef SKIP_FOR_3DNOW -static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - do { - __asm__ volatile( - "movd (%1), %%mm0 \n\t" - "movd (%1, %2), %%mm1 \n\t" - "movd (%1, %2, 2), %%mm2 \n\t" - "movd (%1, %3), %%mm3 \n\t" - PAVGB" (%0), %%mm0 \n\t" - PAVGB" (%0, %2), %%mm1 \n\t" - PAVGB" (%0, %2, 2), %%mm2 \n\t" - PAVGB" (%0, %3), %%mm3 \n\t" - "movd %%mm0, (%1) \n\t" - "movd %%mm1, (%1, %2) \n\t" - "movd %%mm2, (%1, %2, 2) \n\t" - "movd %%mm3, (%1, %3) \n\t" - ::"S"(pixels), "D"(block), - "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size) - :"memory"); - block += 4*line_size; - pixels += 4*line_size; - h -= 4; - } while(h > 0); -} -#endif /* SKIP_FOR_3DNOW */ - //FIXME the following could be optimized too ... static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index ed6cff3e67..d403a14e61 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -366,33 +366,6 @@ void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, } while (--i); } -static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movd (%1 ), %%mm0 \n\t" - "movd (%1, %3), %%mm1 \n\t" - "movd %%mm0, (%2) \n\t" - "movd %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movd (%1 ), %%mm0 \n\t" - "movd (%1, %3), %%mm1 \n\t" - "movd %%mm0, (%2) \n\t" - "movd %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { @@ -455,56 +428,6 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, ); } -static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - __asm__ volatile ( - "1: \n\t" - "movdqu (%1 ), %%xmm0 \n\t" - "movdqu (%1, %3 ), %%xmm1 \n\t" - "movdqu (%1, %3, 2), %%xmm2 \n\t" - "movdqu (%1, %4 ), %%xmm3 \n\t" - "lea (%1, %3, 4), %1 \n\t" - "movdqa %%xmm0, (%2) \n\t" - "movdqa %%xmm1, (%2, %3) \n\t" - "movdqa %%xmm2, (%2, %3, 2) \n\t" - "movdqa %%xmm3, (%2, %4) \n\t" - "subl $4, %0 \n\t" - "lea (%2, %3, 4), %2 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size) - : "memory" - ); -} - -static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, - int line_size, int h) -{ - __asm__ volatile ( - "1: \n\t" - "movdqu (%1 ), %%xmm0 \n\t" - "movdqu (%1, %3 ), %%xmm1 \n\t" - "movdqu (%1, %3, 2), %%xmm2 \n\t" - "movdqu (%1, %4 ), %%xmm3 \n\t" - "lea (%1, %3, 4), %1 \n\t" - "pavgb (%2 ), %%xmm0 \n\t" - "pavgb (%2, %3 ), %%xmm1 \n\t" - "pavgb (%2, %3, 2), %%xmm2 \n\t" - "pavgb (%2, %4), %%xmm3 \n\t" - "movdqa %%xmm0, (%2) \n\t" - "movdqa %%xmm1, (%2, %3) \n\t" - "movdqa %%xmm2, (%2, %3, 2) \n\t" - "movdqa %%xmm3, (%2, %4) \n\t" - "subl $4, %0 \n\t" - "lea (%2, %3, 4), %2 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size) - : "memory" - ); -} - #define CLEAR_BLOCKS(name, n) \ static void name(DCTELEM *blocks) \ { \ @@ -2381,27 +2304,23 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, } #endif /* HAVE_INLINE_ASM */ +#if HAVE_MMXEXT_EXTERNAL if (CONFIG_H264QPEL) { -#if HAVE_INLINE_ASM SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, ); -#endif /* HAVE_INLINE_ASM */ if (!high_bit_depth) { -#if HAVE_INLINE_ASM SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, ); SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, ); SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, ); SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, ); -#endif /* HAVE_INLINE_ASM */ } else if (bit_depth == 10) { -#if HAVE_YASM #if !ARCH_X86_64 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); @@ -2410,18 +2329,14 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, #endif SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); -#endif /* HAVE_YASM */ } -#if HAVE_INLINE_ASM SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, ); SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, ); -#endif /* HAVE_INLINE_ASM */ } -#if HAVE_YASM if (!high_bit_depth && CONFIG_H264CHROMA) { c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext; c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext; @@ -2447,7 +2362,7 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, } else { c->apply_window_int16 = ff_apply_window_int16_round_mmxext; } -#endif /* HAVE_YASM */ +#endif /* HAVE_MMXEXT_EXTERNAL */ } static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, @@ -2546,17 +2461,16 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags) static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, int mm_flags) { +#if HAVE_SSE2_EXTERNAL const int bit_depth = avctx->bits_per_raw_sample; - -#if HAVE_INLINE_ASM const int high_bit_depth = bit_depth > 8; if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { // these functions are slower than mmx on AMD, but faster on Intel if (!high_bit_depth) { - c->put_pixels_tab[0][0] = put_pixels16_sse2; - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2; - c->avg_pixels_tab[0][0] = avg_pixels16_sse2; + c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; + c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; if (CONFIG_H264QPEL) H264_QPEL_FUNCS(0, 0, sse2); } @@ -2583,9 +2497,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, c->idct = ff_idct_xvid_sse2; c->idct_permutation_type = FF_SSE2_IDCT_PERM; } -#endif /* HAVE_INLINE_ASM */ -#if HAVE_YASM if (bit_depth == 10) { if (CONFIG_H264QPEL) { SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); @@ -2615,16 +2527,16 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, c->apply_window_int16 = ff_apply_window_int16_round_sse2; } c->bswap_buf = ff_bswap32_buf_sse2; -#endif /* HAVE_YASM */ +#endif /* HAVE_SSE2_EXTERNAL */ } static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, int mm_flags) { +#if HAVE_SSSE3_EXTERNAL const int high_bit_depth = avctx->bits_per_raw_sample > 8; const int bit_depth = avctx->bits_per_raw_sample; -#if HAVE_SSSE3_INLINE if (!high_bit_depth && CONFIG_H264QPEL) { H264_QPEL_FUNCS(1, 0, ssse3); H264_QPEL_FUNCS(1, 1, ssse3); @@ -2639,9 +2551,6 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, H264_QPEL_FUNCS(3, 2, ssse3); H264_QPEL_FUNCS(3, 3, ssse3); } -#endif /* HAVE_SSSE3_INLINE */ - -#if HAVE_SSSE3_EXTERNAL if (bit_depth == 10 && CONFIG_H264QPEL) { H264_QPEL_FUNCS_10(1, 0, ssse3_cache64); H264_QPEL_FUNCS_10(2, 0, ssse3_cache64); diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 075506f25b..bc56d091e1 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -19,1019 +19,229 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" #include "dsputil_mmx.h" -#if HAVE_INLINE_ASM - -/***********************************/ -/* motion compensation */ - -#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\ - "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\ - "add %2, %0 \n\t"\ - "paddw "#F", "#A" \n\t"\ - "paddw "#A", "#T" \n\t"\ - "psraw $5, "#T" \n\t"\ - "packuswb "#T", "#T" \n\t"\ - OP(T, (%1), A, d)\ - "add %3, %1 \n\t" - -#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\ - "paddw "#F", "#A" \n\t"\ - "add %2, %0 \n\t"\ - "paddw "#A", "#T" \n\t"\ - "mov"#q" "#T", "#OF"(%1) \n\t" - -#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) -#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) -#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) -#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) - +#if HAVE_YASM +void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); +static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h) +{ + ff_put_pixels8_mmxext(block, pixels, line_size, h); + ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h); +} +static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h) +{ + ff_avg_pixels8_mmxext(block, pixels, line_size, h); + ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h); +} +void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext +#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext +#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext +#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext + +#define DEF_QPEL(OPNAME)\ +void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ +void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ +void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ +void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ +void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ +void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\ +void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\ +void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\ +void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ +void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\ +void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\ +void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ +void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ +void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\ +void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h); + +DEF_QPEL(avg) +DEF_QPEL(put) #define QPEL_H264(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=4;\ -\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\ - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ - "1: \n\t"\ - "movd -1(%0), %%mm1 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "movd 1(%0), %%mm3 \n\t"\ - "movd 2(%0), %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "movd -2(%0), %%mm0 \n\t"\ - "movd 3(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psllw $2, %%mm2 \n\t"\ - "psubw %%mm1, %%mm2 \n\t"\ - "pmullw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm6, d)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=4;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm4 \n\t"\ - "movq %1, %%mm5 \n\t"\ - :: "m"(ff_pw_5), "m"(ff_pw_16)\ - );\ - do{\ - __asm__ volatile(\ - "movd -1(%0), %%mm1 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "movd 1(%0), %%mm3 \n\t"\ - "movd 2(%0), %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "movd -2(%0), %%mm0 \n\t"\ - "movd 3(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psllw $2, %%mm2 \n\t"\ - "psubw %%mm1, %%mm2 \n\t"\ - "pmullw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "movd (%2), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - PAVGB" %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm6, d)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }while(--h);\ -}\ -static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - src -= 2*srcStride;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -}\ -static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - int h=4;\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ int w=3;\ src -= 2*srcStride+2;\ while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ - \ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride)\ - : "memory"\ - );\ + ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\ tmp += 4;\ - src += 4 - 9*srcStride;\ + src += 4;\ }\ tmp -= 3*4;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "paddw 10(%0), %%mm0 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "paddw 8(%0), %%mm1 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ - "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ - "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ - "paddsw %%mm2, %%mm0 \n\t"\ - "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ - "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ - "psraw $6, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, d)\ - "add $24, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ + ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\ }\ \ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "movq (%2), %%mm4 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - PAVGB" %%mm4, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - int w= 2;\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ src -= 2*srcStride;\ - \ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - "cmpl $16, %4 \n\t"\ - "jne 2f \n\t"\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - "2: \n\t"\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\ - : "memory"\ - );\ - src += 4-(h+5)*srcStride;\ - dst += 4-h*dstStride;\ - }\ + ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\ + src += 4;\ + dst += 4;\ + ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\ }\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ int w = (size+8)>>2;\ src -= 2*srcStride+2;\ while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ - "cmpl $16, %3 \n\t"\ - "jne 2f \n\t"\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ - "2: \n\t"\ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\ - : "memory"\ - );\ + ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\ tmp += 4;\ - src += 4 - (size+5)*srcStride;\ + src += 4;\ }\ }\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ int w = size>>4;\ do{\ - int h = size;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm3 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "movq 10(%0), %%mm4 \n\t"\ - "paddw %%mm4, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "paddw 18(%0), %%mm3 \n\t"\ - "paddw 16(%0), %%mm4 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "movq 12(%0), %%mm5 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "paddw 14(%0), %%mm5 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "paddsw %%mm2, %%mm0 \n\t"\ - "paddsw %%mm5, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm5, %%mm3 \n\t"\ - "psraw $6, %%mm0 \n\t"\ - "psraw $6, %%mm3 \n\t"\ - "packuswb %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - tmp += 8 - size*24;\ - dst += 8 - size*dstStride;\ + ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\ + tmp += 8;\ + dst += 8;\ }while(w--);\ }\ \ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ }\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ }\ \ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ src += 8*srcStride;\ dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ }\ \ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ src += 8*dstStride;\ dst += 8*dstStride;\ src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ }\ \ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ + ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ }\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ }\ \ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ }\ \ -static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 24(%1), %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - "packuswb %%mm1, %%mm1 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm1 \n\t"\ - OP(%%mm0, (%2), %%mm4, d)\ - OP(%%mm1, (%2,%4), %%mm5, d)\ - "lea (%0,%3,2), %0 \n\t"\ - "lea (%2,%4,2), %2 \n\t"\ - "movq 48(%1), %%mm0 \n\t"\ - "movq 72(%1), %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - "packuswb %%mm1, %%mm1 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm1 \n\t"\ - OP(%%mm0, (%2), %%mm4, d)\ - OP(%%mm1, (%2,%4), %%mm5, d)\ - :"+a"(src8), "+c"(src16), "+d"(dst)\ - :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ - :"memory");\ -}\ -static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ {\ - do{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 8(%1), %%mm1 \n\t"\ - "movq 48(%1), %%mm2 \n\t"\ - "movq 8+48(%1), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "psraw $5, %%mm2 \n\t"\ - "psraw $5, %%mm3 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - "packuswb %%mm3, %%mm2 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm2 \n\t"\ - OP(%%mm0, (%2), %%mm5, q)\ - OP(%%mm2, (%2,%4), %%mm5, q)\ - ::"a"(src8), "c"(src16), "d"(dst),\ - "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ - :"memory");\ - src8 += 2L*src8Stride;\ - src16 += 48;\ - dst += 2L*dstStride;\ - }while(h-=2);\ -}\ -static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ + ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ + ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ }\ #if ARCH_X86_64 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=16;\ - __asm__ volatile(\ - "pxor %%xmm15, %%xmm15 \n\t"\ - "movdqa %6, %%xmm14 \n\t"\ - "movdqa %7, %%xmm13 \n\t"\ - "1: \n\t"\ - "lddqu 6(%0), %%xmm1 \n\t"\ - "lddqu -2(%0), %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm15, %%xmm1 \n\t"\ - "punpcklbw %%xmm15, %%xmm0 \n\t"\ - "punpcklbw %%xmm15, %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm0, %%xmm6 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm0, %%xmm8 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm0, %%xmm9 \n\t"\ - "movdqa %%xmm0, %%xmm12 \n\t"\ - "movdqa %%xmm1, %%xmm11 \n\t"\ - "palignr $10,%%xmm0, %%xmm11\n\t"\ - "palignr $10,%%xmm7, %%xmm12\n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm9 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm7, %%xmm8 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $6, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm0 ,%%xmm11 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $8, %%xmm7, %%xmm0 \n\t"\ - "paddw %%xmm12,%%xmm7 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm8, %%xmm6 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm9, %%xmm0 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psllw $2, %%xmm6 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "psubw %%xmm0, %%xmm6 \n\t"\ - "paddw %%xmm13,%%xmm11 \n\t"\ - "paddw %%xmm13,%%xmm7 \n\t"\ - "pmullw %%xmm14,%%xmm2 \n\t"\ - "pmullw %%xmm14,%%xmm6 \n\t"\ - "lddqu (%2), %%xmm3 \n\t"\ - "paddw %%xmm11,%%xmm2 \n\t"\ - "paddw %%xmm7, %%xmm6 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "psraw $5, %%xmm6 \n\t"\ - "packuswb %%xmm2,%%xmm6 \n\t"\ - "pavgb %%xmm3, %%xmm6 \n\t"\ - OP(%%xmm6, (%1), %%xmm4, dqa)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_5), "m"(ff_pw_16)\ - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \ - "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \ - "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \ - "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\ - "memory"\ - );\ -} + +void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride); +void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride); + #else // ARCH_X86_64 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ src += 8*dstStride;\ dst += 8*dstStride;\ src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ } #endif // ARCH_X86_64 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ - "1: \n\t"\ - "lddqu -2(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $10,%%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "movq (%2), %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm0, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - "pavgb %%xmm3, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ -}\ QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ - "1: \n\t"\ - "lddqu -2(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $10,%%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm0, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ -}\ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ src += 8*srcStride;\ dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ }\ #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - src -= 2*srcStride;\ - \ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movq (%0), %%xmm0 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm1 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm2 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm3 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm2 \n\t"\ - "punpcklbw %%xmm7, %%xmm3 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - "cmpl $16, %4 \n\t"\ - "jne 2f \n\t"\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - "2: \n\t"\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ -}\ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ }\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ } -static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ +static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ int w = (size+8)>>3; src -= 2*srcStride+2; while(w--){ - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "movq (%0), %%xmm0 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm1 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm2 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm3 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm4 \n\t" - "add %2, %0 \n\t" - "punpcklbw %%xmm7, %%xmm0 \n\t" - "punpcklbw %%xmm7, %%xmm1 \n\t" - "punpcklbw %%xmm7, %%xmm2 \n\t" - "punpcklbw %%xmm7, %%xmm3 \n\t" - "punpcklbw %%xmm7, %%xmm4 \n\t" - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) - "cmpl $16, %3 \n\t" - "jne 2f \n\t" - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) - "2: \n\t" - : "+a"(src) - : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size) - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7",) - "memory" - ); + ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size); tmp += 8; - src += 8 - (size+5)*srcStride; + src += 8; } } -#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int h = size;\ - if(size == 16){\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 32(%0), %%xmm4 \n\t"\ - "movdqa 16(%0), %%xmm5 \n\t"\ - "movdqa (%0), %%xmm7 \n\t"\ - "movdqa %%xmm4, %%xmm3 \n\t"\ - "movdqa %%xmm4, %%xmm2 \n\t"\ - "movdqa %%xmm4, %%xmm1 \n\t"\ - "movdqa %%xmm4, %%xmm0 \n\t"\ - "palignr $10, %%xmm5, %%xmm0 \n\t"\ - "palignr $8, %%xmm5, %%xmm1 \n\t"\ - "palignr $6, %%xmm5, %%xmm2 \n\t"\ - "palignr $4, %%xmm5, %%xmm3 \n\t"\ - "palignr $2, %%xmm5, %%xmm4 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "movdqa %%xmm5, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm3 \n\t"\ - "palignr $8, %%xmm7, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm6 \n\t"\ - "palignr $10, %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "palignr $6, %%xmm7, %%xmm5 \n\t"\ - "palignr $4, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm5 \n\t"\ - \ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "psraw $6, %%xmm3 \n\t"\ - "packuswb %%xmm0, %%xmm3 \n\t"\ - OP(%%xmm3, (%1), %%xmm7, dqa)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ - }else{\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 16(%0), %%xmm1 \n\t"\ - "movdqa (%0), %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $10, %%xmm0, %%xmm5 \n\t"\ - "palignr $8, %%xmm0, %%xmm4 \n\t"\ - "palignr $6, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm0, %%xmm2 \n\t"\ - "palignr $2, %%xmm0, %%xmm1 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "packuswb %%xmm0, %%xmm0 \n\t"\ - OP(%%xmm0, (%1), %%xmm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ - }\ -} - #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + ff_put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ + ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ }\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ }\ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ +static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ }\ -#define put_pixels8_l2_sse2 put_pixels8_l2_mmxext -#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmxext -#define put_pixels16_l2_sse2 put_pixels16_l2_mmxext -#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmxext -#define put_pixels8_l2_ssse3 put_pixels8_l2_mmxext -#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmxext -#define put_pixels16_l2_ssse3 put_pixels16_l2_mmxext -#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmxext +#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext +#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext +#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext +#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext -#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmxext -#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmxext -#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmxext -#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmxext -#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmxext -#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmxext -#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmxext -#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmxext +#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 +#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 +#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2 +#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2 -#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmxext -#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmxext -#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmxext -#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmxext - -#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 -#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 -#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 -#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 - -#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmxext -#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmxext +#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext +#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ @@ -1040,77 +250,77 @@ H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ - put_pixels16_sse2(dst, src, stride, 16); + ff_put_pixels16_sse2(dst, src, stride, 16); } static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ - avg_pixels16_sse2(dst, src, stride, 16); + ff_avg_pixels16_sse2(dst, src, stride, 16); } #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ + ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ }\ #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ }\ #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ + ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ + ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ }\ #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ + ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ + ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ + ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ + ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ @@ -1118,8 +328,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t * uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ + ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ @@ -1127,8 +337,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t * uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ + ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ @@ -1136,8 +346,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t * uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ + ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ @@ -1145,8 +355,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t * uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ + ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ }\ #define H264_MC_4816(MMX)\ @@ -1171,25 +381,18 @@ QPEL_H264_V_XMM(put_, PUT_OP, sse2) QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2) QPEL_H264_HV_XMM(put_, PUT_OP, sse2) QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2) -#if HAVE_SSSE3_INLINE QPEL_H264_H_XMM(put_, PUT_OP, ssse3) QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3) -QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV2_XMM(avg_,AVG_MMXEXT_OP, ssse3) QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3) -#endif #undef PAVGB H264_MC_4816(mmxext) H264_MC_816(H264_MC_V, sse2) H264_MC_816(H264_MC_HV, sse2) -#if HAVE_SSSE3_INLINE H264_MC_816(H264_MC_H, ssse3) H264_MC_816(H264_MC_HV, ssse3) -#endif -#endif /* HAVE_INLINE_ASM */ //10bit #define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \ @@ -1285,3 +488,5 @@ QPEL16_OP(mc33, MMX) #if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+ QPEL16(mmxext) #endif + +#endif /* HAVE_YASM */ diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm new file mode 100644 index 0000000000..eef07157c9 --- /dev/null +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -0,0 +1,862 @@ +;***************************************************************************** +;* MMX/SSE2/SSSE3-optimized H.264 QPEL code +;***************************************************************************** +;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2012 Daniel Kang +;* +;* Authors: Daniel Kang +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +cextern pw_16 +cextern pw_5 +cextern pb_0 + +SECTION .text + + +%macro op_avgh 3 + movh %3, %2 + pavgb %1, %3 + movh %2, %1 +%endmacro + +%macro op_avg 2-3 + pavgb %1, %2 + mova %2, %1 +%endmacro + +%macro op_puth 2-3 + movh %2, %1 +%endmacro + +%macro op_put 2-3 + mova %2, %1 +%endmacro + +%macro QPEL4_H_LOWPASS_OP 1 +cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + pxor m7, m7 + mova m4, [pw_5] + mova m5, [pw_16] + mov r4d, 4 +.loop: + movh m1, [r1-1] + movh m2, [r1+0] + movh m3, [r1+1] + movh m0, [r1+2] + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m0, m7 + paddw m1, m0 + paddw m2, m3 + movh m0, [r1-2] + movh m3, [r1+3] + punpcklbw m0, m7 + punpcklbw m3, m7 + paddw m0, m3 + psllw m2, 2 + psubw m2, m1 + pmullw m2, m4 + paddw m0, m5 + paddw m0, m2 + psraw m0, 5 + packuswb m0, m0 + op_%1h m0, [r0], m6 + add r0, r2 + add r1, r3 + dec r4d + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL4_H_LOWPASS_OP put +QPEL4_H_LOWPASS_OP avg + +%macro QPEL8_H_LOWPASS_OP 1 +cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + mov r4d, 8 + pxor m7, m7 + mova m6, [pw_5] +.loop: + mova m0, [r1] + mova m2, [r1+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + psllw m0, 2 + psllw m1, 2 + mova m2, [r1-1] + mova m4, [r1+2] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpckhbw m3, m7 + punpcklbw m4, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m5, m3 + psubw m0, m2 + psubw m1, m5 + pmullw m0, m6 + pmullw m1, m6 + movd m2, [r1-2] + movd m5, [r1+7] + punpcklbw m2, m7 + punpcklbw m5, m7 + paddw m2, m3 + paddw m4, m5 + mova m5, [pw_16] + paddw m2, m5 + paddw m4, m5 + paddw m0, m2 + paddw m1, m4 + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + op_%1 m0, [r0], m4 + add r0, r2 + add r1, r3 + dec r4d + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8_H_LOWPASS_OP put +QPEL8_H_LOWPASS_OP avg + +%macro QPEL8_H_LOWPASS_OP_XMM 1 +cglobal %1_h264_qpel8_h_lowpass, 4,5,7 ; dst, src, dstStride, srcStride + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + mov r4d, 8 + pxor m7, m7 + mova m6, [pw_5] +.loop: + movu m1, [r1-2] + mova m0, m1 + punpckhbw m1, m7 + punpcklbw m0, m7 + mova m2, m1 + mova m3, m1 + mova m4, m1 + mova m5, m1 + palignr m4, m0, 2 + palignr m3, m0, 4 + palignr m2, m0, 6 + palignr m1, m0, 8 + palignr m5, m0, 10 + paddw m0, m5 + paddw m2, m3 + paddw m1, m4 + psllw m2, 2 + psubw m2, m1 + paddw m0, [pw_16] + pmullw m2, m6 + paddw m2, m0 + psraw m2, 5 + packuswb m2, m2 + op_%1h m2, [r0], m4 + add r1, r3 + add r0, r2 + dec r4d + jne .loop + REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL8_H_LOWPASS_OP_XMM put +QPEL8_H_LOWPASS_OP_XMM avg + + +%macro QPEL4_H_LOWPASS_L2_OP 1 +cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + pxor m7, m7 + mova m4, [pw_5] + mova m5, [pw_16] + mov r5d, 4 +.loop: + movh m1, [r1-1] + movh m2, [r1+0] + movh m3, [r1+1] + movh m0, [r1+2] + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m0, m7 + paddw m1, m0 + paddw m2, m3 + movh m0, [r1-2] + movh m3, [r1+3] + punpcklbw m0, m7 + punpcklbw m3, m7 + paddw m0, m3 + psllw m2, 2 + psubw m2, m1 + pmullw m2, m4 + paddw m0, m5 + paddw m0, m2 + movh m3, [r2] + psraw m0, 5 + packuswb m0, m0 + pavgb m0, m3 + op_%1h m0, [r0], m6 + add r0, r3 + add r1, r3 + add r2, r4 + dec r5d + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL4_H_LOWPASS_L2_OP put +QPEL4_H_LOWPASS_L2_OP avg + + +%macro QPEL8_H_LOWPASS_L2_OP 1 +cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 8 + pxor m7, m7 + mova m6, [pw_5] +.loop: + mova m0, [r1] + mova m2, [r1+1] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + psllw m0, 2 + psllw m1, 2 + mova m2, [r1-1] + mova m4, [r1+2] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpckhbw m3, m7 + punpcklbw m4, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m5, m3 + psubw m0, m2 + psubw m1, m5 + pmullw m0, m6 + pmullw m1, m6 + movd m2, [r1-2] + movd m5, [r1+7] + punpcklbw m2, m7 + punpcklbw m5, m7 + paddw m2, m3 + paddw m4, m5 + mova m5, [pw_16] + paddw m2, m5 + paddw m4, m5 + paddw m0, m2 + paddw m1, m4 + psraw m0, 5 + psraw m1, 5 + mova m4, [r2] + packuswb m0, m1 + pavgb m0, m4 + op_%1 m0, [r0], m4 + add r0, r3 + add r1, r3 + add r2, r4 + dec r5d + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8_H_LOWPASS_L2_OP put +QPEL8_H_LOWPASS_L2_OP avg + + +%macro QPEL8_H_LOWPASS_L2_OP_XMM 1 +cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,7 ; dst, src, src2, dstStride, src2Stride + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 8 + pxor m7, m7 + mova m6, [pw_5] +.loop: + lddqu m1, [r1-2] + mova m0, m1 + punpckhbw m1, m7 + punpcklbw m0, m7 + mova m2, m1 + mova m3, m1 + mova m4, m1 + mova m5, m1 + palignr m4, m0, 2 + palignr m3, m0, 4 + palignr m2, m0, 6 + palignr m1, m0, 8 + palignr m5, m0, 10 + paddw m0, m5 + paddw m2, m3 + paddw m1, m4 + psllw m2, 2 + movh m3, [r2] + psubw m2, m1 + paddw m0, [pw_16] + pmullw m2, m6 + paddw m2, m0 + psraw m2, 5 + packuswb m2, m2 + pavgb m2, m3 + op_%1h m2, [r0], m4 + add r1, r3 + add r0, r3 + add r2, r4 + dec r5d + jg .loop + REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL8_H_LOWPASS_L2_OP_XMM put +QPEL8_H_LOWPASS_L2_OP_XMM avg + + +; All functions that call this are required to have function arguments of +; dst, src, dstStride, srcStride +%macro FILT_V 1 + mova m6, m2 + movh m5, [r1] + paddw m6, m3 + psllw m6, 2 + psubw m6, m1 + psubw m6, m4 + punpcklbw m5, m7 + pmullw m6, [pw_5] + paddw m0, [pw_16] + add r1, r3 + paddw m0, m5 + paddw m6, m0 + psraw m6, 5 + packuswb m6, m6 + op_%1h m6, [r0], m0 ; 1 + add r0, r2 + SWAP 0, 1, 2, 3, 4, 5 +%endmacro + +%macro QPEL4_V_LOWPASS_OP 1 +cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + sub r1, r3 + sub r1, r3 + pxor m7, m7 + movh m0, [r1] + movh m1, [r1+r3] + lea r1, [r1+2*r3] + movh m2, [r1] + movh m3, [r1+r3] + lea r1, [r1+2*r3] + movh m4, [r1] + add r1, r3 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + RET +%endmacro + +INIT_MMX mmxext +QPEL4_V_LOWPASS_OP put +QPEL4_V_LOWPASS_OP avg + + + +%macro QPEL8OR16_V_LOWPASS_OP 1 +%if cpuflag(sse2) +cglobal %1_h264_qpel8or16_v_lowpass, 5,5,7 ; dst, src, dstStride, srcStride, h + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + sub r1, r3 + sub r1, r3 +%else +cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,7 ; dst, src, dstStride, srcStride, h + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d +%endif + pxor m7, m7 + movh m0, [r1] + movh m1, [r1+r3] + lea r1, [r1+2*r3] + movh m2, [r1] + movh m3, [r1+r3] + lea r1, [r1+2*r3] + movh m4, [r1] + add r1, r3 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + cmp r4d, 16 + jne .end + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 + FILT_V %1 +.end: + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8OR16_V_LOWPASS_OP put +QPEL8OR16_V_LOWPASS_OP avg + +INIT_XMM sse2 +QPEL8OR16_V_LOWPASS_OP put +QPEL8OR16_V_LOWPASS_OP avg + + +; All functions that use this are required to have args: +; src, tmp, srcSize +%macro FILT_HV 1 ; offset + mova m6, m2 + movh m5, [r0] + paddw m6, m3 + psllw m6, 2 + paddw m0, [pw_16] + psubw m6, m1 + psubw m6, m4 + punpcklbw m5, m7 + pmullw m6, [pw_5] + paddw m0, m5 + add r0, r2 + paddw m6, m0 + mova [r1+%1], m6 + SWAP 0, 1, 2, 3, 4, 5 +%endmacro + +%macro QPEL4_HV1_LOWPASS_OP 1 +cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride + movsxdifnidn r2, r2d + pxor m7, m7 + movh m0, [r0] + movh m1, [r0+r2] + lea r0, [r0+2*r2] + movh m2, [r0] + movh m3, [r0+r2] + lea r0, [r0+2*r2] + movh m4, [r0] + add r0, r2 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + FILT_HV 0*24 + FILT_HV 1*24 + FILT_HV 2*24 + FILT_HV 3*24 + RET + +cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride + movsxdifnidn r2, r2d + mov r3d, 4 +.loop: + mova m0, [r0] + paddw m0, [r0+10] + mova m1, [r0+2] + paddw m1, [r0+8] + mova m2, [r0+4] + paddw m2, [r0+6] + psubw m0, m1 + psraw m0, 2 + psubw m0, m1 + paddsw m0, m2 + psraw m0, 2 + paddw m0, m2 + psraw m0, 6 + packuswb m0, m0 + op_%1h m0, [r1], m7 + add r0, 24 + add r1, r2 + dec r3d + jnz .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL4_HV1_LOWPASS_OP put +QPEL4_HV1_LOWPASS_OP avg + +%macro QPEL8OR16_HV1_LOWPASS_OP 1 +cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,7 ; src, tmp, srcStride, size + movsxdifnidn r2, r2d + pxor m7, m7 + movh m0, [r0] + movh m1, [r0+r2] + lea r0, [r0+2*r2] + movh m2, [r0] + movh m3, [r0+r2] + lea r0, [r0+2*r2] + movh m4, [r0] + add r0, r2 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + FILT_HV 0*48 + FILT_HV 1*48 + FILT_HV 2*48 + FILT_HV 3*48 + FILT_HV 4*48 + FILT_HV 5*48 + FILT_HV 6*48 + FILT_HV 7*48 + cmp r3d, 16 + jne .end + FILT_HV 8*48 + FILT_HV 9*48 + FILT_HV 10*48 + FILT_HV 11*48 + FILT_HV 12*48 + FILT_HV 13*48 + FILT_HV 14*48 + FILT_HV 15*48 +.end: + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8OR16_HV1_LOWPASS_OP put +QPEL8OR16_HV1_LOWPASS_OP avg + +INIT_XMM sse2 +QPEL8OR16_HV1_LOWPASS_OP put + + + +%macro QPEL8OR16_HV2_LOWPASS_OP 1 +; unused is to match ssse3 and mmxext args +cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h + movsxdifnidn r2, r2d +.loop: + mova m0, [r1] + mova m3, [r1+8] + mova m1, [r1+2] + mova m4, [r1+10] + paddw m0, m4 + paddw m1, m3 + paddw m3, [r1+18] + paddw m4, [r1+16] + mova m2, [r1+4] + mova m5, [r1+12] + paddw m2, [r1+6] + paddw m5, [r1+14] + psubw m0, m1 + psubw m3, m4 + psraw m0, 2 + psraw m3, 2 + psubw m0, m1 + psubw m3, m4 + paddsw m0, m2 + paddsw m3, m5 + psraw m0, 2 + psraw m3, 2 + paddw m0, m2 + paddw m3, m5 + psraw m0, 6 + psraw m3, 6 + packuswb m0, m3 + op_%1 m0, [r0], m7 + add r1, 48 + add r0, r2 + dec r4d + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +QPEL8OR16_HV2_LOWPASS_OP put +QPEL8OR16_HV2_LOWPASS_OP avg + +%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 +cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,7 ; dst, tmp, dstStride, tmpStride, size + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + cmp r4d, 16 + je .op16 +.loop8: + mova m1, [r1+16] + mova m0, [r1] + mova m2, m1 + mova m3, m1 + mova m4, m1 + mova m5, m1 + palignr m5, m0, 10 + palignr m4, m0, 8 + palignr m3, m0, 6 + palignr m2, m0, 4 + palignr m1, m0, 2 + paddw m0, m5 + paddw m1, m4 + paddw m2, m3 + psubw m0, m1 + psraw m0, 2 + psubw m0, m1 + paddw m0, m2 + psraw m0, 2 + paddw m0, m2 + psraw m0, 6 + packuswb m0, m0 + op_%1h m0, [r0], m7 + add r1, 48 + add r0, r2 + dec r4d + jne .loop8 + jmp .done +.op16: + mova m4, [r1+32] + mova m5, [r1+16] + mova m7, [r1] + mova m3, m4 + mova m2, m4 + mova m1, m4 + mova m0, m4 + palignr m0, m5, 10 + palignr m1, m5, 8 + palignr m2, m5, 6 + palignr m3, m5, 4 + palignr m4, m5, 2 + paddw m0, m5 + paddw m1, m4 + paddw m2, m3 + mova m6, m5 + mova m4, m5 + mova m3, m5 + palignr m4, m7, 8 + palignr m6, m7, 2 + palignr m3, m7, 10 + paddw m4, m6 + mova m6, m5 + palignr m5, m7, 6 + palignr m6, m7, 4 + paddw m3, m7 + paddw m5, m6 + psubw m0, m1 + psubw m3, m4 + psraw m0, 2 + psraw m3, 2 + psubw m0, m1 + psubw m3, m4 + paddw m0, m2 + paddw m3, m5 + psraw m0, 2 + psraw m3, 2 + paddw m0, m2 + paddw m3, m5 + psraw m0, 6 + psraw m3, 6 + packuswb m3, m0 + op_%1 m3, [r0], m7 + add r1, 48 + add r0, r2 + dec r4d + jne .op16 +.done: + REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL8OR16_HV2_LOWPASS_OP_XMM put +QPEL8OR16_HV2_LOWPASS_OP_XMM avg + + +%macro PIXELS4_L2_SHIFT5 1 +cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mova m0, [r1] + mova m1, [r1+24] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m0 + packuswb m1, m1 + pavgb m0, [r2] + pavgb m1, [r2+r4] + op_%1h m0, [r0], m4 + op_%1h m1, [r0+r3], m5 + lea r2, [r2+r4*2] + lea r0, [r0+r3*2] + mova m0, [r1+48] + mova m1, [r1+72] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m0 + packuswb m1, m1 + pavgb m0, [r2] + pavgb m1, [r2+r4] + op_%1h m0, [r0], m4 + op_%1h m1, [r0+r3], m5 + RET +%endmacro + +INIT_MMX mmxext +PIXELS4_L2_SHIFT5 put +PIXELS4_L2_SHIFT5 avg + + +%macro PIXELS8_L2_SHIFT5 1 +cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d +.loop: + mova m0, [r1] + mova m1, [r1+8] + mova m2, [r1+48] + mova m3, [r1+48+8] + psraw m0, 5 + psraw m1, 5 + psraw m2, 5 + psraw m3, 5 + packuswb m0, m1 + packuswb m2, m3 + pavgb m0, [r2] + pavgb m2, [r2+r4] + op_%1 m0, [r0], m4 + op_%1 m2, [r0+r3], m5 + lea r2, [r2+2*r4] + add r1, 48*2 + lea r0, [r0+2*r3] + sub r5d, 2 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PIXELS8_L2_SHIFT5 put +PIXELS8_L2_SHIFT5 avg + + +%if ARCH_X86_64 +%macro QPEL16_H_LOWPASS_L2_OP 1 +cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride + movsxdifnidn r3, r3d + movsxdifnidn r4, r4d + mov r5d, 16 + pxor m15, m15 + mova m14, [pw_5] + mova m13, [pw_16] +.loop: + lddqu m1, [r1+6] + lddqu m7, [r1-2] + mova m0, m1 + punpckhbw m1, m15 + punpcklbw m0, m15 + punpcklbw m7, m15 + mova m2, m1 + mova m6, m0 + mova m3, m1 + mova m8, m0 + mova m4, m1 + mova m9, m0 + mova m12, m0 + mova m11, m1 + palignr m11, m0, 10 + palignr m12, m7, 10 + palignr m4, m0, 2 + palignr m9, m7, 2 + palignr m3, m0, 4 + palignr m8, m7, 4 + palignr m2, m0, 6 + palignr m6, m7, 6 + paddw m11, m0 + palignr m1, m0, 8 + palignr m0, m7, 8 + paddw m7, m12 + paddw m2, m3 + paddw m6, m8 + paddw m1, m4 + paddw m0, m9 + psllw m2, 2 + psllw m6, 2 + psubw m2, m1 + psubw m6, m0 + paddw m11, m13 + paddw m7, m13 + pmullw m2, m14 + pmullw m6, m14 + lddqu m3, [r2] + paddw m2, m11 + paddw m6, m7 + psraw m2, 5 + psraw m6, 5 + packuswb m6, m2 + pavgb m6, m3 + op_%1 m6, [r0], m11 + add r1, r3 + add r0, r3 + add r2, r4 + dec r5d + jg .loop + REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL16_H_LOWPASS_L2_OP put +QPEL16_H_LOWPASS_L2_OP avg +%endif -- cgit v1.2.3