diff options
author | Timothy Gu <timothygu99@gmail.com> | 2014-05-30 07:56:04 +0400 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-06-04 01:59:43 +0400 |
commit | 108dec3055053e2da9f2d5695b65350c5daaba57 (patch) | |
tree | 3820dbbfa7d786ab5f896cf9309c8b53a13dfc8b | |
parent | d63d964fb3c92631eaa21d58e2cbc7efdd173246 (diff) |
x86: dsputilenc: convert hf_noise*_mmx to yasm
Signed-off-by: Timothy Gu <timothygu99@gmail.com>
Several bugfixes by: Christophe Gisquet <christophe.gisquet@gmail.com>
See: [FFmpeg-devel] [WIP] [PATCH 4/4] x86: dsputilenc: convert hf_noise*_mmx to yasm
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/x86/dsputilenc.asm | 77 | ||||
-rw-r--r-- | libavcodec/x86/dsputilenc_mmx.c | 265 |
2 files changed, 87 insertions, 255 deletions
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 0628550145..84cb7b363b 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -608,3 +608,80 @@ INIT_XMM sse2 SUM_ABS_DCTELEM 7, 2 INIT_XMM ssse3 SUM_ABS_DCTELEM 6, 2 + +;------------------------------------------------------------------------------ +; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h) +;------------------------------------------------------------------------------ +; %1 = 8/16. %2-5=m# +%macro HF_NOISE_PART1 5 + mova m%2, [pix1q] +%if %1 == 8 + mova m%3, m%2 + psllq m%2, 8 + psrlq m%3, 8 + psrlq m%2, 8 +%else + mova m%3, [pix1q+1] +%endif + mova m%4, m%2 + mova m%5, m%3 + punpcklbw m%2, m7 + punpcklbw m%3, m7 + punpckhbw m%4, m7 + punpckhbw m%5, m7 + psubw m%2, m%3 + psubw m%4, m%5 +%endmacro + +; %1-2 = m# +%macro HF_NOISE_PART2 4 + psubw m%1, m%3 + psubw m%2, m%4 + pxor m3, m3 + pxor m1, m1 + pcmpgtw m3, m%1 + pcmpgtw m1, m%2 + pxor m%1, m3 + pxor m%2, m1 + psubw m%1, m3 + psubw m%2, m1 + paddw m%2, m%1 + paddw m6, m%2 +%endmacro + +; %1 = 8/16 +%macro HF_NOISE 1 +cglobal hf_noise%1, 3,3,0, pix1, lsize, h + movsxdifnidn lsizeq, lsized + sub hd, 2 + pxor m7, m7 + pxor m6, m6 + HF_NOISE_PART1 %1, 0, 1, 2, 3 + add pix1q, lsizeq + HF_NOISE_PART1 %1, 4, 1, 5, 3 + HF_NOISE_PART2 0, 2, 4, 5 + add pix1q, lsizeq +.loop: + HF_NOISE_PART1 %1, 0, 1, 2, 3 + HF_NOISE_PART2 4, 5, 0, 2 + add pix1q, lsizeq + HF_NOISE_PART1 %1, 4, 1, 5, 3 + HF_NOISE_PART2 0, 2, 4, 5 + add pix1q, lsizeq + sub hd, 2 + jne .loop + + mova m0, m6 + punpcklwd m0, m7 + punpckhwd m6, m7 + paddd m6, m0 + mova m0, m6 + psrlq m6, 32 + paddd m0, m6 + movd eax, m0 ; eax = result of hf_noise8; + REP_RET ; return eax; +%endmacro + +INIT_MMX mmx +HF_NOISE 8 +HF_NOISE 16 diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index 13128d29ad..e18048681f 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -52,6 +52,8 @@ int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); +int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h); +int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h); #define hadamard_func(cpu) \ int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ @@ -64,255 +66,8 @@ hadamard_func(mmxext) hadamard_func(sse2) hadamard_func(ssse3) -#if HAVE_INLINE_ASM - #if HAVE_YASM -static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm7, %%mm7\n" - "pxor %%mm6, %%mm6\n" - - "movq (%0), %%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "1:\n" - - "movq (%0), %%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7, %%mm0\n" - "punpckhwd %%mm7, %%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6, %%mm0\n" - "movd %%mm0, %1\n" - : "+r" (pix1), "=r" (tmp) - : "r" ((x86_reg) line_size), "g" (h - 2) - : "%ecx"); - - return tmp; -} - -static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h) -{ - int tmp; - uint8_t *pix = pix1; - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm7, %%mm7\n" - "pxor %%mm6, %%mm6\n" - - "movq (%0), %%mm0\n" - "movq 1(%0), %%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq 1(%0), %%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "1:\n" - - "movq (%0), %%mm0\n" - "movq 1(%0), %%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq 1(%0), %%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7, %%mm0\n" - "punpckhwd %%mm7, %%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6, %%mm0\n" - "movd %%mm0, %1\n" - : "+r" (pix1), "=r" (tmp) - : "r" ((x86_reg) line_size), "g" (h - 2) - : "%ecx"); - - return tmp + hf_noise8_mmx(pix + 8, line_size, h); -} - static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { @@ -322,8 +77,8 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); else score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h); - score2 = hf_noise16_mmx(pix1, line_size, h) - - hf_noise16_mmx(pix2, line_size, h); + score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h) + - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h); if (c) return score1 + FFABS(score2) * c->avctx->nsse_weight; @@ -335,8 +90,8 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h); - int score2 = hf_noise8_mmx(pix1, line_size, h) - - hf_noise8_mmx(pix2, line_size, h); + int score2 = ff_hf_noise8_mmx(pix1, line_size, h) - + ff_hf_noise8_mmx(pix2, line_size, h); if (c) return score1 + FFABS(score2) * c->avctx->nsse_weight; @@ -346,6 +101,8 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, #endif /* HAVE_YASM */ +#if HAVE_INLINE_ASM + static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, int line_size, int h) { @@ -689,10 +446,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->vsad[4] = vsad_intra16_mmx; -#if HAVE_YASM - c->nsse[0] = nsse16_mmx; - c->nsse[1] = nsse8_mmx; -#endif /* HAVE_YASM */ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->vsad[0] = vsad16_mmx; c->try_8x8basis = try_8x8basis_mmx; @@ -741,6 +494,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; c->sse[0] = ff_sse16_mmx; c->sse[1] = ff_sse8_mmx; + c->nsse[0] = nsse16_mmx; + c->nsse[1] = nsse8_mmx; } if (EXTERNAL_MMXEXT(cpu_flags)) { |