Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d'

* commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 01:07:36 +0400
committer: Michael Niedermayer <michaelni@gmx.at> 2014-05-28 01:16:06 +0400
commit: e2abc0d5cacc22aa900de8ac26160ea1b786a7b5 (patch)
tree: 536e4539c573752b2b446d2c8ef17d9d62927815 /libavcodec/x86
parent: 43c57dbe14545d13dbfd8aae341b45514e8bcfbb (diff)
parent: 0d439fbede03854eac8a978cccf21a3425a3c82d (diff)
9 files changed, 292 insertions, 202 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 9c39265f36..80441a6116 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -3,8 +3,7 @@ OBJS                                   += x86/constants.o               \
 
 OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
-OBJS-$(CONFIG_DSPUTIL)                 += x86/dsputil_init.o            \
-                                          x86/dsputil_x86.o
+OBJS-$(CONFIG_DSPUTIL)                 += x86/dsputil_init.o
 OBJS-$(CONFIG_ENCODERS)                += x86/dsputilenc_mmx.o          \
                                           x86/fdct.o                    \
                                           x86/motion_est.o
@@ -19,6 +18,7 @@ OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
 OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
+OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o
@@ -54,6 +54,7 @@ MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o             \
                                           x86/idct_sse2_xvid.o          \
                                           x86/simple_idct.o
 MMX-OBJS-$(CONFIG_DIRAC_DECODER)       += x86/dirac_dwt.o
+MMX-OBJS-$(CONFIG_HUFFYUVDSP)          += x86/huffyuvdsp_mmx.o
 
 MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
 MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
@@ -93,6 +94,7 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
                                           x86/hevc_deblock.o
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                           x86/hpeldsp.o
+YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
 YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 4804682fb7..eae71ab28c 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -24,11 +24,6 @@
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
-pb_f: times 16 db 15
-pb_zzzzzzzz77777777: times 8 db -1
-pb_7: times 8 db 7
-pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
-pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
 cextern pb_80
@@ -193,142 +188,6 @@ SCALARPRODUCT_LOOP 0
     RET
 
 
-; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
-;                                           const uint8_t *diff, int w,
-;                                           int *left, int *left_top)
-INIT_MMX mmxext
-cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
-    movq    mm0, [topq]
-    movq    mm2, mm0
-    movd    mm4, [left_topq]
-    psllq   mm2, 8
-    movq    mm1, mm0
-    por     mm4, mm2
-    movd    mm3, [leftq]
-    psubb   mm0, mm4 ; t-tl
-    add    dstq, wq
-    add    topq, wq
-    add   diffq, wq
-    neg      wq
-    jmp .skip
-.loop:
-    movq    mm4, [topq+wq]
-    movq    mm0, mm4
-    psllq   mm4, 8
-    por     mm4, mm1
-    movq    mm1, mm0 ; t
-    psubb   mm0, mm4 ; t-tl
-.skip:
-    movq    mm2, [diffq+wq]
-%assign i 0
-%rep 8
-    movq    mm4, mm0
-    paddb   mm4, mm3 ; t-tl+l
-    movq    mm5, mm3
-    pmaxub  mm3, mm1
-    pminub  mm5, mm1
-    pminub  mm3, mm4
-    pmaxub  mm3, mm5 ; median
-    paddb   mm3, mm2 ; +residual
-%if i==0
-    movq    mm7, mm3
-    psllq   mm7, 56
-%else
-    movq    mm6, mm3
-    psrlq   mm7, 8
-    psllq   mm6, 56
-    por     mm7, mm6
-%endif
-%if i<7
-    psrlq   mm0, 8
-    psrlq   mm1, 8
-    psrlq   mm2, 8
-%endif
-%assign i i+1
-%endrep
-    movq [dstq+wq], mm7
-    add      wq, 8
-    jl .loop
-    movzx   r2d, byte [dstq-1]
-    mov [leftq], r2d
-    movzx   r2d, byte [topq-1]
-    mov [left_topq], r2d
-    RET
-
-
-%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
-    add     srcq, wq
-    add     dstq, wq
-    neg     wq
-%%.loop:
-%if %2
-    mova    m1, [srcq+wq]
-%else
-    movu    m1, [srcq+wq]
-%endif
-    mova    m2, m1
-    psllw   m1, 8
-    paddb   m1, m2
-    mova    m2, m1
-    pshufb  m1, m3
-    paddb   m1, m2
-    pshufb  m0, m5
-    mova    m2, m1
-    pshufb  m1, m4
-    paddb   m1, m2
-%if mmsize == 16
-    mova    m2, m1
-    pshufb  m1, m6
-    paddb   m1, m2
-%endif
-    paddb   m0, m1
-%if %1
-    mova    [dstq+wq], m0
-%else
-    movq    [dstq+wq], m0
-    movhps  [dstq+wq+8], m0
-%endif
-    add     wq, mmsize
-    jl %%.loop
-    mov     eax, mmsize-1
-    sub     eax, wd
-    movd    m1, eax
-    pshufb  m0, m1
-    movd    eax, m0
-    RET
-%endmacro
-
-; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src,
-;                                 int w, int left)
-INIT_MMX ssse3
-cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
-.skip_prologue:
-    mova    m5, [pb_7]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    psllq   m0, 56
-    ADD_HFYU_LEFT_LOOP 1, 1
-
-INIT_XMM sse4
-cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
-    mova    m5, [pb_f]
-    mova    m6, [pb_zzzzzzzz77777777]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    pslldq  m0, 15
-    test    srcq, 15
-    jnz .src_unaligned
-    test    dstq, 15
-    jnz .dst_unaligned
-    ADD_HFYU_LEFT_LOOP 1, 1
-.dst_unaligned:
-    ADD_HFYU_LEFT_LOOP 0, 1
-.src_unaligned:
-    ADD_HFYU_LEFT_LOOP 0, 0
-
-
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
 ;                           int32_t max, unsigned int len)
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 30829ae5f5..9fc92fe799 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -23,7 +23,6 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/internal.h"
-#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
@@ -93,14 +92,6 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
 
-void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
-                                          const uint8_t *diff, int w,
-                                          int *left, int *left_top);
-int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
-                                      int w, int left);
-int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
-                                     int w, int left);
-
 void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
                               int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
@@ -536,8 +527,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
     c->gmc = ff_gmc_mmx;
 #endif
-
-    c->add_bytes = ff_add_bytes_mmx;
 #endif /* HAVE_MMX_INLINE */
 
 #if HAVE_MMX_EXTERNAL
@@ -570,10 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
 
-    /* slower than cmov version on AMD */
-    if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
-        c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
-
     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
 #endif /* HAVE_MMXEXT_EXTERNAL */
@@ -630,10 +615,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
                                        int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_SSSE3_EXTERNAL
-    c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
-    if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
-        c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
-
     if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
     c->bswap_buf = ff_bswap32_buf_ssse3;
@@ -653,11 +634,6 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_7REGS && HAVE_INLINE_ASM
-    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_CMOV)
-        c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov;
-#endif
-
     if (X86_MMX(cpu_flags)) {
 #if HAVE_INLINE_ASM
         const int idct_algo = avctx->idct_algo;
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index aaf1480b67..15b0dd13af 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -134,31 +134,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
     } while (--i);
 }
 
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
 
 /* Draw the edges of width 'w' of an image of size width, height
  * this MMX version can only handle w == 8 || w == 16. */
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 1f4711dd2d..4aab4b0a12 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -45,12 +45,6 @@ void ff_clear_block_sse(int16_t *block);
 void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);
 
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
-
-void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
-                                        const uint8_t *diff, int w,
-                                        int *left, int *left_top);
-
 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
                        int w, int h, int sides);
 
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
new file mode 100644
index 0000000000..f183ebee54
--- /dev/null
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -0,0 +1,165 @@
+;******************************************************************************
+;* SIMD-optimized HuffYUV functions
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pb_f: times 16 db 15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+
+SECTION_TEXT
+
+; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+;                                     const uint8_t *diff, int w,
+;                                     int *left, int *left_top)
+INIT_MMX mmxext
+cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
+    movq    mm0, [topq]
+    movq    mm2, mm0
+    movd    mm4, [left_topq]
+    psllq   mm2, 8
+    movq    mm1, mm0
+    por     mm4, mm2
+    movd    mm3, [leftq]
+    psubb   mm0, mm4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movq    mm4, [topq+wq]
+    movq    mm0, mm4
+    psllq   mm4, 8
+    por     mm4, mm1
+    movq    mm1, mm0 ; t
+    psubb   mm0, mm4 ; t-tl
+.skip:
+    movq    mm2, [diffq+wq]
+%assign i 0
+%rep 8
+    movq    mm4, mm0
+    paddb   mm4, mm3 ; t-tl+l
+    movq    mm5, mm3
+    pmaxub  mm3, mm1
+    pminub  mm5, mm1
+    pminub  mm3, mm4
+    pmaxub  mm3, mm5 ; median
+    paddb   mm3, mm2 ; +residual
+%if i==0
+    movq    mm7, mm3
+    psllq   mm7, 56
+%else
+    movq    mm6, mm3
+    psrlq   mm7, 8
+    psllq   mm6, 56
+    por     mm7, mm6
+%endif
+%if i<7
+    psrlq   mm0, 8
+    psrlq   mm1, 8
+    psrlq   mm2, 8
+%endif
+%assign i i+1
+%endrep
+    movq [dstq+wq], mm7
+    add      wq, 8
+    jl .loop
+    movzx   r2d, byte [dstq-1]
+    mov [leftq], r2d
+    movzx   r2d, byte [topq-1]
+    mov [left_topq], r2d
+    RET
+
+
+%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+%if %2
+    mova    m1, [srcq+wq]
+%else
+    movu    m1, [srcq+wq]
+%endif
+    mova    m2, m1
+    psllw   m1, 8
+    paddb   m1, m2
+    mova    m2, m1
+    pshufb  m1, m3
+    paddb   m1, m2
+    pshufb  m0, m5
+    mova    m2, m1
+    pshufb  m1, m4
+    paddb   m1, m2
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m6
+    paddb   m1, m2
+%endif
+    paddb   m0, m1
+%if %1
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
+INIT_MMX ssse3
+cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
+.skip_prologue:
+    mova    m5, [pb_7]
+    mova    m4, [pb_zzzz3333zzzzbbbb]
+    mova    m3, [pb_zz11zz55zz99zzdd]
+    movd    m0, leftm
+    psllq   m0, 56
+    ADD_HFYU_LEFT_LOOP 1, 1
+
+INIT_XMM sse4
+cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
+    mova    m5, [pb_f]
+    mova    m6, [pb_zzzzzzzz77777777]
+    mova    m4, [pb_zzzz3333zzzzbbbb]
+    mova    m3, [pb_zz11zz55zz99zzdd]
+    movd    m0, leftm
+    pslldq  m0, 15
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP 1, 1
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP 0, 1
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP 0, 0
diff --git a/libavcodec/x86/huffyuvdsp.h b/libavcodec/x86/huffyuvdsp.h
new file mode 100644
index 0000000000..269126a6a6
--- /dev/null
+++ b/libavcodec/x86/huffyuvdsp.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HUFFYUVDSP_H
+#define AVCODEC_X86_HUFFYUVDSP_H
+
+#include <stdint.h>
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+
+void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, int w,
+                                  int *left, int *left_top);
+
+#endif /* AVCODEC_X86_HUFFYUVDSP_H */
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
new file mode 100644
index 0000000000..5b8497a5a9
--- /dev/null
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvdsp.h"
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+
+void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, int w,
+                                  int *left, int *left_top);
+void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+                                    const uint8_t *diff, int w,
+                                    int *left, int *left_top);
+
+int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
+                                 int w, int left);
+int  ff_add_hfyu_left_pred_sse4(uint8_t *dst, const uint8_t *src,
+                                int w, int left);
+
+av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_7REGS && HAVE_INLINE_ASM
+    if (cpu_flags & AV_CPU_FLAG_CMOV)
+        c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov;
+#endif
+
+    if (INLINE_MMX(cpu_flags))
+        c->add_bytes = ff_add_bytes_mmx;
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        /* slower than cmov version on AMD */
+        if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
+            c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
+        if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
+            c->add_hfyu_left_pred = ff_add_hfyu_left_pred_sse4;
+    }
+}
diff --git a/libavcodec/x86/dsputil_x86.c b/libavcodec/x86/huffyuvdsp_mmx.c
index f43b9d782d..59422107d3 100644
--- a/libavcodec/x86/dsputil_x86.c
+++ b/libavcodec/x86/huffyuvdsp_mmx.c
@@ -20,14 +20,14 @@
 
 #include "config.h"
 #include "libavutil/x86/asm.h"
-#include "dsputil_x86.h"
+#include "huffyuvdsp.h"
 
 #if HAVE_INLINE_ASM
 
 #if HAVE_7REGS
-void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
-                                        const uint8_t *diff, int w,
-                                        int *left, int *left_top)
+void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, int w,
+                                  int *left, int *left_top)
 {
     x86_reg w2 = -w;
     x86_reg x;
@@ -62,4 +62,30 @@ void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
 }
 #endif
 
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
+{
+    x86_reg i = 0;
+
+    __asm__ volatile (
+        "jmp          2f                \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %0), %%mm0         \n\t"
+        "movq   (%2, %0), %%mm1         \n\t"
+        "paddb     %%mm0, %%mm1         \n\t"
+        "movq      %%mm1, (%2, %0)      \n\t"
+        "movq  8(%1, %0), %%mm0         \n\t"
+        "movq  8(%2, %0), %%mm1         \n\t"
+        "paddb     %%mm0, %%mm1         \n\t"
+        "movq      %%mm1, 8(%2, %0)     \n\t"
+        "add         $16, %0            \n\t"
+        "2:                             \n\t"
+        "cmp          %3, %0            \n\t"
+        "js           1b                \n\t"
+        : "+r" (i)
+        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
+
+    for (; i < w; i++)
+        dst[i + 0] += src[i + 0];
+}
+
 #endif /* HAVE_INLINE_ASM */
author	Michael Niedermayer <michaelni@gmx.at>	2014-05-28 01:07:36 +0400
committer	Michael Niedermayer <michaelni@gmx.at>	2014-05-28 01:16:06 +0400
commit	e2abc0d5cacc22aa900de8ac26160ea1b786a7b5 (patch)
tree	536e4539c573752b2b446d2c8ef17d9d62927815 /libavcodec/x86
parent	43c57dbe14545d13dbfd8aae341b45514e8bcfbb (diff)
parent	0d439fbede03854eac8a978cccf21a3425a3c82d (diff)