Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FFmpeg/FFmpeg.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/Makefile23
-rw-r--r--libavcodec/x86/ac3dsp.asm8
-rw-r--r--libavcodec/x86/ac3dsp_init.c13
-rw-r--r--libavcodec/x86/cabac.h48
-rw-r--r--libavcodec/x86/cavsdsp.c8
-rw-r--r--libavcodec/x86/constants.c8
-rw-r--r--libavcodec/x86/constants.h10
-rw-r--r--libavcodec/x86/dca.h8
-rw-r--r--libavcodec/x86/dcadsp.asm119
-rw-r--r--libavcodec/x86/dcadsp_init.c61
-rw-r--r--libavcodec/x86/dct32.asm10
-rw-r--r--libavcodec/x86/dct_init.c8
-rw-r--r--libavcodec/x86/deinterlace.asm8
-rw-r--r--libavcodec/x86/dirac_dwt.c202
-rw-r--r--libavcodec/x86/dirac_dwt.h30
-rw-r--r--libavcodec/x86/diracdsp_mmx.c104
-rw-r--r--libavcodec/x86/diracdsp_mmx.h47
-rw-r--r--libavcodec/x86/diracdsp_yasm.asm264
-rw-r--r--libavcodec/x86/dnxhdenc.c8
-rw-r--r--libavcodec/x86/dsputil.asm15
-rw-r--r--libavcodec/x86/dsputil_init.c66
-rw-r--r--libavcodec/x86/dsputil_mmx.c165
-rw-r--r--libavcodec/x86/dsputil_qns_template.c10
-rw-r--r--libavcodec/x86/dsputil_x86.c8
-rw-r--r--libavcodec/x86/dsputil_x86.h17
-rw-r--r--libavcodec/x86/dsputilenc.asm8
-rw-r--r--libavcodec/x86/dsputilenc_mmx.c31
-rw-r--r--libavcodec/x86/dwt_yasm.asm306
-rw-r--r--libavcodec/x86/fdct.c12
-rw-r--r--libavcodec/x86/fft.asm23
-rw-r--r--libavcodec/x86/fft.h8
-rw-r--r--libavcodec/x86/fft_init.c8
-rw-r--r--libavcodec/x86/flacdsp.asm74
-rw-r--r--libavcodec/x86/flacdsp_init.c45
-rw-r--r--libavcodec/x86/fmtconvert.asm8
-rw-r--r--libavcodec/x86/fmtconvert_init.c8
-rw-r--r--libavcodec/x86/fpel.asm8
-rw-r--r--libavcodec/x86/fpel_mmx.c8
-rw-r--r--libavcodec/x86/h263_loopfilter.asm10
-rw-r--r--libavcodec/x86/h263dsp_init.c8
-rw-r--r--libavcodec/x86/h264_chromamc.asm8
-rw-r--r--libavcodec/x86/h264_chromamc_10bit.asm12
-rw-r--r--libavcodec/x86/h264_deblock.asm20
-rw-r--r--libavcodec/x86/h264_deblock_10bit.asm16
-rw-r--r--libavcodec/x86/h264_i386.h8
-rw-r--r--libavcodec/x86/h264_idct.asm8
-rw-r--r--libavcodec/x86/h264_idct_10bit.asm24
-rw-r--r--libavcodec/x86/h264_intrapred.asm13
-rw-r--r--libavcodec/x86/h264_intrapred_10bit.asm34
-rw-r--r--libavcodec/x86/h264_intrapred_init.c8
-rw-r--r--libavcodec/x86/h264_qpel.c16
-rw-r--r--libavcodec/x86/h264_qpel_10bit.asm8
-rw-r--r--libavcodec/x86/h264_qpel_8bit.asm8
-rw-r--r--libavcodec/x86/h264_weight.asm15
-rw-r--r--libavcodec/x86/h264_weight_10bit.asm8
-rw-r--r--libavcodec/x86/h264chroma_init.c8
-rw-r--r--libavcodec/x86/h264dsp_init.c10
-rw-r--r--libavcodec/x86/hpeldsp.asm15
-rw-r--r--libavcodec/x86/hpeldsp_init.c8
-rw-r--r--libavcodec/x86/hpeldsp_mmx.c8
-rw-r--r--libavcodec/x86/hpeldsp_rnd_template.c8
-rw-r--r--libavcodec/x86/idct_mmx_xvid.c8
-rw-r--r--libavcodec/x86/idct_sse2_xvid.c25
-rw-r--r--libavcodec/x86/idct_xvid.h8
-rw-r--r--libavcodec/x86/imdct36.asm35
-rw-r--r--libavcodec/x86/lossless_videodsp.asm292
-rw-r--r--libavcodec/x86/lossless_videodsp_init.c62
-rw-r--r--libavcodec/x86/lpc.c11
-rw-r--r--libavcodec/x86/mathops.h8
-rw-r--r--libavcodec/x86/mlpdsp.c9
-rw-r--r--libavcodec/x86/motion_est.c26
-rw-r--r--libavcodec/x86/mpeg4qpel.asm10
-rw-r--r--libavcodec/x86/mpegaudiodsp.c33
-rw-r--r--libavcodec/x86/mpegvideo.c28
-rw-r--r--libavcodec/x86/mpegvideoenc.c10
-rw-r--r--libavcodec/x86/mpegvideoenc_template.c69
-rw-r--r--libavcodec/x86/pngdsp.asm8
-rw-r--r--libavcodec/x86/pngdsp_init.c8
-rw-r--r--libavcodec/x86/proresdsp.asm132
-rw-r--r--libavcodec/x86/proresdsp_init.c10
-rw-r--r--libavcodec/x86/qpel.asm8
-rw-r--r--libavcodec/x86/rnd_mmx.c8
-rw-r--r--libavcodec/x86/rnd_template.c8
-rw-r--r--libavcodec/x86/rv34dsp.asm8
-rw-r--r--libavcodec/x86/rv34dsp_init.c8
-rw-r--r--libavcodec/x86/rv40dsp.asm8
-rw-r--r--libavcodec/x86/rv40dsp_init.c8
-rw-r--r--libavcodec/x86/sbrdsp.asm192
-rw-r--r--libavcodec/x86/sbrdsp_init.c27
-rw-r--r--libavcodec/x86/simple_idct.c11
-rw-r--r--libavcodec/x86/snowdsp.c902
-rw-r--r--libavcodec/x86/ttadsp.asm119
-rw-r--r--libavcodec/x86/ttadsp_init.c42
-rw-r--r--libavcodec/x86/v210-init.c48
-rw-r--r--libavcodec/x86/v210.asm88
-rw-r--r--libavcodec/x86/vc1dsp.asm8
-rw-r--r--libavcodec/x86/vc1dsp.h8
-rw-r--r--libavcodec/x86/vc1dsp_mmx.c1
-rw-r--r--libavcodec/x86/videodsp.asm60
-rw-r--r--libavcodec/x86/videodsp_init.c45
-rw-r--r--libavcodec/x86/vorbisdsp.asm8
-rw-r--r--libavcodec/x86/vorbisdsp_init.c8
-rw-r--r--libavcodec/x86/vp3dsp.asm8
-rw-r--r--libavcodec/x86/vp3dsp_init.c69
-rw-r--r--libavcodec/x86/vp56_arith.h8
-rw-r--r--libavcodec/x86/vp6dsp.asm8
-rw-r--r--libavcodec/x86/vp6dsp_init.c8
-rw-r--r--libavcodec/x86/vp8dsp.asm8
-rw-r--r--libavcodec/x86/vp8dsp_init.c30
-rw-r--r--libavcodec/x86/vp8dsp_loopfilter.asm8
-rw-r--r--libavcodec/x86/vp9dsp_init.c389
-rw-r--r--libavcodec/x86/vp9intrapred.asm1405
-rw-r--r--libavcodec/x86/vp9itxfm.asm1669
-rw-r--r--libavcodec/x86/vp9lpf.asm823
-rw-r--r--libavcodec/x86/vp9mc.asm (renamed from libavcodec/x86/vp9dsp.asm)155
-rw-r--r--libavcodec/x86/w64xmmtest.c15
116 files changed, 8235 insertions, 879 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index f985525061..0d3594f7c4 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -13,12 +13,15 @@ OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \
x86/fdct.o \
x86/motion_est.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
+OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp_init.o
+OBJS-$(CONFIG_FLAC_ENCODER) += x86/flacdsp_init.o
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
@@ -26,9 +29,12 @@ OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
x86/rv40dsp_init.o
+OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
+OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
@@ -37,6 +43,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o
+OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \
@@ -45,9 +52,12 @@ MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \
x86/idct_sse2_xvid.o \
x86/rnd_mmx.o \
x86/simple_idct.o
+MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o
MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \
x86/hpeldsp_mmx.o \
x86/rnd_mmx.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
YASM-OBJS += x86/deinterlace.o \
@@ -57,12 +67,15 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
+YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\
+ x86/dwt_yasm.o
YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
x86/fpel.o \
x86/mpeg4qpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
+YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o
@@ -80,12 +93,16 @@ YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
+YASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
x86/rv40dsp.o
+YASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
+YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
@@ -93,4 +110,8 @@ YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o \
x86/vp8dsp_loopfilter.o
-YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp.o
+YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
+ x86/vp9itxfm.o \
+ x86/vp9lpf.o \
+ x86/vp9mc.o
+YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index d0ab3e7359..5cc8392fe6 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -2,20 +2,20 @@
;* x86-optimized AC-3 DSP functions
;* Copyright (c) 2011 Justin Ruggles
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index a82c249400..6936ace87b 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -2,20 +2,20 @@
* x86-optimized AC-3 DSP functions
* Copyright (c) 2011 Justin Ruggles
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -65,6 +65,11 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
+#if ARCH_X86_32 && defined(__INTEL_COMPILER)
+# undef HAVE_7REGS
+# define HAVE_7REGS 0
+#endif
+
#if HAVE_SSE_INLINE && HAVE_7REGS
#define IF1(x) x
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index d1701bf071..3a82c1e4a4 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,8 +27,27 @@
#include "libavutil/x86/asm.h"
#include "config.h"
+#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+ || ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
+# define BROKEN_COMPILER 1
+#else
+# define BROKEN_COMPILER 0
+#endif
+
#if HAVE_INLINE_ASM
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+ "cmp "end" , %%"REG_c" \n\t"\
+ "jge 1f \n\t"
+#endif
+
#ifdef BROKEN_RELOCATIONS
#define TABLES_ARG , "r"(tables)
@@ -73,8 +92,7 @@
"test "lowword" , "lowword" \n\t"\
"jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
- "cmp "end" , %%"REG_c" \n\t"\
- "jge 1f \n\t"\
+ END_CHECK(end)\
"add"OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
@@ -93,6 +111,7 @@
#else /* BROKEN_RELOCATIONS */
#define TABLES_ARG
+#define RIP_ARG
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
@@ -134,8 +153,7 @@
"test "lowword" , "lowword" \n\t"\
" jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
- "cmp "end" , %%"REG_c" \n\t"\
- "jge 1f \n\t"\
+ END_CHECK(end)\
"add"OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
@@ -154,8 +172,7 @@
#endif /* BROKEN_RELOCATIONS */
-
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
#define get_cabac_inline get_cabac_inline_x86
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
uint8_t *const state)
@@ -178,17 +195,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%8")
- : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+ : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
+ ,"1"(c->low), "2"(c->range)
: "%"REG_c, "memory"
);
return bit & 1;
}
#endif /* HAVE_7REGS */
+#if !BROKEN_COMPILER
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
@@ -211,10 +230,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
+#if UNCHECKED_BITSTREAM_READER
+ "add $2, %1 \n\t"
+ "addl %%edx, %%eax \n\t"
+ "mov %1, %c4(%2) \n\t"
+#else
"addl %%edx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"OPSIZE" $2, %c4(%2) \n\t"
+#endif
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
@@ -268,6 +293,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
);
return res;
}
+#endif /* !BROKEN_COMPILER */
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_CABAC_H */
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index bc9cbf7411..aaa09d1784 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -5,20 +5,20 @@
* MMX-optimized DSP functions, based on H.264 optimizations by
* Michael Niedermayer and Loren Merritt
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 5b8d1b224f..3bba80bd87 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -1,20 +1,20 @@
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index f38fbe3425..4bf74ff76c 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -1,20 +1,20 @@
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -31,6 +31,7 @@ extern const xmm_reg ff_pw_3;
extern const xmm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_8;
+extern const xmm_reg ff_pw_9;
extern const uint64_t ff_pw_15;
extern const xmm_reg ff_pw_16;
extern const xmm_reg ff_pw_18;
@@ -45,6 +46,7 @@ extern const uint64_t ff_pw_255;
extern const xmm_reg ff_pb_1;
extern const xmm_reg ff_pb_3;
+extern const xmm_reg ff_pb_80;
extern const xmm_reg ff_pb_F8;
extern const uint64_t ff_pb_FC;
diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
index 11d45ae61c..c9be50d065 100644
--- a/libavcodec/x86/dca.h
+++ b/libavcodec/x86/dca.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 56039baa6f..4746784795 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -2,20 +2,20 @@
;* SSE-optimized functions for the DCA decoder
;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -199,66 +199,85 @@ INIT_XMM sse
DCA_LFE_FIR 0
DCA_LFE_FIR 1
-INIT_XMM sse2
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+ pxor %1, %1
+%else
+ xorps %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 3
+%if cpuflag(avx)
+ mova %3, [%2 - 16]
+ vperm2f128 %1, %3, %3, 1
+ vshufps %1, %1, %1, q0123
+%elif cpuflag(sse2)
+ pshufd %1, [%2], q0123
+%else
+ mova %1, [%2]
+ shufps %1, %1, q0123
+%endif
+%endmacro
+
%macro INNER_LOOP 1
; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
;~ a += window[i + j] * (-synth_buf[15 - i + j])
;~ b += window[i + j + 16] * (synth_buf[i + j])
- pshufd m5, [ptr2 + j + (15 - 3) * 4], q0123
+ SHUF m5, ptr2 + j + (15 - 3) * 4, m6
mova m6, [ptr1 + j]
%if ARCH_X86_64
- pshufd m11, [ptr2 + j + (15 - 3) * 4 - mmsize], q0123
+ SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
mova m12, [ptr1 + j + mmsize]
%endif
- mulps m6, [win + %1 + j + 16 * 4]
- mulps m5, [win + %1 + j]
+ FMULADD_PS m2, m6, [win + %1 + j + 16 * 4], m2, m6
+ mulps m5, m5, [win + %1 + j]
+ subps m1, m1, m5
%if ARCH_X86_64
- mulps m12, [win + %1 + j + mmsize + 16 * 4]
- mulps m11, [win + %1 + j + mmsize]
-%endif
- addps m2, m6
- subps m1, m5
-%if ARCH_X86_64
- addps m8, m12
- subps m7, m11
+ FMULADD_PS m8, m12, [win + %1 + j + mmsize + 16 * 4], m8, m12
+ mulps m11, m11, [win + %1 + j + mmsize]
+ subps m7, m7, m11
%endif
;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
- pshufd m6, [ptr2 + j + (31 - 3) * 4], q0123
+ SHUF m6, ptr2 + j + (31 - 3) * 4, m5
mova m5, [ptr1 + j + 16 * 4]
%if ARCH_X86_64
- pshufd m12, [ptr2 + j + (31 - 3) * 4 - mmsize], q0123
+ SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
mova m11, [ptr1 + j + mmsize + 16 * 4]
%endif
- mulps m5, [win + %1 + j + 32 * 4]
- mulps m6, [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
- mulps m11, [win + %1 + j + mmsize + 32 * 4]
- mulps m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
- addps m3, m5
- addps m4, m6
+ FMULADD_PS m3, m5, [win + %1 + j + 32 * 4], m3, m5
+ FMULADD_PS m4, m6, [win + %1 + j + 48 * 4], m4, m6
%if ARCH_X86_64
- addps m9, m11
- addps m10, m12
+ FMULADD_PS m9, m11, [win + %1 + j + mmsize + 32 * 4], m9, m11
+ FMULADD_PS m10, m12, [win + %1 + j + mmsize + 48 * 4], m10, m12
%endif
sub j, 64 * 4
%endmacro
-; void ff_synth_filter_inner_sse2(float *synth_buf, float synth_buf2[32],
-; const float window[512], float out[32],
-; intptr_t offset, float scale)
+; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
+; const float window[512], float out[32],
+; intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
synth_buf, synth_buf2, window, out, off, scale
%define scale m0
%if ARCH_X86_32 || WIN64
- movd scale, scalem
+%if cpuflag(sse2) && notcpuflag(avx)
+ movd m0, scalem
+ SPLATD m0
+%else
+ VBROADCASTSS m0, scalem
+%endif
; Make sure offset is in a register and not on the stack
%define OFFQ r4q
%else
+ SPLATD xmm0
+%if cpuflag(avx)
+ vinsertf128 m0, m0, xmm0, 1
+%endif
%define OFFQ offq
%endif
- pshufd m0, m0, 0
; prepare inner counter limit 1
mov r5q, 480
sub r5q, offmp
@@ -274,8 +293,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%endif
.mainloop
; m1 = a m2 = b m3 = c m4 = d
- pxor m3, m3
- pxor m4, m4
+ SETZERO m3
+ SETZERO m4
mova m1, [buf2 + i]
mova m2, [buf2 + i + 16 * 4]
%if ARCH_X86_32
@@ -292,8 +311,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%define ptr2 r7q ; must be loaded
%define win r8q
%define j r9q
- pxor m9, m9
- pxor m10, m10
+ SETZERO m9
+ SETZERO m10
mova m7, [buf2 + i + mmsize]
mova m8, [buf2 + i + mmsize + 16 * 4]
lea win, [windowq + i]
@@ -325,11 +344,11 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%endif
;~ out[i] = a * scale;
;~ out[i + 16] = b * scale;
- mulps m1, scale
- mulps m2, scale
+ mulps m1, m1, scale
+ mulps m2, m2, scale
%if ARCH_X86_64
- mulps m7, scale
- mulps m8, scale
+ mulps m7, m7, scale
+ mulps m8, m8, scale
%endif
;~ synth_buf2[i] = c;
;~ synth_buf2[i + 16] = d;
@@ -350,3 +369,19 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
sub i, (ARCH_X86_64 + 1) * mmsize
jge .mainloop
RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+SYNTH_FILTER
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+SYNTH_FILTER
+%endif
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 65e3db5c6c..48880d628a 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -56,25 +56,33 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
}
}
-void ff_synth_filter_inner_sse2(float *synth_buf_ptr, float synth_buf2[32],
- const float window[512],
- float out[32], intptr_t offset, float scale);
-#if HAVE_YASM
-static void synth_filter_sse2(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32], float scale)
-{
- float *synth_buf= synth_buf_ptr + *synth_buf_offset;
-
- imdct->imdct_half(imdct, synth_buf, in);
-
- ff_synth_filter_inner_sse2(synth_buf, synth_buf2, window,
- out, *synth_buf_offset, scale);
+#define SYNTH_FILTER_FUNC(opt) \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
+ const float window[512], \
+ float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct, \
+ float *synth_buf_ptr, int *synth_buf_offset, \
+ float synth_buf2[32], const float window[512], \
+ float out[32], const float in[32], float scale) \
+{ \
+ float *synth_buf= synth_buf_ptr + *synth_buf_offset; \
+ \
+ imdct->imdct_half(imdct, synth_buf, in); \
+ \
+ ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \
+ out, *synth_buf_offset, scale); \
+ \
+ *synth_buf_offset = (*synth_buf_offset - 32) & 511; \
+} \
- *synth_buf_offset = (*synth_buf_offset - 32) & 511;
-}
+#if HAVE_YASM
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
+SYNTH_FILTER_FUNC(avx)
+SYNTH_FILTER_FUNC(fma3)
#endif /* HAVE_YASM */
av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
@@ -82,8 +90,19 @@ av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
+ if (EXTERNAL_SSE(cpu_flags)) {
+ s->synth_filter_float = synth_filter_sse;
+ }
+#endif
if (EXTERNAL_SSE2(cpu_flags)) {
s->synth_filter_float = synth_filter_sse2;
}
+ if (EXTERNAL_AVX(cpu_flags)) {
+ s->synth_filter_float = synth_filter_avx;
+ }
+ if (EXTERNAL_FMA3(cpu_flags)) {
+ s->synth_filter_float = synth_filter_fma3;
+ }
#endif /* HAVE_YASM */
}
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index 9c147b9c00..6fd5ba350d 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -2,20 +2,20 @@
;* 32 point SSE-optimized DCT transform
;* Copyright (c) 2010 Vitor Sessak
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -192,6 +192,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
INIT_YMM avx
SECTION_TEXT
+%if HAVE_AVX_EXTERNAL
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
cglobal dct32_float, 2,3,8, out, in, tmp
; pass 1
@@ -264,6 +265,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp
INIT_XMM
PASS6_AND_PERMUTE
RET
+%endif
%if ARCH_X86_64
%define SPILL SWAP
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index 7bda5e81b6..85e2d0c3e6 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/deinterlace.asm b/libavcodec/x86/deinterlace.asm
index 70d000e0db..baa9249db2 100644
--- a/libavcodec/x86/deinterlace.asm
+++ b/libavcodec/x86/deinterlace.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2010 Vitor Sessak
;* Copyright (c) 2002 Michael Niedermayer
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/dirac_dwt.c b/libavcodec/x86/dirac_dwt.c
new file mode 100644
index 0000000000..04c514f4fd
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.c
@@ -0,0 +1,202 @@
+/*
+ * MMX optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "dsputil_x86.h"
+#include "dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) \
+ b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+ ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) \
+ b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+ ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+ IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) \
+ b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+ ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+ IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) \
+ b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+ ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
+{ \
+ int i, width_align = width&~(align-1); \
+\
+ for(i=width_align; i<width; i++) { \
+ b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+ b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+ } \
+\
+ ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+ int w2= w>>1;\
+ int x= w2 - (w2&(align-1));\
+ ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+ for (; x < w2; x++) {\
+ b[2*x ] = tmp[x];\
+ b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+ }\
+}\
+static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+ int w2= w>>1;\
+ int x= w2 - (w2&(align-1));\
+ ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+ for (; x < w2; x++) {\
+ b[2*x ] = (tmp[x] + 1)>>1;\
+ b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+ }\
+}\
+\
+
+#if HAVE_YASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
+{
+ int w2= w>>1;
+ int x= w2 - (w2&7);
+ ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+ for (; x < w2; x++) {
+ b[2*x ] = (tmp[x] + 1)>>1;
+ b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+ }
+}
+#endif
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+ if (!(mm_flags & AV_CPU_FLAG_MMX))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+ break;
+ case DWT_DIRAC_LEGALL5_3:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+ break;
+ case DWT_DIRAC_DD13_7:
+ d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+ break;
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_mmx;
+ d->horizontal_compose = horizontal_compose_haar0i_mmx;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->vertical_compose = (void*)vertical_compose_haar_mmx;
+ d->horizontal_compose = horizontal_compose_haar1i_mmx;
+ break;
+ }
+#endif
+
+ if (!(mm_flags & AV_CPU_FLAG_SSE2))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+ break;
+ case DWT_DIRAC_LEGALL5_3:
+ d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+ break;
+ case DWT_DIRAC_DD13_7:
+ d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+ d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+ break;
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_sse2;
+ d->horizontal_compose = horizontal_compose_haar0i_sse2;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->vertical_compose = (void*)vertical_compose_haar_sse2;
+ d->horizontal_compose = horizontal_compose_haar1i_sse2;
+ break;
+ }
+
+ if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+ return;
+
+ switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+ break;
+ }
+#endif // HAVE_YASM
+}
diff --git a/libavcodec/x86/dirac_dwt.h b/libavcodec/x86/dirac_dwt.h
new file mode 100644
index 0000000000..126b29029f
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRAC_DWT_H
+#define AVCODEC_X86_DIRAC_DWT_H
+
+#include "libavcodec/dirac_dwt.h"
+
+void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
+
+#endif
diff --git a/libavcodec/x86/diracdsp_mmx.c b/libavcodec/x86/diracdsp_mmx.c
new file mode 100644
index 0000000000..a28bb82060
--- /dev/null
+++ b/libavcodec/x86/diracdsp_mmx.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_x86.h"
+#include "diracdsp_mmx.h"
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+
+#define HPEL_FILTER(MMSIZE, EXT) \
+ void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
+ void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
+ \
+ static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
+ const uint8_t *src, int stride, int width, int height) \
+ { \
+ while( height-- ) \
+ { \
+ ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+ ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
+ ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
+ \
+ dsth += stride; \
+ dstv += stride; \
+ dstc += stride; \
+ src += stride; \
+ } \
+ }
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+#define PIXFUNC(PFX, IDX, EXT) \
+ /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
+ c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+ c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (!(mm_flags & AV_CPU_FLAG_MMX))
+ return;
+
+#if HAVE_YASM
+ c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+ c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+ c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+ c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+ c->add_rect_clamped = ff_add_rect_clamped_mmx;
+ c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
+#endif
+#endif
+
+#if HAVE_MMX_INLINE
+ PIXFUNC(put, 0, mmx);
+ PIXFUNC(avg, 0, mmx);
+#endif
+
+#if HAVE_MMXEXT_INLINE
+ if (mm_flags & AV_CPU_FLAG_MMX2) {
+ PIXFUNC(avg, 0, mmxext);
+ }
+#endif
+
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+#if HAVE_YASM
+ c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+ c->add_rect_clamped = ff_add_rect_clamped_sse2;
+ c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
+
+ c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+ c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+#endif
+#if HAVE_SSE2_INLINE
+ c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+ c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+ c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+ c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+#endif
+ }
+}
diff --git a/libavcodec/x86/diracdsp_mmx.h b/libavcodec/x86/diracdsp_mmx.h
new file mode 100644
index 0000000000..89858544f3
--- /dev/null
+++ b/libavcodec/x86/diracdsp_mmx.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRACDSP_H
+#define AVCODEC_X86_DIRACDSP_H
+
+#include "libavcodec/diracdsp.h"
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c);
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+#endif
diff --git a/libavcodec/x86/diracdsp_yasm.asm b/libavcodec/x86/diracdsp_yasm.asm
new file mode 100644
index 0000000000..3e9765b42d
--- /dev/null
+++ b/libavcodec/x86/diracdsp_yasm.asm
@@ -0,0 +1,264 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_3: times 8 dw 3
+pw_7: times 8 dw 7
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+pb_128: times 16 db 128
+
+section .text
+
+%macro UNPACK_ADD 6
+ mov%5 %1, %3
+ mov%6 m5, %4
+ mova m4, %1
+ mova %2, m5
+ punpcklbw %1, m7
+ punpcklbw m5, m7
+ punpckhbw m4, m7
+ punpckhbw %2, m7
+ paddw %1, m5
+ paddw %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+ mov src0q, srcq
+ lea stridex3q, [3*strideq]
+ sub src0q, stridex3q
+ pxor m7, m7
+.loop:
+ ; 7*(src[0] + src[1])
+ UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+ pmullw m0, [pw_7]
+ pmullw m1, [pw_7]
+
+ ; 3*( ... + src[-2] + src[3])
+ UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+ paddw m0, m2
+ paddw m1, m3
+ pmullw m0, [pw_3]
+ pmullw m1, [pw_3]
+
+ ; ... - 7*(src[-1] + src[2])
+ UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+ pmullw m2, [pw_7]
+ pmullw m3, [pw_7]
+ psubw m0, m2
+ psubw m1, m3
+
+ ; ... - (src[-3] + src[4])
+ UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+ psubw m0, m2
+ psubw m1, m3
+
+ paddw m0, [pw_16]
+ paddw m1, [pw_16]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, mmsize
+ add srcq, mmsize
+ add src0q, mmsize
+ sub widthd, mmsize
+ jg .loop
+ RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+ dec widthd
+ pxor m7, m7
+ and widthd, ~(mmsize-1)
+.loop:
+ ; 7*(src[0] + src[1])
+ UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+ pmullw m0, [pw_7]
+ pmullw m1, [pw_7]
+
+ ; 3*( ... + src[-2] + src[3])
+ UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+ paddw m0, m2
+ paddw m1, m3
+ pmullw m0, [pw_3]
+ pmullw m1, [pw_3]
+
+ ; ... - 7*(src[-1] + src[2])
+ UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+ pmullw m2, [pw_7]
+ pmullw m3, [pw_7]
+ psubw m0, m2
+ psubw m1, m3
+
+ ; ... - (src[-3] + src[4])
+ UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+ psubw m0, m2
+ psubw m1, m3
+
+ paddw m0, [pw_16]
+ paddw m1, [pw_16]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ mova [dstq + widthq], m0
+ sub widthd, mmsize
+ jge .loop
+ RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+ mova m0, [pb_128]
+ add wd, (mmsize-1)
+ and wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+ movsxd dst_strideq, dst_strided
+ movsxd src_strideq, src_strided
+ mov r7d, r5m
+ mov r8d, wd
+ %define wspill r8d
+ %define hd r7d
+%else
+ mov r4m, wd
+ %define wspill r4m
+ %define hd r5mp
+%endif
+
+.loopy
+ lea src2q, [srcq+src_strideq*2]
+ lea dst2q, [dstq+dst_strideq]
+.loopx:
+ sub wd, mmsize
+ mova m1, [srcq +2*wq]
+ mova m2, [src2q+2*wq]
+ packsswb m1, [srcq +2*wq+mmsize]
+ packsswb m2, [src2q+2*wq+mmsize]
+ paddb m1, m0
+ paddb m2, m0
+ mova [dstq +wq], m1
+ mova [dst2q+wq], m2
+ jg .loopx
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*2]
+ sub hd, 2
+ mov wd, wspill
+ jg .loopy
+ RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+ mova m0, [pw_32]
+ add wd, (mmsize-1)
+ and wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+ movsxd strideq, strided
+ movsxd idwt_strideq, idwt_strided
+ mov r8d, wd
+ %define wspill r8d
+%else
+ mov r5m, wd
+ %define wspill r5m
+%endif
+
+.loop:
+ sub wd, mmsize
+ movu m1, [srcq +2*wq] ; FIXME: ensure alignment
+ paddw m1, m0
+ psraw m1, 6
+ movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+ paddw m2, m0
+ psraw m2, 6
+ paddw m1, [idwtq+2*wq]
+ paddw m2, [idwtq+2*wq+mmsize]
+ packuswb m1, m2
+ mova [dstq +wq], m1
+ jg .loop
+
+ lea srcq, [srcq + 2*strideq]
+ add dstq, strideq
+ lea idwtq, [idwtq+ 2*idwt_strideq]
+ sub hd, 1
+ mov wd, wspill
+ jg .loop
+ RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+ pxor m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+ mova m0, [srcq+i]
+ mova m1, m0
+ punpcklbw m0, m4
+ punpckhbw m1, m4
+ mova m2, [obmcq+i]
+ mova m3, m2
+ punpcklbw m2, m4
+ punpckhbw m3, m4
+ pmullw m0, m2
+ pmullw m1, m3
+ movu m2, [dstq+2*i]
+ movu m3, [dstq+2*i+mmsize]
+ paddw m0, m2
+ paddw m1, m3
+ movu [dstq+2*i], m0
+ movu [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+ lea srcq, [srcq+strideq]
+ lea dstq, [dstq+2*strideq]
+ add obmcq, 32
+ sub yblend, 1
+ jg .loop
+ RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
diff --git a/libavcodec/x86/dnxhdenc.c b/libavcodec/x86/dnxhdenc.c
index 0bab69f67d..c7e776a4c1 100644
--- a/libavcodec/x86/dnxhdenc.c
+++ b/libavcodec/x86/dnxhdenc.c
@@ -4,20 +4,20 @@
*
* VC-3 encoder funded by the British Broadcasting Corporation
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 414d2124db..b7893875d0 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -1,21 +1,23 @@
;******************************************************************************
;* MMX optimized DSP utils
;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -59,6 +61,9 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
%endif
paddd m2, m0
movd eax, m2
+%if mmsize == 8
+ emms
+%endif
RET
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
@@ -468,6 +473,7 @@ cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
.src_unaligned:
ADD_HFYU_LEFT_LOOP 0, 0
+
;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
; int32_t max, unsigned int len)
@@ -610,6 +616,7 @@ cglobal bswap32_buf, 3,4,3
cglobal bswap32_buf, 3,4,5
mov r3, r1
%endif
+ or r3, r0
and r3, 15
jz .start_align
BSWAP_LOOPS u
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 00f89b2525..4e518ad56d 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -1,18 +1,21 @@
/*
- * This file is part of Libav.
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -531,24 +534,11 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
c->draw_edges = ff_draw_edges_mmx;
-
- switch (avctx->idct_algo) {
- case FF_IDCT_AUTO:
- case FF_IDCT_SIMPLEMMX:
- c->idct_put = ff_simple_idct_put_mmx;
- c->idct_add = ff_simple_idct_add_mmx;
- c->idct = ff_simple_idct_mmx;
- c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
- break;
- case FF_IDCT_XVIDMMX:
- c->idct_put = ff_idct_xvid_mmx_put;
- c->idct_add = ff_idct_xvid_mmx_add;
- c->idct = ff_idct_xvid_mmx;
- break;
- }
}
+#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
c->gmc = ff_gmc_mmx;
+#endif
c->add_bytes = ff_add_bytes_mmx;
#endif /* HAVE_MMX_INLINE */
@@ -564,7 +554,7 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
#if HAVE_MMXEXT_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
- if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+ if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
c->idct_put = ff_idct_xvid_mmxext_put;
c->idct_add = ff_idct_xvid_mmxext_add;
c->idct = ff_idct_xvid_mmxext;
@@ -597,19 +587,21 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
c->vector_clipf = ff_vector_clipf_sse;
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
- if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)
+ if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
return;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
if (!high_bit_depth) {
c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse;
}
#endif /* HAVE_SSE_INLINE */
+
+#if HAVE_YASM
+#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
+ c->gmc = ff_gmc_sse;
+#endif
+#endif /* HAVE_YASM */
}
static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
@@ -618,7 +610,7 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
#if HAVE_SSE2_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
- if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+ if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
c->idct_put = ff_idct_xvid_sse2_put;
c->idct_add = ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
@@ -665,12 +657,30 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx)
int cpu_flags = av_get_cpu_flags();
#if HAVE_7REGS && HAVE_INLINE_ASM
- if (cpu_flags & AV_CPU_FLAG_CMOV)
+ if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_CMOV)
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov;
#endif
- if (X86_MMX(cpu_flags))
+ if (X86_MMX(cpu_flags)) {
+#if HAVE_INLINE_ASM
+ const int idct_algo = avctx->idct_algo;
+
+ if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
+ if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
+ c->idct_put = ff_simple_idct_put_mmx;
+ c->idct_add = ff_simple_idct_add_mmx;
+ c->idct = ff_simple_idct_mmx;
+ c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
+ } else if (idct_algo == FF_IDCT_XVIDMMX) {
+ c->idct_put = ff_idct_xvid_mmx_put;
+ c->idct_add = ff_idct_xvid_mmx_add;
+ c->idct = ff_idct_xvid_mmx;
+ }
+ }
+#endif /* HAVE_INLINE_ASM */
+
dsputil_init_mmx(c, avctx, cpu_flags);
+ }
if (X86_MMXEXT(cpu_flags))
dsputil_init_mmxext(c, avctx, cpu_flags);
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index d6136f65fe..47835acf2b 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -3,30 +3,33 @@
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
#include "config.h"
+#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
+#include "libavcodec/videodsp.h"
#include "constants.h"
#include "dsputil_x86.h"
+#include "diracdsp_mmx.h"
#if HAVE_INLINE_ASM
@@ -277,7 +280,7 @@ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
: "+r" (ptr)
: "r" ((x86_reg) wrap), "r" ((x86_reg) width),
"r" (ptr + wrap * height));
- } else {
+ } else if (w == 16) {
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
@@ -295,6 +298,25 @@ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
+ : "+r"(ptr)
+ : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+ );
+ } else {
+ av_assert1(w == 4);
+ __asm__ volatile (
+ "1: \n\t"
+ "movd (%0), %%mm0 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpcklwd %%mm0, %%mm0 \n\t"
+ "movd %%mm0, -4(%0) \n\t"
+ "movd -4(%0, %2), %%mm1 \n\t"
+ "punpcklbw %%mm1, %%mm1 \n\t"
+ "punpckhwd %%mm1, %%mm1 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, (%0, %2) \n\t"
+ "add %1, %0 \n\t"
+ "cmp %3, %0 \n\t"
+ "jb 1b \n\t"
: "+r" (ptr)
: "r" ((x86_reg) wrap), "r" ((x86_reg) width),
"r" (ptr + wrap * height));
@@ -342,10 +364,17 @@ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
}
}
-void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
- int stride, int h, int ox, int oy,
- int dxx, int dxy, int dyx, int dyy,
- int shift, int r, int width, int height)
+typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride,
+ ptrdiff_t src_linesize,
+ int block_w, int block_h,
+ int src_x, int src_y, int w, int h);
+
+static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height,
+ emulated_edge_mc_func *emu_edge_fn)
{
const int w = 8;
const int ix = ox >> (16 + shift);
@@ -360,20 +389,24 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
const uint64_t shift2 = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+ uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
int x, y;
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
const int dxh = dxy * (h - 1);
const int dyw = dyx * (w - 1);
+ int need_emu = (unsigned) ix >= width - w ||
+ (unsigned) iy >= height - h;
if ( // non-constant fullpel offset (3% of blocks)
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
// uses more than 16 bits of subpel mv (only at huge resolution)
(dxx | dxy | dyx | dyy) & 15 ||
- (unsigned) ix >= width - w ||
- (unsigned) iy >= height - h) {
+ (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
// FIXME could still use mmx for some of the rows
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
shift, r, width, height);
@@ -381,6 +414,10 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
}
src += ix + iy * stride;
+ if (need_emu) {
+ emu_edge_fn(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
+ src = edge_buf;
+ }
__asm__ volatile (
"movd %0, %%mm6 \n\t"
@@ -456,6 +493,108 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
}
}
+#if CONFIG_VIDEODSP
+#if HAVE_YASM
+#if ARCH_X86_32
+void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &ff_emulated_edge_mc_8);
+}
+#endif
+void ff_gmc_sse(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &ff_emulated_edge_mc_8);
+}
+#else
+void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &ff_emulated_edge_mc_8);
+}
+#endif
+#endif
+
+#if CONFIG_DIRAC_DECODER
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3)\
+ ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+ else\
+ OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3)\
+ ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+ else\
+ OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+ if (h&3) {\
+ ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+ } else {\
+ OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
+ OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+ }\
+}
+
+#if HAVE_MMX_INLINE
+PIXELS16(static, ff_avg, , , _mmxext)
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+#endif
+
+#if HAVE_YASM
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3)
+ ff_put_dirac_pixels16_c(dst, src, stride, h);
+ else
+ ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3)
+ ff_avg_dirac_pixels16_c(dst, src, stride, h);
+ else
+ ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3) {
+ ff_put_dirac_pixels32_c(dst, src, stride, h);
+ } else {
+ ff_put_pixels16_sse2(dst , src[0] , stride, h);
+ ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+ }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+ if (h&3) {
+ ff_avg_dirac_pixels32_c(dst, src, stride, h);
+ } else {
+ ff_avg_pixels16_sse2(dst , src[0] , stride, h);
+ ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+ }
+}
+#endif
+#endif
+
void ff_vector_clipf_sse(float *dst, const float *src,
float min, float max, int len)
{
diff --git a/libavcodec/x86/dsputil_qns_template.c b/libavcodec/x86/dsputil_qns_template.c
index 20a40a175e..bde6b0a606 100644
--- a/libavcodec/x86/dsputil_qns_template.c
+++ b/libavcodec/x86/dsputil_qns_template.c
@@ -5,20 +5,20 @@
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -28,7 +28,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[
{
x86_reg i=0;
- assert(FFABS(scale) < MAX_ABS);
+ av_assert2(FFABS(scale) < MAX_ABS);
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
diff --git a/libavcodec/x86/dsputil_x86.c b/libavcodec/x86/dsputil_x86.c
index 144339be64..f43b9d782d 100644
--- a/libavcodec/x86/dsputil_x86.c
+++ b/libavcodec/x86/dsputil_x86.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 8f1fc17474..b45d8cffcd 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -2,20 +2,20 @@
* MMX optimized DSP utils
* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -133,6 +133,11 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height);
+void ff_gmc_sse(uint8_t *dst, uint8_t *src,
+ int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy,
+ int shift, int r, int width, int height);
+
void ff_vector_clipf_sse(float *dst, const float *src,
float min, float max, int len);
@@ -166,6 +171,10 @@ void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+
+void ff_mmx_idct(int16_t *block);
+void ff_mmxext_idct(int16_t *block);
+
void ff_deinterlace_line_mmx(uint8_t *dst,
const uint8_t *lum_m4, const uint8_t *lum_m3,
const uint8_t *lum_m2, const uint8_t *lum_m1,
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 2790a88036..a6a3aba61a 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 2c320371b0..5038d946a8 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -5,20 +5,20 @@
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -452,8 +452,8 @@ static int vsad_intra16_mmx(void *v, uint8_t *pix, uint8_t *dummy,
{
int tmp;
- assert((((int) pix) & 7) == 0);
- assert((line_size & 7) == 0);
+ av_assert2((((int) pix) & 7) == 0);
+ av_assert2((line_size & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
@@ -516,8 +516,8 @@ static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
{
int tmp;
- assert((((int) pix) & 7) == 0);
- assert((line_size & 7) == 0);
+ av_assert2((((int) pix) & 7) == 0);
+ av_assert2((line_size & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n" \
@@ -559,9 +559,9 @@ static int vsad16_mmx(void *v, uint8_t *pix1, uint8_t *pix2,
{
int tmp;
- assert((((int) pix1) & 7) == 0);
- assert((((int) pix2) & 7) == 0);
- assert((line_size & 7) == 0);
+ av_assert2((((int) pix1) & 7) == 0);
+ av_assert2((((int) pix2) & 7) == 0);
+ av_assert2((line_size & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
@@ -640,9 +640,9 @@ static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
{
int tmp;
- assert((((int) pix1) & 7) == 0);
- assert((((int) pix2) & 7) == 0);
- assert((line_size & 7) == 0);
+ av_assert2((((int) pix1) & 7) == 0);
+ av_assert2((((int) pix2) & 7) == 0);
+ av_assert2((line_size & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n" \
@@ -695,10 +695,11 @@ static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
}
#undef SUM
-static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
+static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
{
x86_reg i = 0;
+ if (w >= 16)
__asm__ volatile (
"1: \n\t"
"movq (%2, %0), %%mm0 \n\t"
diff --git a/libavcodec/x86/dwt_yasm.asm b/libavcodec/x86/dwt_yasm.asm
new file mode 100644
index 0000000000..5253abc6c8
--- /dev/null
+++ b/libavcodec/x86/dwt_yasm.asm
@@ -0,0 +1,306 @@
+;******************************************************************************
+;* MMX optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_1991: times 4 dw 9,-1
+
+section .text
+
+; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
+%macro COMPOSE_53iL0 4
+ paddw %2, %3
+ paddw %2, %4
+ psraw %2, 2
+ psubw %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered m3: pw_8 m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+ paddw m0, %3
+ paddw m1, %2
+ psubw m0, m3
+ mova m2, m1
+ punpcklwd m1, m0
+ punpckhwd m2, m0
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+%if %0 > 3
+ movu %1, %4
+%endif
+ psrad m1, 4
+ psrad m2, 4
+ packssdw m1, m2
+ paddw m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+ mova m2, [pw_2]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m1, [b0q+2*widthq]
+ mova m0, [b1q+2*widthq]
+ COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+ mova [b1q+2*widthq], m0
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+ mova m1, [pw_1]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ paddw m0, [b2q+2*widthq]
+ paddw m0, m1
+ psraw m0, 1
+ paddw m0, [b1q+2*widthq]
+ mova [b1q+2*widthq], m0
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+ mova m3, [pw_8]
+ mova m4, [pw_1991]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ mova m1, [b1q+2*widthq]
+ COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+ mova [b2q+2*widthq], m1
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+ mova m3, [pw_16]
+ mova m4, [pw_1991]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ mova m1, [b1q+2*widthq]
+ mova m5, [b2q+2*widthq]
+ paddw m0, [b4q+2*widthq]
+ paddw m1, [b3q+2*widthq]
+ psubw m0, m3
+ mova m2, m1
+ punpcklwd m1, m0
+ punpckhwd m2, m0
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ psrad m1, 5
+ psrad m2, 5
+ packssdw m1, m2
+ psubw m5, m1
+ mova [b2q+2*widthq], m5
+ jg .loop
+ REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+ mova m3, [pw_1]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m1, [b1q+2*widthq]
+ mova m0, [b0q+2*widthq]
+ mova m2, m1
+ paddw m1, m3
+ psraw m1, 1
+ psubw m0, m1
+ mova [b0q+2*widthq], m0
+ paddw m2, m0
+ mova [b1q+2*widthq], m2
+ jg .loop
+ REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+ mov %3, [tmpq]
+%assign %%i 1
+%rep %1
+ mov [tmpq-2*%%i], %3
+ %assign %%i %%i+1
+%endrep
+ mov %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+ mov [tmpq+2*w2q+2*%%i], %3
+ %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+ mov w2d, wd
+ xor xq, xq
+ shr w2d, 1
+ lea b_w2q, [bq+wq]
+ mova m3, [pw_1]
+.lowpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [bq + 2*xq]
+ paddw m1, m3
+ psraw m1, 1
+ psubw m0, m1
+ mova [tmpq + 2*xq], m0
+ add xq, mmsize/2
+ cmp xq, w2q
+ jl .lowpass_loop
+
+ xor xq, xq
+ and w2q, ~(mmsize/2 - 1)
+ cmp w2q, mmsize/2
+ jl .end
+
+.highpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [tmpq + 2*xq]
+ paddw m1, m0
+
+ ; shift and interleave
+%if %2 == 1
+ paddw m0, m3
+ paddw m1, m3
+ psraw m0, 1
+ psraw m1, 1
+%endif
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ mova [bq+4*xq], m0
+ mova [bq+4*xq+mmsize], m2
+
+ add xq, mmsize/2
+ cmp xq, w2q
+ jl .highpass_loop
+.end:
+ REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+ mov w2d, wd
+ xor xd, xd
+ shr w2d, 1
+ lea b_w2q, [bq+wq]
+ movu m4, [bq+wq]
+ mova m7, [pw_2]
+ pslldq m4, 14
+.lowpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [bq + 2*xq]
+ mova m2, m1
+ palignr m1, m4, 14
+ mova m4, m2
+ COMPOSE_53iL0 m0, m1, m2, m7
+ mova [tmpq + 2*xq], m0
+ add xd, mmsize/2
+ cmp xd, w2d
+ jl .lowpass_loop
+
+ EDGE_EXTENSION 1, 2, xw
+ ; leave the last up to 7 (sse) or 3 (mmx) values for C
+ xor xd, xd
+ and w2d, ~(mmsize/2 - 1)
+ cmp w2d, mmsize/2
+ jl .end
+
+ mova m7, [tmpq-mmsize]
+ mova m0, [tmpq]
+ mova m5, [pw_1]
+ mova m3, [pw_8]
+ mova m4, [pw_1991]
+.highpass_loop:
+ mova m6, m0
+ palignr m0, m7, 14
+ mova m7, [tmpq + 2*xq + 16]
+ mova m1, m7
+ mova m2, m7
+ palignr m1, m6, 2
+ palignr m2, m6, 4
+ COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+ mova m0, m7
+ mova m7, m6
+
+ ; shift and interleave
+ paddw m6, m5
+ paddw m1, m5
+ psraw m6, 1
+ psraw m1, 1
+ mova m2, m6
+ punpcklwd m6, m1
+ punpckhwd m2, m1
+ mova [bq+4*xq], m6
+ mova [bq+4*xq+mmsize], m2
+
+ add xd, mmsize/2
+ cmp xd, w2d
+ jl .highpass_loop
+.end:
+ REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c
index 6c95439e10..f0cd471d36 100644
--- a/libavcodec/x86/fdct.c
+++ b/libavcodec/x86/fdct.c
@@ -13,20 +13,20 @@
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
-static struct
+static const struct
{
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
} fdct_r_row_sse2 =
@@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct
29692, -12299, 26722, -31521,
};
-static struct
+static const struct
{
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
} tab_frw_01234567_sse2 =
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index e4744a3b60..cae404c1a2 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein.
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -36,6 +36,8 @@
%define pointer resd
%endif
+SECTION_RODATA 32
+
struc FFTContext
.nbits: resd 1
.reverse: resd 1
@@ -51,13 +53,10 @@ struc FFTContext
.imdcthalf:pointer 1
endstruc
-SECTION_RODATA
-
%define M_SQRT1_2 0.70710678118654752440
%define M_COS_PI_1_8 0.923879532511287
%define M_COS_PI_3_8 0.38268343236509
-align 32
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
@@ -305,6 +304,7 @@ IF%1 mova Z(1), m5
INIT_YMM avx
+%if HAVE_AVX_EXTERNAL
align 16
fft8_avx:
mova m0, Z(0)
@@ -394,6 +394,8 @@ fft32_interleave_avx:
jg .deint_loop
ret
+%endif
+
INIT_XMM sse
align 16
@@ -537,6 +539,7 @@ DEFINE_ARGS zc, w, n, o1, o3
INIT_YMM avx
+%if HAVE_AVX_EXTERNAL
%macro INTERL_AVX 5
vunpckhps %3, %2, %1
vunpcklps %2, %2, %1
@@ -558,6 +561,7 @@ cglobal fft_calc, 2,5,8
FFT_DISPATCH _interleave %+ SUFFIX, r1
REP_RET
+%endif
INIT_XMM sse
@@ -776,9 +780,11 @@ align 8
dispatch_tab %+ fullsuffix: pointer list_of_fft
%endmacro ; DECL_FFT
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
DECL_FFT 6
DECL_FFT 6, _interleave
+%endif
INIT_XMM sse
DECL_FFT 5
DECL_FFT 5, _interleave
@@ -1080,4 +1086,7 @@ DECL_IMDCT POSROTATESHUF_3DNOW
%endif
INIT_YMM avx
+
+%if HAVE_AVX_EXTERNAL
DECL_IMDCT POSROTATESHUF_AVX
+%endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index a604956836..398091eb1f 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c
index 7ca72c54a4..5682230c8e 100644
--- a/libavcodec/x86/fft_init.c
+++ b/libavcodec/x86/fft_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
new file mode 100644
index 0000000000..37ee87b163
--- /dev/null
+++ b/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,74 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+ sub lend, pred_orderd
+ jle .ret
+ lea decodedq, [decodedq+pred_orderq*4-8]
+ lea coeffsq, [coeffsq+pred_orderq*4]
+ neg pred_orderq
+ movd m4, qlevelm
+ALIGN 16
+.loop_sample:
+ movd m0, [decodedq+pred_orderq*4+8]
+ add decodedq, 8
+ movd m1, [coeffsq+pred_orderq*4]
+ pxor m2, m2
+ pxor m3, m3
+ lea jq, [pred_orderq+1]
+ test jq, jq
+ jz .end_order
+.loop_order:
+ PMACSDQL m2, m0, m1, m2, m0
+ movd m0, [decodedq+jq*4]
+ PMACSDQL m3, m1, m0, m3, m1
+ movd m1, [coeffsq+jq*4]
+ inc jq
+ jl .loop_order
+.end_order:
+ PMACSDQL m2, m0, m1, m2, m0
+ psrlq m2, m4
+ movd m0, [decodedq]
+ paddd m0, m2
+ movd [decodedq], m0
+ sub lend, 2
+ jl .ret
+ PMACSDQL m3, m1, m0, m3, m1
+ psrlq m3, m4
+ movd m1, [decodedq+4]
+ paddd m1, m3
+ movd [decodedq+4], m1
+ jg .loop_sample
+.ret:
+ REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
new file mode 100644
index 0000000000..a071b3d3b7
--- /dev/null
+++ b/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+ int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+ int qlevel, int len);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt,
+ int bps)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ if (bps > 16 && CONFIG_FLAC_DECODER)
+ c->lpc = ff_flac_lpc_32_sse4;
+ }
+ if (EXTERNAL_XOP(cpu_flags)) {
+ if (bps > 16 && CONFIG_FLAC_DECODER)
+ c->lpc = ff_flac_lpc_32_xop;
+ }
+#endif
+}
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 818437672f..0d3f821c9f 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -2,20 +2,20 @@
;* x86 optimized Format Conversion Utils
;* Copyright (c) 2008 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 3d75df92bd..d300dfd864 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -5,20 +5,20 @@
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b581471296..de6b1d6c1b 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c
index 1ae8f86466..384ab89d9c 100644
--- a/libavcodec/x86/fpel_mmx.c
+++ b/libavcodec/x86/fpel_mmx.c
@@ -4,20 +4,20 @@
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm
index 673f795daa..2fcd1a26e5 100644
--- a/libavcodec/x86/h263_loopfilter.asm
+++ b/libavcodec/x86/h263_loopfilter.asm
@@ -1,20 +1,22 @@
;******************************************************************************
;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c
index d4fab981bf..ab81063233 100644
--- a/libavcodec/x86/h263dsp_init.c
+++ b/libavcodec/x86/h263dsp_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index cc41f00461..107ae51cbc 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
;* 2005-2008 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index 7b003515cc..c358482092 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -252,8 +252,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7
%define CHROMAMC_AVG NOTHING
INIT_XMM sse2
CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 put
+%endif
INIT_MMX mmxext
CHROMA_MC4 put
CHROMA_MC2 put
@@ -261,8 +263,10 @@ CHROMA_MC2 put
%define CHROMAMC_AVG AVG
INIT_XMM sse2
CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 avg
+%endif
INIT_MMX mmxext
CHROMA_MC4 avg
CHROMA_MC2 avg
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 8a9fdf6c35..e5cc442943 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,20 +7,20 @@
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;* Oskar Arvidsson <oskar@irock.se>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -384,8 +384,10 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
INIT_XMM sse2
DEBLOCK_LUMA
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA
+%endif
%else
@@ -499,8 +501,10 @@ INIT_MMX mmxext
DEBLOCK_LUMA v8, 8
INIT_XMM sse2
DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA v, 16
+%endif
%endif ; ARCH
@@ -772,8 +776,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80
INIT_XMM sse2
DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
+%endif
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_LUMA_INTRA v8
@@ -836,7 +842,11 @@ cglobal deblock_h_chroma_8, 5,7
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
movq buf1, m3
- call ff_chroma_inter_body_mmxext
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index 7242cb613e..fc4f4d7770 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -7,20 +7,20 @@
;* Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -418,9 +418,11 @@ cglobal deblock_h_luma_10, 5,7,15
INIT_XMM sse2
DEBLOCK_LUMA_64
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_64
%endif
+%endif
%macro SWAPMOVA 2
%ifid %1
@@ -715,8 +717,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16
INIT_XMM sse2
DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_INTRA_64
+%endif
%endif
@@ -802,10 +806,12 @@ DEBLOCK_LUMA_INTRA
INIT_XMM sse2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%endif
+%endif
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', %2=q0'
@@ -918,5 +924,7 @@ DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_CHROMA
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_CHROMA
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index bb881c35df..0dc0a7cb0f 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -2,20 +2,20 @@
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 313791a5d9..7fafe195f1 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -9,20 +9,20 @@
;* Holger Lubitz <hal@duncan.ol.sub.de>
;* Min Chen <chenm001.163.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index b7d51051d3..5c3acb1d38 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -83,8 +83,10 @@ cglobal h264_idct_add_10, 3,3
INIT_XMM sse2
IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD_10
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
@@ -117,9 +119,11 @@ add4x4_idct %+ SUFFIX:
INIT_XMM sse2
ALIGN 16
ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
ALIGN 16
ADD4x4IDCT
+%endif
%macro ADD16_OP 2
cmp byte [r4+%2], 0
@@ -155,8 +159,10 @@ cglobal h264_idct_add16_10, 5,6
INIT_XMM sse2
IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16_10
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
@@ -220,8 +226,10 @@ cglobal h264_idct8_dc_add_10,3,4,7
INIT_XMM sse2
IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_DC_ADD
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
@@ -293,8 +301,10 @@ cglobal h264_idct_add16intra_10,5,7,8
INIT_XMM sse2
IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16INTRA_10
+%endif
%assign last_block 36
;-----------------------------------------------------------------------------
@@ -330,8 +340,10 @@ cglobal h264_idct_add8_10,5,8,7
INIT_XMM sse2
IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD8
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
@@ -537,8 +549,10 @@ h264_idct8_add1_10 %+ SUFFIX:
INIT_XMM sse2
IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD
+%endif
;-----------------------------------------------------------------------------
; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
@@ -577,5 +591,7 @@ cglobal h264_idct8_add4_10, 0,7,16
INIT_XMM sse2
IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD4
+%endif
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index 394dcf4532..3e13ccb05d 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -5,20 +5,20 @@
;* Copyright (c) 2010 Loren Merritt
;* Copyright (c) 2010 Ronald S. Bultje
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -2497,10 +2497,7 @@ cglobal pred4x4_tm_vp8_8, 3,3
pshufb mm3, mm6
pshufb mm4, mm6
pshufb mm5, mm6
- psubw mm2, mm7
- psubw mm3, mm7
- psubw mm4, mm7
- psubw mm5, mm7
+ psubw mm0, mm7
paddw mm2, mm0
paddw mm3, mm0
paddw mm4, mm0
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 55790a956e..40f1c9f053 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -82,8 +82,10 @@ INIT_XMM sse2
PRED4x4_DR
INIT_XMM ssse3
PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_DR
+%endif
;------------------------------------------------------------------------------
; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
@@ -119,8 +121,10 @@ INIT_XMM sse2
PRED4x4_VR
INIT_XMM ssse3
PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_VR
+%endif
;-------------------------------------------------------------------------------
; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
@@ -159,8 +163,10 @@ INIT_XMM sse2
PRED4x4_HD
INIT_XMM ssse3
PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_HD
+%endif
;-----------------------------------------------------------------------------
; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
@@ -228,8 +234,10 @@ cglobal pred4x4_down_left_10, 3, 3
INIT_XMM sse2
PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_DL
+%endif
;-----------------------------------------------------------------------------
; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
@@ -255,8 +263,10 @@ cglobal pred4x4_vertical_left_10, 3, 3
INIT_XMM sse2
PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_VL
+%endif
;-----------------------------------------------------------------------------
; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
@@ -565,8 +575,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6
INIT_XMM sse2
PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_TOP_DC
+%endif
;-------------------------------------------------------------------------------
; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
@@ -622,8 +634,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6
INIT_XMM sse2
PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DC
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
@@ -656,8 +670,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6
INIT_XMM sse2
PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_VERTICAL
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
@@ -711,8 +727,10 @@ INIT_XMM sse2
PRED8x8L_HORIZONTAL
INIT_XMM ssse3
PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_HORIZONTAL
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
@@ -778,8 +796,10 @@ INIT_XMM sse2
PRED8x8L_DOWN_LEFT
INIT_XMM ssse3
PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DOWN_LEFT
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
@@ -851,8 +871,10 @@ INIT_XMM sse2
PRED8x8L_DOWN_RIGHT
INIT_XMM ssse3
PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DOWN_RIGHT
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
@@ -920,8 +942,10 @@ INIT_XMM sse2
PRED8x8L_VERTICAL_RIGHT
INIT_XMM ssse3
PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_VERTICAL_RIGHT
+%endif
;-----------------------------------------------------------------------------
; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
@@ -980,8 +1004,10 @@ INIT_XMM sse2
PRED8x8L_HORIZONTAL_UP
INIT_XMM ssse3
PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_HORIZONTAL_UP
+%endif
;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index f934256706..1724153932 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Jason Garrett-Glaser
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 90857cec88..fd6068ffb1 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -2,20 +2,20 @@
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
* Copyright (c) 2011 Daniel Kang
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -338,7 +338,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
+ av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
}\
@@ -348,7 +348,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
+ av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
}\
@@ -358,7 +358,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
+ av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
}\
@@ -368,7 +368,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
+ av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index f92c4aab2b..e7ce1b8b44 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index bc6c72541b..2d287ba443 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -6,20 +6,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index d1873af5b4..b4fb9db309 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -135,6 +135,13 @@ WEIGHT_FUNC_HALF_MM 8, 8
add off_regd, 1
or off_regd, 1
add r4, 1
+ cmp r5, 128
+ jne .normal
+ sar r5, 1
+ sar r6, 1
+ sar off_regd, 1
+ sub r4, 1
+.normal
%if cpuflag(ssse3)
movd m4, r5d
movd m0, r6d
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 961ec8ca45..5d9496224a 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -5,20 +5,20 @@
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index eec1653d3f..3d8d5b0fe1 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index e9d93e0af9..ae449e7e57 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -211,6 +211,7 @@ H264_BIWEIGHT_10_SSE(4, 10)
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
+#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (chroma_format_idc <= 1 && EXTERNAL_MMXEXT(cpu_flags))
@@ -366,4 +367,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
#endif /* HAVE_ALIGNED_STACK */
}
}
+#endif
}
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 073f7f908e..171c77b928 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -1,20 +1,27 @@
;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
;* SIMD-optimized halfpel functions
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 83d53dee5f..aee42cd003 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -3,20 +3,20 @@
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c
index fece265da8..50db36dc1b 100644
--- a/libavcodec/x86/hpeldsp_mmx.c
+++ b/libavcodec/x86/hpeldsp_mmx.c
@@ -3,20 +3,20 @@
*
* Copyright (c) 2001 Fabrice Bellard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
index b514a8fd4e..24e04a8f22 100644
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/libavcodec/x86/hpeldsp_rnd_template.c
@@ -7,20 +7,20 @@
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index 27723393bf..4cd6de101c 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -22,20 +22,20 @@
*
* conversion to gcc syntax by Michael Niedermayer
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
+ * along with FFmpeg; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index 50655d6bc0..d9bc841825 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -9,7 +9,7 @@
*
* Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
* Vertical pass is an implementation of the scheme:
* Loeffler C., Ligtenberg A., and Moschytz C.S.:
@@ -23,22 +23,21 @@
*
* More details at http://skal.planet-d.net/coding/dct.html
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
+ * along with FFmpeg; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "idct_xvid.h"
@@ -148,7 +147,7 @@ DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
#endif
-#define ROUND(x) "paddd "MANGLE(x)
+#define ROUND(x) "paddd "x
#define JZ(reg, to) \
"testl "reg","reg" \n\t" \
@@ -348,13 +347,13 @@ inline void ff_idct_xvid_sse2(short *block)
{
__asm__ volatile(
"movq "MANGLE(m127)", %%mm0 \n\t"
- iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
- iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
- iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
+ iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(MANGLE(walkenIdctRounders)), PUT_EVEN(ROW0))
+ iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND("1*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW1))
+ iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND("2*16+"MANGLE(walkenIdctRounders)), PUT_EVEN(ROW2))
TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
JZ("%%eax", "1f")
- iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
+ iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND("3*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW3))
TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
@@ -369,13 +368,13 @@ inline void ff_idct_xvid_sse2(short *block)
"2: \n\t"
iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
"3: \n\t"
- iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
+ iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND("4*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW5))
JZ("%%edx", "1f")
"4: \n\t"
- iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
+ iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND("5*16+"MANGLE(walkenIdctRounders)), PUT_EVEN(ROW6))
JZ("%%esi", "1f")
"5: \n\t"
- iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
+ iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND("5*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW7))
#if ARCH_X86_32
iLLM_HEAD
#endif
diff --git a/libavcodec/x86/idct_xvid.h b/libavcodec/x86/idct_xvid.h
index aea28bab96..7a2847b864 100644
--- a/libavcodec/x86/idct_xvid.h
+++ b/libavcodec/x86/idct_xvid.h
@@ -1,20 +1,20 @@
/*
* XVID MPEG-4 VIDEO CODEC
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index 633fcd9d59..ce30b42103 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -2,20 +2,20 @@
;* 36 point SSE-optimized IMDCT transform
;* Copyright (c) 2011 Vitor Sessak
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -50,7 +50,7 @@ ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
- dd 1.0, 0.70710678118654752439, 0.0, 0.0
+ dd 1.0, -0.70710678118654752439, 0.0, 0.0
costabs: times 4 dd 0.98480773
times 4 dd 0.93969262
@@ -129,6 +129,19 @@ SECTION_TEXT
%endif
%endmacro
+%macro BUTTERF2 3
+%if cpuflag(sse3)
+ mulps %1, %1, [ps_cosh_sse3 + %3]
+ PSHUFD %2, %1, 0xe1
+ addsubps %1, %1, %2
+%else
+ mulps %1, [ps_cosh + %3]
+ PSHUFD %2, %1, 0xe1
+ xorps %1, [ps_p1m1p1m1]
+ addps %1, %2
+%endif
+%endmacro
+
%macro STORE 4
movhlps %2, %1
movss [%3 ], %1
@@ -279,11 +292,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
BUTTERF m7, m2, 16
BUTTERF m3, m6, 32
BUTTERF m4, m1, 48
-
- mulps m5, m5, [ps_cosh + 64]
- PSHUFD m1, m5, 0xe1
- xorps m5, m5, [ps_p1m1p1m1]
- addps m5, m5, m1
+ BUTTERF2 m5, m1, 64
; permutates:
; m0 0 1 2 3 => 2 6 10 14 m1
@@ -358,8 +367,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
RET
%endmacro
+%if ARCH_X86_32
INIT_XMM sse
DEFINE_IMDCT
+%endif
INIT_XMM sse2
DEFINE_IMDCT
@@ -370,8 +381,10 @@ DEFINE_IMDCT
INIT_XMM ssse3
DEFINE_IMDCT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_IMDCT
+%endif
INIT_XMM sse
@@ -716,5 +729,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
INIT_XMM sse
DEFINE_FOUR_IMDCT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_FOUR_IMDCT
+%endif
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
new file mode 100644
index 0000000000..bacc1e927a
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -0,0 +1,292 @@
+;******************************************************************************
+;* SIMD lossless video DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_ef: times 8 db 14,15
+pb_67: times 8 db 6, 7
+pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
+pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
+
+SECTION_TEXT
+
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+ movd m4, maskd
+ SPLATW m4, m4
+ add wd, wd
+ test wq, 2*mmsize - 1
+ jz %%.tomainloop
+%%.wordloop:
+ sub wq, 2
+%ifidn %2, add
+ mov ax, [srcq+wq]
+ add ax, [dstq+wq]
+%else
+ mov ax, [src1q+wq]
+ sub ax, [src2q+wq]
+%endif
+ and ax, maskw
+ mov [dstq+wq], ax
+ test wq, 2*mmsize - 1
+ jnz %%.wordloop
+%%.tomainloop:
+%ifidn %2, add
+ add srcq, wq
+%else
+ add src1q, wq
+ add src2q, wq
+%endif
+ add dstq, wq
+ neg wq
+ jz %%.end
+%%.loop:
+%ifidn %2, add
+ mov%1 m0, [srcq+wq]
+ mov%1 m1, [dstq+wq]
+ mov%1 m2, [srcq+wq+mmsize]
+ mov%1 m3, [dstq+wq+mmsize]
+%else
+ mov%1 m0, [src1q+wq]
+ mov%1 m1, [src2q+wq]
+ mov%1 m2, [src1q+wq+mmsize]
+ mov%1 m3, [src2q+wq+mmsize]
+%endif
+ p%2w m0, m1
+ p%2w m2, m3
+ pand m0, m4
+ pand m2, m4
+ mov%1 [dstq+wq] , m0
+ mov%1 [dstq+wq+mmsize], m2
+ add wq, 2*mmsize
+ jl %%.loop
+%%.end:
+ RET
+%endmacro
+
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w
+ INT16_LOOP a, add
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w
+ test srcq, mmsize-1
+ jnz .unaligned
+ test dstq, mmsize-1
+ jnz .unaligned
+ INT16_LOOP a, add
+.unaligned:
+ INT16_LOOP u, add
+
+INIT_MMX mmx
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w
+ INT16_LOOP a, sub
+
+INIT_XMM sse2
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w
+ test src1q, mmsize-1
+ jnz .unaligned
+ test src2q, mmsize-1
+ jnz .unaligned
+ test dstq, mmsize-1
+ jnz .unaligned
+ INT16_LOOP a, sub
+.unaligned:
+ INT16_LOOP u, sub
+
+
+%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
+ add wd, wd
+ add srcq, wq
+ add dstq, wq
+ neg wq
+%%.loop:
+ mov%2 m1, [srcq+wq]
+ mova m2, m1
+ pslld m1, 16
+ paddw m1, m2
+ mova m2, m1
+
+ pshufb m1, m3
+ paddw m1, m2
+ pshufb m0, m5
+%if mmsize == 16
+ mova m2, m1
+ pshufb m1, m4
+ paddw m1, m2
+%endif
+ paddw m0, m1
+ pand m0, m7
+%ifidn %1, a
+ mova [dstq+wq], m0
+%else
+ movq [dstq+wq], m0
+ movhps [dstq+wq+8], m0
+%endif
+ add wq, mmsize
+ jl %%.loop
+ mov eax, mmsize-1
+ sub eax, wd
+ mov wd, eax
+ shl wd, 8
+ lea eax, [wd+eax-1]
+ movd m1, eax
+ pshufb m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+; int add_hfyu_left_prediction_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
+INIT_MMX ssse3
+cglobal add_hfyu_left_prediction_int16, 4,4,8, dst, src, mask, w, left
+.skip_prologue:
+ mova m5, [pb_67]
+ mova m3, [pb_zzzz2323zzzzabab]
+ movd m0, leftm
+ psllq m0, 48
+ movd m7, maskm
+ SPLATW m7 ,m7
+ ADD_HFYU_LEFT_LOOP_INT16 a, a
+
+INIT_XMM sse4
+cglobal add_hfyu_left_prediction_int16, 4,4,8, dst, src, mask, w, left
+ mova m5, [pb_ef]
+ mova m4, [pb_zzzzzzzz67676767]
+ mova m3, [pb_zzzz2323zzzzabab]
+ movd m0, leftm
+ pslldq m0, 14
+ movd m7, maskm
+ SPLATW m7 ,m7
+ test srcq, 15
+ jnz .src_unaligned
+ test dstq, 15
+ jnz .dst_unaligned
+ ADD_HFYU_LEFT_LOOP_INT16 a, a
+.dst_unaligned:
+ ADD_HFYU_LEFT_LOOP_INT16 u, a
+.src_unaligned:
+ ADD_HFYU_LEFT_LOOP_INT16 u, u
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
+INIT_MMX mmxext
+cglobal add_hfyu_median_prediction_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+ add wd, wd
+ movd mm6, maskd
+ SPLATW mm6, mm6
+ movq mm0, [topq]
+ movq mm2, mm0
+ movd mm4, [left_topq]
+ psllq mm2, 16
+ movq mm1, mm0
+ por mm4, mm2
+ movd mm3, [leftq]
+ psubw mm0, mm4 ; t-tl
+ add dstq, wq
+ add topq, wq
+ add diffq, wq
+ neg wq
+ jmp .skip
+.loop:
+ movq mm4, [topq+wq]
+ movq mm0, mm4
+ psllq mm4, 16
+ por mm4, mm1
+ movq mm1, mm0 ; t
+ psubw mm0, mm4 ; t-tl
+.skip:
+ movq mm2, [diffq+wq]
+%assign i 0
+%rep 4
+ movq mm4, mm0
+ paddw mm4, mm3 ; t-tl+l
+ pand mm4, mm6
+ movq mm5, mm3
+ pmaxsw mm3, mm1
+ pminsw mm5, mm1
+ pminsw mm3, mm4
+ pmaxsw mm3, mm5 ; median
+ paddw mm3, mm2 ; +residual
+ pand mm3, mm6
+%if i==0
+ movq mm7, mm3
+ psllq mm7, 48
+%else
+ movq mm4, mm3
+ psrlq mm7, 16
+ psllq mm4, 48
+ por mm7, mm4
+%endif
+%if i<3
+ psrlq mm0, 16
+ psrlq mm1, 16
+ psrlq mm2, 16
+%endif
+%assign i i+1
+%endrep
+ movq [dstq+wq], mm7
+ add wq, 8
+ jl .loop
+ movzx r2d, word [dstq-2]
+ mov [leftq], r2d
+ movzx r2d, word [topq-2]
+ mov [left_topq], r2d
+ RET
+
+cglobal sub_hfyu_median_prediction_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+ add wd, wd
+ movd mm7, maskd
+ SPLATW mm7, mm7
+ movq mm0, [src1q]
+ movq mm2, [src2q]
+ psllq mm0, 16
+ psllq mm2, 16
+ movd mm6, [left_topq]
+ por mm0, mm6
+ movd mm6, [leftq]
+ por mm2, mm6
+ xor maskq, maskq
+.loop:
+ movq mm1, [src1q + maskq]
+ movq mm3, [src2q + maskq]
+ movq mm4, mm2
+ psubw mm2, mm0
+ paddw mm2, mm1
+ pand mm2, mm7
+ movq mm5, mm4
+ pmaxsw mm4, mm1
+ pminsw mm1, mm5
+ pminsw mm4, mm2
+ pmaxsw mm4, mm1
+ psubw mm3, mm4
+ pand mm3, mm7
+ movq [dstq + maskq], mm3
+ add maskq, 8
+ movq mm0, [src1q + maskq - 2]
+ movq mm2, [src2q + maskq - 2]
+ cmp maskq, wq
+ jb .loop
+ mov maskd, [src1q + wq - 2]
+ mov [left_topq], maskd
+ mov maskd, [src2q + wq - 2]
+ mov [leftq], maskd
+ RET
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
new file mode 100644
index 0000000000..03ea2dff26
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -0,0 +1,62 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../lossless_videodsp.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+int ff_add_hfyu_left_prediction_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int acc);
+int ff_add_hfyu_left_prediction_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int acc);
+void ff_add_hfyu_median_prediction_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
+void ff_sub_hfyu_median_prediction_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+ const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ c->add_int16 = ff_add_int16_mmx;
+ c->diff_int16 = ff_diff_int16_mmx;
+ }
+
+ if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc->comp[0].depth_minus1<15) {
+ c->add_hfyu_median_prediction_int16 = ff_add_hfyu_median_prediction_int16_mmxext;
+ c->sub_hfyu_median_prediction_int16 = ff_sub_hfyu_median_prediction_int16_mmxext;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->add_int16 = ff_add_int16_sse2;
+ c->diff_int16 = ff_diff_int16_sse2;
+ }
+
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ c->add_hfyu_left_prediction_int16 = ff_add_hfyu_left_prediction_int16_ssse3;
+ }
+
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ c->add_hfyu_left_prediction_int16 = ff_add_hfyu_left_prediction_int16_sse4;
+ }
+}
diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c
index ea5d2eab56..9682733096 100644
--- a/libavcodec/x86/lpc.c
+++ b/libavcodec/x86/lpc.c
@@ -2,26 +2,25 @@
* SIMD-optimized LPC functions
* Copyright (c) 2007 Loren Merritt
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
@@ -152,7 +151,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c)
#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
- if (INLINE_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+ if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
}
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
index a62094ee97..9c48afeb20 100644
--- a/libavcodec/x86/mathops.h
+++ b/libavcodec/x86/mathops.h
@@ -2,20 +2,20 @@
* simple math operations
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp.c
index 72fc637764..94849b7e79 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp.c
@@ -2,25 +2,24 @@
* MLP DSP functions x86-optimized
* Copyright (c) 2009 Ramiro Polla
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
diff --git a/libavcodec/x86/motion_est.c b/libavcodec/x86/motion_est.c
index d126012e39..2b11cf95d3 100644
--- a/libavcodec/x86/motion_est.c
+++ b/libavcodec/x86/motion_est.c
@@ -5,25 +5,25 @@
*
* mostly by Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
+#include "libavutil/avassert.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
@@ -41,7 +41,7 @@ DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
- x86_reg len = -(stride * h);
+ x86_reg len = -(x86_reg)stride * h;
__asm__ volatile (
".p2align 4 \n\t"
"1: \n\t"
@@ -199,7 +199,7 @@ static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
int stride, int h)
{
- x86_reg len = -(stride * h);
+ x86_reg len = -(x86_reg)stride * h;
__asm__ volatile (
".p2align 4 \n\t"
"1: \n\t"
@@ -237,7 +237,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
- x86_reg len = -(stride * h);
+ x86_reg len = -(x86_reg)stride * h;
__asm__ volatile (
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
@@ -326,7 +326,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
static int sad8_ ## suf(void *v, uint8_t *blk2, \
uint8_t *blk1, int stride, int h) \
{ \
- assert(h == 8); \
+ av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
@@ -340,7 +340,7 @@ static int sad8_ ## suf(void *v, uint8_t *blk2, \
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, \
uint8_t *blk1, int stride, int h) \
{ \
- assert(h == 8); \
+ av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
@@ -355,7 +355,7 @@ static int sad8_x2_ ## suf(void *v, uint8_t *blk2, \
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, \
uint8_t *blk1, int stride, int h) \
{ \
- assert(h == 8); \
+ av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
@@ -370,7 +370,7 @@ static int sad8_y2_ ## suf(void *v, uint8_t *blk2, \
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, \
uint8_t *blk1, int stride, int h) \
{ \
- assert(h == 8); \
+ av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
@@ -478,7 +478,7 @@ av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
c->pix_abs[1][3] = sad8_xy2_mmxext;
}
}
- if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
+ if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0] = sad16_sse2;
}
#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm
index 2cdd84b76d..dc0f900c5b 100644
--- a/libavcodec/x86/mpeg4qpel.asm
+++ b/libavcodec/x86/mpeg4qpel.asm
@@ -1,21 +1,23 @@
;******************************************************************************
;* mpeg4 qpel
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index 533b4a7c3f..5dc7bc6362 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -2,20 +2,20 @@
* SIMD-optimized MP3 decoding functions
* Copyright (c) 2010 Vitor Sessak
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -26,11 +26,18 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegaudiodsp.h"
-void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+#if ARCH_X86_32
+DECL(sse)
+#endif
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
float *tmpbuf);
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
@@ -217,11 +224,17 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
} \
}
+#if HAVE_SSE
+#if ARCH_X86_32
DECL_IMDCT_BLOCKS(sse,sse)
+#endif
DECL_IMDCT_BLOCKS(sse2,sse)
DECL_IMDCT_BLOCKS(sse3,sse)
DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
DECL_IMDCT_BLOCKS(avx,avx)
+#endif
#endif /* HAVE_YASM */
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
@@ -249,9 +262,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
#endif /* HAVE_SSE2_INLINE */
#if HAVE_YASM
+#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse;
}
+#endif
if (EXTERNAL_SSE2(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse2;
}
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 25b44e6768..b2ce68062c 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -2,20 +2,20 @@
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,7 +27,7 @@
#include "libavcodec/mpegvideo.h"
#include "dsputil_x86.h"
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
@@ -36,7 +36,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
qmul = qscale << 1;
- assert(s->block_last_index[n]>=0 || s->h263_aic);
+ av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
if (!s->h263_aic) {
if (n < 4)
@@ -112,7 +112,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
qmul = qscale << 1;
qadd = (qscale - 1) | 1;
- assert(s->block_last_index[n]>=0 || s->h263_aic);
+ av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
@@ -172,7 +172,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
const uint16_t *quant_matrix;
int block0;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
@@ -240,7 +240,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
x86_reg nCoeffs;
const uint16_t *quant_matrix;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
@@ -307,7 +307,7 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
const uint16_t *quant_matrix;
int block0;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -372,7 +372,7 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
x86_reg nCoeffs;
const uint16_t *quant_matrix;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -443,11 +443,11 @@ __asm__ volatile(
);
}
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
{
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
@@ -459,5 +459,5 @@ av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
}
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
}
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 19ab83a7fa..7dd9959087 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -2,20 +2,20 @@
* The simplest mpeg encoder (well, it was the simplest!)
* Copyright (c) 2000,2001 Fabrice Bellard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -194,7 +194,7 @@ static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
}
#endif /* HAVE_INLINE_ASM */
-av_cold void ff_MPV_encode_init_x86(MpegEncContext *s)
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
{
const int dct_algo = s->avctx->dct_algo;
int i;
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index ed00e8eea7..10012e4484 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -3,20 +3,20 @@
*
* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -100,7 +100,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
const uint16_t *qmat, *bias;
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
- assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+ av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
//s->fdct (block);
RENAMEl(ff_fdct) (block); //cannot be anything else ...
@@ -110,10 +110,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
if (s->mb_intra) {
int dummy;
- if (n < 4)
+ if (n < 4){
q = s->y_dc_scale;
- else
+ bias = s->q_intra_matrix16[qscale][1];
+ qmat = s->q_intra_matrix16[qscale][0];
+ }else{
q = s->c_dc_scale;
+ bias = s->q_chroma_intra_matrix16[qscale][1];
+ qmat = s->q_chroma_intra_matrix16[qscale][0];
+ }
/* note: block[0] is assumed to be positive */
if (!s->h263_aic) {
__asm__ volatile (
@@ -128,8 +133,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
block[0]=0; //avoid fake overflow
// temp_block[0] = (block[0] + (q >> 1)) / q;
last_non_zero_p1 = 1;
- bias = s->q_intra_matrix16[qscale][1];
- qmat = s->q_intra_matrix16[qscale][0];
} else {
last_non_zero_p1 = 0;
bias = s->q_inter_matrix16[qscale][1];
@@ -165,7 +168,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"REG_a" \n\t"
- "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
+ "movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat), "r" (bias),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -199,7 +202,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"REG_a" \n\t"
- "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
+ "movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -267,6 +270,50 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+ }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
+ if(last_non_zero_p1 <= 1) goto end;
+ block[0x04] = temp_block[0x01];
+ block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+ if(last_non_zero_p1 <= 4) goto end;
+ block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+ block[0x05] = temp_block[0x03];
+ if(last_non_zero_p1 <= 7) goto end;
+ block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+ block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+ if(last_non_zero_p1 <= 11) goto end;
+ block[0x1C] = temp_block[0x19];
+ block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+ block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+ if(last_non_zero_p1 <= 16) goto end;
+ block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+ block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+ block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+ block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+ if(last_non_zero_p1 <= 24) goto end;
+ block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+ block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+ block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+ block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+ if(last_non_zero_p1 <= 32) goto end;
+ block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+ block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+ block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+ block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+ if(last_non_zero_p1 <= 40) goto end;
+ block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+ block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+ block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+ block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+ if(last_non_zero_p1 <= 48) goto end;
+ block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+ block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+ block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+ block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+ if(last_non_zero_p1 <= 56) goto end;
+ block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+ block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+ block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+ block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}else{
if(last_non_zero_p1 <= 1) goto end;
block[0x01] = temp_block[0x01];
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index c05f3da017..8e23ccfbc6 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c
index 34a3da36d7..7dca62c675 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/x86/pngdsp_init.c
@@ -2,20 +2,20 @@
* x86 PNG optimizations.
* Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index a0e97b3951..255eb24897 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -1,23 +1,24 @@
;******************************************************************************
;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT except for the clip range
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
;*
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -47,10 +48,10 @@ w1_plus_w5: times 4 dw W1sh2, +W5sh2
w5_min_w1: times 4 dw W5sh2, -W1sh2
w5_plus_w7: times 4 dw W5sh2, +W7sh2
w7_min_w5: times 4 dw W7sh2, -W5sh2
-row_round: times 8 dw (1<<14)
+pw_88: times 8 dw 0x2008
+cextern pw_1
cextern pw_4
-cextern pw_8
cextern pw_512
cextern pw_1019
@@ -91,14 +92,12 @@ section .text align=16
; a2 -= W6 * row[2];
; a3 -= W2 * row[2];
%ifidn %1, col
- paddw m10,[pw_8]
+ paddw m10,[pw_88]
%endif
- SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
%ifidn %1, row
- psubw m10,[row_round]
+ paddw m10,[pw_1]
%endif
- SIGNEXTEND m8, m9, m14 ; { row[2] }[0-3] / [4-7]
- SIGNEXTEND m10, m11, m14 ; { row[0] }[0-3] / [4-7]
+ SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
pmaddwd m2, m0, [w4_plus_w6]
pmaddwd m3, m1, [w4_plus_w6]
pmaddwd m4, m0, [w4_min_w6]
@@ -107,75 +106,33 @@ section .text align=16
pmaddwd m7, m1, [w4_min_w2]
pmaddwd m0, [w4_plus_w2]
pmaddwd m1, [w4_plus_w2]
- pslld m2, 2
- pslld m3, 2
- pslld m4, 2
- pslld m5, 2
- pslld m6, 2
- pslld m7, 2
- pslld m0, 2
- pslld m1, 2
; a0: -1*row[0]-1*row[2]
; a1: -1*row[0]
; a2: -1*row[0]
; a3: -1*row[0]+1*row[2]
- psubd m2, m10 ; a1[0-3]
- psubd m3, m11 ; a1[4-7]
- psubd m4, m10 ; a2[0-3]
- psubd m5, m11 ; a2[4-7]
- psubd m0, m10
- psubd m1, m11
- psubd m6, m10
- psubd m7, m11
- psubd m0, m8 ; a0[0-3]
- psubd m1, m9 ; a0[4-7]
- paddd m6, m8 ; a3[0-3]
- paddd m7, m9 ; a3[4-7]
; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
- SIGNEXTEND m13, m14, m10 ; { row[4] }[0-3] / [4-7]
pmaddwd m10, m8, [w4_plus_w6]
pmaddwd m11, m9, [w4_plus_w6]
- pslld m10, 2
- pslld m11, 2
- psubd m10, m13
- psubd m11, m14
paddd m0, m10 ; a0[0-3]
paddd m1, m11 ; a0[4-7]
pmaddwd m10, m8, [w4_min_w6]
pmaddwd m11, m9, [w4_min_w6]
- pslld m10, 2
- pslld m11, 2
- psubd m10, m13
- psubd m11, m14
paddd m6, m10 ; a3[0-3]
paddd m7, m11 ; a3[4-7]
pmaddwd m10, m8, [w4_min_w2]
pmaddwd m11, m9, [w4_min_w2]
pmaddwd m8, [w4_plus_w2]
pmaddwd m9, [w4_plus_w2]
- pslld m10, 2
- pslld m11, 2
- pslld m8, 2
- pslld m9, 2
- psubd m10, m13
- psubd m11, m14
- psubd m8, m13
- psubd m9, m14
psubd m4, m10 ; a2[0-3] intermediate
psubd m5, m11 ; a2[4-7] intermediate
psubd m2, m8 ; a1[0-3] intermediate
psubd m3, m9 ; a1[4-7] intermediate
- SIGNEXTEND m12, m13, m10 ; { row[6] }[0-3] / [4-7]
- psubd m4, m12 ; a2[0-3]
- psubd m5, m13 ; a2[4-7]
- paddd m2, m12 ; a1[0-3]
- paddd m3, m13 ; a1[4-7]
; load/store
mova [r2+ 0], m0
@@ -206,8 +163,6 @@ section .text align=16
; b3 = MUL(W7, row[1]);
; MAC(b3, -W5, row[3]);
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
- SIGNEXTEND m10, m11, m12 ; { row[1] }[0-3] / [4-7]
- SIGNEXTEND m8, m9, m12 ; { row[3] }[0-3] / [4-7]
pmaddwd m2, m0, [w3_min_w7]
pmaddwd m3, m1, [w3_min_w7]
pmaddwd m4, m0, [w5_min_w1]
@@ -216,35 +171,11 @@ section .text align=16
pmaddwd m7, m1, [w7_min_w5]
pmaddwd m0, [w1_plus_w3]
pmaddwd m1, [w1_plus_w3]
- pslld m2, 2
- pslld m3, 2
- pslld m4, 2
- pslld m5, 2
- pslld m6, 2
- pslld m7, 2
- pslld m0, 2
- pslld m1, 2
; b0: +1*row[1]+2*row[3]
; b1: +2*row[1]-1*row[3]
; b2: -1*row[1]-1*row[3]
; b3: +1*row[1]+1*row[3]
- psubd m2, m8
- psubd m3, m9
- paddd m0, m8
- paddd m1, m9
- paddd m8, m10 ; { row[1] + row[3] }[0-3]
- paddd m9, m11 ; { row[1] + row[3] }[4-7]
- paddd m10, m10
- paddd m11, m11
- paddd m0, m8 ; b0[0-3]
- paddd m1, m9 ; b0[4-7]
- paddd m2, m10 ; b1[0-3]
- paddd m3, m11 ; b2[4-7]
- psubd m4, m8 ; b2[0-3]
- psubd m5, m9 ; b2[4-7]
- paddd m6, m8 ; b3[0-3]
- paddd m7, m9 ; b3[4-7]
; MAC(b0, W5, row[5]);
; MAC(b0, W7, row[7]);
@@ -255,38 +186,16 @@ section .text align=16
; MAC(b3, W3, row[5]);
; MAC(b3, -W1, row[7]);
SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
- SIGNEXTEND m13, m12, m11 ; { row[5] }[0-3] / [4-7]
- SIGNEXTEND m14, m11, m10 ; { row[7] }[0-3] / [4-7]
; b0: -1*row[5]+1*row[7]
; b1: -1*row[5]+1*row[7]
; b2: +1*row[5]+2*row[7]
; b3: +2*row[5]-1*row[7]
- paddd m4, m13
- paddd m5, m12
- paddd m6, m13
- paddd m7, m12
- psubd m13, m14 ; { row[5] - row[7] }[0-3]
- psubd m12, m11 ; { row[5] - row[7] }[4-7]
- paddd m14, m14
- paddd m11, m11
- psubd m0, m13
- psubd m1, m12
- psubd m2, m13
- psubd m3, m12
- paddd m4, m14
- paddd m5, m11
- paddd m6, m13
- paddd m7, m12
pmaddwd m10, m8, [w1_plus_w5]
pmaddwd m11, m9, [w1_plus_w5]
pmaddwd m12, m8, [w5_plus_w7]
pmaddwd m13, m9, [w5_plus_w7]
- pslld m10, 2
- pslld m11, 2
- pslld m12, 2
- pslld m13, 2
psubd m2, m10 ; b1[0-3]
psubd m3, m11 ; b1[4-7]
paddd m0, m12 ; b0[0-3]
@@ -295,10 +204,6 @@ section .text align=16
pmaddwd m13, m9, [w7_plus_w3]
pmaddwd m8, [w3_min_w1]
pmaddwd m9, [w3_min_w1]
- pslld m12, 2
- pslld m13, 2
- pslld m8, 2
- pslld m9, 2
paddd m4, m12 ; b2[0-3]
paddd m5, m13 ; b2[4-7]
paddd m6, m8 ; b3[0-3]
@@ -345,7 +250,7 @@ cglobal prores_idct_put_10, 4, 4, %1
pmullw m13,[r3+64]
pmullw m12,[r3+96]
- IDCT_1D row, 17
+ IDCT_1D row, 15
; transpose for second part of IDCT
TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
@@ -360,20 +265,11 @@ cglobal prores_idct_put_10, 4, 4, %1
; for (i = 0; i < 8; i++)
; idctSparseColAdd(dest + i, line_size, block + i);
- IDCT_1D col, 20
+ IDCT_1D col, 18
; clip/store
- mova m6, [pw_512]
mova m3, [pw_4]
mova m5, [pw_1019]
- paddw m8, m6
- paddw m0, m6
- paddw m1, m6
- paddw m2, m6
- paddw m4, m6
- paddw m11, m6
- paddw m9, m6
- paddw m10, m6
pmaxsw m8, m3
pmaxsw m0, m3
pmaxsw m1, m3
@@ -422,7 +318,9 @@ INIT_XMM sse2
idct_put_fn 16
INIT_XMM sse4
idct_put_fn 16
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
idct_put_fn 16
+%endif
%endif
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index 68ad929067..394025b68e 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -3,20 +3,20 @@
*
* Copyright (c) 2010-2011 Maxim Poliakovski
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -32,7 +32,7 @@ void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
int16_t *block, const int16_t *qmat);
-av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
{
#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 27a1c63b8a..4e72d5084f 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/rnd_mmx.c b/libavcodec/x86/rnd_mmx.c
index db4515a9c5..326e2f395b 100644
--- a/libavcodec/x86/rnd_mmx.c
+++ b/libavcodec/x86/rnd_mmx.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index 144f111368..ca6ad3ea23 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -7,20 +7,20 @@
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b600..7732d65b2a 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -2,20 +2,20 @@
;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 586e4e9a6d..99c56f9d09 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -2,20 +2,20 @@
* RV30/40 MMX/SSE2 optimizations
* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 1a33aa0ba7..24a803865f 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -4,20 +4,20 @@
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 781f467490..75ba8ba12c 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -2,20 +2,20 @@
* RV40 decoder motion compensation functions x86-optimised
* Copyright (c) 2008 Konstantin Shishkov
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index d7164b6496..af774e1a5c 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -2,20 +2,20 @@
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -26,6 +26,12 @@ SECTION_RODATA
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
ps_neg times 4 dd 1<<31
+ps_noise0 times 2 dd 1.0, 0.0,
+ps_noise2 times 2 dd -1.0, 0.0
+ps_noise13 dd 0.0, 1.0, 0.0, -1.0
+ dd 0.0, -1.0, 0.0, 1.0
+ dd 0.0, 1.0, 0.0, -1.0
+cextern sbr_noise_table
SECTION_TEXT
@@ -136,7 +142,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
mova m3, m1
mova m4, m2
- mova m7, [ps_mask]
; Set pointers
%if ARCH_X86_64 == 0 || WIN64
@@ -156,30 +161,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
shl start, 3 ; offset from num loops
mova m0, [X_lowq + start]
- movlhps m1, m1 ; (a2 a3 a2 a3)
- movlhps m2, m2 ; (a0 a1 a0 a1)
- shufps m3, m3, q0101 ; (a3 a2 a3 a2)
- shufps m4, m4, q0101 ; (a1 a0 a1 a0)
- xorps m3, m7 ; (-a3 a2 -a3 a2)
- xorps m4, m7 ; (-a1 a0 -a1 a0)
+ shufps m3, m3, q1111
+ shufps m4, m4, q1111
+ xorps m3, [ps_mask]
+ shufps m1, m1, q0000
+ shufps m2, m2, q0000
+ xorps m4, [ps_mask]
.loop2:
- mova m5, m0
+ movu m7, [X_lowq + start + 8] ; BbCc
mova m6, m0
- shufps m0, m0, q2200 ; {Xl[-2][0],",Xl[-1][0],"}
- shufps m5, m5, q3311 ; {Xl[-2][1],",Xl[-1][1],"}
- mulps m0, m2
- mulps m5, m4
- mova m7, m6
- addps m5, m0
- mova m0, [X_lowq + start + 2*2*4]
- shufps m6, m0, q0022 ; {Xl[-1][0],",Xl[0][0],"}
- shufps m7, m0, q1133 ; {Xl[-1][1],",Xl[1][1],"}
- mulps m6, m1
+ mova m5, m7
+ shufps m0, m0, q2301 ; aAbB
+ shufps m7, m7, q2301 ; bBcC
+ mulps m0, m4
mulps m7, m3
- addps m5, m6
+ mulps m6, m2
+ mulps m5, m1
+ addps m7, m0
+ mova m0, [X_lowq + start +16] ; CcDd
addps m7, m0
- addps m5, m7
- mova [X_highq + start], m5
+ addps m6, m5
+ addps m7, m6
+ mova [X_highq + start], m7
add start, 16
jnz .loop2
RET
@@ -246,33 +249,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z
jne .loop
REP_RET
-INIT_XMM sse2
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY 0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
- mova m2, [src0q+cq+mmsize]
- mova m3, [src1q+mmsize]
- pshufd m4, m0, q0123
- pshufd m5, m1, q0123
- pshufd m6, m2, q0123
- pshufd m7, m3, q0123
- addps m3, m4
+ mova m4, [src0q+cq+mmsize]
+ mova m5, [src1q+mmsize]
+%if cpuflag(sse2)
+ pshufd m2, m0, q0123
+ pshufd m3, m1, q0123
+ pshufd m6, m4, q0123
+ pshufd m7, m5, q0123
+%else
+ shufps m2, m0, m0, q0123
+ shufps m3, m1, m1, q0123
+ shufps m6, m4, m4, q0123
+ shufps m7, m5, m5, q0123
+%endif
+ addps m5, m2
subps m0, m7
addps m1, m6
- subps m2, m5
+ subps m4, m3
mova [vrevq], m1
- mova [vrevq+mmsize], m3
+ mova [vrevq+mmsize], m5
mova [vq+cq], m0
- mova [vq+cq+mmsize], m2
+ mova [vq+cq+mmsize], m4
add src1q, 2*mmsize
add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
INIT_XMM sse2
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
@@ -303,3 +320,106 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
movq m2, [zq]
movq [r2q], m2
REP_RET
+
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+
+%macro LOAD_NST 1
+%ifdef PIC
+ lea NOISE_TABLE, [%1]
+ mova m0, [kxq + NOISE_TABLE]
+%else
+ mova m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ mova m0, [ps_noise0]
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ and kxq, 1
+ shl kxq, 4
+ LOAD_NST ps_noise13
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ mova m0, [ps_noise2]
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ and kxq, 1
+ shl kxq, 4
+ LOAD_NST ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+ mov kxd, m_maxm
+%define count kxq
+%else
+%define count m_maxq
+%endif
+ dec noiseq
+ shl count, 2
+%ifdef PIC
+ lea NOISE_TABLE, [sbr_noise_table]
+%endif
+ lea Yq, [Yq + 2*count]
+ add s_mq, count
+ add q_filtq, count
+ shl noiseq, 3
+ pxor m5, m5
+ neg count
+.loop:
+ mova m1, [q_filtq + count]
+ movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
+ movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
+ add noiseq, 2*mmsize
+ and noiseq, 0x1ff<<3
+ punpckhdq m2, m1, m1
+ punpckldq m1, m1
+ mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+ mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+ mova m3, [s_mq + count]
+ ; TODO: replace by a vpermd in AVX2
+ punpckhdq m4, m3, m3
+ punpckldq m3, m3
+ pcmpeqd m6, m3, m5 ; m6 == 0
+ pcmpeqd m7, m4, m5 ; m7 == 0
+ mulps m3, m0 ; s_m[m] * phi_sign
+ mulps m4, m0 ; s_m[m] * phi_sign
+ pand m1, m6
+ pand m2, m7
+ movu m6, [Yq + 2*count]
+ movu m7, [Yq + 2*count + mmsize]
+ addps m3, m1
+ addps m4, m2
+ addps m6, m3
+ addps m7, m4
+ movu [Yq + 2*count], m6
+ movu [Yq + 2*count + mmsize], m7
+ add count, mmsize
+ jl .loop
+ RET
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 9600852163..2b912d0e9e 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -2,20 +2,20 @@
* AAC Spectral Band Replication decoding functions
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -34,9 +34,23 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
float bw, int start, int end);
void ff_sbr_neg_odd_64_sse(float *z);
void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_pre_shuffle_sse2(float *z);
+void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -48,10 +62,15 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
s->hf_g_filt = ff_sbr_hf_g_filt_sse;
s->hf_gen = ff_sbr_hf_gen_sse;
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+ s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2;
+ s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
+ s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
+ s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
+ s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
}
}
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index a342110cd3..e10dc7ba09 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -3,24 +3,23 @@
*
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/simple_idct.h"
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "dsputil_x86.h"
@@ -82,7 +81,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
static inline void idct(int16_t *block)
{
- DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+ LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
int16_t * const temp= (int16_t*)align_tmp;
__asm__ volatile(
diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c
new file mode 100644
index 0000000000..735e7905a0
--- /dev/null
+++ b/libavcodec/x86/snowdsp.c
@@ -0,0 +1,902 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+#include "dsputil_x86.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+ const int w2= (width+1)>>1;
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ IDWTELEM * const ref = b + w2 - 1;
+ IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+ // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+ // The savings in code and time are well worth having to store this value and
+ // calculate b[0] correctly afterwards.
+
+ i = 0;
+ __asm__ volatile(
+ "pcmpeqd %%xmm7, %%xmm7 \n\t"
+ "pcmpeqd %%xmm3, %%xmm3 \n\t"
+ "psllw $1, %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "psllw $13, %%xmm3 \n\t"
+ ::);
+ for(; i<w_l-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm2 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
+ "pmulhw %%xmm3, %%xmm2 \n\t"
+ "pmulhw %%xmm3, %%xmm6 \n\t"
+ "paddw (%0), %%xmm2 \n\t"
+ "paddw 16(%0), %%xmm6 \n\t"
+ "movdqa %%xmm2, (%0) \n\t"
+ "movdqa %%xmm6, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ }
+
+ { // Lift 1
+ IDWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+ dst[i] = dst[i] - (b[i] + b[i + 1]);
+ }
+ for(; i<w_r-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqa 16(%0), %%xmm4 \n\t"
+ "psubw %%xmm2, %%xmm0 \n\t"
+ "psubw %%xmm6, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ IDWTELEM * const ref = b+w2 - 1;
+ IDWTELEM b_0 = b[0];
+
+ i = 0;
+ __asm__ volatile(
+ "psllw $15, %%xmm7 \n\t"
+ "pcmpeqw %%xmm6, %%xmm6 \n\t"
+ "psrlw $13, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
+ ::);
+ for(; i<w_l-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu 16(%1), %%xmm4 \n\t"
+ "movdqu 2(%1), %%xmm1 \n\t"
+ "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
+ "paddw %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm5, %%xmm4 \n\t"
+ "psubw %%xmm7, %%xmm0 \n\t"
+ "psubw %%xmm7, %%xmm4 \n\t"
+ "psraw $1, %%xmm0 \n\t"
+ "psraw $1, %%xmm4 \n\t"
+ "movdqa (%0), %%xmm1 \n\t"
+ "movdqa 16(%0), %%xmm5 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "psraw $2, %%xmm0 \n\t"
+ "psraw $2, %%xmm4 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+ }
+
+ { // Lift 3
+ IDWTELEM * const src = b+w2;
+
+ i = 0;
+ for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+ temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+ }
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw (%1), %%xmm2 \n\t"
+ "paddw 16(%1), %%xmm6 \n\t"
+ "movdqu (%0), %%xmm0 \n\t"
+ "movdqu 16(%0), %%xmm4 \n\t"
+ "paddw %%xmm2, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "psraw $1, %%xmm2 \n\t"
+ "psraw $1, %%xmm6 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "paddw %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm2, (%2) \n\t"
+ "movdqa %%xmm6, 16(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x3E) != 0x3E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=62; i>=0; i-=64){
+ __asm__ volatile(
+ "movdqa (%1), %%xmm0 \n\t"
+ "movdqa 16(%1), %%xmm2 \n\t"
+ "movdqa 32(%1), %%xmm4 \n\t"
+ "movdqa 48(%1), %%xmm6 \n\t"
+ "movdqa (%1), %%xmm1 \n\t"
+ "movdqa 16(%1), %%xmm3 \n\t"
+ "movdqa 32(%1), %%xmm5 \n\t"
+ "movdqa 48(%1), %%xmm7 \n\t"
+ "punpcklwd (%2), %%xmm0 \n\t"
+ "punpcklwd 16(%2), %%xmm2 \n\t"
+ "punpcklwd 32(%2), %%xmm4 \n\t"
+ "punpcklwd 48(%2), %%xmm6 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm2, 32(%0) \n\t"
+ "movdqa %%xmm4, 64(%0) \n\t"
+ "movdqa %%xmm6, 96(%0) \n\t"
+ "punpckhwd (%2), %%xmm1 \n\t"
+ "punpckhwd 16(%2), %%xmm3 \n\t"
+ "punpckhwd 32(%2), %%xmm5 \n\t"
+ "punpckhwd 48(%2), %%xmm7 \n\t"
+ "movdqa %%xmm1, 16(%0) \n\t"
+ "movdqa %%xmm3, 48(%0) \n\t"
+ "movdqa %%xmm5, 80(%0) \n\t"
+ "movdqa %%xmm7, 112(%0) \n\t"
+ :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+ const int w2= (width+1)>>1;
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ IDWTELEM * const ref = b + w2 - 1;
+
+ i = 1;
+ b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm3, %%mm3 \n\t"
+ "psllw $1, %%mm3 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "psllw $13, %%mm3 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%1), %%mm6 \n\t"
+ "paddw 2(%1), %%mm2 \n\t"
+ "paddw 10(%1), %%mm6 \n\t"
+ "paddw %%mm7, %%mm2 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
+ "pmulhw %%mm3, %%mm2 \n\t"
+ "pmulhw %%mm3, %%mm6 \n\t"
+ "paddw (%0), %%mm2 \n\t"
+ "paddw 8(%0), %%mm6 \n\t"
+ "movq %%mm2, (%0) \n\t"
+ "movq %%mm6, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ }
+
+ { // Lift 1
+ IDWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%1), %%mm6 \n\t"
+ "paddw 2(%1), %%mm2 \n\t"
+ "paddw 10(%1), %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm6, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ IDWTELEM * const ref = b+w2 - 1;
+
+ i = 1;
+ b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+ __asm__ volatile(
+ "psllw $15, %%mm7 \n\t"
+ "pcmpeqw %%mm6, %%mm6 \n\t"
+ "psrlw $13, %%mm6 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm4 \n\t"
+ "movq 2(%1), %%mm1 \n\t"
+ "movq 10(%1), %%mm5 \n\t"
+ "paddw %%mm6, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm5, %%mm4 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm4 \n\t"
+ "psraw $1, %%mm0 \n\t"
+ "psraw $1, %%mm4 \n\t"
+ "movq (%0), %%mm1 \n\t"
+ "movq 8(%0), %%mm5 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm5, %%mm4 \n\t"
+ "psraw $2, %%mm0 \n\t"
+ "psraw $2, %%mm4 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm5, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ }
+
+ { // Lift 3
+ IDWTELEM * const src = b+w2;
+ i = 0;
+
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movq 2(%1), %%mm2 \n\t"
+ "movq 10(%1), %%mm6 \n\t"
+ "paddw (%1), %%mm2 \n\t"
+ "paddw 8(%1), %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "psraw $1, %%mm2 \n\t"
+ "psraw $1, %%mm6 \n\t"
+ "paddw %%mm0, %%mm2 \n\t"
+ "paddw %%mm4, %%mm6 \n\t"
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm6, 8(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x1E) != 0x1E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=30; i>=0; i-=32){
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm6 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm5 \n\t"
+ "movq 24(%1), %%mm7 \n\t"
+ "punpcklwd (%2), %%mm0 \n\t"
+ "punpcklwd 8(%2), %%mm2 \n\t"
+ "punpcklwd 16(%2), %%mm4 \n\t"
+ "punpcklwd 24(%2), %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, 16(%0) \n\t"
+ "movq %%mm4, 32(%0) \n\t"
+ "movq %%mm6, 48(%0) \n\t"
+ "punpckhwd (%2), %%mm1 \n\t"
+ "punpckhwd 8(%2), %%mm3 \n\t"
+ "punpckhwd 16(%2), %%mm5 \n\t"
+ "punpckhwd 24(%2), %%mm7 \n\t"
+ "movq %%mm1, 8(%0) \n\t"
+ "movq %%mm3, 24(%0) \n\t"
+ "movq %%mm5, 40(%0) \n\t"
+ "movq %%mm7, 56(%0) \n\t"
+ :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+ ""op" ("r",%%"REG_d"), %%"t0" \n\t"\
+ ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\
+ ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\
+ ""op" 48("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+ snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+ snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "psubw %%"s0", %%"t0" \n\t"\
+ "psubw %%"s1", %%"t1" \n\t"\
+ "psubw %%"s2", %%"t2" \n\t"\
+ "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+ "movdqa %%"s0", ("w",%%"REG_d") \n\t"\
+ "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\
+ "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\
+ "movdqa %%"s3", 48("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+ "psraw $"n", %%"t0" \n\t"\
+ "psraw $"n", %%"t1" \n\t"\
+ "psraw $"n", %%"t2" \n\t"\
+ "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "paddw %%"s0", %%"t0" \n\t"\
+ "paddw %%"s1", %%"t1" \n\t"\
+ "paddw %%"s2", %%"t2" \n\t"\
+ "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "pmulhw %%"s0", %%"t0" \n\t"\
+ "pmulhw %%"s1", %%"t1" \n\t"\
+ "pmulhw %%"s2", %%"t2" \n\t"\
+ "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "movdqa %%"s0", %%"t0" \n\t"\
+ "movdqa %%"s1", %%"t1" \n\t"\
+ "movdqa %%"s2", %%"t2" \n\t"\
+ "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+ x86_reg i = width;
+
+ while(i & 0x1F)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+ i+=i;
+
+ __asm__ volatile (
+ "jmp 2f \n\t"
+ "1: \n\t"
+ snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+ "pcmpeqw %%xmm0, %%xmm0 \n\t"
+ "pcmpeqw %%xmm2, %%xmm2 \n\t"
+ "paddw %%xmm2, %%xmm2 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "psllw $13, %%xmm2 \n\t"
+ snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+ "pcmpeqw %%xmm7, %%xmm7 \n\t"
+ "pcmpeqw %%xmm5, %%xmm5 \n\t"
+ "psllw $15, %%xmm7 \n\t"
+ "psrlw $13, %%xmm5 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+ "movq (%2,%%"REG_d"), %%xmm1 \n\t"
+ "movq 8(%2,%%"REG_d"), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm3, %%xmm2 \n\t"
+ "movq 16(%2,%%"REG_d"), %%xmm1 \n\t"
+ "movq 24(%2,%%"REG_d"), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm4 \n\t"
+ "pavgw %%xmm3, %%xmm6 \n\t"
+ snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+ snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+ "2: \n\t"
+ "sub $64, %%"REG_d" \n\t"
+ "jge 1b \n\t"
+ :"+d"(i)
+ :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+ ""op" ("r",%%"REG_d"), %%"t0" \n\t"\
+ ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\
+ ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
+ ""op" 24("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+ snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+ snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+ "movq %%"s0", ("w",%%"REG_d") \n\t"\
+ "movq %%"s1", 8("w",%%"REG_d") \n\t"\
+ "movq %%"s2", 16("w",%%"REG_d") \n\t"\
+ "movq %%"s3", 24("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "movq %%"s0", %%"t0" \n\t"\
+ "movq %%"s1", %%"t1" \n\t"\
+ "movq %%"s2", %%"t2" \n\t"\
+ "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+ x86_reg i = width;
+ while(i & 15)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+ i+=i;
+ __asm__ volatile(
+ "jmp 2f \n\t"
+ "1: \n\t"
+
+ snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+ "pcmpeqw %%mm0, %%mm0 \n\t"
+ "pcmpeqw %%mm2, %%mm2 \n\t"
+ "paddw %%mm2, %%mm2 \n\t"
+ "paddw %%mm0, %%mm2 \n\t"
+ "psllw $13, %%mm2 \n\t"
+ snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm5, %%mm5 \n\t"
+ "psllw $15, %%mm7 \n\t"
+ "psrlw $13, %%mm5 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+ "movq (%2,%%"REG_d"), %%mm1 \n\t"
+ "movq 8(%2,%%"REG_d"), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm3, %%mm2 \n\t"
+ "movq 16(%2,%%"REG_d"), %%mm1 \n\t"
+ "movq 24(%2,%%"REG_d"), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm4 \n\t"
+ "pavgw %%mm3, %%mm6 \n\t"
+ snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+ snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+ "2: \n\t"
+ "sub $32, %%"REG_d" \n\t"
+ "jge 1b \n\t"
+ :"+d"(i)
+ :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#define snow_inner_add_yblock_sse2_header \
+ IDWTELEM * * dst_array = sb->line + src_y;\
+ x86_reg tmp;\
+ __asm__ volatile(\
+ "mov %7, %%"REG_c" \n\t"\
+ "mov %6, %2 \n\t"\
+ "mov %4, %%"REG_S" \n\t"\
+ "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
+ "pcmpeqd %%xmm3, %%xmm3 \n\t"\
+ "psllw $15, %%xmm3 \n\t"\
+ "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
+ "1: \n\t"\
+ "mov %1, %%"REG_D" \n\t"\
+ "mov (%%"REG_D"), %%"REG_D" \n\t"\
+ "add %3, %%"REG_D" \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+ "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+ "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+ "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ "pmullw %%xmm0, %%"out_reg1" \n\t"\
+ "pmullw %%xmm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+ "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+ "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+ "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ "pmullw %%xmm0, %%"out_reg1" \n\t"\
+ "pmullw %%xmm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+ snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+ "paddusw %%xmm2, %%xmm1 \n\t"\
+ "paddusw %%xmm6, %%xmm5 \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+ snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+ "paddusw %%xmm2, %%xmm1 \n\t"\
+ "paddusw %%xmm6, %%xmm5 \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+ "add $32, %%"REG_S" \n\t"\
+ "add %%"REG_c", %0 \n\t"\
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+ "add %%"REG_c", (%%"REG_a") \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+ "jnz 1b \n\t"\
+ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+ :\
+ "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+ "sal $1, %%"REG_c" \n\t"\
+ "add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\
+ snow_inner_add_yblock_sse2_end_common1\
+ "sar $1, %%"REG_c" \n\t"\
+ "sub $2, %2 \n\t"\
+ snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+ "add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\
+ snow_inner_add_yblock_sse2_end_common1\
+ "dec %2 \n\t"\
+ snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+ "mov %0, %%"REG_d" \n\t"
+ "movdqa (%%"REG_D"), %%xmm0 \n\t"
+ "movdqa %%xmm1, %%xmm2 \n\t"
+
+ "punpckhwd %%xmm7, %%xmm1 \n\t"
+ "punpcklwd %%xmm7, %%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm0 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm2 \n\t"
+ "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm3, %%xmm0 \n\t"
+ "paddd %%xmm3, %%xmm2 \n\t"
+
+ "mov %1, %%"REG_D" \n\t"
+ "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
+ "add %3, %%"REG_D" \n\t"
+
+ "movdqa (%%"REG_D"), %%xmm4 \n\t"
+ "movdqa %%xmm5, %%xmm6 \n\t"
+ "punpckhwd %%xmm7, %%xmm5 \n\t"
+ "punpcklwd %%xmm7, %%xmm6 \n\t"
+ "paddd %%xmm6, %%xmm4 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm6 \n\t"
+ "paddd %%xmm5, %%xmm6 \n\t"
+ "paddd %%xmm3, %%xmm4 \n\t"
+ "paddd %%xmm3, %%xmm6 \n\t"
+
+ "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
+ "packssdw %%xmm2, %%xmm0 \n\t"
+ "packuswb %%xmm7, %%xmm0 \n\t"
+ "movq %%xmm0, (%%"REG_d") \n\t"
+
+ "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
+ "packssdw %%xmm6, %%xmm4 \n\t"
+ "packuswb %%xmm7, %%xmm4 \n\t"
+ "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+ "mov %0, %%"REG_d" \n\t"
+ "psrlw $4, %%xmm1 \n\t"
+ "psrlw $4, %%xmm5 \n\t"
+ "paddw (%%"REG_D"), %%xmm1 \n\t"
+ "paddw 16(%%"REG_D"), %%xmm5 \n\t"
+ "paddw %%xmm3, %%xmm1 \n\t"
+ "paddw %%xmm3, %%xmm5 \n\t"
+ "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
+ "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
+ "packuswb %%xmm5, %%xmm1 \n\t"
+
+ "movdqu %%xmm1, (%%"REG_d") \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+ IDWTELEM * * dst_array = sb->line + src_y;\
+ x86_reg tmp;\
+ __asm__ volatile(\
+ "mov %7, %%"REG_c" \n\t"\
+ "mov %6, %2 \n\t"\
+ "mov %4, %%"REG_S" \n\t"\
+ "pxor %%mm7, %%mm7 \n\t" /* 0 */\
+ "pcmpeqd %%mm3, %%mm3 \n\t"\
+ "psllw $15, %%mm3 \n\t"\
+ "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
+ "1: \n\t"\
+ "mov %1, %%"REG_D" \n\t"\
+ "mov (%%"REG_D"), %%"REG_D" \n\t"\
+ "add %3, %%"REG_D" \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
+ "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
+ "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+ "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
+ "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "pmullw %%mm0, %%"out_reg1" \n\t"\
+ "pmullw %%mm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+ snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+ "paddusw %%mm2, %%mm1 \n\t"\
+ "paddusw %%mm6, %%mm5 \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+ "mov %0, %%"REG_d" \n\t"\
+ "psrlw $4, %%mm1 \n\t"\
+ "psrlw $4, %%mm5 \n\t"\
+ "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
+ "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psraw $4, %%mm1 \n\t"\
+ "psraw $4, %%mm5 \n\t"\
+ "packuswb %%mm5, %%mm1 \n\t"\
+ "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+ "add $"s_step", %%"REG_S" \n\t"\
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+ "add %%"REG_c", (%%"REG_a") \n\t"\
+ "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
+ "add %%"REG_c", %0 \n\t"\
+ "dec %2 \n\t"\
+ "jnz 1b \n\t"\
+ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+ :\
+ "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16) {
+ if (!(b_h & 1))
+ inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ } else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16)
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+ c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+ c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+ }
+ else{
+ if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+ c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+ }
+ c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+ }
+ }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/x86/ttadsp.asm b/libavcodec/x86/ttadsp.asm
new file mode 100644
index 0000000000..8346cabac2
--- /dev/null
+++ b/libavcodec/x86/ttadsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224: dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTA_FILTER 2
+INIT_XMM %1
+cglobal ttafilter_process_dec, 5,5,%2, qm, dx, dl, error, in, shift, round
+ mova m2, [qmq ]
+ mova m3, [qmq + 0x10]
+ mova m4, [dxq ]
+ mova m5, [dxq + 0x10]
+
+ movd m6, [errorq] ; if (filter->error < 0) {
+ SPLATD m6 ; for (int i = 0; i < 8; i++)
+ psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i];
+ psignd m1, m5, m6 ; } else if (filter->error > 0) {
+ paddd m2, m0 ; for (int i = 0; i < 8; i++)
+ paddd m3, m1 ; filter->qm[i] += filter->dx[i];
+ mova [qmq ], m2 ; }
+ mova [qmq + 0x10], m3 ;
+
+ mova m0, [dlq ]
+ mova m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+ pmulld m2, m0
+ pmulld m3, m1
+%else
+ pshufd m6, m0, 0xb1
+ pshufd m7, m2, 0xb1
+ pmuludq m6, m7
+ pshufd m6, m6, 0xd8
+ pmuludq m2, m0
+ pshufd m2, m2, 0xd8
+ punpckldq m2, m6
+
+ pshufd m6, m1, 0xb1
+ pshufd m7, m3, 0xb1
+ pmuludq m6, m7
+ pshufd m6, m6, 0xd8
+ pmuludq m3, m1
+ pshufd m3, m3, 0xd8
+ punpckldq m3, m6
+%endif
+ ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+ paddd m2, m3 ; int sum = filter->round +
+ ; filter->dl[0] * filter->qm[0] +
+ punpckhqdq m3, m2, m2 ; filter->dl[1] * filter->qm[1] +
+ paddd m2, m3 ; filter->dl[2] * filter->qm[2] +
+ ; filter->dl[3] * filter->qm[3] +
+ movd m6, roundm ; filter->dl[4] * filter->qm[4] +
+ paddd m6, m2 ; filter->dl[5] * filter->qm[5] +
+ pshufd m2, m2, 0x1 ; filter->dl[6] * filter->qm[6] +
+ paddd m6, m2 ; filter->dl[7] * filter->qm[7];
+
+ palignr m5, m4, 4 ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+ ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+ palignr m2, m1, m0, 4 ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+ ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+ psrad m4, m1, 30 ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+ por m4, [pd_1224 ] ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+ pand m4, [pd_n0113] ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+ ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+ mova [dlq ], m2
+ mova [dxq ], m5
+ mova [dxq + 0x10], m4
+ movd m0, [inq] ; filter->error = *in;
+ movd [errorq], m0 ;
+
+ movd m2, shiftm ; *in += (sum >> filter->shift);
+ psrad m6, m2 ;
+ paddd m0, m6 ;
+ movd [inq], m0 ;
+
+ psrldq m1, 4 ;
+ pslldq m0, 12 ; filter->dl[4] = -filter->dl[5];
+ pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6];
+ psubd m0, m1 ; filter->dl[6] = *in - filter->dl[7];
+ psrldq m1, m0, 4 ; filter->dl[7] = *in;
+ pshufd m1, m1, 0xf4 ; filter->dl[5] += filter->dl[6];
+ paddd m0, m1 ; filter->dl[4] += filter->dl[5];
+ psrldq m1, 4 ;
+ paddd m0, m1 ;
+ mova [dlq + 0x10], m0 ;
+ RET
+%endmacro
+
+TTA_FILTER ssse3, 8
+TTA_FILTER sse4, 7
diff --git a/libavcodec/x86/ttadsp_init.c b/libavcodec/x86/ttadsp_init.c
new file mode 100644
index 0000000000..47dc87f6af
--- /dev/null
+++ b/libavcodec/x86/ttadsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttadsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_ttafilter_process_dec_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+ int32_t *error, int32_t *in, int32_t shift,
+ int32_t round);
+void ff_ttafilter_process_dec_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+ int32_t *error, int32_t *in, int32_t shift,
+ int32_t round);
+
+av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSSE3(cpu_flags))
+ c->ttafilter_process_dec = ff_ttafilter_process_dec_ssse3;
+ if (EXTERNAL_SSE4(cpu_flags))
+ c->ttafilter_process_dec = ff_ttafilter_process_dec_sse4;
+#endif
+}
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000000..02c5eaa2c2
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void v210_x86_init(V210DecContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_YASM
+ if (s->aligned_input) {
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+ if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+ s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+ }
+ else {
+ if (cpu_flags & AV_CPU_FLAG_SSSE3)
+ s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+ if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+ s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+ }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000000..6554a43de1
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,88 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 2
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1_%2, 5, 5, 7
+ movsxdifnidn r4, r4d
+ lea r1, [r1+2*r4]
+ add r2, r4
+ add r3, r4
+ neg r4
+
+ mova m3, [v210_mult]
+ mova m4, [v210_mask]
+ mova m5, [v210_luma_shuf]
+ mova m6, [v210_chroma_shuf]
+.loop
+%ifidn %1, unaligned
+ movu m0, [r0]
+%else
+ mova m0, [r0]
+%endif
+
+ pmullw m1, m0, m3
+ psrld m0, 10
+ psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5
+ pand m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+ shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+ pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+ movu [r1+2*r4], m2
+
+ shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+ pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+ movq [r2+r4], m1
+ movhps [r3+r4], m1
+
+ add r0, mmsize
+ add r4, 6
+ jl .loop
+
+ REP_RET
+%endmacro
+
+INIT_XMM
+v210_planar_unpack unaligned, ssse3
+%if HAVE_AVX_EXTERNAL
+INIT_AVX
+v210_planar_unpack unaligned, avx
+%endif
+
+INIT_XMM
+v210_planar_unpack aligned, ssse3
+%if HAVE_AVX_EXTERNAL
+INIT_AVX
+v210_planar_unpack aligned, avx
+%endif
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index adf08d7d84..546688cf9d 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -2,20 +2,20 @@
;* VC1 deblocking optimizations
;* Copyright (c) 2009 David Conrad
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h
index 9b6c8ada26..fdd4de1813 100644
--- a/libavcodec/x86/vc1dsp.h
+++ b/libavcodec/x86/vc1dsp.h
@@ -1,20 +1,20 @@
/*
* VC-1 and WMV3 decoder - X86 DSP init functions
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 15fd2c830c..5ceacd348e 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -25,7 +25,6 @@
*/
#include "libavutil/cpu.h"
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index 53b9e8292c..1ac02574d6 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -2,20 +2,20 @@
;* Core video DSP functions
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -54,13 +54,13 @@ SECTION .text
; | | <- bottom is copied from last line in body of source
; '----' <- bh
%if ARCH_X86_64
-cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
start_y, end_y, bh, w
%else ; x86-32
cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
%define src_strideq r3mp
-%define dst_strideq r2mp
- mov srcq, r1mp
+%define dst_strideq r1mp
+ mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
@@ -262,30 +262,30 @@ hvar_fn
%rep 1+%2-%1
%if %%n <= 3
%if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
start_y, end_y, val, bh
mov bhq, r6mp ; r6mp = bhmp
%else ; x86-32
cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
mov dstq, r0mp
- mov srcq, r1mp
+ mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
%define src_strideq r3mp
%endif ; x86-64/32
%else
%if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
start_y, end_y, bh
%else ; x86-32
cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
- mov srcq, r1mp
+ mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
%define src_strideq r3mp
%endif ; x86-64/32
%endif
@@ -344,10 +344,6 @@ VERTICAL_EXTEND 16, 22
; obviously not the same on both sides.
%macro READ_V_PIXEL 2
-%if %1 == 2
- movzx valw, byte %2
- imul valw, 0x0101
-%else
movzx vald, byte %2
imul vald, 0x01010101
%if %1 >= 8
@@ -356,13 +352,15 @@ VERTICAL_EXTEND 16, 22
pshufd m0, m0, q0000
%else
punpckldq m0, m0
-%endif
-%endif ; %1 >= 8
-%endif
+%endif ; mmsize == 16
+%endif ; %1 > 16
%endmacro ; READ_V_PIXEL
%macro WRITE_V_PIXEL 2
%assign %%off 0
+
+%if %1 >= 8
+
%rep %1/mmsize
movu [%2+%%off], m0
%assign %%off %%off+mmsize
@@ -378,27 +376,29 @@ VERTICAL_EXTEND 16, 22
%assign %%off %%off+8
%endif
%endif ; %1-%%off >= 8
-%endif
+%endif ; mmsize == 16
%if %1-%%off >= 4
%if %1 > 8 && %1-%%off > 4
movq [%2+%1-8], m0
%assign %%off %1
-%elif %1 >= 8 && %1-%%off >= 4
- movd [%2+%%off], m0
-%assign %%off %%off+4
%else
- mov [%2+%%off], vald
+ movd [%2+%%off], m0
%assign %%off %%off+4
%endif
%endif ; %1-%%off >= 4
-%if %1-%%off >= 2
-%if %1 >= 8
- movd [%2+%1-4], m0
-%else
+%else ; %1 < 8
+
+%rep %1/4
+ mov [%2+%%off], vald
+%assign %%off %%off+4
+%endrep ; %1/4
+
+%endif ; %1 >=/< 8
+
+%if %1-%%off == 2
mov [%2+%%off], valw
-%endif
%endif ; (%1-%%off)/2
%endmacro ; WRITE_V_PIXEL
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index 8ee837096a..3218abdd88 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -1,25 +1,27 @@
/*
+ * Copyright (C) 2002-2012 Michael Niedermayer
* Copyright (C) 2012 Ronald S. Bultje
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
@@ -28,11 +30,11 @@
#include "libavcodec/videodsp.h"
#if HAVE_YASM
-typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src,
- x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+ const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh);
-typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src,
- x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+ const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh,
x86_reg w);
@@ -141,14 +143,16 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
if (!w || !h)
- return;
+ return;
if (src_y >= h) {
- src -= src_y * src_stride;
- src_y = src_y_add = h - 1;
+ src -= src_y*src_stride;
+ src_y_add = h - 1;
+ src_y = h - 1;
} else if (src_y <= -block_h) {
- src -= src_y*src_stride;
- src_y = src_y_add = 1 - block_h;
+ src -= src_y*src_stride;
+ src_y_add = 1 - block_h;
+ src_y = 1 - block_h;
}
if (src_x >= w) {
src += w - 1 - src_x;
@@ -162,18 +166,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
start_x = FFMAX(0, -src_x);
end_y = FFMIN(block_h, h-src_y);
end_x = FFMIN(block_w, w-src_x);
- assert(start_x < end_x && block_w > 0);
- assert(start_y < end_y && block_h > 0);
+ av_assert2(start_x < end_x && block_w > 0);
+ av_assert2(start_y < end_y && block_h > 0);
// fill in the to-be-copied part plus all above/below
src += (src_y_add + start_y) * src_stride + start_x;
w = end_x - start_x;
if (w <= 22) {
- vfix_tbl[w - 1](dst + start_x, src,
- dst_stride, src_stride,
+ vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h);
} else {
- v_extend_var(dst + start_x, src, dst_stride, src_stride,
+ v_extend_var(dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h, w);
}
@@ -212,7 +215,7 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
-static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src,
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
ptrdiff_t buf_stride,
ptrdiff_t src_stride,
int block_w, int block_h,
@@ -231,8 +234,8 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
int src_x, int src_y, int w,
int h)
{
- emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x,
- src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+ emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+ src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
}
#endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
index c54650eef5..b25d838868 100644
--- a/libavcodec/x86/vorbisdsp.asm
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -2,20 +2,20 @@
;* Vorbis x86 optimizations
;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index 2a978b66aa..284a528a0c 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index fc8a047224..24496ae8cf 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -2,20 +2,20 @@
;* MMX/SSE2-optimized functions for the VP3 decoder
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 9e38014e4d..1f02a6f709 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -1,18 +1,20 @@
/*
- * This file is part of Libav.
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
*
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -21,6 +23,7 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/vp3dsp.h"
@@ -40,10 +43,68 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);
+#if HAVE_MMX_INLINE
+
+#define MOVQ_BFE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \
+ "paddb %%"#regd", %%"#regd" \n\t" ::)
+
+#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
+ "movq "#rega", "#regr" \n\t" \
+ "movq "#regc", "#regp" \n\t" \
+ "pand "#regb", "#regr" \n\t" \
+ "pand "#regd", "#regp" \n\t" \
+ "pxor "#rega", "#regb" \n\t" \
+ "pxor "#regc", "#regd" \n\t" \
+ "pand %%mm6, "#regb" \n\t" \
+ "pand %%mm6, "#regd" \n\t" \
+ "psrlq $1, "#regb" \n\t" \
+ "psrlq $1, "#regd" \n\t" \
+ "paddb "#regb", "#regr" \n\t" \
+ "paddb "#regd", "#regp" \n\t"
+
+static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
+{
+// START_TIMER
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "movq (%1,%4), %%mm2 \n\t"
+ "movq (%2,%4), %%mm3 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3) \n\t"
+ "movq %%mm5, (%3,%4) \n\t"
+
+ "movq (%1,%4,2), %%mm0 \n\t"
+ "movq (%2,%4,2), %%mm1 \n\t"
+ "movq (%1,%5), %%mm2 \n\t"
+ "movq (%2,%5), %%mm3 \n\t"
+ "lea (%1,%4,4), %1 \n\t"
+ "lea (%2,%4,4), %2 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3,%4,2) \n\t"
+ "movq %%mm5, (%3,%5) \n\t"
+ "lea (%3,%4,4), %3 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
+ :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
+ :"memory");
+// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
+}
+#endif /* HAVE_MMX_INLINE */
+
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
+#if HAVE_MMX_INLINE
+ c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx;
+#endif /* HAVE_MMX_INLINE */
+
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_mmx;
diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h
index 0a693684af..e71dbf8ed0 100644
--- a/libavcodec/x86/vp56_arith.h
+++ b/libavcodec/x86/vp56_arith.h
@@ -4,20 +4,20 @@
* Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
* Copyright (C) 2010 Eli Friedman
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm
index 80f8ca5f38..3d874ea62a 100644
--- a/libavcodec/x86/vp6dsp.asm
+++ b/libavcodec/x86/vp6dsp.asm
@@ -3,20 +3,20 @@
;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c
index cd94f3e038..82baee7e97 100644
--- a/libavcodec/x86/vp6dsp_init.c
+++ b/libavcodec/x86/vp6dsp_init.c
@@ -3,20 +3,20 @@
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 2f05334374..4407dd3dd0 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 69460aa73f..d9ce9772ea 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -3,20 +3,20 @@
* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -315,18 +315,22 @@ DECLARE_LOOP_FILTER(sse4)
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
-av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
+av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c, int vp7)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
+ if (!vp7) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
+ }
#if ARCH_X86_32
+ if (!vp7) {
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
+ }
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
#endif
@@ -334,6 +338,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
#if ARCH_X86_32
+ if (!vp7) {
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
@@ -346,6 +351,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
+ }
#endif
}
@@ -360,6 +366,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
+ if (!vp7) {
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
@@ -372,22 +379,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+ }
#endif
}
if (EXTERNAL_SSE(cpu_flags)) {
+ if (!vp7) {
c->vp8_idct_add = ff_vp8_idct_add_sse;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
+ }
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
- if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+ if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
+ if (!vp7) {
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
@@ -395,9 +406,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
+ }
}
if (EXTERNAL_SSE2(cpu_flags)) {
+ if (!vp7) {
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
@@ -407,6 +420,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
+ }
}
if (EXTERNAL_SSSE3(cpu_flags)) {
@@ -417,6 +431,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
+ if (!vp7) {
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
@@ -429,14 +444,17 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
+ }
}
if (EXTERNAL_SSE4(cpu_flags)) {
+ if (!vp7) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
+ }
}
#endif /* HAVE_YASM */
}
diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm
index 8305881532..98bd8a5e38 100644
--- a/libavcodec/x86/vp8dsp_loopfilter.asm
+++ b/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index ce58c08a3b..011e13d9da 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -1,102 +1,90 @@
/*
* VP9 SIMD optimizations
*
- * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
#if HAVE_YASM
-#define fpel_func(avg, sz, opt) \
-void ff_ ## avg ## sz ## _ ## opt(uint8_t *dst, const uint8_t *src, \
- ptrdiff_t dst_stride, \
- ptrdiff_t src_stride, \
- int h, int mx, int my)
-
+#define fpel_func(avg, sz, opt) \
+void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
fpel_func(put, 4, mmx);
fpel_func(put, 8, mmx);
fpel_func(put, 16, sse);
fpel_func(put, 32, sse);
fpel_func(put, 64, sse);
-fpel_func(avg, 4, sse);
-fpel_func(avg, 8, sse);
+fpel_func(avg, 4, mmxext);
+fpel_func(avg, 8, mmxext);
fpel_func(avg, 16, sse2);
fpel_func(avg, 32, sse2);
fpel_func(avg, 64, sse2);
#undef fpel_func
-#define mc_func(avg, sz, dir, opt) \
-void \
-ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
- const uint8_t *src, \
- ptrdiff_t dst_stride, \
- ptrdiff_t src_stride, \
- int h, \
- const int8_t (*filter)[16])
-
-#define mc_funcs(sz) \
- mc_func(put, sz, h, ssse3); \
- mc_func(avg, sz, h, ssse3); \
- mc_func(put, sz, v, ssse3); \
- mc_func(avg, sz, v, ssse3)
+#define mc_func(avg, sz, dir, opt) \
+void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, const int8_t (*filter)[16])
+#define mc_funcs(sz) \
+mc_func(put, sz, h, ssse3); \
+mc_func(avg, sz, h, ssse3); \
+mc_func(put, sz, v, ssse3); \
+mc_func(avg, sz, v, ssse3)
mc_funcs(4);
mc_funcs(8);
+#if ARCH_X86_64
+mc_funcs(16);
+#endif
#undef mc_funcs
#undef mc_func
-#define mc_rep_func(avg, sz, hsz, dir, opt) \
-static av_always_inline void \
-ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
- const uint8_t *src, \
- ptrdiff_t dst_stride, \
- ptrdiff_t src_stride, \
- int h, \
- const int8_t (*filter)[16]) \
-{ \
- ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, src, \
- dst_stride, \
- src_stride, \
- h, \
- filter); \
- ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz, \
- src + hsz, \
- dst_stride, \
- src_stride, \
- h, filter); \
+#define mc_rep_func(avg, sz, hsz, dir, opt) \
+static av_always_inline void \
+ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, const int8_t (*filter)[16]) \
+{ \
+ ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \
+ src_stride, h, filter); \
+ ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \
+ src_stride, h, filter); \
}
-#define mc_rep_funcs(sz, hsz) \
- mc_rep_func(put, sz, hsz, h, ssse3); \
- mc_rep_func(avg, sz, hsz, h, ssse3); \
- mc_rep_func(put, sz, hsz, v, ssse3); \
- mc_rep_func(avg, sz, hsz, v, ssse3)
+#define mc_rep_funcs(sz, hsz) \
+mc_rep_func(put, sz, hsz, h, ssse3); \
+mc_rep_func(avg, sz, hsz, h, ssse3); \
+mc_rep_func(put, sz, hsz, v, ssse3); \
+mc_rep_func(avg, sz, hsz, v, ssse3)
+#if ARCH_X86_32
mc_rep_funcs(16, 8);
+#endif
mc_rep_funcs(32, 16);
mc_rep_funcs(64, 32);
@@ -105,36 +93,29 @@ mc_rep_funcs(64, 32);
extern const int8_t ff_filters_ssse3[3][15][4][16];
-#define filter_8tap_2d_fn(op, sz, f, fname) \
-static void \
-op ## _8tap_ ## fname ## _ ## sz ## hv_ssse3(uint8_t *dst, \
- const uint8_t *src, \
- ptrdiff_t dst_stride, \
- ptrdiff_t src_stride, \
- int h, int mx, int my) \
-{ \
- LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \
- ff_put_8tap_1d_h_ ## sz ## _ssse3(temp, src - 3 * src_stride, \
- 64, src_stride, \
- h + 7, \
- ff_filters_ssse3[f][mx - 1]); \
- ff_ ## op ## _8tap_1d_v_ ## sz ## _ssse3(dst, temp + 3 * 64, \
- dst_stride, 64, \
- h, \
- ff_filters_ssse3[f][my - 1]); \
+#define filter_8tap_2d_fn(op, sz, f, fname) \
+static void op##_8tap_##fname##_##sz##hv_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \
+ ff_put_8tap_1d_h_##sz##_ssse3(temp, 64, src - 3 * src_stride, src_stride, \
+ h + 7, ff_filters_ssse3[f][mx - 1]); \
+ ff_##op##_8tap_1d_v_##sz##_ssse3(dst, dst_stride, temp + 3 * 64, 64, \
+ h, ff_filters_ssse3[f][my - 1]); \
}
-#define filters_8tap_2d_fn(op, sz) \
- filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
- filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \
- filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
+#define filters_8tap_2d_fn(op, sz) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
#define filters_8tap_2d_fn2(op) \
- filters_8tap_2d_fn(op, 64) \
- filters_8tap_2d_fn(op, 32) \
- filters_8tap_2d_fn(op, 16) \
- filters_8tap_2d_fn(op, 8) \
- filters_8tap_2d_fn(op, 4)
+filters_8tap_2d_fn(op, 64) \
+filters_8tap_2d_fn(op, 32) \
+filters_8tap_2d_fn(op, 16) \
+filters_8tap_2d_fn(op, 8) \
+filters_8tap_2d_fn(op, 4)
filters_8tap_2d_fn2(put)
filters_8tap_2d_fn2(avg)
@@ -143,36 +124,30 @@ filters_8tap_2d_fn2(avg)
#undef filters_8tap_2d_fn
#undef filter_8tap_2d_fn
-#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \
-static void \
-op ## _8tap_ ## fname ## _ ## sz ## dir ## _ssse3(uint8_t *dst, \
- const uint8_t *src, \
- ptrdiff_t dst_stride, \
- ptrdiff_t src_stride, \
- int h, int mx, \
- int my) \
-{ \
- ff_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ssse3(dst, src, \
- dst_stride, \
- src_stride, h, \
- ff_filters_ssse3[f][dvar - 1]); \
+#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \
+static void op##_8tap_##fname##_##sz##dir##_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ ff_##op##_8tap_1d_##dir##_##sz##_ssse3(dst, dst_stride, src, src_stride, \
+ h, ff_filters_ssse3[f][dvar - 1]); \
}
-#define filters_8tap_1d_fn(op, sz, dir, dvar) \
- filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
- filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \
- filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
+#define filters_8tap_1d_fn(op, sz, dir, dvar) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
-#define filters_8tap_1d_fn2(op, sz) \
- filters_8tap_1d_fn(op, sz, h, mx) \
- filters_8tap_1d_fn(op, sz, v, my)
+#define filters_8tap_1d_fn2(op, sz) \
+filters_8tap_1d_fn(op, sz, h, mx) \
+filters_8tap_1d_fn(op, sz, v, my)
#define filters_8tap_1d_fn3(op) \
- filters_8tap_1d_fn2(op, 64) \
- filters_8tap_1d_fn2(op, 32) \
- filters_8tap_1d_fn2(op, 16) \
- filters_8tap_1d_fn2(op, 8) \
- filters_8tap_1d_fn2(op, 4)
+filters_8tap_1d_fn2(op, 64) \
+filters_8tap_1d_fn2(op, 32) \
+filters_8tap_1d_fn2(op, 16) \
+filters_8tap_1d_fn2(op, 8) \
+filters_8tap_1d_fn2(op, 4)
filters_8tap_1d_fn3(put)
filters_8tap_1d_fn3(avg)
@@ -182,6 +157,94 @@ filters_8tap_1d_fn3(avg)
#undef filters_8tap_1d_fn3
#undef filter_8tap_1d_fn
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct, idct, size, opt); \
+itxfm_func(iadst, idct, size, opt); \
+itxfm_func(idct, iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+
+#undef itxfm_func
+#undef itxfm_funcs
+
+#define lpf_funcs(size1, size2, opt) \
+void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H); \
+void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int E, int I, int H)
+
+lpf_funcs(16, 16, sse2);
+lpf_funcs(16, 16, ssse3);
+lpf_funcs(16, 16, avx);
+lpf_funcs(44, 16, sse2);
+lpf_funcs(44, 16, ssse3);
+lpf_funcs(44, 16, avx);
+lpf_funcs(84, 16, sse2);
+lpf_funcs(84, 16, ssse3);
+lpf_funcs(84, 16, avx);
+lpf_funcs(48, 16, sse2);
+lpf_funcs(48, 16, ssse3);
+lpf_funcs(48, 16, avx);
+lpf_funcs(88, 16, sse2);
+lpf_funcs(88, 16, ssse3);
+lpf_funcs(88, 16, avx);
+
+#undef lpf_funcs
+
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *l, const uint8_t *a)
+
+#define ipred_funcs(type, opt) \
+ipred_func(4, type, opt); \
+ipred_func(8, type, opt); \
+ipred_func(16, type, opt); \
+ipred_func(32, type, opt)
+
+ipred_funcs(dc, ssse3);
+ipred_funcs(dc_left, ssse3);
+ipred_funcs(dc_top, ssse3);
+
+#undef ipred_funcs
+
+ipred_func(8, v, mmx);
+ipred_func(16, v, sse2);
+ipred_func(32, v, sse2);
+
+#define ipred_func_set(size, type, opt1, opt2) \
+ipred_func(size, type, opt1); \
+ipred_func(size, type, opt2)
+
+#define ipred_funcs(type, opt1, opt2) \
+ipred_func(4, type, opt1); \
+ipred_func_set(8, type, opt1, opt2); \
+ipred_func_set(16, type, opt1, opt2); \
+ipred_func_set(32, type, opt1, opt2)
+
+ipred_funcs(h, ssse3, avx);
+ipred_funcs(tm, ssse3, avx);
+ipred_funcs(dl, ssse3, avx);
+ipred_funcs(dr, ssse3, avx);
+ipred_funcs(hu, ssse3, avx);
+ipred_funcs(hd, ssse3, avx);
+ipred_funcs(vl, ssse3, avx);
+ipred_funcs(vr, ssse3, avx);
+
+#undef ipred_funcs
+#undef ipred_func_set
+#undef ipred_func
+
#endif /* HAVE_YASM */
av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
@@ -189,52 +252,140 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
-#define init_fpel(idx1, idx2, sz, type, opt) \
- dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
- dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
- dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
- dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_ ## type ## sz ## _ ## opt
-
+#define init_fpel(idx1, idx2, sz, type, opt) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_##type##sz##_##opt
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
- dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv] = type ## _8tap_smooth_ ## sz ## dir ## _ ## opt; \
- dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
- dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv] = type ## _8tap_sharp_ ## sz ## dir ## _ ## opt
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt
-#define init_subpel2(idx, idxh, idxv, dir, type, opt) \
+#define init_subpel2(idx, idxh, idxv, dir, type, opt) \
init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \
init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt)
-#define init_subpel3(idx, type, opt) \
+#define init_subpel3(idx, type, opt) \
init_subpel2(idx, 1, 1, hv, type, opt); \
- init_subpel2(idx, 0, 1, v, type, opt); \
- init_subpel2(idx, 1, 0, h, type, opt)
+ init_subpel2(idx, 0, 1, v, type, opt); \
+ init_subpel2(idx, 1, 0, h, type, opt)
+
+#define init_lpf(opt) do { \
+ if (ARCH_X86_64) { \
+ dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
+ dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
+ dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
+ dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
+ dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
+ dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
+ dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
+ dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
+ dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
+ dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
+ } \
+} while (0)
+
+#define init_ipred(tx, sz, opt) do { \
+ dsp->intra_pred[tx][HOR_PRED] = ff_vp9_ipred_h_##sz##x##sz##_##opt; \
+ dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = ff_vp9_ipred_dl_##sz##x##sz##_##opt; \
+ dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = ff_vp9_ipred_dr_##sz##x##sz##_##opt; \
+ dsp->intra_pred[tx][HOR_DOWN_PRED] = ff_vp9_ipred_hd_##sz##x##sz##_##opt; \
+ dsp->intra_pred[tx][VERT_LEFT_PRED] = ff_vp9_ipred_vl_##sz##x##sz##_##opt; \
+ dsp->intra_pred[tx][HOR_UP_PRED] = ff_vp9_ipred_hu_##sz##x##sz##_##opt; \
+ if (ARCH_X86_64 || tx != TX_32X32) { \
+ dsp->intra_pred[tx][VERT_RIGHT_PRED] = ff_vp9_ipred_vr_##sz##x##sz##_##opt; \
+ dsp->intra_pred[tx][TM_VP8_PRED] = ff_vp9_ipred_tm_##sz##x##sz##_##opt; \
+ } \
+} while (0)
+#define init_dc_ipred(tx, sz, opt) do { \
+ init_ipred(tx, sz, opt); \
+ dsp->intra_pred[tx][DC_PRED] = ff_vp9_ipred_dc_##sz##x##sz##_##opt; \
+ dsp->intra_pred[tx][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_##sz##x##sz##_##opt; \
+ dsp->intra_pred[tx][TOP_DC_PRED] = ff_vp9_ipred_dc_top_##sz##x##sz##_##opt; \
+} while (0)
if (EXTERNAL_MMX(cpu_flags)) {
init_fpel(4, 0, 4, put, mmx);
init_fpel(3, 0, 8, put, mmx);
+ dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+ dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+ dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+ dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+ dsp->intra_pred[TX_8X8][VERT_PRED] = ff_vp9_ipred_v_8x8_mmx;
+ }
+
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ init_fpel(4, 1, 4, avg, mmxext);
+ init_fpel(3, 1, 8, avg, mmxext);
}
if (EXTERNAL_SSE(cpu_flags)) {
init_fpel(2, 0, 16, put, sse);
init_fpel(1, 0, 32, put, sse);
init_fpel(0, 0, 64, put, sse);
- init_fpel(4, 1, 4, avg, sse);
- init_fpel(3, 1, 8, avg, sse);
}
if (EXTERNAL_SSE2(cpu_flags)) {
init_fpel(2, 1, 16, avg, sse2);
init_fpel(1, 1, 32, avg, sse2);
init_fpel(0, 1, 64, avg, sse2);
+ init_lpf(sse2);
+ dsp->intra_pred[TX_16X16][VERT_PRED] = ff_vp9_ipred_v_16x16_sse2;
+ dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
init_subpel3(0, put, ssse3);
init_subpel3(1, avg, ssse3);
+ dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3;
+ dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+ if (ARCH_X86_64) {
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
+ }
+ init_lpf(ssse3);
+ init_dc_ipred(TX_4X4, 4, ssse3);
+ init_dc_ipred(TX_8X8, 8, ssse3);
+ init_dc_ipred(TX_16X16, 16, ssse3);
+ init_dc_ipred(TX_32X32, 32, ssse3);
+ }
+
+ if (EXTERNAL_AVX(cpu_flags)) {
+ if (ARCH_X86_64) {
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx;
+ dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
+ }
+ init_lpf(avx);
+ init_ipred(TX_8X8, 8, avx);
+ init_ipred(TX_16X16, 16, avx);
+ init_ipred(TX_32X32, 32, avx);
}
#undef init_fpel
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
new file mode 100644
index 0000000000..3faf1c564d
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -0,0 +1,1405 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* Parts based on:
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Jason Garrett-Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_m256: times 8 dw -256
+pw_m255: times 8 dw -255
+pw_512: times 8 dw 512
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+pw_4096: times 8 dw 4096
+pw_8192: times 8 dw 8192
+
+pb_4x3_4x2_4x1_4x0: times 4 db 3
+ times 4 db 2
+ times 4 db 1
+ times 4 db 0
+pb_8x1_8x0: times 8 db 1
+ times 8 db 0
+pb_8x3_8x2: times 8 db 3
+ times 8 db 2
+pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7
+ times 8 db -1
+pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6
+ times 9 db 7
+pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
+ times 10 db 7
+pb_2to6_3x7:
+pb_2to6_11x7: db 2, 3, 4, 5, 6
+ times 11 db 7
+pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+pb_13456_3xm1: db 1, 3, 4, 5, 6
+ times 3 db -1
+pb_6012_4xm1: db 6, 0, 1, 2
+ times 4 db -1
+pb_6xm1_246_8toE: times 6 db -1
+ db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
+pb_6xm1_BDF_0to6: times 6 db -1
+ db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
+pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+pb_7to1_9x0: db 7, 6, 5, 4
+pb_3to1_5x0: db 3, 2, 1
+ times 9 db 0
+pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+pb_2: times 16 db 2
+pb_15: times 16 db 15
+
+cextern pb_1
+cextern pb_3
+
+SECTION .text
+
+; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq]
+ pxor m1, m1
+ psadbw m0, m1
+ pmulhrsw m0, [pw_4096]
+ pshufb m0, m1
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ RET
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [lq]
+ movq m1, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ pmulhrsw m0, [pw_2048]
+ pshufb m0, m2
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM ssse3
+cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ pmulhrsw m0, [pw_1024]
+ pshufb m0, m2
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [lq+16]
+ mova m2, [aq]
+ mova m3, [aq+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m4, m4
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m4
+ psadbw m3, m4
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+ pmulhrsw m0, [pw_512]
+ pshufb m0, m4
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
+INIT_MMX ssse3
+cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [%2q]
+ pxor m1, m1
+ psadbw m0, m1
+ pmulhrsw m0, [pw_8192]
+ pshufb m0, m1
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ RET
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [%2q]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pxor m1, m1
+ psadbw m0, m1
+ pmulhrsw m0, [pw_4096]
+ pshufb m0, m1
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM ssse3
+cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
+ mova m0, [%2q]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+ pmulhrsw m0, [pw_2048]
+ pshufb m0, m2
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+ mova m0, [%2q]
+ mova m1, [%2q+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ pmulhrsw m0, [pw_1024]
+ pshufb m0, m2
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+DC_1D_FUNCS top, a
+DC_1D_FUNCS left, l
+
+; v
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
+ movq m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
+ mova m0, [aq]
+ mova m1, [aq+16]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 8
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+; h
+
+INIT_XMM ssse3
+cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
+ movd m0, [lq]
+ pshufb m0, [pb_4x3_4x2_4x1_4x0]
+ lea stride3q, [strideq*3]
+ movd [dstq+strideq*0], m0
+ psrldq m0, 4
+ movd [dstq+strideq*1], m0
+ psrldq m0, 4
+ movd [dstq+strideq*2], m0
+ psrldq m0, 4
+ movd [dstq+stride3q ], m0
+ RET
+
+%macro H_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt
+ mova m2, [pb_8x1_8x0]
+ mova m3, [pb_8x3_8x2]
+ lea stride3q, [strideq*3]
+ mov cntq, 1
+.loop:
+ movd m0, [lq+cntq*4]
+ pshufb m1, m0, m3
+ pshufb m0, m2
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ movq [dstq+strideq*2], m0
+ movhps [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m7, [pb_3]
+ pxor m4, m4
+ lea stride3q, [strideq*3]
+ mov cntq, 3
+.loop:
+ movd m3, [lq+cntq*4]
+ pshufb m0, m3, m7
+ pshufb m1, m3, m6
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufb m2, m3, m5
+ pshufb m3, m4
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m7, [pb_3]
+ pxor m4, m4
+ lea stride3q, [strideq*3]
+ mov cntq, 7
+.loop:
+ movd m3, [lq+cntq*4]
+ pshufb m0, m3, m7
+ pshufb m1, m3, m6
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+ pshufb m2, m3, m5
+ pshufb m3, m4
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntq
+ jge .loop
+ RET
+%endmacro
+
+H_XMM_FUNCS ssse3
+H_XMM_FUNCS avx
+
+; tm
+
+INIT_MMX ssse3
+cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
+ pxor m1, m1
+ pinsrw m2, [aq-1], 0
+ movd m0, [aq]
+ DEFINE_ARGS dst, stride, l, cnt
+ mova m3, [pw_m256]
+ mova m4, [pw_m255]
+ pshufb m2, m3
+ punpcklbw m0, m1
+ psubw m0, m2
+ mov cntq, 1
+.loop:
+ pinsrw m2, [lq+cntq*2], 0
+ pshufb m1, m2, m4
+ pshufb m2, m3
+ paddw m1, m0
+ paddw m2, m0
+ packuswb m1, m1
+ packuswb m2, m2
+ movd [dstq+strideq*0], m1
+ movd [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+
+%macro TM_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
+ pxor m1, m1
+ pinsrw m2, [aq-1], 0
+ movh m0, [aq]
+ DEFINE_ARGS dst, stride, l, cnt
+ mova m3, [pw_m256]
+ mova m4, [pw_m255]
+ pshufb m2, m3
+ punpcklbw m0, m1
+ psubw m0, m2
+ mov cntq, 3
+.loop:
+ pinsrw m2, [lq+cntq*2], 0
+ pshufb m1, m2, m4
+ pshufb m2, m3
+ paddw m1, m0
+ paddw m2, m0
+ packuswb m1, m2
+ movh [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
+ pxor m3, m3
+ pinsrw m2, [aq-1], 0
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, l, cnt
+ mova m4, [pw_m256]
+ mova m5, [pw_m255]
+ pshufb m2, m4
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ psubw m1, m2
+ psubw m0, m2
+ mov cntq, 7
+.loop:
+ pinsrw m7, [lq+cntq*2], 0
+ pshufb m3, m7, m5
+ pshufb m7, m4
+ paddw m2, m3, m0
+ paddw m3, m1
+ paddw m6, m7, m0
+ paddw m7, m1
+ packuswb m2, m3
+ packuswb m6, m7
+ mova [dstq+strideq*0], m2
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+
+%if ARCH_X86_64
+INIT_XMM %1
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
+ pxor m5, m5
+ pinsrw m4, [aq-1], 0
+ mova m0, [aq]
+ mova m2, [aq+16]
+ DEFINE_ARGS dst, stride, l, cnt
+ mova m8, [pw_m256]
+ mova m9, [pw_m255]
+ pshufb m4, m8
+ punpckhbw m1, m0, m5
+ punpckhbw m3, m2, m5
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ psubw m1, m4
+ psubw m0, m4
+ psubw m3, m4
+ psubw m2, m4
+ mov cntq, 15
+.loop:
+ pinsrw m13, [lq+cntq*2], 0
+ pshufb m7, m13, m9
+ pshufb m13, m8
+ paddw m4, m7, m0
+ paddw m5, m7, m1
+ paddw m6, m7, m2
+ paddw m7, m3
+ paddw m10, m13, m0
+ paddw m11, m13, m1
+ paddw m12, m13, m2
+ paddw m13, m3
+ packuswb m4, m5
+ packuswb m6, m7
+ packuswb m10, m11
+ packuswb m12, m13
+ mova [dstq+strideq*0+ 0], m4
+ mova [dstq+strideq*0+16], m6
+ mova [dstq+strideq*1+ 0], m10
+ mova [dstq+strideq*1+16], m12
+ lea dstq, [dstq+strideq*2]
+ dec cntq
+ jge .loop
+ RET
+%endif
+%endmacro
+
+TM_XMM_FUNCS ssse3
+TM_XMM_FUNCS avx
+
+; dl
+
+%macro LOWPASS 4 ; left [dst], center, right, tmp
+ pxor m%4, m%1, m%3
+ pand m%4, [pb_1]
+ pavgb m%1, m%3
+ psubusb m%1, m%4
+ pavgb m%1, m%2
+%endmacro
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
+ movq m1, [aq]
+ pshufb m0, m1, [pb_0to5_2x7]
+ pshufb m2, m1, [pb_2to6_3x7]
+ psrlq m1, 8
+ LOWPASS 0, 1, 2, 3
+
+ pshufw m1, m0, q3321
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*2], m1
+ psrlq m0, 8
+ psrlq m1, 8
+ add dstq, strideq
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*2], m1
+ RET
+
+%macro DL_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
+ movq m0, [aq]
+ lea stride5q, [strideq*5]
+ pshufb m1, m0, [pb_1to6_10x7]
+ psrldq m2, m1, 1
+ shufps m0, m1, q3210
+ LOWPASS 0, 1, 2, 3
+
+ pshufd m1, m0, q3321
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*4], m1
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*1], m0
+ movq [dstq+stride5q ], m1
+ lea dstq, [dstq+strideq*2]
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*4], m1
+ psrldq m0, 1
+ psrldq m1, 1
+ movq [dstq+strideq*1], m0
+ movq [dstq+stride5q ], m1
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
+ mova m5, [pb_1toE_2xF]
+ mova m0, [aq]
+ pshufb m1, m0, m5
+ pshufb m2, m1, m5
+ pshufb m4, m0, [pb_15]
+ LOWPASS 0, 1, 2, 3
+ DEFINE_ARGS dst, stride, cnt, stride9
+ lea stride9q, [strideq*3]
+ mov cntd, 4
+ lea stride9q, [stride9q*3]
+
+.loop:
+ movhlps m4, m0
+ mova [dstq+strideq*0], m0
+ pshufb m0, m5
+ mova [dstq+strideq*8], m4
+ movhlps m4, m0
+ mova [dstq+strideq*1], m0
+ pshufb m0, m5
+ mova [dstq+stride9q ], m4
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
+ mova m5, [pb_1toE_2xF]
+ mova m0, [aq]
+ mova m1, [aq+16]
+ palignr m2, m1, m0, 1
+ palignr m3, m1, m0, 2
+ LOWPASS 0, 2, 3, 4
+ pshufb m2, m1, m5
+ pshufb m3, m2, m5
+ pshufb m6, m1, [pb_15]
+ LOWPASS 1, 2, 3, 4
+ mova m7, m6
+ lea dst16q, [dstq +strideq*8]
+ mov cntd, 8
+ lea dst16q, [dst16q+strideq*8]
+.loop:
+ movhlps m7, m1
+ mova [dstq +strideq*0+ 0], m0
+ mova [dstq +strideq*0+16], m1
+ movhps [dstq+strideq*8+ 0], m0
+ movq [dstq +strideq*8+ 8], m1
+ mova [dstq +strideq*8+16], m7
+ mova [dst16q+strideq*0+ 0], m1
+ mova [dst16q+strideq*0+16], m6
+ mova [dst16q+strideq*8+ 0], m7
+ mova [dst16q+strideq*8+16], m6
+%if cpuflag(avx)
+ vpalignr m0, m1, m0, 1
+ pshufb m1, m5
+%else
+ palignr m2, m1, m0, 1
+ pshufb m1, m5
+ mova m0, m2
+%endif
+ add dstq, strideq
+ add dst16q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+DL_XMM_FUNCS ssse3
+DL_XMM_FUNCS avx
+
+; dr
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq-1]
+ movd m1, [aq+3]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ palignr m1, m0, 1
+ psrlq m2, m1, 8
+ LOWPASS 0, 1, 2, 3
+
+ movd [dstq+stride3q ], m0
+ psrlq m0, 8
+ movd [dstq+strideq*2], m0
+ psrlq m0, 8
+ movd [dstq+strideq*1], m0
+ psrlq m0, 8
+ movd [dstq+strideq*0], m0
+ RET
+
+%macro DR_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
+ movq m1, [lq]
+ movhps m1, [aq-1]
+ movd m2, [aq+7]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pslldq m0, m1, 1
+ palignr m2, m1, 1
+ LOWPASS 0, 1, 2, 3
+
+ movhps [dstq+strideq*0], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*1], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*2], m0
+ pslldq m0, 1
+ movhps [dstq+stride3q ], m0
+ pslldq m0, 1
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*1], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*2], m0
+ pslldq m0, 1
+ movhps [dstq+stride3q ], m0
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
+ mova m1, [lq]
+ movu m2, [aq-1]
+ movd m4, [aq+15]
+ DEFINE_ARGS dst, stride, stride9, cnt
+ lea stride9q, [strideq *3]
+ mov cntd, 4
+ lea stride9q, [stride9q*3]
+ palignr m4, m2, 1
+ palignr m3, m2, m1, 15
+ LOWPASS 3, 2, 4, 5
+ pslldq m0, m1, 1
+ palignr m2, m1, 1
+ LOWPASS 0, 1, 2, 4
+
+.loop:
+ mova [dstq+strideq*0 ], m3
+ movhps [dstq+strideq*8+0], m0
+ movq [dstq+strideq*8+8], m3
+ palignr m3, m0, 15
+ pslldq m0, 1
+ mova [dstq+strideq*1 ], m3
+ movhps [dstq+stride9q +0], m0
+ movq [dstq+stride9q +8], m3
+ palignr m3, m0, 15
+ pslldq m0, 1
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
+ mova m1, [lq]
+ mova m2, [lq+16]
+ movu m3, [aq-1]
+ movu m4, [aq+15]
+ movd m5, [aq+31]
+ DEFINE_ARGS dst, stride, stride8, cnt
+ lea stride8q, [strideq*8]
+ palignr m5, m4, 1
+ palignr m6, m4, m3, 15
+ LOWPASS 5, 4, 6, 7
+ palignr m4, m3, 1
+ palignr m6, m3, m2, 15
+ LOWPASS 4, 3, 6, 7
+ palignr m3, m2, 1
+ palignr m6, m2, m1, 15
+ LOWPASS 3, 2, 6, 7
+ palignr m2, m1, 1
+ pslldq m0, m1, 1
+ LOWPASS 2, 1, 0, 6
+ mov cntd, 16
+
+ ; out=m2/m3/m4/m5
+.loop:
+ mova [dstq+stride8q*0+ 0], m4
+ mova [dstq+stride8q*0+16], m5
+ mova [dstq+stride8q*2+ 0], m3
+ mova [dstq+stride8q*2+16], m4
+ palignr m5, m4, 15
+ palignr m4, m3, 15
+ palignr m3, m2, 15
+ pslldq m2, 1
+ add dstq, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+DR_XMM_FUNCS ssse3
+DR_XMM_FUNCS avx
+
+; vl
+
+INIT_MMX ssse3
+cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
+ movq m0, [aq]
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ movq [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ psrlq m1, 8
+ psrlq m2, 8
+ movq [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m2
+ RET
+
+%macro VL_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
+ movq m0, [aq]
+ pshufb m0, [pb_0to6_9x7]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psrldq m1, m0, 1
+ psrldq m2, m0, 2
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ movq [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m2
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m2
+ psrldq m1, 1
+ psrldq m2, 1
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
+ mova m0, [aq]
+ mova m4, [pb_1toE_2xF]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ pshufb m1, m0, m4
+ pshufb m2, m1, m4
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ pshufb m1, m4
+ pshufb m2, m4
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ pshufb m1, m4
+ pshufb m2, m4
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
+ mova m0, [aq]
+ mova m5, [aq+16]
+ mova m4, [pb_1toE_2xF]
+ DEFINE_ARGS dst, stride, dst16, cnt
+ palignr m2, m5, m0, 1
+ palignr m3, m5, m0, 2
+ lea dst16q, [dstq +strideq*8]
+ LOWPASS 3, 2, 0, 6
+ pavgb m2, m0
+ pshufb m0, m5, m4
+ pshufb m1, m0, m4
+ lea dst16q, [dst16q+strideq*8]
+ LOWPASS 1, 0, 5, 6
+ pavgb m0, m5
+ pshufb m5, [pb_15]
+ mov cntd, 8
+
+.loop:
+%macro %%write 3
+ mova [dstq+stride%1+ 0], %2
+ mova [dstq+stride%1+16], %3
+ movhps [dst16q+stride%1 ], %2
+ movu [dst16q+stride%1+ 8], %3
+ movq [dst16q+stride%1+24], m5
+%if cpuflag(avx)
+ palignr %2, %3, %2, 1
+ pshufb %3, m4
+%else
+ palignr m6, %3, %2, 1
+ pshufb %3, m4
+ mova %2, m6
+%endif
+%endmacro
+
+ %%write q*0, m2, m0
+ %%write q*1, m3, m1
+ lea dstq, [dstq +strideq*2]
+ lea dst16q, [dst16q+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+VL_XMM_FUNCS ssse3
+VL_XMM_FUNCS avx
+
+; vr
+
+INIT_MMX ssse3
+cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
+ movq m1, [aq-1]
+ punpckldq m2, [lq]
+ movd m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pavgb m0, m1
+ palignr m1, m2, 5
+ psrlq m2, m1, 8
+ psllq m3, m1, 8
+ LOWPASS 2, 1, 3, 4
+
+ ; ABCD <- for the following predictor:
+ ; EFGH
+ ; IABC | m0 contains ABCDxxxx
+ ; JEFG | m2 contains xJIEFGHx
+
+ punpckldq m0, m2
+ pshufb m2, [pb_13456_3xm1]
+ movd [dstq+strideq*0], m0
+ pshufb m0, [pb_6012_4xm1]
+ movd [dstq+stride3q ], m2
+ psrlq m2, 8
+ movd [dstq+strideq*2], m0
+ movd [dstq+strideq*1], m2
+ RET
+
+%macro VR_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
+ movu m1, [aq-1]
+ movhps m2, [lq]
+ movq m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pavgb m0, m1
+ palignr m1, m2, 9
+ pslldq m2, m1, 1
+ pslldq m3, m1, 2
+ LOWPASS 1, 2, 3, 4
+
+ ; ABCDEFGH <- for the following predictor:
+ ; IJKLMNOP
+ ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx
+ ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP
+ ; SQABCDEF
+ ; TRIJKLMN
+ ; USQABCDE
+ ; VTRIJKLM
+
+ punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ
+ movq [dstq+strideq*0], m0
+ pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
+ movhps [dstq+strideq*1], m1
+ pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
+ movhps [dstq+strideq*2], m0
+ pslldq m0, 1
+ movhps [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ pslldq m1, 1
+ movhps [dstq+strideq*0], m0
+ pslldq m0, 1
+ movhps [dstq+strideq*1], m1
+ pslldq m1, 1
+ movhps [dstq+strideq*2], m0
+ movhps [dstq+stride3q ], m1
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_vr_16x16, 4, 4, 6, dst, stride, l, a
+ mova m0, [aq]
+ movu m1, [aq-1]
+ mova m2, [lq]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ palignr m3, m1, m2, 15
+ LOWPASS 3, 1, 0, 4
+ pavgb m0, m1
+ palignr m1, m2, 1
+ pslldq m4, m2, 1
+ LOWPASS 1, 2, 4, 5
+ pshufb m1, [pb_02468ACE_13579BDF]
+ mov cntd, 4
+
+.loop:
+ movlhps m2, m1
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m3
+ palignr m4, m0, m1, 15
+ palignr m5, m3, m2, 15
+ mova [dstq+strideq*2], m4
+ mova [dstq+stride3q ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m0, m1, 14
+ palignr m3, m2, 14
+ pslldq m1, 2
+ dec cntd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+INIT_XMM %1
+cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
+ mova m0, [aq]
+ mova m2, [aq+16]
+ movu m1, [aq-1]
+ palignr m3, m2, m0, 15
+ palignr m4, m2, m0, 14
+ LOWPASS 4, 3, 2, 5
+ pavgb m3, m2
+ mova m2, [lq+16]
+ palignr m5, m1, m2, 15
+ LOWPASS 5, 1, 0, 6
+ pavgb m0, m1
+ mova m6, [lq]
+ palignr m1, m2, 1
+ palignr m7, m2, m6, 15
+ LOWPASS 1, 2, 7, 8
+ palignr m2, m6, 1
+ pslldq m7, m6, 1
+ LOWPASS 2, 6, 7, 8
+ pshufb m1, [pb_02468ACE_13579BDF]
+ pshufb m2, [pb_02468ACE_13579BDF]
+ DEFINE_ARGS dst, stride, dst16, cnt
+ lea dst16q, [dstq +strideq*8]
+ lea dst16q, [dst16q+strideq*8]
+ SBUTTERFLY qdq, 2, 1, 6
+ mov cntd, 8
+
+.loop:
+ ; even lines (0, 2, 4, ...): m1 | m0, m3
+ ; odd lines (1, 3, 5, ...): m2 | m5, m4
+%macro %%write 4
+ mova [dstq+stride%1+ 0], %3
+ mova [dstq+stride%1+16], %4
+ movhps [dst16q+stride%1 ], %2
+ movu [dst16q+stride%1+ 8], %3
+ movq [dst16q+stride%1+24], %4
+ palignr %4, %3, 15
+ palignr %3, %2, 15
+ pslldq %2, 1
+%endmacro
+
+ %%write q*0, m1, m0, m3
+ %%write q*1, m2, m5, m4
+ lea dstq, [dstq +strideq*2]
+ lea dst16q, [dst16q+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endif
+%endmacro
+
+VR_XMM_FUNCS ssse3
+VR_XMM_FUNCS avx
+
+; hd
+
+INIT_MMX ssse3
+cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
+ movd m0, [lq]
+ punpckldq m0, [aq-1]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ ; DHIJ <- for the following predictor:
+ ; CGDH
+ ; BFCG | m1 contains ABCDxxxx
+ ; AEBF | m2 contains EFGHIJxx
+
+ punpcklbw m1, m2
+ punpckhdq m0, m1, m2
+
+ ; m1 contains AEBFCGDH
+ ; m0 contains CGDHIJxx
+
+ movd [dstq+stride3q ], m1
+ movd [dstq+strideq*1], m0
+ psrlq m1, 16
+ psrlq m0, 16
+ movd [dstq+strideq*2], m1
+ movd [dstq+strideq*0], m0
+ RET
+
+%macro HD_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a
+ movq m0, [lq]
+ movhps m0, [aq-1]
+ DEFINE_ARGS dst, stride, stride3, dst4
+ lea stride3q, [strideq*3]
+ lea dst4q, [dstq+strideq*4]
+ psrldq m1, m0, 1
+ psrldq m2, m1, 1
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+
+ ; HPQRSTUV <- for the following predictor
+ ; GOHPQRST
+ ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx
+ ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx
+ ; DLEMFNGO
+ ; CKDLEMFN
+ ; BJCKDLEM
+ ; AIBJCKDL
+
+ punpcklbw m1, m2
+ movhlps m2, m2
+
+ ; m1 contains AIBJCKDLEMFNGOHP
+ ; m2 contains QRSTUVxxxxxxxxxx
+
+ movhps [dstq +stride3q ], m1
+ movq [dst4q+stride3q ], m1
+ palignr m3, m2, m1, 2
+ movhps [dstq +strideq*2], m3
+ movq [dst4q+strideq*2], m3
+ palignr m3, m2, m1, 4
+ movhps [dstq +strideq*1], m3
+ movq [dst4q+strideq*1], m3
+ palignr m2, m1, 6
+ movhps [dstq +strideq*0], m2
+ movq [dst4q+strideq*0], m2
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
+ mova m0, [lq]
+ movu m3, [aq-1]
+ DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
+ lea stride4q, [strideq*4]
+ lea dst4q, [dstq +stride4q]
+ lea dst8q, [dst4q+stride4q]
+ lea dst12q, [dst8q+stride4q]
+ psrldq m4, m3, 1
+ psrldq m5, m3, 2
+ LOWPASS 5, 4, 3, 6
+ palignr m1, m3, m0, 1
+ palignr m2, m3, m0, 2
+ LOWPASS 2, 1, 0, 6
+ pavgb m1, m0
+ SBUTTERFLY bw, 1, 2, 6
+
+ ; I PROBABLY INVERTED L0 ad L16 here
+ ; m1, m2, m5
+.loop:
+ sub stride4q, strideq
+ movhps [dstq +stride4q +0], m2
+ movq [dstq +stride4q +8], m5
+ mova [dst4q+stride4q ], m2
+ movhps [dst8q+stride4q +0], m1
+ movq [dst8q+stride4q +8], m2
+ mova [dst12q+stride4q ], m1
+%if cpuflag(avx)
+ palignr m1, m2, m1, 2
+ palignr m2, m5, m2, 2
+%else
+ palignr m3, m2, m1, 2
+ palignr m0, m5, m2, 2
+ mova m1, m3
+ mova m2, m0
+%endif
+ psrldq m5, 2
+ jg .loop
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
+ mova m0, [lq]
+ mova m1, [lq+16]
+ movu m2, [aq-1]
+ movu m3, [aq+15]
+ DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
+ lea stride8q, [strideq*8]
+ lea dst8q, [dstq +stride8q]
+ lea dst16q, [dst8q +stride8q]
+ lea dst24q, [dst16q+stride8q]
+ psrldq m4, m3, 1
+ psrldq m5, m3, 2
+ LOWPASS 5, 4, 3, 6
+ palignr m4, m3, m2, 2
+ palignr m3, m2, 1
+ LOWPASS 4, 3, 2, 6
+ palignr m3, m2, m1, 2
+ palignr m2, m1, 1
+ LOWPASS 3, 2, 1, 6
+ pavgb m2, m1
+ palignr m6, m1, m0, 1
+ palignr m1, m0, 2
+ LOWPASS 1, 6, 0, 7
+ pavgb m0, m1
+ SBUTTERFLY bw, 2, 3, 6
+ SBUTTERFLY bw, 0, 1, 6
+
+ ; m0, m1, m2, m3, m4, m5
+.loop:
+ sub stride8q, strideq
+ mova [dstq +stride8q+ 0], m3
+ mova [dstq +stride8q+16], m4
+ mova [dst8q +stride8q+ 0], m2
+ mova [dst8q +stride8q+16], m3
+ mova [dst16q+stride8q+ 0], m1
+ mova [dst16q+stride8q+16], m2
+ mova [dst24q+stride8q+ 0], m0
+ mova [dst24q+stride8q+16], m1
+%if cpuflag(avx)
+ palignr m0, m1, m0, 2
+ palignr m1, m2, m1, 2
+ palignr m2, m3, m2, 2
+ palignr m3, m4, m3, 2
+ palignr m4, m5, m4, 2
+ psrldq m5, 2
+%else
+ psrldq m6, m5, 2
+ palignr m5, m4, 2
+ palignr m4, m3, 2
+ palignr m3, m2, 2
+ palignr m2, m1, 2
+ palignr m1, m0, 2
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ mova m4, m5
+ mova m5, m6
+%endif
+ jg .loop
+ RET
+%endmacro
+
+HD_XMM_FUNCS ssse3
+HD_XMM_FUNCS avx
+
+INIT_MMX ssse3
+cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
+ movd m0, [lq]
+ pshufb m0, [pb_3to1_5x0]
+ psrlq m1, m0, 8
+ psrlq m2, m1, 8
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ SBUTTERFLY bw, 1, 2, 0
+ palignr m2, m1, 2
+ movd [dstq+strideq*0], m1
+ movd [dstq+strideq*1], m2
+ punpckhdq m1, m1
+ punpckhdq m2, m2
+ movd [dstq+strideq*2], m1
+ movd [dstq+stride3q ], m2
+ RET
+
+%macro HU_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
+ movq m0, [lq]
+ pshufb m0, [pb_7to1_9x0]
+ psrldq m1, m0, 1
+ psrldq m2, m1, 1
+ LOWPASS 2, 1, 0, 3
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride3, dst4
+ lea stride3q, [strideq*3]
+ lea dst4q, [dstq+strideq*4]
+ SBUTTERFLY bw, 1, 2, 0
+ movq [dstq +strideq*0], m1
+ movhps [dst4q+strideq*0], m1
+ palignr m0, m2, m1, 2
+ movq [dstq +strideq*1], m0
+ movhps [dst4q+strideq*1], m0
+ palignr m0, m2, m1, 4
+ movq [dstq +strideq*2], m0
+ movhps [dst4q+strideq*2], m0
+ palignr m2, m1, 6
+ movq [dstq +stride3q ], m2
+ movhps [dst4q+stride3q ], m2
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
+ mova m0, [lq]
+ pshufb m0, [pb_Fto0]
+ mova m3, [pb_2toE_3xF]
+ pshufb m1, m0, [pb_1toE_2xF]
+ pshufb m2, m0, m3
+ LOWPASS 2, 1, 0, 4
+ pavgb m1, m0
+ DEFINE_ARGS dst, stride, stride9, cnt
+ lea stride9q, [strideq *3]
+ mov cntd, 4
+ lea stride9q, [stride9q*3]
+ SBUTTERFLY bw, 1, 2, 0
+
+.loop:
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*8], m2
+ palignr m0, m2, m1, 2
+ pshufb m2, m3
+ mova [dstq+strideq*1], m0
+ mova [dstq+stride9q ], m2
+ palignr m1, m2, m0, 2
+ pshufb m2, m3
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM %1
+cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
+ mova m0, [lq]
+ mova m1, [lq+16]
+ mova m2, [pb_Fto0]
+ mova m4, [pb_2toE_3xF]
+ pshufb m0, m2
+ pshufb m1, m2
+ palignr m2, m0, m1, 1
+ palignr m3, m0, m1, 2
+ LOWPASS 3, 2, 1, 5
+ pavgb m2, m1
+ pshufb m1, m0, m4
+ pshufb m5, m0, [pb_1toE_2xF]
+ LOWPASS 1, 5, 0, 6
+ pavgb m0, m5
+ DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
+ mov cntd, 8
+ xor stride0q, stride0q
+ lea dst8q, [dstq +strideq*8]
+ lea dst16q, [dst8q +strideq*8]
+ lea dst24q, [dst16q+strideq*8]
+ SBUTTERFLY bw, 0, 1, 5
+ SBUTTERFLY bw, 2, 3, 5
+ pshufb m6, m1, [pb_15]
+
+.loop:
+ mova [dstq +stride0q+ 0], m2
+ mova [dstq +stride0q+16], m3
+ mova [dst8q +stride0q+ 0], m3
+ mova [dst8q +stride0q+16], m0
+ mova [dst16q+stride0q+ 0], m0
+ mova [dst16q+stride0q+16], m1
+ mova [dst24q+stride0q+ 0], m1
+ mova [dst24q+stride0q+16], m6
+%if cpuflag(avx)
+ palignr m2, m3, m2, 2
+ palignr m3, m0, m3, 2
+ palignr m0, m1, m0, 2
+ pshufb m1, m4
+%else
+ pshufb m5, m1, m4
+ palignr m1, m0, 2
+ palignr m0, m3, 2
+ palignr m3, m2, 2
+ mova m2, m3
+ mova m3, m0
+ mova m0, m1
+ mova m1, m5
+%endif
+ add stride0q, strideq
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+HU_XMM_FUNCS ssse3
+HU_XMM_FUNCS avx
+
+; FIXME 127, 128, 129 ?
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
new file mode 100644
index 0000000000..8087c2e336
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -0,0 +1,1669 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pw_m11585x2: times 8 dw -23170
+
+%macro VP9_IDCT_COEFFS 2-3 0
+pw_%1x2: times 8 dw %1*2
+pw_m%1x2: times 8 dw -%1*2
+pw_%2x2: times 8 dw %2*2
+pw_m%2x2: times 8 dw -%2*2
+pw_m%1_%2: times 4 dw -%1, %2
+pw_%2_%1: times 4 dw %2, %1
+pw_m%2_m%1: times 4 dw -%2, -%1
+%if %3 == 1
+pw_m%2_%1: times 4 dw -%2, %1
+pw_%1_%2: times 4 dw %1, %2
+%endif
+%endmacro
+
+VP9_IDCT_COEFFS 15137, 6270, 1
+VP9_IDCT_COEFFS 16069, 3196, 1
+VP9_IDCT_COEFFS 9102, 13623, 1
+VP9_IDCT_COEFFS 16305, 1606
+VP9_IDCT_COEFFS 10394, 12665
+VP9_IDCT_COEFFS 14449, 7723
+VP9_IDCT_COEFFS 4756, 15679
+VP9_IDCT_COEFFS 16364, 804
+VP9_IDCT_COEFFS 11003, 12140
+VP9_IDCT_COEFFS 14811, 7005
+VP9_IDCT_COEFFS 5520, 15426
+VP9_IDCT_COEFFS 15893, 3981
+VP9_IDCT_COEFFS 8423, 14053
+VP9_IDCT_COEFFS 13160, 9760
+VP9_IDCT_COEFFS 2404, 16207
+
+pw_5283_13377: times 4 dw 5283, 13377
+pw_9929_13377: times 4 dw 9929, 13377
+pw_15212_m13377: times 4 dw 15212, -13377
+pw_15212_9929: times 4 dw 15212, 9929
+pw_m5283_m15212: times 4 dw -5283, -15212
+pw_13377x2: times 8 dw 13377*2
+
+pd_8192: times 4 dd 8192
+pw_2048: times 8 dw 2048
+pw_1024: times 8 dw 1024
+pw_512: times 8 dw 512
+pw_m1: times 8 dw -1
+
+SECTION .text
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+ pmaddwd m%1, m%2, %4
+ pmaddwd m%2, %5
+ paddd m%1, %3
+ paddd m%2, %3
+ psrad m%1, 14
+ psrad m%2, 14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+ VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3]
+ VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3]
+ packssdw m%1, m%7
+ packssdw m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+ punpckhwd m%6, m%2, m%1
+ punpcklwd m%2, m%1
+ VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7
+%else
+ punpckhwd m%8, m%4, m%3
+ punpcklwd m%2, m%4, m%3
+ VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
+ punpckhwd m%4, m%2, m%1
+ punpcklwd m%2, m%1
+ pmaddwd m%3, m%4, [pw_m%5_%6]
+ pmaddwd m%4, [pw_%6_%5]
+ pmaddwd m%1, m%2, [pw_m%5_%6]
+ pmaddwd m%2, [pw_%6_%5]
+%endmacro
+
+%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
+ SUMSUB_BA d, %1, %2, %5
+ SUMSUB_BA d, %3, %4, %5
+ paddd m%1, %6
+ paddd m%2, %6
+ paddd m%3, %6
+ paddd m%4, %6
+ psrad m%1, 14
+ psrad m%2, 14
+ psrad m%3, 14
+ psrad m%4, 14
+ packssdw m%1, m%3
+ packssdw m%2, m%4
+%endmacro
+
+%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
+ movh m%3, [%6]
+ movh m%4, [%6+strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%3, m%1
+ paddw m%4, m%2
+ packuswb m%3, m%5
+ packuswb m%4, m%5
+ movh [%6], m%3
+ movh [%6+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*2/mmsize
+ mova [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IWHT4_1D 0
+ SWAP 1, 2, 3
+ paddw m0, m2
+ psubw m3, m1
+ psubw m4, m0, m3
+ psraw m4, 1
+ psubw m5, m4, m1
+ SWAP 5, 1
+ psubw m4, m2
+ SWAP 4, 2
+ psubw m0, m1
+ paddw m3, m2
+ SWAP 3, 2, 1
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+ mova m0, [blockq+0*8]
+ mova m1, [blockq+1*8]
+ mova m2, [blockq+2*8]
+ mova m3, [blockq+3*8]
+ psraw m0, 2
+ psraw m1, 2
+ psraw m2, 2
+ psraw m3, 2
+
+ VP9_IWHT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IWHT4_1D
+
+ pxor m4, m4
+ VP9_STORE_2X 0, 1, 5, 6, 4
+ lea dstq, [dstq+strideq*2]
+ VP9_STORE_2X 2, 3, 5, 6, 4
+ ZERO_BLOCK blockq, 8, 4, m4
+ RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+ SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
+ SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
+ SWAP 0, 3, 2 ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+ SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+ pmulhrsw m2, m6 ; m2=t0
+ pmulhrsw m0, m6 ; m0=t1
+ VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
+ VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+; 2x2 top left corner
+%macro VP9_IDCT4_2x2_1D 0
+ pmulhrsw m0, m5 ; m0=t1
+ mova m2, m0 ; m2=t0
+ mova m3, m1
+ pmulhrsw m1, m6 ; m1=t2
+ pmulhrsw m3, m7 ; m3=t3
+ VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT4_WRITEOUT 0
+ mova m5, [pw_2048]
+ pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+ pmulhrsw m1, m5
+ VP9_STORE_2X 0, 1, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ VP9_STORE_2X 2, 3, 6, 7, 4
+%endmacro
+
+INIT_MMX ssse3
+cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
+
+ cmp eobd, 4 ; 2x2 or smaller
+ jg .idctfull
+
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idct2x2
+
+ movd m0, [blockq]
+ mova m5, [pw_11585x2]
+ pmulhrsw m0, m5
+ pmulhrsw m0, m5
+ pshufw m0, m0, 0
+ pxor m4, m4
+ movh [blockq], m4
+ pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ RET
+
+; faster path for when only top left 2x2 block is set
+.idct2x2:
+ movd m0, [blockq+0]
+ movd m1, [blockq+8]
+ mova m5, [pw_11585x2]
+ mova m6, [pw_6270x2]
+ mova m7, [pw_15137x2]
+ VP9_IDCT4_2x2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT4_2x2_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ movh [blockq+ 0], m4
+ movh [blockq+ 8], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+
+.idctfull: ; generic full 4x4 idct/idct
+ mova m0, [blockq+ 0]
+ mova m1, [blockq+ 8]
+ mova m2, [blockq+16]
+ mova m3, [blockq+24]
+ mova m6, [pw_11585x2]
+ mova m7, [pd_8192] ; rounding
+ VP9_IDCT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT4_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ mova [blockq+ 0], m4
+ mova [blockq+ 8], m4
+ mova [blockq+16], m4
+ mova [blockq+24], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IADST4_1D 0
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ movq2dq xmm2, m2
+ movq2dq xmm3, m3
+ paddw m3, m0
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+ pmaddwd xmm1, xmm0, [pw_5283_13377]
+ pmaddwd xmm4, xmm0, [pw_9929_13377]
+ pmaddwd xmm0, [pw_15212_m13377]
+ pmaddwd xmm3, xmm2, [pw_15212_9929]
+ pmaddwd xmm2, [pw_m5283_m15212]
+ psubw m3, m2
+ paddd xmm0, xmm2
+ paddd xmm3, [pd_8192]
+ paddd xmm2, [pd_8192]
+ paddd xmm1, xmm3
+ paddd xmm0, xmm3
+ paddd xmm4, xmm2
+ psrad xmm1, 14
+ psrad xmm0, 14
+ psrad xmm4, 14
+ pmulhrsw m3, [pw_13377x2] ; out2
+ packssdw xmm0, xmm0
+ packssdw xmm1, xmm1
+ packssdw xmm4, xmm4
+ movdq2q m0, xmm0 ; out3
+ movdq2q m1, xmm1 ; out0
+ movdq2q m2, xmm4 ; out1
+ SWAP 0, 1, 2, 3
+%endmacro
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 8, dst, stride, block, eob
+ mova m0, [blockq+ 0]
+ mova m1, [blockq+ 8]
+ mova m2, [blockq+16]
+ mova m3, [blockq+24]
+ mova m6, [pw_11585x2]
+ mova m7, [pd_8192] ; rounding
+ VP9_%2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_%4_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ mova [blockq+ 0], m4
+ mova [blockq+ 8], m4
+ mova [blockq+16], m4
+ mova [blockq+24], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+IADST4_FN idct, IDCT4, iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct, IDCT4, ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+
+%if ARCH_X86_64 ; TODO: 32-bit? (32-bit limited to 8 xmm reg, we use more)
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT8_1D_FINALIZE 0
+ SUMSUB_BA w, 3, 10, 4 ; m3=t0+t7, m10=t0-t7
+ SUMSUB_BA w, 1, 2, 4 ; m1=t1+t6, m2=t1-t6
+ SUMSUB_BA w, 11, 0, 4 ; m11=t2+t5, m0=t2-t5
+ SUMSUB_BA w, 9, 8, 4 ; m9=t3+t4, m8=t3-t4
+ SWAP 11, 10, 2
+ SWAP 3, 9, 0
+%endmacro
+
+%macro VP9_IDCT8_1D 0
+ SUMSUB_BA w, 8, 0, 4 ; m8=IN(0)+IN(4) m0=IN(0)-IN(4)
+ pmulhrsw m8, m12 ; m8=t0a
+ pmulhrsw m0, m12 ; m0=t1a
+ VP9_UNPACK_MULSUB_2W_4X 2, 10, 15137, 6270, m7, 4, 5 ; m2=t2a, m10=t3a
+ VP9_UNPACK_MULSUB_2W_4X 1, 11, 16069, 3196, m7, 4, 5 ; m1=t4a, m11=t7a
+ VP9_UNPACK_MULSUB_2W_4X 9, 3, 9102, 13623, m7, 4, 5 ; m9=t5a, m3=t6a
+ SUMSUB_BA w, 10, 8, 4 ; m10=t0a+t3a (t0), m8=t0a-t3a (t3)
+ SUMSUB_BA w, 2, 0, 4 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+ SUMSUB_BA w, 9, 1, 4 ; m9=t4a+t5a (t4), m1=t4a-t5a (t5a)
+ SUMSUB_BA w, 3, 11, 4 ; m3=t7a+t6a (t7), m11=t7a-t6a (t6a)
+ SUMSUB_BA w, 1, 11, 4 ; m1=t6a+t5a (t6), m11=t6a-t5a (t5)
+ pmulhrsw m1, m12 ; m1=t6
+ pmulhrsw m11, m12 ; m11=t5
+ VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_4x4_1D 0
+ pmulhrsw m0, m12 ; m0=t1a/t0a
+ pmulhrsw m10, m2, [pw_15137x2] ; m10=t3a
+ pmulhrsw m2, [pw_6270x2] ; m2=t2a
+ pmulhrsw m11, m1, [pw_16069x2] ; m11=t7a
+ pmulhrsw m1, [pw_3196x2] ; m1=t4a
+ pmulhrsw m9, m3, [pw_9102x2] ; m9=-t5a
+ pmulhrsw m3, [pw_13623x2] ; m3=t6a
+ psubw m8, m0, m10 ; m8=t0a-t3a (t3)
+ paddw m10, m0 ; m10=t0a+t3a (t0)
+ SUMSUB_BA w, 2, 0, 4 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+ SUMSUB_BA w, 9, 1, 4 ; m1=t4a+t5a (t4), m9=t4a-t5a (t5a)
+ SWAP 1, 9
+ SUMSUB_BA w, 3, 11, 4 ; m3=t7a+t6a (t7), m11=t7a-t6a (t6a)
+ SUMSUB_BA w, 1, 11, 4 ; m1=t6a+t5a (t6), m11=t6a-t5a (t5)
+ pmulhrsw m1, m12 ; m1=t6
+ pmulhrsw m11, m12 ; m11=t5
+ VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+; TODO: a lot of t* copies can probably be removed and merged with
+; following SUMSUBs from VP9_IDCT8_1D_FINALIZE with AVX
+%macro VP9_IDCT8_2x2_1D 0
+ pmulhrsw m0, m12 ; m0=t0
+ mova m3, m1
+ pmulhrsw m1, m6 ; m1=t4
+ pmulhrsw m3, m7 ; m3=t7
+ mova m2, m0 ; m2=t1
+ mova m10, m0 ; m10=t2
+ mova m8, m0 ; m8=t3
+ mova m11, m3 ; t5 = t7a ...
+ mova m9, m3 ; t6 = t7a ...
+ psubw m11, m1 ; t5 = t7a - t4a
+ paddw m9, m1 ; t6 = t7a + t4a
+ pmulhrsw m11, m12 ; m11=t5
+ pmulhrsw m9, m12 ; m9=t6
+ SWAP 0, 10
+ SWAP 9, 1
+ VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_WRITEOUT 0
+ mova m5, [pw_1024]
+ pmulhrsw m0, m5 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+ pmulhrsw m1, m5
+ VP9_STORE_2X 0, 1, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ VP9_STORE_2X 2, 3, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ pmulhrsw m8, m5
+ pmulhrsw m9, m5
+ VP9_STORE_2X 8, 9, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ pmulhrsw m10, m5
+ pmulhrsw m11, m5
+ VP9_STORE_2X 10, 11, 6, 7, 4
+%endmacro
+
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
+
+ mova m12, [pw_11585x2] ; often used
+
+ cmp eobd, 12 ; top left half or less
+ jg .idctfull
+
+ cmp eobd, 3 ; top left corner or less
+ jg .idcthalf
+
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idcttopleftcorner
+
+ movd m0, [blockq]
+ pmulhrsw m0, m12
+ pmulhrsw m0, m12
+ SPLATW m0, m0, 0
+ pxor m4, m4
+ movd [blockq], m4
+ mova m5, [pw_1024]
+ pmulhrsw m0, m5 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 0, 0, 6, 7, 4
+ RET
+
+; faster path for when only left corner is set (3 input: DC, right to DC, below
+; to DC). Note: also working with a 2x2 block
+.idcttopleftcorner:
+ movd m0, [blockq+0]
+ movd m1, [blockq+16]
+ mova m6, [pw_3196x2]
+ mova m7, [pw_16069x2]
+ VP9_IDCT8_2x2_1D
+ TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4
+ VP9_IDCT8_2x2_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ movd [blockq+ 0], m4
+ movd [blockq+16], m4
+ VP9_IDCT8_WRITEOUT
+ RET
+
+.idcthalf:
+ movh m0, [blockq + 0]
+ movh m1, [blockq +16]
+ movh m2, [blockq +32]
+ movh m3, [blockq +48]
+ VP9_IDCT8_4x4_1D
+ TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4
+ VP9_IDCT8_4x4_1D
+ pxor m4, m4
+ movh [blockq+ 0], m4
+ movh [blockq+16], m4
+ movh [blockq+32], m4
+ movh [blockq+48], m4
+ VP9_IDCT8_WRITEOUT
+ RET
+
+.idctfull: ; generic full 8x8 idct/idct
+ mova m0, [blockq+ 0] ; IN(0)
+ mova m1, [blockq+ 16] ; IN(1)
+ mova m2, [blockq+ 32] ; IN(2)
+ mova m3, [blockq+ 48] ; IN(3)
+ mova m8, [blockq+ 64] ; IN(4)
+ mova m9, [blockq+ 80] ; IN(5)
+ mova m10, [blockq+ 96] ; IN(6)
+ mova m11, [blockq+112] ; IN(7)
+ mova m7, [pd_8192] ; rounding
+ VP9_IDCT8_1D
+ TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4
+ VP9_IDCT8_1D
+
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ ZERO_BLOCK blockq, 16, 8, m4
+ VP9_IDCT8_WRITEOUT
+ RET
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3
+VP9_IDCT_IDCT_8x8_ADD_XMM avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/8/9/10/11
+ VP9_UNPACK_MULSUB_2D_4X 11, 0, 4, 5, 16305, 1606 ; m11/4=t1[d], m0/5=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 8, 6, 13, 10394, 12665 ; m3/6=t5[d], m8/13=t4[d]
+ VP9_RND_SH_SUMSUB_BA 8, 0, 13, 5, 14, m7 ; m8=t0[w], m0=t4[w]
+ VP9_RND_SH_SUMSUB_BA 3, 11, 6, 4, 14, m7 ; m3=t1[w], m11=t5[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 9, 2, 4, 5, 14449, 7723 ; m9/4=t3[d], m2/5=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 1, 10, 6, 13, 4756, 15679 ; m1/6=t7[d], m10/13=t6[d]
+ VP9_RND_SH_SUMSUB_BA 10, 2, 13, 5, 14, m7 ; m10=t2[w], m2=t6[w]
+ VP9_RND_SH_SUMSUB_BA 1, 9, 6, 4, 14, m7 ; m1=t3[w], m9=t7[w]
+
+ ; m8=t0, m3=t1, m10=t2, m1=t3, m0=t4, m11=t5, m2=t6, m9=t7
+
+ VP9_UNPACK_MULSUB_2D_4X 0, 11, 4, 5, 15137, 6270 ; m0/4=t5[d], m11/5=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 9, 2, 6, 13, 6270, 15137 ; m9/6=t6[d], m2/13=t7[d]
+ VP9_RND_SH_SUMSUB_BA 9, 11, 6, 5, 14, m7
+ psignw m9, [pw_m1] ; m9=out1[w], m11=t6[w]
+ VP9_RND_SH_SUMSUB_BA 2, 0, 13, 4, 14, m7 ; m2=out6[w], m0=t7[w]
+
+ SUMSUB_BA w, 10, 8, 14 ; m10=out0[w], m8=t2[w]
+ SUMSUB_BA w, 1, 3, 14
+ psignw m1, [pw_m1] ; m1=out7[w], m3=t3[w]
+
+ ; m10=out0, m9=out1, m8=t2, m3=t3, m11=t6, m0=t7, m2=out6, m1=out7
+
+ SUMSUB_BA w, 3, 8, 4
+ SUMSUB_BA w, 0, 11, 5
+ pmulhrsw m3, m12
+ pmulhrsw m11, m12
+ pmulhrsw m8, m12 ; out4
+ pmulhrsw m0, m12 ; out2
+ psignw m3, [pw_m1] ; out3
+ psignw m11, [pw_m1] ; out5
+
+ ; m10=out0, m9=out1, m0=out2, m3=out3, m8=out4, m11=out5, m2=out6, m1=out7
+
+ SWAP 0, 10, 2
+ SWAP 11, 1, 9
+%endmacro
+
+%macro IADST8_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_8x8_add, 3, 3, 15, dst, stride, block, eob
+ mova m0, [blockq+ 0] ; IN(0)
+ mova m1, [blockq+ 16] ; IN(1)
+ mova m2, [blockq+ 32] ; IN(2)
+ mova m3, [blockq+ 48] ; IN(3)
+ mova m8, [blockq+ 64] ; IN(4)
+ mova m9, [blockq+ 80] ; IN(5)
+ mova m10, [blockq+ 96] ; IN(6)
+ mova m11, [blockq+112] ; IN(7)
+
+ mova m12, [pw_11585x2] ; often used
+ mova m7, [pd_8192] ; rounding
+ VP9_%2_1D
+ TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4
+ VP9_%4_1D
+
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ ZERO_BLOCK blockq, 16, 8, m4
+ VP9_IDCT8_WRITEOUT
+ RET
+%endmacro
+
+IADST8_FN idct, IDCT8, iadst, IADST8, ssse3
+IADST8_FN idct, IDCT8, iadst, IADST8, avx
+IADST8_FN iadst, IADST8, idct, IDCT8, ssse3
+IADST8_FN iadst, IADST8, idct, IDCT8, avx
+IADST8_FN iadst, IADST8, iadst, IADST8, ssse3
+IADST8_FN iadst, IADST8, iadst, IADST8, avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; at the end of this macro, m7 is stored in stack_scratch
+; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
+; the following sumsubs have not been done yet:
+; SUMSUB_BA w, 6, 9, 15 ; t6, t9
+; SUMSUB_BA w, 7, 8, 15 ; t7, t8
+%macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch
+%if %2 <= 4
+ mova m3, [%1+ 1*%3] ; IN(1)
+ mova m12, [%1+ 2*%3] ; IN(2)
+ mova m0, [%1+ 3*%3] ; IN(3)
+
+ pmulhrsw m15, m12, [pw_16069x2] ; t6-7
+ pmulhrsw m12, [pw_3196x2] ; t4-5
+ pmulhrsw m4, m3, [pw_16305x2] ; t14-15
+ pmulhrsw m3, [pw_1606x2] ; t8-9
+ pmulhrsw m7, m0, [pw_m4756x2] ; t10-11
+ pmulhrsw m0, [pw_15679x2] ; t12-13
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+ ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+ paddw m14, m15, m12
+ psubw m13, m15, m12
+ pmulhrsw m13, [pw_11585x2] ; t5
+ pmulhrsw m14, [pw_11585x2] ; t6
+
+ VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
+ VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
+ mova m5, [%1+ 1*%3] ; IN(1)
+ mova m14, [%1+ 2*%3] ; IN(2)
+ mova m6, [%1+ 3*%3] ; IN(3)
+ mova m9, [%1+ 4*%3] ; IN(4)
+ mova m7, [%1+ 5*%3] ; IN(5)
+ mova m15, [%1+ 6*%3] ; IN(6)
+ mova m4, [%1+ 7*%3] ; IN(7)
+%if %2 <= 8
+ pmulhrsw m8, m9, [pw_15137x2] ; t3
+ pmulhrsw m9, [pw_6270x2] ; t2
+ pmulhrsw m13, m14, [pw_16069x2] ; t7
+ pmulhrsw m14, [pw_3196x2] ; t4
+ pmulhrsw m12, m15, [pw_m9102x2] ; t5
+ pmulhrsw m15, [pw_13623x2] ; t6
+ pmulhrsw m2, m5, [pw_16305x2] ; t15
+ pmulhrsw m5, [pw_1606x2] ; t8
+ pmulhrsw m3, m4, [pw_m10394x2] ; t9
+ pmulhrsw m4, [pw_12665x2] ; t14
+ pmulhrsw m0, m7, [pw_14449x2] ; t13
+ pmulhrsw m7, [pw_7723x2] ; t10
+ pmulhrsw m1, m6, [pw_m4756x2] ; t11
+ pmulhrsw m6, [pw_15679x2] ; t12
+%else
+ mova m3, [%1+ 9*%3] ; IN(9)
+ mova m12, [%1+10*%3] ; IN(10)
+ mova m0, [%1+11*%3] ; IN(11)
+ mova m8, [%1+12*%3] ; IN(12)
+ mova m1, [%1+13*%3] ; IN(13)
+ mova m13, [%1+14*%3] ; IN(14)
+ mova m2, [%1+15*%3] ; IN(15)
+
+ ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
+ ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
+
+ VP9_UNPACK_MULSUB_2W_4X 9, 8, 15137, 6270, [pd_8192], 10, 11 ; t2, t3
+ VP9_UNPACK_MULSUB_2W_4X 14, 13, 16069, 3196, [pd_8192], 10, 11 ; t4, t7
+ VP9_UNPACK_MULSUB_2W_4X 12, 15, 9102, 13623, [pd_8192], 10, 11 ; t5, t6
+ VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 10, 11 ; t8, t15
+ VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 10, 11 ; t9, t14
+ VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 10, 11 ; t10, t13
+ VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 10, 11 ; t11, t12
+%endif
+
+ ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
+ ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
+
+ SUMSUB_BA w, 12, 14, 10 ; t4, t5
+ SUMSUB_BA w, 15, 13, 10 ; t7, t6
+ SUMSUB_BA w, 3, 5, 10 ; t8, t9
+ SUMSUB_BA w, 7, 1, 10 ; t11, t10
+ SUMSUB_BA w, 0, 6, 10 ; t12, t13
+ SUMSUB_BA w, 4, 2, 10 ; t15, t14
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+ ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+ SUMSUB_BA w, 14, 13, 10
+ pmulhrsw m13, [pw_11585x2] ; t5
+ pmulhrsw m14, [pw_11585x2] ; t6
+ VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
+ VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
+%endif
+
+ ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
+ ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
+
+ SUMSUB_BA w, 7, 3, 10 ; t8, t11
+ SUMSUB_BA w, 6, 2, 10 ; t9, t10
+ SUMSUB_BA w, 0, 4, 10 ; t15, t12
+ SUMSUB_BA w, 1, 5, 10 ; t14. t13
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+
+ SUMSUB_BA w, 2, 5, 10
+ SUMSUB_BA w, 3, 4, 10
+ pmulhrsw m5, [pw_11585x2] ; t10
+ pmulhrsw m4, [pw_11585x2] ; t11
+ pmulhrsw m3, [pw_11585x2] ; t12
+ pmulhrsw m2, [pw_11585x2] ; t13
+
+ ; backup first register
+ mova [%4], m7
+
+ ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+ ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
+
+ ; from load/start
+%if %2 <= 4
+ mova m11, [%1+ 0*%3] ; IN(0)
+ pmulhrsw m11, [pw_11585x2] ; t0-t3
+
+ psubw m8, m11, m15
+ paddw m15, m11
+ psubw m9, m11, m14
+ paddw m14, m11
+ psubw m10, m11, m13
+ paddw m13, m11
+%else
+ mova m10, [%1+ 0*%3] ; IN(0)
+%if %2 <= 8
+ pmulhrsw m10, [pw_11585x2] ; t0 and t1
+ psubw m11, m10, m8
+ paddw m8, m10
+%else
+ mova m11, [%1+ 8*%3] ; IN(8)
+
+ ; from 3 stages back
+ SUMSUB_BA w, 11, 10, 7
+ pmulhrsw m11, [pw_11585x2] ; t0
+ pmulhrsw m10, [pw_11585x2] ; t1
+
+ ; from 2 stages back
+ SUMSUB_BA w, 8, 11, 7 ; t0, t3
+%endif
+ SUMSUB_BA w, 9, 10, 7 ; t1, t2
+
+ ; from 1 stage back
+ SUMSUB_BA w, 15, 8, 7 ; t0, t7
+ SUMSUB_BA w, 14, 9, 7 ; t1, t6
+ SUMSUB_BA w, 13, 10, 7 ; t2, t5
+%endif
+ SUMSUB_BA w, 12, 11, 7 ; t3, t4
+
+ SUMSUB_BA w, 0, 15, 7 ; t0, t15
+ SUMSUB_BA w, 1, 14, 7 ; t1, t14
+ SUMSUB_BA w, 2, 13, 7 ; t2, t13
+ SUMSUB_BA w, 3, 12, 7 ; t3, t12
+ SUMSUB_BA w, 4, 11, 7 ; t4, t11
+ SUMSUB_BA w, 5, 10, 7 ; t5, t10
+%endmacro
+
+%macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc
+ VP9_IDCT16_1D_START %1, %3, 32, tmpq+32
+
+%if %2 == 1
+ ; backup a different register
+ mova [tmpq+16], m15
+ mova m7, [tmpq+32]
+
+ SUMSUB_BA w, 6, 9, 15 ; t6, t9
+ SUMSUB_BA w, 7, 8, 15 ; t7, t8
+
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15
+ mova [tmpq+ 0], m0
+ mova [tmpq+ 32], m1
+ mova [tmpq+ 64], m2
+ mova [tmpq+ 96], m3
+ mova [tmpq+128], m4
+ mova [tmpq+160], m5
+ mova [tmpq+192], m6
+ mova [tmpq+224], m7
+
+ mova m15, [tmpq+16]
+ TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
+ mova [tmpq+ 16], m8
+ mova [tmpq+ 48], m9
+ mova [tmpq+ 80], m10
+ mova [tmpq+112], m11
+ mova [tmpq+144], m12
+ mova [tmpq+176], m13
+ mova [tmpq+208], m14
+ mova [tmpq+240], m15
+%else ; %2 == 2
+ ; backup more registers
+ mova [tmpq+64], m8
+ mova [tmpq+96], m9
+
+ pxor m7, m7
+ pmulhrsw m0, [pw_512]
+ pmulhrsw m1, [pw_512]
+ VP9_STORE_2X 0, 1, 8, 9, 7
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m2, [pw_512]
+ pmulhrsw m3, [pw_512]
+ VP9_STORE_2X 2, 3, 8, 9, 7
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m4, [pw_512]
+ pmulhrsw m5, [pw_512]
+ VP9_STORE_2X 4, 5, 8, 9, 7
+ lea dstq, [dstq+strideq*2]
+
+ ; restore from cache
+ SWAP 0, 7 ; move zero from m7 to m0
+ mova m7, [tmpq+32]
+ mova m8, [tmpq+64]
+ mova m9, [tmpq+96]
+
+ SUMSUB_BA w, 6, 9, 1 ; t6, t9
+ SUMSUB_BA w, 7, 8, 1 ; t7, t8
+
+ pmulhrsw m6, [pw_512]
+ pmulhrsw m7, [pw_512]
+ VP9_STORE_2X 6, 7, 1, 2, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m8, [pw_512]
+ pmulhrsw m9, [pw_512]
+ VP9_STORE_2X 8, 9, 1, 2, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m10, [pw_512]
+ pmulhrsw m11, [pw_512]
+ VP9_STORE_2X 10, 11, 1, 2, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m12, [pw_512]
+ pmulhrsw m13, [pw_512]
+ VP9_STORE_2X 12, 13, 1, 2, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m14, [pw_512]
+ pmulhrsw m15, [pw_512]
+ VP9_STORE_2X 14, 15, 1, 2, 0
+%endif ; %2 == 1/2
+%endmacro
+
+%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
+ mova m%3, [dstq]
+ mova m%5, [dstq+%7]
+ punpcklbw m%2, m%3, m%6
+ punpckhbw m%3, m%6
+ punpcklbw m%4, m%5, m%6
+ punpckhbw m%5, m%6
+ paddw m%2, m%1
+ paddw m%3, m%1
+ paddw m%4, m%1
+ paddw m%5, m%1
+ packuswb m%2, m%3
+ packuswb m%4, m%5
+ mova [dstq], m%2
+ mova [dstq+%7], m%4
+%endmacro
+
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
+ ; 2x2=eob=3, 4x4=eob=10
+ cmp eobd, 38
+ jg .idctfull
+ cmp eobd, 1 ; faster path for when only DC is set
+ jne .idct8x8
+
+ ; dc-only
+ movd m0, [blockq]
+ mova m1, [pw_11585x2]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ SPLATW m0, m0, q0000
+ pmulhrsw m0, [pw_512]
+ pxor m5, m5
+ movd [blockq], m5
+%rep 7
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
+ lea dstq, [dstq+2*strideq]
+%endrep
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
+ RET
+
+ DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
+.idct8x8:
+ mov tmpq, rsp
+ VP9_IDCT16_1D blockq, 1, 8
+
+ mov cntd, 2
+ mov dst_bakq, dstq
+.loop2_8x8:
+ VP9_IDCT16_1D tmpq, 2, 8
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_8x8
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 8, m0
+ RET
+
+.idctfull:
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_full:
+ VP9_IDCT16_1D blockq, 1
+ add blockq, 16
+ add tmpq, 256
+ dec cntd
+ jg .loop1_full
+ sub blockq, 32
+
+ mov cntd, 2
+ mov tmpq, rsp
+ mov dst_bakq, dstq
+.loop2_full:
+ VP9_IDCT16_1D tmpq, 2
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST16_1D 2 ; src, pass
+%assign %%str 16*%2
+ mova m0, [%1+ 0*32] ; in0
+ mova m1, [%1+15*32] ; in15
+ mova m8, [%1+ 7*32] ; in7
+ mova m9, [%1+ 8*32] ; in8
+
+ VP9_UNPACK_MULSUB_2D_4X 1, 0, 2, 3, 16364, 804 ; m1/2=t1[d], m0/3=t0[d]
+ VP9_UNPACK_MULSUB_2D_4X 8, 9, 11, 10, 11003, 12140 ; m8/11=t9[d], m9/10=t8[d]
+ VP9_RND_SH_SUMSUB_BA 9, 0, 10, 3, 4, [pd_8192] ; m9=t0[w], m0=t8[w]
+ VP9_RND_SH_SUMSUB_BA 8, 1, 11, 2, 4, [pd_8192] ; m8=t1[w], m1=t9[w]
+
+ mova m11, [%1+ 2*32] ; in2
+ mova m10, [%1+13*32] ; in13
+ mova m3, [%1+ 5*32] ; in5
+ mova m2, [%1+10*32] ; in10
+
+ VP9_UNPACK_MULSUB_2D_4X 10, 11, 6, 7, 15893, 3981 ; m10/6=t3[d], m11/7=t2[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d]
+ VP9_RND_SH_SUMSUB_BA 2, 11, 5, 7, 12, [pd_8192] ; m2=t2[w], m11=t10[w]
+ VP9_RND_SH_SUMSUB_BA 3, 10, 4, 6, 12, [pd_8192] ; m3=t3[w], m10=t11[w]
+
+ mova [tmpq+ 0*%%str], m9 ; make some scratch space (t0:m9->r0)
+ mova m4, [%1+ 4*32] ; in4
+ mova m5, [%1+11*32] ; in11
+ mova m12, [%1+ 3*32] ; in3
+ mova m13, [%1+12*32] ; in12
+
+ VP9_UNPACK_MULSUB_2D_4X 5, 4, 7, 6, 14811, 7005 ; m5/7=t5[d], m4/6=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 12, 13, 14, 15, 5520, 15426 ; m12/14=t13[d], m13/15=t12[d]
+ VP9_RND_SH_SUMSUB_BA 13, 4, 15, 6, 9, [pd_8192] ; m13=t4[w], m4=t12[w]
+ VP9_RND_SH_SUMSUB_BA 12, 5, 14, 7, 9, [pd_8192] ; m12=t5[w], m5=t13[w]
+
+ mova [tmpq+ 2*%%str], m8 ; t1:m9->r2
+ mova [tmpq+ 3*%%str], m2 ; t2:m2->r3
+ mova [tmpq+ 4*%%str], m3 ; t3:m3->r4
+ mova [tmpq+ 5*%%str], m13 ; t4:m13->r5
+ mova m2, [%1+ 6*32] ; in6
+ mova m3, [%1+ 9*32] ; in9
+ mova m8, [%1+ 1*32] ; in1
+ mova m9, [%1+14*32] ; in14
+
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d]
+ VP9_UNPACK_MULSUB_2D_4X 8, 9, 13, 14, 2404, 16207 ; m8/13=t15[d], m9/14=t14[d]
+ VP9_RND_SH_SUMSUB_BA 9, 2, 14, 6, 15, [pd_8192] ; m9=t6[w], m2=t14[w]
+ VP9_RND_SH_SUMSUB_BA 8, 3, 13, 7, 15, [pd_8192] ; m8=t7[w], m3=t15[w]
+
+ ; r0=t0, r2=t1, r3=t2, r4=t3, r5=t4, m12=t5, m9=t6, m8=t7
+ ; m0=t8, m1=t9, m11=t10, m10=t11, m4=t12, m5=t13, m2=t14, m3=t15
+
+ ; handle t8-15 first
+ VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d]
+ VP9_UNPACK_MULSUB_2D_4X 5, 4, 13, 14, 3196, 16069 ; m5/13=t12[d], m4/14=t13[d]
+ VP9_RND_SH_SUMSUB_BA 5, 1, 13, 7, 15, [pd_8192] ; m5=t8[w], m1=t12[w]
+ VP9_RND_SH_SUMSUB_BA 4, 0, 14, 6, 15, [pd_8192] ; m4=t9[w], m0=t13[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 11, 10, 6, 7, 9102, 13623 ; m11/6=t11[d], m10/7=t10[d]
+ VP9_UNPACK_MULSUB_2D_4X 3, 2, 13, 14, 13623, 9102 ; m3/13=t14[d], m2/14=t15[d]
+ VP9_RND_SH_SUMSUB_BA 3, 10, 13, 7, 15, [pd_8192] ; m3=t10[w], m10=t14[w]
+ VP9_RND_SH_SUMSUB_BA 2, 11, 14, 6, 15, [pd_8192] ; m2=t11[w], m11=t15[w]
+
+ ; m5=t8, m4=t9, m3=t10, m2=t11, m1=t12, m0=t13, m10=t14, m11=t15
+
+ VP9_UNPACK_MULSUB_2D_4X 1, 0, 6, 7, 15137, 6270 ; m1/6=t13[d], m0/7=t12[d]
+ VP9_UNPACK_MULSUB_2D_4X 11, 10, 13, 14, 6270, 15137 ; m11/13=t14[d], m10/14=t15[d]
+ VP9_RND_SH_SUMSUB_BA 11, 0, 13, 7, 15, [pd_8192] ; m11=out2[w], m0=t14[w]
+ VP9_RND_SH_SUMSUB_BA 10, 1, 14, 6, 15, [pd_8192]
+ psignw m10, [pw_m1] ; m10=out13[w], m1=t15[w]
+
+ SUMSUB_BA w, 3, 5, 15
+ psignw m3, [pw_m1] ; m3=out1[w], m5=t10[w]
+ SUMSUB_BA w, 2, 4, 15 ; m2=out14[w], m4=t11[w]
+
+ SUMSUB_BA w, 5, 4, 15
+ pmulhrsw m5, [pw_11585x2] ; m5=out6[w]
+ pmulhrsw m4, [pw_11585x2] ; m4=out9[w]
+ SUMSUB_BA w, 1, 0, 15
+ pmulhrsw m1, [pw_m11585x2] ; m1=out5[w]
+ pmulhrsw m0, [pw_11585x2] ; m0=out10[w]
+
+ ; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14
+
+ mova m6, [tmpq+ 0*%%str]
+ mova m7, [tmpq+ 2*%%str]
+ mova m13, [tmpq+ 3*%%str]
+ mova m14, [tmpq+ 4*%%str]
+ mova m15, [tmpq+ 5*%%str]
+ mova [tmpq+ 8*%%str], m5
+ mova [tmpq+ 9*%%str], m4
+ mova [tmpq+10*%%str], m0
+ mova [tmpq+11*%%str], m10
+ mova [tmpq+12*%%str], m2
+
+ ; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7
+ ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
+
+ SUMSUB_BA w, 15, 6, 0 ; m15=t0[w], m6=t4[w]
+ SUMSUB_BA w, 12, 7, 0 ; m12=t1[w], m7=t5[w]
+ SUMSUB_BA w, 9, 13, 0 ; m9=t2[w], m13=t6[w]
+ SUMSUB_BA w, 8, 14, 0 ; m8=t3[w], m14=t7[w]
+
+ VP9_UNPACK_MULSUB_2D_4X 6, 7, 0, 2, 15137, 6270 ; m6/0=t5[d], m7/2=t4[d]
+ VP9_UNPACK_MULSUB_2D_4X 14, 13, 4, 5, 6270, 15137 ; m14/4=t6[d], m13/5=t7[d]
+ VP9_RND_SH_SUMSUB_BA 14, 7, 4, 2, 10, [pd_8192]
+ psignw m14, [pw_m1] ; m14=out3[w], m7=t6[w]
+ VP9_RND_SH_SUMSUB_BA 13, 6, 5, 0, 10, [pd_8192] ; m13=out12[w], m6=t7[w]
+ SUMSUB_BA w, 9, 15, 10 ; m9=out0[w], m15=t2[w]
+ SUMSUB_BA w, 8, 12, 10
+ psignw m8, [pw_m1] ; m8=out15[w], m12=t3[w]
+
+ SUMSUB_BA w, 12, 15, 10
+ pmulhrsw m12, [pw_m11585x2] ; m12=out7[w]
+ pmulhrsw m15, [pw_11585x2] ; m15=out8[w]
+ SUMSUB_BA w, 7, 6, 10
+ pmulhrsw m7, [pw_11585x2] ; m7=out4[w]
+ pmulhrsw m6, [pw_11585x2] ; m6=out11[w]
+
+ ; m9=out0, m14=out3, m7=out4, m12=out7, m15=out8, m6=out11, m13=out12, m8=out15
+ ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
+
+%if %2 == 1
+ mova m0, [tmpq+ 8*%%str]
+ TRANSPOSE8x8W 9, 3, 11, 14, 7, 1, 0, 12, 2
+ mova [tmpq+ 0*16], m9
+ mova [tmpq+ 2*16], m3
+ mova [tmpq+ 4*16], m11
+ mova [tmpq+ 6*16], m14
+ mova m9, [tmpq+ 9*%%str]
+ mova m3, [tmpq+10*%%str]
+ mova m11, [tmpq+11*%%str]
+ mova m14, [tmpq+12*%%str]
+ mova [tmpq+ 8*16], m7
+ mova [tmpq+10*16], m1
+ mova [tmpq+12*16], m0
+ mova [tmpq+14*16], m12
+
+ TRANSPOSE8x8W 15, 9, 3, 6, 13, 11, 14, 8, 2
+ mova [tmpq+ 1*16], m15
+ mova [tmpq+ 3*16], m9
+ mova [tmpq+ 5*16], m3
+ mova [tmpq+ 7*16], m6
+ mova [tmpq+ 9*16], m13
+ mova [tmpq+11*16], m11
+ mova [tmpq+13*16], m14
+ mova [tmpq+15*16], m8
+%else
+ mova m5, [tmpq+ 8*%%str]
+ pxor m0, m0
+
+ pmulhrsw m9, [pw_512]
+ pmulhrsw m3, [pw_512]
+ VP9_STORE_2X 9, 3, 2, 4, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m11, [pw_512]
+ pmulhrsw m14, [pw_512]
+ VP9_STORE_2X 11, 14, 2, 4, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m7, [pw_512]
+ pmulhrsw m1, [pw_512]
+ VP9_STORE_2X 7, 1, 2, 4, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m5, [pw_512]
+ pmulhrsw m12, [pw_512]
+ VP9_STORE_2X 5, 12, 2, 4, 0
+ lea dstq, [dstq+strideq*2]
+
+ mova m9, [tmpq+ 9*%%str]
+ mova m3, [tmpq+10*%%str]
+ mova m11, [tmpq+11*%%str]
+ mova m14, [tmpq+12*%%str]
+
+ pmulhrsw m15, [pw_512]
+ pmulhrsw m9, [pw_512]
+ VP9_STORE_2X 15, 9, 2, 4, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m3, [pw_512]
+ pmulhrsw m6, [pw_512]
+ VP9_STORE_2X 3, 6, 2, 4, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m13, [pw_512]
+ pmulhrsw m11, [pw_512]
+ VP9_STORE_2X 13, 11, 2, 4, 0
+ lea dstq, [dstq+strideq*2]
+ pmulhrsw m14, [pw_512]
+ pmulhrsw m8, [pw_512]
+ VP9_STORE_2X 14, 8, 2, 4, 0
+%endif
+%endmacro
+
+%macro IADST16_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_full:
+ VP9_%2_1D blockq, 1
+ add blockq, 16
+ add tmpq, 256
+ dec cntd
+ jg .loop1_full
+ sub blockq, 32
+
+ mov cntd, 2
+ mov tmpq, rsp
+ mov dst_bakq, dstq
+.loop2_full:
+ VP9_%4_1D tmpq, 2
+ lea dstq, [dst_bakq+8]
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m0 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 32, 16, m0
+ RET
+%endmacro
+
+IADST16_FN idct, IDCT16, iadst, IADST16, ssse3
+IADST16_FN idct, IDCT16, iadst, IADST16, avx
+IADST16_FN iadst, IADST16, idct, IDCT16, ssse3
+IADST16_FN iadst, IADST16, idct, IDCT16, avx
+IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
+IADST16_FN iadst, IADST16, iadst, IADST16, avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
+%assign %%str 16*%2*%2
+ ; first do t0-15, this can be done identical to idct16x16
+ VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq+ 4*%%str
+
+ ; backup a different register
+ mova [tmpq+30*%%str], m15 ; t15
+ mova m7, [tmpq+ 4*%%str]
+
+ SUMSUB_BA w, 6, 9, 15 ; t6, t9
+ SUMSUB_BA w, 7, 8, 15 ; t7, t8
+
+ ; store everything on stack to make space available for t16-31
+ ; we store interleaved with the output of the second half (t16-31)
+ ; so we don't need to allocate extra stack space
+ mova [tmpq+ 0*%%str], m0 ; t0
+ mova [tmpq+ 4*%%str], m1 ; t1
+ mova [tmpq+ 8*%%str], m2 ; t2
+ mova [tmpq+12*%%str], m3 ; t3
+ mova [tmpq+16*%%str], m4 ; t4
+ mova [tmpq+20*%%str], m5 ; t5
+ mova [tmpq+24*%%str], m6 ; t6
+ mova [tmpq+28*%%str], m7 ; t7
+ mova [tmpq+ 2*%%str], m8 ; t8
+ mova [tmpq+ 6*%%str], m9 ; t9
+ mova [tmpq+10*%%str], m10 ; t10
+ mova [tmpq+14*%%str], m11 ; t11
+ mova [tmpq+18*%%str], m12 ; t12
+ mova [tmpq+22*%%str], m13 ; t13
+ mova [tmpq+26*%%str], m14 ; t14
+
+ ; then, secondly, do t16-31
+%if %3 <= 8
+ mova m4, [%1+ 1*64]
+ mova m3, [%1+ 3*64]
+ mova m0, [%1+ 5*64]
+ mova m7, [%1+ 7*64]
+
+ pmulhrsw m11, m4, [pw_16364x2] ;t31
+ pmulhrsw m4, [pw_804x2] ;t16
+ pmulhrsw m8, m7, [pw_m5520x2] ;t19
+ pmulhrsw m7, [pw_15426x2] ;t28
+ pmulhrsw m15, m0, [pw_15893x2] ;t27
+ pmulhrsw m0, [pw_3981x2] ;t20
+ pmulhrsw m12, m3, [pw_m2404x2] ;t23
+ pmulhrsw m3, [pw_16207x2] ;t24
+
+ ; m4=t16/17, m8=t18/19, m0=t20/21, m12=t22/23,
+ ; m3=t24/25, m15=t26/27, m7=t28/29, m11=t30/31
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 10, 11, 4, 16069, 3196, [pd_8192], 6, 9 ; t17, t30
+ VP9_UNPACK_MULSUB_2W_4X 9, 6, 7, 8, 3196, m16069, [pd_8192], 1, 14 ; t18, t29
+ ; from 1 stage forward
+ SUMSUB_BA w, 8, 4, 1
+ ; temporary storage
+ mova [tmpq+17*%%str], m8 ; t16
+ mova [tmpq+21*%%str], m4 ; t19
+ VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
+ VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
+
+ ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+ ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+%else
+ mova m10, [%1+ 1*64]
+ mova m13, [%1+ 3*64]
+ mova m14, [%1+ 5*64]
+ mova m9, [%1+ 7*64]
+ mova m8, [%1+ 9*64]
+ mova m15, [%1+11*64]
+ mova m12, [%1+13*64]
+ mova m11, [%1+15*64]
+%if %3 <= 16
+ pmulhrsw m5, m10, [pw_16364x2]
+ pmulhrsw m10, [pw_804x2]
+ pmulhrsw m4, m11, [pw_m11003x2]
+ pmulhrsw m11, [pw_12140x2]
+ pmulhrsw m7, m8, [pw_14811x2]
+ pmulhrsw m8, [pw_7005x2]
+ pmulhrsw m6, m9, [pw_m5520x2]
+ pmulhrsw m9, [pw_15426x2]
+ pmulhrsw m1, m14, [pw_15893x2]
+ pmulhrsw m14, [pw_3981x2]
+ pmulhrsw m0, m15, [pw_m8423x2]
+ pmulhrsw m15, [pw_14053x2]
+%else
+ mova m4, [%1+17*64]
+ mova m0, [%1+21*64]
+ mova m7, [%1+23*64]
+ mova m6, [%1+25*64]
+ mova m1, [%1+27*64]
+ mova m5, [%1+31*64]
+
+ ; m10=in1, m4=in17, m8=in9, m6=in25, m14=in5, m0=in21, m12=in13, m2=in29,
+ ; m13=in3, m3=in19, m15=in11, m1=in27, m9=in7, m7=in23, m11=in15, m5=in31
+
+ VP9_UNPACK_MULSUB_2W_4X 10, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31
+ VP9_UNPACK_MULSUB_2W_4X 4, 11, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
+ VP9_UNPACK_MULSUB_2W_4X 8, 7, 14811, 7005, [pd_8192], 2, 3 ; t18, t29
+ VP9_UNPACK_MULSUB_2W_4X 6, 9, 5520, 15426, [pd_8192], 2, 3 ; t19, t28
+ VP9_UNPACK_MULSUB_2W_4X 14, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27
+ VP9_UNPACK_MULSUB_2W_4X 0, 15, 8423, 14053, [pd_8192], 2, 3 ; t21, t26
+%endif
+
+ ; from 1 stage forward
+ SUMSUB_BA w, 4, 10, 2
+ SUMSUB_BA w, 8, 6, 2
+ ; from 2 stages forward
+ SUMSUB_BA w, 8, 4, 2
+ ; temporary storage
+ mova [tmpq+17*%%str], m8 ; t16
+ mova [tmpq+21*%%str], m4 ; t19
+%if %3 <= 16
+ pmulhrsw m3, m12, [pw_13160x2]
+ pmulhrsw m12, [pw_9760x2]
+ pmulhrsw m2, m13, [pw_m2404x2]
+ pmulhrsw m13, [pw_16207x2]
+%else
+ mova m2, [%1+29*64]
+ mova m3, [%1+19*64]
+ VP9_UNPACK_MULSUB_2W_4X 12, 3, 13160, 9760, [pd_8192], 4, 8 ; t22, t25
+ VP9_UNPACK_MULSUB_2W_4X 2, 13, 2404, 16207, [pd_8192], 4, 8 ; t23, t24
+%endif
+
+ ; m10=t16, m4=t17, m8=t18, m6=t19, m14=t20, m0=t21, m12=t22, m2=t23,
+ ; m13=t24, m3=t25, m15=t26, m1=t27, m9=t28, m7=t29, m11=t30, m5=t31
+
+ SUMSUB_BA w, 0, 14, 4
+ SUMSUB_BA w, 12, 2, 4
+ SUMSUB_BA w, 3, 13, 4
+ SUMSUB_BA w, 15, 1, 4
+ SUMSUB_BA w, 7, 9, 4
+ SUMSUB_BA w, 11, 5, 4
+
+ ; m4=t16, m10=t17, m6=t18, m8=t19, m0=t20, m14=t21, m2=t22, m12=t23,
+ ; m3=t24, m13=t25, m1=t26, m15=t27, m7=t28, m9=t29, m5=t30, m11=t31
+
+ VP9_UNPACK_MULSUB_2W_4X 5, 10, 16069, 3196, [pd_8192], 4, 8 ; t17, t30
+ VP9_UNPACK_MULSUB_2W_4X 9, 6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29
+ VP9_UNPACK_MULSUB_2W_4X 1, 14, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
+ VP9_UNPACK_MULSUB_2W_4X 13, 2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
+%endif
+
+ ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+ ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+
+ SUMSUB_BA w, 9, 5, 4
+ SUMSUB_BA w, 1, 13, 4
+ SUMSUB_BA w, 0, 12, 4
+ SUMSUB_BA w, 15, 3, 4
+ SUMSUB_BA w, 14, 2, 4
+ SUMSUB_BA w, 6, 10, 4
+ SUMSUB_BA w, 7, 11, 4
+
+ ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
+ ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
+
+ mova m8, [tmpq+17*%%str] ; t16
+ ; from 2 stages forward
+ SUMSUB_BA w, 0, 8, 4
+ SUMSUB_BA w, 15, 7, 4
+ ; from 3 stages forward
+ SUMSUB_BA w, 8, 7, 4
+ pmulhrsw m7, [pw_11585x2]
+ pmulhrsw m8, [pw_11585x2]
+ ; store t16/t23
+ mova [tmpq+ 1*%%str], m0 ; t16
+ mova [tmpq+29*%%str], m7 ; t23
+
+ mova m4, [tmpq+21*%%str] ; t19
+ VP9_UNPACK_MULSUB_2W_4X 10, 5, 15137, 6270, [pd_8192], 0, 7 ; t18, t29
+ VP9_UNPACK_MULSUB_2W_4X 11, 4, 15137, 6270, [pd_8192], 0, 7 ; t19, t28
+ VP9_UNPACK_MULSUB_2W_4X 3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27
+ VP9_UNPACK_MULSUB_2W_4X 2, 13, 6270, m15137, [pd_8192], 0, 7 ; t21, t26
+
+ ; m8=t16, m9=t17, m10=t18, m11=t19, m3=t20, m2=t21, m1=t22, m0=t23,
+ ; m15=t24, m14=t25, m13=t26, m12=t27, m4=t28, m5=t29, m6=t30, m7=t31
+
+ SUMSUB_BA w, 1, 9, 0
+ SUMSUB_BA w, 2, 10, 0
+ SUMSUB_BA w, 3, 11, 0
+ SUMSUB_BA w, 12, 4, 0
+ SUMSUB_BA w, 13, 5, 0
+ SUMSUB_BA w, 14, 6, 0
+
+ ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
+ ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+ SUMSUB_BA w, 9, 6, 0
+ SUMSUB_BA w, 10, 5, 0
+ SUMSUB_BA w, 11, 4, 0
+
+ pmulhrsw m6, [pw_11585x2]
+ pmulhrsw m9, [pw_11585x2]
+ pmulhrsw m5, [pw_11585x2]
+ pmulhrsw m10, [pw_11585x2]
+ pmulhrsw m4, [pw_11585x2]
+ pmulhrsw m11, [pw_11585x2]
+
+ ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
+ ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+ ; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for
+ ; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for
+ ; final sumsub of pass 2
+ mova [tmpq+ 5*%%str], m1 ; t17
+ mova [tmpq+ 9*%%str], m2 ; t18
+ mova [tmpq+13*%%str], m3 ; t19
+
+ ; then do final pass to sumsub+store the two halves
+%if %2 == 1
+ mova [tmpq+17*%%str], m4 ; t20
+ mova [tmpq+21*%%str], m5 ; t21
+ mova [tmpq+25*%%str], m6 ; t22
+
+ mova m0, [tmpq+ 0*%%str] ; t0
+ mova m1, [tmpq+ 4*%%str] ; t1
+ mova m2, [tmpq+ 8*%%str] ; t2
+ mova m3, [tmpq+12*%%str] ; t3
+ mova m4, [tmpq+16*%%str] ; t4
+ mova m5, [tmpq+20*%%str] ; t5
+ mova m6, [tmpq+24*%%str] ; t6
+
+ SUMSUB_BA w, 15, 0, 7
+ mova [tmpq+ 3*%%str], m0 ; t15
+ mova m7, [tmpq+28*%%str] ; t7
+ SUMSUB_BA w, 14, 1, 0
+ SUMSUB_BA w, 13, 2, 0
+ SUMSUB_BA w, 12, 3, 0
+ SUMSUB_BA w, 11, 4, 0
+ SUMSUB_BA w, 10, 5, 0
+ SUMSUB_BA w, 9, 6, 0
+ SUMSUB_BA w, 8, 7, 0
+
+ TRANSPOSE8x8W 15, 14, 13, 12, 11, 10, 9, 8, 0
+ mova [tmpq+ 0*%%str], m15
+ mova [tmpq+ 4*%%str], m14
+ mova [tmpq+ 8*%%str], m13
+ mova [tmpq+12*%%str], m12
+ mova [tmpq+16*%%str], m11
+ mova [tmpq+20*%%str], m10
+ mova [tmpq+24*%%str], m9
+ mova [tmpq+28*%%str], m8
+
+ mova m0, [tmpq+ 3*%%str] ; t15
+ TRANSPOSE8x8W 7, 6, 5, 4, 3, 2, 1, 0, 8
+ mova [tmpq+ 3*%%str], m7
+ mova [tmpq+ 7*%%str], m6
+ mova [tmpq+11*%%str], m5
+ mova [tmpq+15*%%str], m4
+ mova [tmpq+19*%%str], m3
+ mova [tmpq+23*%%str], m2
+ mova [tmpq+27*%%str], m1
+ mova [tmpq+31*%%str], m0
+
+ mova m15, [tmpq+ 2*%%str] ; t8
+ mova m14, [tmpq+ 6*%%str] ; t9
+ mova m13, [tmpq+10*%%str] ; t10
+ mova m12, [tmpq+14*%%str] ; t11
+ mova m11, [tmpq+18*%%str] ; t12
+ mova m10, [tmpq+22*%%str] ; t13
+ mova m9, [tmpq+26*%%str] ; t14
+ mova m8, [tmpq+30*%%str] ; t15
+ mova m7, [tmpq+ 1*%%str] ; t16
+ mova m6, [tmpq+ 5*%%str] ; t17
+ mova m5, [tmpq+ 9*%%str] ; t18
+ mova m4, [tmpq+13*%%str] ; t19
+ mova m3, [tmpq+17*%%str] ; t20
+ mova m2, [tmpq+21*%%str] ; t21
+ mova m1, [tmpq+25*%%str] ; t22
+
+ SUMSUB_BA w, 7, 8, 0
+ mova [tmpq+ 2*%%str], m8
+ mova m0, [tmpq+29*%%str] ; t23
+ SUMSUB_BA w, 6, 9, 8
+ SUMSUB_BA w, 5, 10, 8
+ SUMSUB_BA w, 4, 11, 8
+ SUMSUB_BA w, 3, 12, 8
+ SUMSUB_BA w, 2, 13, 8
+ SUMSUB_BA w, 1, 14, 8
+ SUMSUB_BA w, 0, 15, 8
+
+ TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ mova [tmpq+ 1*%%str], m0
+ mova [tmpq+ 5*%%str], m1
+ mova [tmpq+ 9*%%str], m2
+ mova [tmpq+13*%%str], m3
+ mova [tmpq+17*%%str], m4
+ mova [tmpq+21*%%str], m5
+ mova [tmpq+25*%%str], m6
+ mova [tmpq+29*%%str], m7
+
+ mova m8, [tmpq+ 2*%%str]
+ TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
+ mova [tmpq+ 2*%%str], m8
+ mova [tmpq+ 6*%%str], m9
+ mova [tmpq+10*%%str], m10
+ mova [tmpq+14*%%str], m11
+ mova [tmpq+18*%%str], m12
+ mova [tmpq+22*%%str], m13
+ mova [tmpq+26*%%str], m14
+ mova [tmpq+30*%%str], m15
+%else
+ ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+ ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+ ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
+ ; t20-22 is in m4-6
+ ; t24-31 is in m8-15
+ pxor m7, m7
+
+%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
+ SUMSUB_BA w, %4, %1, %5
+ SUMSUB_BA w, %3, %2, %5
+ pmulhrsw m%4, [pw_512]
+ pmulhrsw m%3, [pw_512]
+ VP9_STORE_2X %4, %3, %5, %6, %7
+%if %8 == 1
+ add dstq, stride2q
+%endif
+ pmulhrsw m%2, [pw_512]
+ pmulhrsw m%1, [pw_512]
+ VP9_STORE_2X %2, %1, %5, %6, %7, dst_endq
+%if %8 == 1
+ sub dst_endq, stride2q
+%endif
+%endmacro
+
+ ; store t0-1 and t30-31
+ mova m0, [tmpq+ 0*%%str]
+ mova m1, [tmpq+ 4*%%str]
+ %%STORE_2X2 0, 1, 14, 15, 2, 3, 7
+
+ ; store t2-3 and t28-29
+ mova m0, [tmpq+ 8*%%str]
+ mova m1, [tmpq+12*%%str]
+ %%STORE_2X2 0, 1, 12, 13, 2, 3, 7
+
+ ; store t4-5 and t26-27
+ mova m0, [tmpq+16*%%str]
+ mova m1, [tmpq+20*%%str]
+ %%STORE_2X2 0, 1, 10, 11, 2, 3, 7
+
+ ; store t6-7 and t24-25
+ mova m0, [tmpq+24*%%str]
+ mova m1, [tmpq+28*%%str]
+ %%STORE_2X2 0, 1, 8, 9, 2, 3, 7
+
+ ; store t8-9 and t22-23
+ mova m0, [tmpq+ 2*%%str]
+ mova m1, [tmpq+ 6*%%str]
+ mova m8, [tmpq+29*%%str]
+ %%STORE_2X2 0, 1, 6, 8, 2, 3, 7
+
+ ; store t10-11 and t20-21
+ mova m0, [tmpq+10*%%str]
+ mova m1, [tmpq+14*%%str]
+ %%STORE_2X2 0, 1, 4, 5, 2, 3, 7
+
+ ; store t12-13 and t18-19
+ mova m0, [tmpq+18*%%str]
+ mova m1, [tmpq+22*%%str]
+ mova m5, [tmpq+13*%%str]
+ mova m4, [tmpq+ 9*%%str]
+ %%STORE_2X2 0, 1, 4, 5, 2, 3, 7
+
+ ; store t14-17
+ mova m0, [tmpq+26*%%str]
+ mova m1, [tmpq+30*%%str]
+ mova m5, [tmpq+ 5*%%str]
+ mova m4, [tmpq+ 1*%%str]
+ %%STORE_2X2 0, 1, 4, 5, 2, 3, 7, 0
+%endif
+%endmacro
+
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
+ cmp eobd, 135
+ jg .idctfull
+ cmp eobd, 34
+ jg .idct16x16
+ cmp eobd, 1
+ jg .idct8x8
+
+ ; dc-only case
+ movd m0, [blockq]
+ mova m1, [pw_11585x2]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ SPLATW m0, m0, q0000
+ pmulhrsw m0, [pw_512]
+ pxor m5, m5
+ movd [blockq], m5
+ DEFINE_ARGS dst, stride, block, cnt
+%rep 31
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
+ add dstq, strideq
+%endrep
+ VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
+ RET
+
+ DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+.idct8x8:
+ mov tmpq, rsp
+ VP9_IDCT32_1D blockq, 1, 8
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ sub stride30q, stride2q ; stride*30
+.loop2_8x8:
+ mov dstq, dst_bakq
+ lea dst_endq, [dst_bakq+stride30q]
+ VP9_IDCT32_1D tmpq, 2, 8
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_8x8
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 64, 8, m7
+ RET
+
+.idct16x16:
+ mov cntd, 2
+ mov tmpq, rsp
+.loop1_16x16:
+ VP9_IDCT32_1D blockq, 1, 16
+ add blockq, 16
+ add tmpq, 512
+ dec cntd
+ jg .loop1_16x16
+ sub blockq, 32
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ mov tmpq, rsp
+ sub stride30q, stride2q ; stride*30
+.loop2_16x16:
+ mov dstq, dst_bakq
+ lea dst_endq, [dst_bakq+stride30q]
+ VP9_IDCT32_1D tmpq, 2, 16
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_16x16
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 64, 16, m7
+ RET
+
+.idctfull:
+ mov cntd, 4
+ mov tmpq, rsp
+.loop1_full:
+ VP9_IDCT32_1D blockq, 1
+ add blockq, 16
+ add tmpq, 512
+ dec cntd
+ jg .loop1_full
+ sub blockq, 64
+
+ mov stride30q, strideq ; stride
+ lea stride2q, [strideq*2] ; stride*2
+ shl stride30q, 5 ; stride*32
+ mov cntd, 4
+ mov tmpq, rsp
+ sub stride30q, stride2q ; stride*30
+.loop2_full:
+ mov dstq, dst_bakq
+ lea dst_endq, [dst_bakq+stride30q]
+ VP9_IDCT32_1D tmpq, 2
+ add dst_bakq, 8
+ add tmpq, 16
+ dec cntd
+ jg .loop2_full
+
+ ; at the end of the loop, m7 should still be zero
+ ; use that to zero out block coefficients
+ ZERO_BLOCK blockq, 64, 32, m7
+ RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
+
+%endif ; x86-64
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
new file mode 100644
index 0000000000..e41dd2cd47
--- /dev/null
+++ b/libavcodec/x86/vp9lpf.asm
@@ -0,0 +1,823 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%if ARCH_X86_64
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_3
+cextern pb_80
+
+pb_4: times 16 db 0x04
+pb_10: times 16 db 0x10
+pb_40: times 16 db 0x40
+pb_81: times 16 db 0x81
+pb_f8: times 16 db 0xf8
+pb_fe: times 16 db 0xfe
+
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+
+; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
+; the following mask is used to splat both in the same register
+mask_mix: times 8 db 0
+ times 8 db 1
+
+mask_mix84: times 8 db 0xff
+ times 8 db 0x00
+mask_mix48: times 8 db 0x00
+ times 8 db 0xff
+
+SECTION .text
+
+; %1 = abs(%2-%3)
+%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
+ psubusb %1, %3, %2
+ psubusb %4, %2, %3
+ por %1, %4
+%endmacro
+
+; %1 = %1<=%2
+%macro CMP_LTE 3-4 ; src/dst, cmp, tmp, pb_80
+%if %0 == 4
+ pxor %1, %4
+%endif
+ pcmpgtb %3, %2, %1 ; cmp > src?
+ pcmpeqb %1, %2 ; cmp == src? XXX: avoid this with a -1/+1 well placed?
+ por %1, %3 ; cmp >= src?
+%endmacro
+
+; %1 = abs(%2-%3) <= %4
+%macro ABSSUB_CMP 6-7 [pb_80]; dst, src1, src2, cmp, tmp1, tmp2, [pb_80]
+ ABSSUB %1, %2, %3, %6 ; dst = abs(src1-src2)
+ CMP_LTE %1, %4, %6, %7 ; dst <= cmp
+%endmacro
+
+%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp
+ pand %1, %3 ; new &= mask
+ pandn %4, %3, %2 ; tmp = ~mask & old
+ por %1, %4 ; new&mask | old&~mask
+%endmacro
+
+%macro FILTER_SUBx2_ADDx2 8 ; %1=dst %2=h/l %3=cache %4=sub1 %5=sub2 %6=add1 %7=add2 %8=rshift
+ punpck%2bw %3, %4, m0
+ psubw %1, %3
+ punpck%2bw %3, %5, m0
+ psubw %1, %3
+ punpck%2bw %3, %6, m0
+ paddw %1, %3
+ punpck%2bw %3, %7, m0
+ paddw %1, %3
+ mova %3, %1
+ psraw %1, %8
+%endmacro
+
+%macro FILTER_INIT 7-8 ; tmp1, tmp2, cacheL, cacheH, dstp, filterid, [source]
+ FILTER%6_INIT %1, l, %3
+ FILTER%6_INIT %2, h, %4
+ packuswb %1, %2
+%if %0 == 8
+ MASK_APPLY %1, %8, %7, %2
+%else
+ MASK_APPLY %1, %5, %7, %2
+%endif
+ mova %5, %1
+%endmacro
+
+%macro FILTER_UPDATE 11-14 ; tmp1, tmp2, cacheL, cacheH, dstp, -, -, +, +, rshift, [source], [preload reg + value]
+%if %0 == 13 ; no source + preload
+ mova %12, %13
+%elif %0 == 14 ; source + preload
+ mova %13, %14
+%endif
+ FILTER_SUBx2_ADDx2 %1, l, %3, %6, %7, %8, %9, %10
+ FILTER_SUBx2_ADDx2 %2, h, %4, %6, %7, %8, %9, %10
+ packuswb %1, %2
+%if %0 == 12 || %0 == 14
+ MASK_APPLY %1, %12, %11, %2
+%else
+ MASK_APPLY %1, %5, %11, %2
+%endif
+ mova %5, %1
+%endmacro
+
+%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp
+ mova %4, [pb_f8]
+ pand %1, %4
+ pand %2, %4
+ psrlq %1, 3
+ psrlq %2, 3
+ pxor %1, %3
+ pxor %2, %3
+ psubb %1, %3
+ psubb %2, %3
+%endmacro
+
+%macro EXTRACT_POS_NEG 3 ; i8, neg, pos
+ pxor %3, %3
+ pxor %2, %2
+ pcmpgtb %3, %1 ; i8 < 0 mask
+ psubb %2, %1 ; neg values (only the originally - will be kept)
+ pand %2, %3 ; negative values of i8 (but stored as +)
+ pandn %3, %1 ; positive values of i8
+%endmacro
+
+; clip_u8(u8 + i8)
+%macro SIGN_ADD 5 ; dst, u8, i8, tmp1, tmp2
+ EXTRACT_POS_NEG %3, %4, %5
+ psubusb %1, %2, %4 ; sub the negatives
+ paddusb %1, %5 ; add the positives
+%endmacro
+
+; clip_u8(u8 - i8)
+%macro SIGN_SUB 5 ; dst, u8, i8, tmp1, tmp2
+ EXTRACT_POS_NEG %3, %4, %5
+ psubusb %1, %2, %5 ; sub the positives
+ paddusb %1, %4 ; add the negatives
+%endmacro
+
+%macro FILTER6_INIT 3 ; %1=dst %2=h/l %3=cache
+ punpck%2bw %3, m14, m0 ; p3: B->W
+ mova %1, %3 ; p3
+ paddw %1, %3 ; p3*2
+ paddw %1, %3 ; p3*3
+ punpck%2bw %3, m15, m0 ; p2: B->W
+ paddw %1, %3 ; p3*3 + p2
+ paddw %1, %3 ; p3*3 + p2*2
+ punpck%2bw %3, m10, m0 ; p1: B->W
+ paddw %1, %3 ; p3*3 + p2*2 + p1
+ punpck%2bw %3, m11, m0 ; p0: B->W
+ paddw %1, %3 ; p3*3 + p2*2 + p1 + p0
+ punpck%2bw %3, m12, m0 ; q0: B->W
+ paddw %1, %3 ; p3*3 + p2*2 + p1 + p0 + q0
+ paddw %1, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4
+ mova %3, %1 ; base for next line (cache)
+ psraw %1, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
+%endmacro
+
+%macro FILTER14_INIT 3 ; %1=dst %2=h/l %3=cache
+ punpck%2bw %1, m2, m0 ; p7: B->W
+ mova %3, %1
+ psllw %1, 3 ; p7*8
+ psubw %1, %3 ; p7*7
+ punpck%2bw %3, m3, m0 ; p6: B->W
+ paddw %1, %3 ; p7*7 + p6
+ paddw %1, %3 ; p7*7 + p6*2
+ punpck%2bw %3, m8, m0 ; p5: B->W
+ paddw %1, %3 ; p7*7 + p6*2 + p5
+ punpck%2bw %3, m9, m0 ; p4: B->W
+ paddw %1, %3 ; p7*7 + p6*2 + p5 + p4
+ punpck%2bw %3, m14, m0 ; p3: B->W
+ paddw %1, %3 ; p7*7 + p6*2 + p5 + p4 + p3
+ punpck%2bw %3, m15, m0 ; p2: B->W
+ paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p2
+ punpck%2bw %3, m10, m0 ; p1: B->W
+ paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p1
+ punpck%2bw %3, m11, m0 ; p0: B->W
+ paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p0
+ punpck%2bw %3, m12, m0 ; q0: B->W
+ paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p0 + q0
+ paddw %1, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
+ mova %3, %1 ; base for next line (cache)
+ psraw %1, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
+%endmacro
+
+%macro TRANSPOSE16x16B 17
+ mova %17, m%16
+ SBUTTERFLY bw, %1, %2, %16
+ SBUTTERFLY bw, %3, %4, %16
+ SBUTTERFLY bw, %5, %6, %16
+ SBUTTERFLY bw, %7, %8, %16
+ SBUTTERFLY bw, %9, %10, %16
+ SBUTTERFLY bw, %11, %12, %16
+ SBUTTERFLY bw, %13, %14, %16
+ mova m%16, %17
+ mova %17, m%14
+ SBUTTERFLY bw, %15, %16, %14
+ SBUTTERFLY wd, %1, %3, %14
+ SBUTTERFLY wd, %2, %4, %14
+ SBUTTERFLY wd, %5, %7, %14
+ SBUTTERFLY wd, %6, %8, %14
+ SBUTTERFLY wd, %9, %11, %14
+ SBUTTERFLY wd, %10, %12, %14
+ SBUTTERFLY wd, %13, %15, %14
+ mova m%14, %17
+ mova %17, m%12
+ SBUTTERFLY wd, %14, %16, %12
+ SBUTTERFLY dq, %1, %5, %12
+ SBUTTERFLY dq, %2, %6, %12
+ SBUTTERFLY dq, %3, %7, %12
+ SBUTTERFLY dq, %4, %8, %12
+ SBUTTERFLY dq, %9, %13, %12
+ SBUTTERFLY dq, %10, %14, %12
+ SBUTTERFLY dq, %11, %15, %12
+ mova m%12, %17
+ mova %17, m%8
+ SBUTTERFLY dq, %12, %16, %8
+ SBUTTERFLY qdq, %1, %9, %8
+ SBUTTERFLY qdq, %2, %10, %8
+ SBUTTERFLY qdq, %3, %11, %8
+ SBUTTERFLY qdq, %4, %12, %8
+ SBUTTERFLY qdq, %5, %13, %8
+ SBUTTERFLY qdq, %6, %14, %8
+ SBUTTERFLY qdq, %7, %15, %8
+ mova m%8, %17
+ mova %17, m%1
+ SBUTTERFLY qdq, %8, %16, %1
+ mova m%1, %17
+ SWAP %2, %9
+ SWAP %3, %5
+ SWAP %4, %13
+ SWAP %6, %11
+ SWAP %8, %15
+ SWAP %12, %14
+%endmacro
+
+; transpose 16 half lines (high part) to 8 full centered lines
+%macro TRANSPOSE16x8B 16
+ punpcklbw m%1, m%2
+ punpcklbw m%3, m%4
+ punpcklbw m%5, m%6
+ punpcklbw m%7, m%8
+ punpcklbw m%9, m%10
+ punpcklbw m%11, m%12
+ punpcklbw m%13, m%14
+ punpcklbw m%15, m%16
+ SBUTTERFLY wd, %1, %3, %2
+ SBUTTERFLY wd, %5, %7, %2
+ SBUTTERFLY wd, %9, %11, %2
+ SBUTTERFLY wd, %13, %15, %2
+ SBUTTERFLY dq, %1, %5, %2
+ SBUTTERFLY dq, %3, %7, %2
+ SBUTTERFLY dq, %9, %13, %2
+ SBUTTERFLY dq, %11, %15, %2
+ SBUTTERFLY qdq, %1, %9, %2
+ SBUTTERFLY qdq, %3, %11, %2
+ SBUTTERFLY qdq, %5, %13, %2
+ SBUTTERFLY qdq, %7, %15, %2
+ SWAP %5, %1
+ SWAP %6, %9
+ SWAP %7, %1
+ SWAP %8, %13
+ SWAP %9, %3
+ SWAP %10, %11
+ SWAP %11, %1
+ SWAP %12, %15
+%endmacro
+
+%macro DEFINE_REAL_P7_TO_Q7 0-1 0
+%define P7 dst1q + 2*mstrideq + %1
+%define P6 dst1q + mstrideq + %1
+%define P5 dst1q + %1
+%define P4 dst1q + strideq + %1
+%define P3 dstq + 4*mstrideq + %1
+%define P2 dstq + mstride3q + %1
+%define P1 dstq + 2*mstrideq + %1
+%define P0 dstq + mstrideq + %1
+%define Q0 dstq + %1
+%define Q1 dstq + strideq + %1
+%define Q2 dstq + 2*strideq + %1
+%define Q3 dstq + stride3q + %1
+%define Q4 dstq + 4*strideq + %1
+%define Q5 dst2q + mstrideq + %1
+%define Q6 dst2q + %1
+%define Q7 dst2q + strideq + %1
+%endmacro
+
+%macro SPLATB_MASK 2
+%if cpuflag(ssse3)
+ pshufb %1, %2
+%else
+ punpcklbw %1, %1
+ punpcklqdq %1, %1
+ pshuflw %1, %1, 0
+ pshufhw %1, %1, 0x55
+%endif
+%endmacro
+
+%macro LOOPFILTER 2 ; %1=v/h %2=size1
+ lea mstrideq, [strideq]
+ neg mstrideq
+
+ lea stride3q, [strideq+2*strideq]
+ mov mstride3q, stride3q
+ neg mstride3q
+
+%ifidn %1, h
+%if %2 > 16
+%define movx movh
+ lea dstq, [dstq + 8*strideq - 4]
+%else
+%define movx movu
+ lea dstq, [dstq + 8*strideq - 8] ; go from top center (h pos) to center left (v pos)
+%endif
+%endif
+
+ lea dst1q, [dstq + 2*mstride3q] ; dst1q = &dst[stride * -6]
+ lea dst2q, [dstq + 2* stride3q] ; dst2q = &dst[stride * +6]
+
+ DEFINE_REAL_P7_TO_Q7
+
+%ifidn %1, h
+ movx m0, [P7]
+ movx m1, [P6]
+ movx m2, [P5]
+ movx m3, [P4]
+ movx m4, [P3]
+ movx m5, [P2]
+ movx m6, [P1]
+ movx m7, [P0]
+ movx m8, [Q0]
+ movx m9, [Q1]
+ movx m10, [Q2]
+ movx m11, [Q3]
+ movx m12, [Q4]
+ movx m13, [Q5]
+ movx m14, [Q6]
+ movx m15, [Q7]
+%define P7 rsp + 0
+%define P6 rsp + 16
+%define P5 rsp + 32
+%define P4 rsp + 48
+%define P3 rsp + 64
+%define P2 rsp + 80
+%define P1 rsp + 96
+%define P0 rsp + 112
+%define Q0 rsp + 128
+%define Q1 rsp + 144
+%define Q2 rsp + 160
+%define Q3 rsp + 176
+%define Q4 rsp + 192
+%define Q5 rsp + 208
+%define Q6 rsp + 224
+%define Q7 rsp + 240
+
+%if %2 == 16
+ TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+ mova [P7], m0
+ mova [P6], m1
+ mova [P5], m2
+ mova [P4], m3
+%else
+ TRANSPOSE16x8B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%endif
+ mova [P3], m4
+ mova [P2], m5
+ mova [P1], m6
+ mova [P0], m7
+ mova [Q0], m8
+ mova [Q1], m9
+ mova [Q2], m10
+ mova [Q3], m11
+%if %2 == 16
+ mova [Q4], m12
+ mova [Q5], m13
+ mova [Q6], m14
+ mova [Q7], m15
+%endif
+%endif
+
+ ; calc fm mask
+%if %2 == 16
+%if cpuflag(ssse3)
+ pxor m0, m0
+%endif
+ SPLATB_REG m2, I, m0 ; I I I I ...
+ SPLATB_REG m3, E, m0 ; E E E E ...
+%else
+%if cpuflag(ssse3)
+ mova m0, [mask_mix]
+%endif
+ movd m2, Id
+ movd m3, Ed
+ SPLATB_MASK m2, m0
+ SPLATB_MASK m3, m0
+%endif
+ mova m0, [pb_80]
+ pxor m2, m0
+ pxor m3, m0
+%ifidn %1, v
+ mova m8, [P3]
+ mova m9, [P2]
+ mova m10, [P1]
+ mova m11, [P0]
+ mova m12, [Q0]
+ mova m13, [Q1]
+ mova m14, [Q2]
+ mova m15, [Q3]
+%else
+ SWAP 8, 4, 12
+ SWAP 9, 5, 13
+ SWAP 10, 6, 14
+ SWAP 11, 7, 15
+%endif
+ ABSSUB_CMP m5, m8, m9, m2, m6, m7, m0 ; m5 = abs(p3-p2) <= I
+ ABSSUB_CMP m1, m9, m10, m2, m6, m7, m0 ; m1 = abs(p2-p1) <= I
+ pand m5, m1
+ ABSSUB_CMP m1, m10, m11, m2, m6, m7, m0 ; m1 = abs(p1-p0) <= I
+ pand m5, m1
+ ABSSUB_CMP m1, m12, m13, m2, m6, m7, m0 ; m1 = abs(q1-q0) <= I
+ pand m5, m1
+ ABSSUB_CMP m1, m13, m14, m2, m6, m7, m0 ; m1 = abs(q2-q1) <= I
+ pand m5, m1
+ ABSSUB_CMP m1, m14, m15, m2, m6, m7, m0 ; m1 = abs(q3-q2) <= I
+ pand m5, m1
+ ABSSUB m1, m11, m12, m7 ; abs(p0-q0)
+ paddusb m1, m1 ; abs(p0-q0) * 2
+ ABSSUB m2, m10, m13, m7 ; abs(p1-q1)
+ pand m2, [pb_fe] ; drop lsb so shift can work
+ psrlq m2, 1 ; abs(p1-q1)/2
+ paddusb m1, m2 ; abs(p0-q0)*2 + abs(p1-q1)/2
+ pxor m1, m0
+ pcmpgtb m4, m3, m1 ; E > X?
+ pcmpeqb m3, m1 ; E == X?
+ por m3, m4 ; E >= X?
+ pand m3, m5 ; fm final value
+
+ ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
+ ; calc flat8in (if not 44_16) and hev masks
+ mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80
+%if %2 != 44
+ ABSSUB_CMP m2, m8, m11, m6, m4, m5 ; abs(p3 - p0) <= 1
+ mova m8, [pb_80]
+ ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1
+ pand m2, m1
+ ABSSUB m4, m10, m11, m5 ; abs(p1 - p0)
+%if %2 == 16
+%if cpuflag(ssse3)
+ pxor m0, m0
+%endif
+ SPLATB_REG m7, H, m0 ; H H H H ...
+%else
+ movd m7, Hd
+ SPLATB_MASK m7, [mask_mix]
+%endif
+ pxor m7, m8
+ pxor m4, m8
+ pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition)
+ CMP_LTE m4, m6, m5 ; abs(p1 - p0) <= 1
+ pand m2, m4 ; (flat8in)
+ ABSSUB m4, m13, m12, m1 ; abs(q1 - q0)
+ pxor m4, m8
+ pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition)
+ por m0, m5 ; hev final value
+ CMP_LTE m4, m6, m5 ; abs(q1 - q0) <= 1
+ pand m2, m4 ; (flat8in)
+ ABSSUB_CMP m1, m14, m12, m6, m4, m5, m8 ; abs(q2 - q0) <= 1
+ pand m2, m1
+ ABSSUB_CMP m1, m15, m12, m6, m4, m5, m8 ; abs(q3 - q0) <= 1
+ pand m2, m1 ; flat8in final value
+%if %2 == 84 || %2 == 48
+ pand m2, [mask_mix%2]
+%endif
+%else
+ mova m6, [pb_80]
+ movd m7, Hd
+ SPLATB_MASK m7, [mask_mix]
+ pxor m7, m6
+ ABSSUB m4, m10, m11, m1 ; abs(p1 - p0)
+ pxor m4, m6
+ pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition)
+ ABSSUB m4, m13, m12, m1 ; abs(q1 - q0)
+ pxor m4, m6
+ pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition)
+ por m0, m5 ; hev final value
+%endif
+
+%if %2 == 16
+ ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
+ ; calc flat8out mask
+ mova m8, [P7]
+ mova m9, [P6]
+ ABSSUB_CMP m1, m8, m11, m6, m4, m5 ; abs(p7 - p0) <= 1
+ ABSSUB_CMP m7, m9, m11, m6, m4, m5 ; abs(p6 - p0) <= 1
+ pand m1, m7
+ mova m8, [P5]
+ mova m9, [P4]
+ ABSSUB_CMP m7, m8, m11, m6, m4, m5 ; abs(p5 - p0) <= 1
+ pand m1, m7
+ ABSSUB_CMP m7, m9, m11, m6, m4, m5 ; abs(p4 - p0) <= 1
+ pand m1, m7
+ mova m14, [Q4]
+ mova m15, [Q5]
+ ABSSUB_CMP m7, m14, m12, m6, m4, m5 ; abs(q4 - q0) <= 1
+ pand m1, m7
+ ABSSUB_CMP m7, m15, m12, m6, m4, m5 ; abs(q5 - q0) <= 1
+ pand m1, m7
+ mova m14, [Q6]
+ mova m15, [Q7]
+ ABSSUB_CMP m7, m14, m12, m6, m4, m5 ; abs(q4 - q0) <= 1
+ pand m1, m7
+ ABSSUB_CMP m7, m15, m12, m6, m4, m5 ; abs(q5 - q0) <= 1
+ pand m1, m7 ; flat8out final value
+%endif
+
+ ; if (fm) {
+ ; if (out && in) filter_14()
+ ; else if (in) filter_6()
+ ; else if (hev) filter_2()
+ ; else filter_4()
+ ; }
+ ;
+ ; f14: fm & out & in
+ ; f6: fm & ~f14 & in => fm & ~(out & in) & in => fm & ~out & in
+ ; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev
+ ; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev
+
+ ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
+ ; filter2()
+%if %2 != 44
+ mova m6, [pb_80] ; already in m6 if 44_16
+%endif
+ pxor m15, m12, m6 ; q0 ^ 0x80
+ pxor m14, m11, m6 ; p0 ^ 0x80
+ psubsb m15, m14 ; (signed) q0 - p0
+ pxor m4, m10, m6 ; p1 ^ 0x80
+ pxor m5, m13, m6 ; q1 ^ 0x80
+ psubsb m4, m5 ; (signed) p1 - q1
+ paddsb m4, m15 ; (q0 - p0) + (p1 - q1)
+ paddsb m4, m15 ; 2*(q0 - p0) + (p1 - q1)
+ paddsb m4, m15 ; 3*(q0 - p0) + (p1 - q1)
+ paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127)
+ paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127)
+ mova m14, [pb_10] ; will be reused in filter4()
+ SRSHIFT3B_2X m6, m4, m14, m7 ; f1 and f2 sign byte shift by 3
+ SIGN_SUB m7, m12, m6, m5, m9 ; m7 = q0 - f1
+ SIGN_ADD m8, m11, m4, m5, m9 ; m8 = p0 + f2
+%if %2 != 44
+ pandn m6, m2, m3 ; ~mask(in) & mask(fm)
+ pand m6, m0 ; (~mask(in) & mask(fm)) & mask(hev)
+%else
+ pand m6, m3, m0
+%endif
+ MASK_APPLY m7, m12, m6, m5 ; m7 = filter2(q0) & mask / we write it in filter4()
+ MASK_APPLY m8, m11, m6, m5 ; m8 = filter2(p0) & mask / we write it in filter4()
+
+ ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m7..m8: q0' p0', m10..13: p1 p0 q0 q1, m14: pb_10, m15: q0-p0)
+ ; filter4()
+ mova m4, m15
+ paddsb m15, m4 ; 2 * (q0 - p0)
+ paddsb m15, m4 ; 3 * (q0 - p0)
+ paddsb m6, m15, [pb_4] ; m6: f1 = clip(f + 4, 127)
+ paddsb m15, [pb_3] ; m15: f2 = clip(f + 3, 127)
+ SRSHIFT3B_2X m6, m15, m14, m9 ; f1 and f2 sign byte shift by 3
+%if %2 != 44
+%define p0tmp m7
+%define q0tmp m9
+ pandn m5, m2, m3 ; ~mask(in) & mask(fm)
+ pandn m0, m5 ; ~mask(hev) & (~mask(in) & mask(fm))
+%else
+%define p0tmp m1
+%define q0tmp m2
+ pandn m0, m3
+%endif
+ SIGN_SUB q0tmp, m12, m6, m4, m14 ; q0 - f1
+ MASK_APPLY q0tmp, m7, m0, m5 ; filter4(q0) & mask
+ mova [Q0], q0tmp
+ SIGN_ADD p0tmp, m11, m15, m4, m14 ; p0 + f2
+ MASK_APPLY p0tmp, m8, m0, m5 ; filter4(p0) & mask
+ mova [P0], p0tmp
+ paddb m6, [pb_80] ;
+ pxor m8, m8 ; f=(f1+1)>>1
+ pavgb m6, m8 ;
+ psubb m6, [pb_40] ;
+ SIGN_ADD m7, m10, m6, m8, m9 ; p1 + f
+ SIGN_SUB m4, m13, m6, m8, m9 ; q1 - f
+ MASK_APPLY m7, m10, m0, m14 ; m7 = filter4(p1)
+ MASK_APPLY m4, m13, m0, m14 ; m4 = filter4(q1)
+ mova [P1], m7
+ mova [Q1], m4
+
+ ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
+ ; filter6()
+%if %2 != 44
+ pxor m0, m0
+%if %2 > 16
+ pand m3, m2
+%else
+ pand m2, m3 ; mask(fm) & mask(in)
+ pandn m3, m1, m2 ; ~mask(out) & (mask(fm) & mask(in))
+%endif
+ mova m14, [P3]
+ mova m15, [P2]
+ mova m8, [Q2]
+ mova m9, [Q3]
+ FILTER_INIT m4, m5, m6, m7, [P2], 6, m3, m15 ; [p2]
+ FILTER_UPDATE m6, m7, m4, m5, [P1], m14, m15, m10, m13, 3, m3 ; [p1] -p3 -p2 +p1 +q1
+ FILTER_UPDATE m4, m5, m6, m7, [P0], m14, m10, m11, m8, 3, m3 ; [p0] -p3 -p1 +p0 +q2
+ FILTER_UPDATE m6, m7, m4, m5, [Q0], m14, m11, m12, m9, 3, m3 ; [q0] -p3 -p0 +q0 +q3
+ FILTER_UPDATE m4, m5, m6, m7, [Q1], m15, m12, m13, m9, 3, m3 ; [q1] -p2 -q0 +q1 +q3
+ FILTER_UPDATE m6, m7, m4, m5, [Q2], m10, m13, m8, m9, 3, m3, m8 ; [q2] -p1 -q1 +q2 +q3
+%endif
+
+ ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
+ ; filter14()
+ ;
+ ; m2 m3 m8 m9 m14 m15 m10 m11 m12 m13
+ ;
+ ; q2 q3 p3 p2 p1 p0 q0 q1
+ ; p6 -7 p7 p6 p5 p4 . . . . .
+ ; p5 -6 -p7 -p6 +p5 +q1 . . . .
+ ; p4 -5 -p7 -p5 +p4 +q2 . . . q2
+ ; p3 -4 -p7 -p4 +p3 +q3 . . . q3
+ ; p2 -3 -p7 -p3 +p2 +q4 . . . q4
+ ; p1 -2 -p7 -p2 +p1 +q5 . . . q5
+ ; p0 -1 -p7 -p1 +p0 +q6 . . . q6
+ ; q0 +0 -p7 -p0 +q0 +q7 . . . q7
+ ; q1 +1 -p6 -q0 +q1 +q7 q1 . . .
+ ; q2 +2 -p5 -q1 +q2 +q7 . q2 . .
+ ; q3 +3 -p4 -q2 +q3 +q7 . q3 . .
+ ; q4 +4 -p3 -q3 +q4 +q7 . q4 . .
+ ; q5 +5 -p2 -q4 +q5 +q7 . q5 . .
+ ; q6 +6 -p1 -q5 +q6 +q7 . q6 . .
+
+%if %2 == 16
+ pand m1, m2 ; mask(out) & (mask(fm) & mask(in))
+ mova m2, [P7]
+ mova m3, [P6]
+ mova m8, [P5]
+ mova m9, [P4]
+ FILTER_INIT m4, m5, m6, m7, [P6], 14, m1, m3
+ FILTER_UPDATE m6, m7, m4, m5, [P5], m2, m3, m8, m13, 4, m1, m8 ; [p5] -p7 -p6 +p5 +q1
+ FILTER_UPDATE m4, m5, m6, m7, [P4], m2, m8, m9, m13, 4, m1, m9, m13, [Q2] ; [p4] -p7 -p5 +p4 +q2
+ FILTER_UPDATE m6, m7, m4, m5, [P3], m2, m9, m14, m13, 4, m1, m14, m13, [Q3] ; [p3] -p7 -p4 +p3 +q3
+ FILTER_UPDATE m4, m5, m6, m7, [P2], m2, m14, m15, m13, 4, m1, m13, [Q4] ; [p2] -p7 -p3 +p2 +q4
+ FILTER_UPDATE m6, m7, m4, m5, [P1], m2, m15, m10, m13, 4, m1, m13, [Q5] ; [p1] -p7 -p2 +p1 +q5
+ FILTER_UPDATE m4, m5, m6, m7, [P0], m2, m10, m11, m13, 4, m1, m13, [Q6] ; [p0] -p7 -p1 +p0 +q6
+ FILTER_UPDATE m6, m7, m4, m5, [Q0], m2, m11, m12, m13, 4, m1, m13, [Q7] ; [q0] -p7 -p0 +q0 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q1], m3, m12, m2, m13, 4, m1, m2, [Q1] ; [q1] -p6 -q0 +q1 +q7
+ FILTER_UPDATE m6, m7, m4, m5, [Q2], m8, m2, m3, m13, 4, m1, m3, [Q2] ; [q2] -p5 -q1 +q2 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q3], m9, m3, m8, m13, 4, m1, m8, m8, [Q3] ; [q3] -p4 -q2 +q3 +q7
+ FILTER_UPDATE m6, m7, m4, m5, [Q4], m14, m8, m9, m13, 4, m1, m9, m9, [Q4] ; [q4] -p3 -q3 +q4 +q7
+ FILTER_UPDATE m4, m5, m6, m7, [Q5], m15, m9, m14, m13, 4, m1, m14, m14, [Q5] ; [q5] -p2 -q4 +q5 +q7
+ FILTER_UPDATE m6, m7, m4, m5, [Q6], m10, m14, m15, m13, 4, m1, m15, m15, [Q6] ; [q6] -p1 -q5 +q6 +q7
+%endif
+
+%ifidn %1, h
+%if %2 == 16
+ mova m0, [P7]
+ mova m1, [P6]
+ mova m2, [P5]
+ mova m3, [P4]
+ mova m4, [P3]
+ mova m5, [P2]
+ mova m6, [P1]
+ mova m7, [P0]
+ mova m8, [Q0]
+ mova m9, [Q1]
+ mova m10, [Q2]
+ mova m11, [Q3]
+ mova m12, [Q4]
+ mova m13, [Q5]
+ mova m14, [Q6]
+ mova m15, [Q7]
+ TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+ DEFINE_REAL_P7_TO_Q7
+ movu [P7], m0
+ movu [P6], m1
+ movu [P5], m2
+ movu [P4], m3
+ movu [P3], m4
+ movu [P2], m5
+ movu [P1], m6
+ movu [P0], m7
+ movu [Q0], m8
+ movu [Q1], m9
+ movu [Q2], m10
+ movu [Q3], m11
+ movu [Q4], m12
+ movu [Q5], m13
+ movu [Q6], m14
+ movu [Q7], m15
+%elif %2 == 44
+ SWAP 0, 7 ; m0 = p1
+ SWAP 3, 4 ; m3 = q1
+ DEFINE_REAL_P7_TO_Q7 2
+ SBUTTERFLY bw, 0, 1, 8
+ SBUTTERFLY bw, 2, 3, 8
+ SBUTTERFLY wd, 0, 2, 8
+ SBUTTERFLY wd, 1, 3, 8
+ SBUTTERFLY dq, 0, 4, 8
+ SBUTTERFLY dq, 1, 5, 8
+ SBUTTERFLY dq, 2, 6, 8
+ SBUTTERFLY dq, 3, 7, 8
+ movd [P7], m0
+ punpckhqdq m0, m8
+ movd [P6], m0
+ movd [Q0], m1
+ punpckhqdq m1, m9
+ movd [Q1], m1
+ movd [P3], m2
+ punpckhqdq m2, m10
+ movd [P2], m2
+ movd [Q4], m3
+ punpckhqdq m3, m11
+ movd [Q5], m3
+ movd [P5], m4
+ punpckhqdq m4, m12
+ movd [P4], m4
+ movd [Q2], m5
+ punpckhqdq m5, m13
+ movd [Q3], m5
+ movd [P1], m6
+ punpckhqdq m6, m14
+ movd [P0], m6
+ movd [Q6], m7
+ punpckhqdq m7, m8
+ movd [Q7], m7
+%else
+ ; the following code do a transpose of 8 full lines to 16 half
+ ; lines (high part). It is inlined to avoid the need of a staging area
+ mova m0, [P3]
+ mova m1, [P2]
+ mova m2, [P1]
+ mova m3, [P0]
+ mova m4, [Q0]
+ mova m5, [Q1]
+ mova m6, [Q2]
+ mova m7, [Q3]
+ DEFINE_REAL_P7_TO_Q7
+ SBUTTERFLY bw, 0, 1, 8
+ SBUTTERFLY bw, 2, 3, 8
+ SBUTTERFLY bw, 4, 5, 8
+ SBUTTERFLY bw, 6, 7, 8
+ SBUTTERFLY wd, 0, 2, 8
+ SBUTTERFLY wd, 1, 3, 8
+ SBUTTERFLY wd, 4, 6, 8
+ SBUTTERFLY wd, 5, 7, 8
+ SBUTTERFLY dq, 0, 4, 8
+ SBUTTERFLY dq, 1, 5, 8
+ SBUTTERFLY dq, 2, 6, 8
+ SBUTTERFLY dq, 3, 7, 8
+ movh [P7], m0
+ punpckhqdq m0, m8
+ movh [P6], m0
+ movh [Q0], m1
+ punpckhqdq m1, m9
+ movh [Q1], m1
+ movh [P3], m2
+ punpckhqdq m2, m10
+ movh [P2], m2
+ movh [Q4], m3
+ punpckhqdq m3, m11
+ movh [Q5], m3
+ movh [P5], m4
+ punpckhqdq m4, m12
+ movh [P4], m4
+ movh [Q2], m5
+ punpckhqdq m5, m13
+ movh [Q3], m5
+ movh [P1], m6
+ punpckhqdq m6, m14
+ movh [P0], m6
+ movh [Q6], m7
+ punpckhqdq m7, m8
+ movh [Q7], m7
+%endif
+%endif
+
+ RET
+%endmacro
+
+%macro LPF_16_VH 2
+INIT_XMM %2
+cglobal vp9_loop_filter_v_%1_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
+ LOOPFILTER v, %1
+cglobal vp9_loop_filter_h_%1_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
+ LOOPFILTER h, %1
+%endmacro
+
+%macro LPF_16_VH_ALL_OPTS 1
+LPF_16_VH %1, sse2
+LPF_16_VH %1, ssse3
+LPF_16_VH %1, avx
+%endmacro
+
+LPF_16_VH_ALL_OPTS 16
+LPF_16_VH_ALL_OPTS 44
+LPF_16_VH_ALL_OPTS 48
+LPF_16_VH_ALL_OPTS 84
+LPF_16_VH_ALL_OPTS 88
+
+%endif ; x86-64
diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9mc.asm
index 6488f3092d..154ab2b4f9 100644
--- a/libavcodec/x86/vp9dsp.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -1,22 +1,22 @@
;******************************************************************************
-;* VP9 SIMD optimizations
+;* VP9 MC SIMD optimizations
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -87,7 +87,7 @@ SECTION .text
%macro filter_h_fn 1
%assign %%px mmsize/2
-cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery
+cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
mova m6, [pw_256]
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
@@ -145,30 +145,85 @@ INIT_XMM ssse3
filter_h_fn put
filter_h_fn avg
+%if ARCH_X86_64
+%macro filter_hx2_fn 1
+%assign %%px mmsize
+cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+ mova m13, [pw_256]
+ mova m8, [filteryq+ 0]
+ mova m9, [filteryq+16]
+ mova m10, [filteryq+32]
+ mova m11, [filteryq+48]
+.loop:
+ movu m0, [srcq-3]
+ movu m1, [srcq-2]
+ movu m2, [srcq-1]
+ movu m3, [srcq+0]
+ movu m4, [srcq+1]
+ movu m5, [srcq+2]
+ movu m6, [srcq+3]
+ movu m7, [srcq+4]
+ add srcq, sstrideq
+ SBUTTERFLY bw, 0, 1, 12
+ SBUTTERFLY bw, 2, 3, 12
+ SBUTTERFLY bw, 4, 5, 12
+ SBUTTERFLY bw, 6, 7, 12
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddsw m0, m4
+ paddsw m1, m5
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+filter_hx2_fn put
+filter_hx2_fn avg
+
+%endif ; ARCH_X86_64
+
%macro filter_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
-cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, src, dstride, sstride, h, filtery, src4, sstride3
+cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
-cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, src4, sstride3
+cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
- sub srcq, sstrideq
- lea sstride3q, [sstrideq*3]
- sub srcq, sstrideq
mova m6, [pw_256]
- sub srcq, sstrideq
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
mova m7, [filteryq+ 0]
- lea src4q, [srcq+sstrideq*4]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+16]
mova m9, [filteryq+32]
mova m10, [filteryq+48]
%endif
.loop:
- ; FIXME maybe reuse loads from previous rows, or just more generally
- ; unroll this to prevent multiple loads of the same data?
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
movh m0, [srcq]
movh m1, [srcq+sstrideq]
movh m2, [srcq+sstrideq*2]
@@ -219,6 +274,70 @@ INIT_XMM ssse3
filter_v_fn put
filter_v_fn avg
+%if ARCH_X86_64
+
+%macro filter_vx2_fn 1
+%assign %%px mmsize
+cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+ mova m13, [pw_256]
+ lea sstride3q, [sstrideq*3]
+ lea src4q, [srcq+sstrideq]
+ sub srcq, sstride3q
+ mova m8, [filteryq+ 0]
+ mova m9, [filteryq+16]
+ mova m10, [filteryq+32]
+ mova m11, [filteryq+48]
+.loop:
+ ; FIXME maybe reuse loads from previous rows, or just
+ ; more generally unroll this to prevent multiple loads of
+ ; the same data?
+ movu m0, [srcq]
+ movu m1, [srcq+sstrideq]
+ movu m2, [srcq+sstrideq*2]
+ movu m3, [srcq+sstride3q]
+ movu m4, [src4q]
+ movu m5, [src4q+sstrideq]
+ movu m6, [src4q+sstrideq*2]
+ movu m7, [src4q+sstride3q]
+ add srcq, sstrideq
+ add src4q, sstrideq
+ SBUTTERFLY bw, 0, 1, 12
+ SBUTTERFLY bw, 2, 3, 12
+ SBUTTERFLY bw, 4, 5, 12
+ SBUTTERFLY bw, 6, 7, 12
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ pmaddubsw m4, m10
+ pmaddubsw m5, m10
+ pmaddubsw m6, m11
+ pmaddubsw m7, m11
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ paddsw m0, m4
+ paddsw m1, m5
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+%ifidn %1, avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dstrideq
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM ssse3
+filter_vx2_fn put
+filter_vx2_fn avg
+
+%endif ; ARCH_X86_64
+
%macro fpel_fn 6
%if %2 == 4
%define %%srcfn movh
@@ -229,11 +348,11 @@ filter_v_fn avg
%endif
%if %2 <= 16
-cglobal %1%2, 5, 7, 4, dst, src, dstride, sstride, h, dstride3, sstride3
+cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
lea sstride3q, [sstrideq*3]
lea dstride3q, [dstrideq*3]
%else
-cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h
+cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h
%endif
.loop:
%%srcfn m0, [srcq]
@@ -262,7 +381,7 @@ cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h
INIT_MMX mmx
fpel_fn put, 4, strideq, strideq*2, stride3q, 4
fpel_fn put, 8, strideq, strideq*2, stride3q, 4
-INIT_MMX sse
+INIT_MMX mmxext
fpel_fn avg, 4, strideq, strideq*2, stride3q, 4
fpel_fn avg, 8, strideq, strideq*2, stride3q, 4
INIT_XMM sse
diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c
index 2f064cad7b..25e833fef3 100644
--- a/libavcodec/x86/w64xmmtest.c
+++ b/libavcodec/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -65,6 +65,13 @@ wrap(avcodec_encode_audio2(AVCodecContext *avctx,
got_packet_ptr);
}
+wrap(avcodec_encode_video(AVCodecContext *avctx,
+ uint8_t *buf, int buf_size,
+ const AVFrame *pict))
+{
+ testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict);
+}
+
wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
uint8_t *buf, int buf_size,
const AVSubtitle *sub))