Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/FFmpeg.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2015-09-26 00:24:07 +0300
committerRonald S. Bultje <rsbultje@gmail.com>2015-10-03 21:42:39 +0300
commit26ece7a511f8905a5ddfc19c7cd4ecdca7056138 (patch)
tree1efff584577934d49bd98d2089896f7c695ffce7 /libavcodec/x86
parentdb7786e8ffa2c8f5c7da062054962ca81cf09349 (diff)
vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/Makefile1
-rw-r--r--libavcodec/x86/constants.c4
-rw-r--r--libavcodec/x86/constants.h2
-rw-r--r--libavcodec/x86/h264_idct_10bit.asm5
-rw-r--r--libavcodec/x86/h264_intrapred_10bit.asm2
-rw-r--r--libavcodec/x86/vp9dsp_init.h23
-rw-r--r--libavcodec/x86/vp9dsp_init_16bpp.c15
-rw-r--r--libavcodec/x86/vp9dsp_init_16bpp_template.c7
-rw-r--r--libavcodec/x86/vp9intrapred_16bpp.asm615
9 files changed, 669 insertions, 5 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 01e5f18783..5ff3a77e37 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -158,6 +158,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
+ x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 9f3c8b4165..19345f56e4 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -81,3 +81,7 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x800
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+ 0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+ 0x0000002000000020ULL, 0x0000002000000020ULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 37a1869641..4a2451d520 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -63,5 +63,7 @@ extern const uint64_t ff_pb_FC;
extern const xmm_reg ff_ps_neg;
extern const ymm_reg ff_pd_1;
+extern const ymm_reg ff_pd_16;
+extern const ymm_reg ff_pd_32;
#endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index cc115b0ff9..f1c2c81ef8 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -24,14 +24,11 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-
-pd_32: times 4 dd 32
-
SECTION .text
cextern pw_1023
%define pw_pixel_max pw_1023
+cextern pd_32
;-----------------------------------------------------------------------------
; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 9aeb70242b..9e40cfe24b 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -34,11 +34,11 @@ cextern pw_8
cextern pw_4
cextern pw_2
cextern pw_1
+cextern pd_16
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3
pd_17: times 4 dd 17
-pd_16: times 4 dd 16
SECTION .text
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index d1a9514c09..47d22461ae 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+ ptrdiff_t stride, \
+ const uint8_t *l, \
+ const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type, 4, bpp, opt4); \
+decl_ipred_fn(type, 8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
@@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt)
+#define cat(a, bpp, b) a##bpp##b
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+ dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+ cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+ init_ipred_func(type, enum, 8, bpp, opt); \
+ init_ipred_func(type, enum, 16, bpp, opt); \
+ init_ipred_func(type, enum, 32, bpp, opt)
+
void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index bd61e24288..f4a4a5d891 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -46,6 +46,11 @@ decl_fpel_func(avg, 32, _16, avx2);
decl_fpel_func(avg, 64, _16, avx2);
decl_fpel_func(avg, 128, _16, avx2);
+decl_ipred_fns(v, 16, mmx, sse);
+decl_ipred_fns(h, 16, mmxext, sse2);
+decl_ipred_fns(dc, 16, mmxext, sse2);
+decl_ipred_fns(dc_top, 16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
#endif /* HAVE_YASM */
av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
@@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
if (EXTERNAL_MMX(cpu_flags)) {
init_fpel_func(4, 0, 8, put, , mmx);
+ init_ipred_func(v, VERT, 4, 16, mmx);
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
init_fpel_func(4, 1, 8, avg, _16, mmxext);
+ init_ipred_func(h, HOR, 4, 16, mmxext);
+ init_ipred_func(dc, DC, 4, 16, mmxext);
+ init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext);
+ init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
}
if (EXTERNAL_SSE(cpu_flags)) {
@@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 0, 32, put, , sse);
init_fpel_func(1, 0, 64, put, , sse);
init_fpel_func(0, 0, 128, put, , sse);
+ init_8_16_32_ipred_funcs(v, VERT, 16, sse);
}
if (EXTERNAL_SSE2(cpu_flags)) {
@@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 1, 32, avg, _16, sse2);
init_fpel_func(1, 1, 64, avg, _16, sse2);
init_fpel_func(0, 1, 128, avg, _16, sse2);
+ init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+ init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+ init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2);
+ init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index 56cd79e7a4..f486caf1a1 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \
lpf_mix2_wrappers_set(BPC, sse2);
lpf_mix2_wrappers_set(BPC, ssse3);
lpf_mix2_wrappers_set(BPC, avx);
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
#endif /* HAVE_YASM */
av_cold void INIT_FUNC(VP9DSPContext *dsp)
@@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+ }
+
if (EXTERNAL_SSE2(cpu_flags)) {
init_subpel3(0, put, BPC, sse2);
init_subpel3(1, avg, BPC, sse2);
init_lpf_funcs(BPC, sse2);
+ init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
}
if (EXTERNAL_SSSE3(cpu_flags)) {
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 0000000000..018d92de58
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,615 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+cextern pd_16
+cextern pd_32
+
+SECTION .text
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ mova m1, [aq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq+mmsize*0]
+ mova m1, [aq+mmsize*1]
+ mova m2, [aq+mmsize*2]
+ mova m3, [aq+mmsize*3]
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 16
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*1+32], m2
+ mova [dstq+strideq*1+48], m3
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+ mova m3, [lq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufw m0, m3, q3333
+ pshufw m1, m3, q2222
+ pshufw m2, m3, q1111
+ pshufw m3, m3, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
+ mova m2, [lq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ punpckhwd m3, m2, m2
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufd m0, m3, q1111
+ pshufd m1, m3, q0000
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ punpcklwd m2, m2
+ pshufd m0, m2, q3333
+ pshufd m1, m2, q2222
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q0000
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
+ mov cntd, 3
+ lea stride3q, [strideq*3]
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
+ mov cntd, 7
+ lea stride3q, [strideq*3]
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m1
+ mova [dstq+strideq*1+32], m1
+ mova [dstq+strideq*1+48], m1
+ mova [dstq+strideq*2+ 0], m2
+ mova [dstq+strideq*2+16], m2
+ mova [dstq+strideq*2+32], m2
+ mova [dstq+strideq*2+48], m2
+ mova [dstq+stride3q + 0], m3
+ mova [dstq+stride3q +16], m3
+ mova [dstq+stride3q +32], m3
+ mova [dstq+stride3q +48], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufw m1, m0, q3232
+ paddd m0, [pd_4]
+ paddd m0, m1
+ psrad m0, 3
+ pshufw m0, m0, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [aq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_8]
+ paddd m0, m1
+ psrad m0, 4
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq]
+ paddw m0, [lq+mmsize]
+ paddw m0, [aq]
+ paddw m0, [aq+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_16]
+ paddd m0, m1
+ psrad m0, 5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [lq+mmsize*0]
+ paddw m0, [lq+mmsize*1]
+ paddw m0, [lq+mmsize*2]
+ paddw m0, [lq+mmsize*3]
+ paddw m0, [aq+mmsize*0]
+ paddw m0, [aq+mmsize*1]
+ paddw m0, [aq+mmsize*2]
+ paddw m0, [aq+mmsize*3]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 16
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_32]
+ paddd m0, m1
+ psrad m0, 6
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*1+32], m0
+ mova [dstq+strideq*1+48], m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufw m1, m0, q3232
+ paddd m0, [pd_2]
+ paddd m0, m1
+ psrad m0, 2
+ pshufw m0, m0, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_4]
+ paddd m0, m1
+ psrad m0, 3
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2]
+ paddw m0, [%2+mmsize]
+ DEFINE_ARGS dst, stride, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 4
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_8]
+ paddd m0, m1
+ psrad m0, 4
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*2+ 0], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q + 0], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jg .loop
+ RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+ mova m0, [%2+mmsize*0]
+ paddw m0, [%2+mmsize*1]
+ paddw m0, [%2+mmsize*2]
+ paddw m0, [%2+mmsize*3]
+ DEFINE_ARGS dst, stride, cnt
+ mov cntd, 16
+ pmaddwd m0, [pw_1]
+ pshufd m1, m0, q3232
+ paddd m0, m1
+ pshufd m1, m0, q1111
+ paddd m0, [pd_16]
+ paddd m0, m1
+ psrad m0, 5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m0
+ mova [dstq+strideq*0+32], m0
+ mova [dstq+strideq*0+48], m0
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+16], m0
+ mova [dstq+strideq*1+32], m0
+ mova [dstq+strideq*1+48], m0
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jg .loop
+ RET
+%endmacro
+
+DC_1D_FNS top, aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+ mova m5, [pw_1023]
+.body:
+ mova m4, [aq]
+ mova m3, [lq]
+ movd m0, [aq-4]
+ pshufw m0, m0, q1111
+ psubw m4, m0
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufw m0, m3, q3333
+ pshufw m1, m3, q2222
+ pshufw m2, m3, q1111
+ pshufw m3, m3, q0000
+ paddw m0, m4
+ paddw m1, m4
+ paddw m2, m4
+ paddw m3, m4
+ pxor m4, m4
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ pmaxsw m3, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ pminsw m2, m5
+ pminsw m3, m5
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+ mova m5, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
+ mova m4, [pw_1023]
+.body:
+ pxor m6, m6
+ mova m5, [aq]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m5, m0
+ DEFINE_ARGS dst, stride, l, stride3, cnt
+ lea stride3q, [strideq*3]
+ mov cntd, 1
+.loop:
+ movh m3, [lq+cntq*8]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ paddw m0, m5
+ paddw m1, m5
+ paddw m2, m5
+ paddw m3, m5
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ pmaxsw m2, m6
+ pmaxsw m3, m6
+ pminsw m0, m4
+ pminsw m1, m4
+ pminsw m2, m4
+ pminsw m3, m4
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
+ mova m4, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_1023]
+.body:
+ pxor m6, m6
+ mova m4, [aq]
+ mova m5, [aq+mmsize]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m4, m0
+ psubw m5, m0
+ DEFINE_ARGS dst, stride, l, cnt
+ mov cntd, 7
+.loop:
+ movd m3, [lq+cntq*4]
+ punpcklwd m3, m3
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ paddw m0, m2, m4
+ paddw m2, m5
+ paddw m1, m3, m4
+ paddw m3, m5
+ pmaxsw m0, m6
+ pmaxsw m2, m6
+ pmaxsw m1, m6
+ pmaxsw m3, m6
+ pminsw m0, m7
+ pminsw m2, m7
+ pminsw m1, m7
+ pminsw m3, m7
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m2
+ mova [dstq+strideq*1+ 0], m1
+ mova [dstq+strideq*1+16], m3
+ lea dstq, [dstq+strideq*2]
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+ mova m7, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a
+ mova m0, [pw_1023]
+.body:
+ pxor m1, m1
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+ mova [rsp+ 0], m0
+ mova [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+ mova m4, [aq+mmsize*0]
+ mova m5, [aq+mmsize*1]
+ mova m6, [aq+mmsize*2]
+ mova m7, [aq+mmsize*3]
+ movd m0, [aq-4]
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ psubw m4, m0
+ psubw m5, m0
+ psubw m6, m0
+ psubw m7, m0
+ DEFINE_ARGS dst, stride, l, cnt
+ mov cntd, 31
+.loop:
+ pinsrw m3, [lq+cntq*2], 0
+ punpcklwd m3, m3
+ pshufd m3, m3, q0000
+ paddw m0, m3, m4
+ paddw m1, m3, m5
+ paddw m2, m3, m6
+ paddw m3, m7
+ pmaxsw m0, reg_min
+ pmaxsw m1, reg_min
+ pmaxsw m2, reg_min
+ pmaxsw m3, reg_min
+ pminsw m0, reg_max
+ pminsw m1, reg_max
+ pminsw m2, reg_max
+ pminsw m3, reg_max
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+16], m1
+ mova [dstq+strideq*0+32], m2
+ mova [dstq+strideq*0+48], m3
+ add dstq, strideq
+ dec cntd
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a
+ mova m0, [pw_4095]
+ jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body