From ebd6fb23c5373858343816a4dd53726929cc36f7 Mon Sep 17 00:00:00 2001 From: Michael Stoner Date: Sat, 16 Mar 2019 09:24:47 -0700 Subject: libavcodec Adding ff_v210_planar_unpack AVX2 Replaced VSHUFPS with VPBLENDD to relieve port 5 bottleneck AVX2 is 1.4x faster than AVX --- libavcodec/x86/v210-init.c | 8 ++++++ libavcodec/x86/v210.asm | 72 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 64 insertions(+), 16 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c index d64dbca1a8..cb9a6cbd6a 100644 --- a/libavcodec/x86/v210-init.c +++ b/libavcodec/x86/v210-init.c @@ -21,9 +21,11 @@ extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); av_cold void ff_v210_x86_init(V210DecContext *s) { @@ -36,6 +38,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + + if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) + s->unpack_frame = ff_v210_planar_unpack_aligned_avx2; } else { if (cpu_flags & AV_CPU_FLAG_SSSE3) @@ -43,6 +48,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + + if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) + s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2; } #endif } diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm index c24c765e5b..706712313d 100644 --- a/libavcodec/x86/v210.asm +++ b/libavcodec/x86/v210.asm @@ -22,9 +22,14 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 + +; for AVX2 version only +v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required +v210_chroma_shuf2: db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1 +v210_luma_shuf_avx2: db 0,1,4,5,6,7,8,9,12,13,14,15,-1,-1,-1,-1 +v210_chroma_shuf_avx2: db 0,1,4,5,10,11,-1,-1,2,3,8,9,12,13,-1,-1 -v210_mask: times 4 dd 0x3ff v210_mult: dw 64,4,64,4,64,4,64,4 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 @@ -34,40 +39,65 @@ SECTION .text %macro v210_planar_unpack 1 ; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) -cglobal v210_planar_unpack_%1, 5, 5, 7 +cglobal v210_planar_unpack_%1, 5, 5, 8 movsxdifnidn r4, r4d lea r1, [r1+2*r4] add r2, r4 add r3, r4 neg r4 - mova m3, [v210_mult] - mova m4, [v210_mask] - mova m5, [v210_luma_shuf] - mova m6, [v210_chroma_shuf] + VBROADCASTI128 m3, [v210_mult] + VBROADCASTI128 m5, [v210_chroma_shuf] + +%if cpuflag(avx2) + VBROADCASTI128 m4, [v210_luma_shuf_avx2] + VBROADCASTI128 m5, [v210_chroma_shuf_avx2] + mova m6, [v210_luma_permute] + VBROADCASTI128 m7, [v210_chroma_shuf2] +%else + VBROADCASTI128 m4, [v210_luma_shuf] + VBROADCASTI128 m5, [v210_chroma_shuf] +%endif + .loop: %ifidn %1, unaligned - movu m0, [r0] + movu m0, [r0] ; yB v5 yA u5 y9 v4 y8 u4 y7 v3 y6 u3 y5 v2 y4 u2 y3 v1 y2 u1 y1 v0 y0 u0 %else mova m0, [r0] %endif pmullw m1, m0, m3 - psrld m0, 10 - psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 - pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ + pslld m0, 12 + psrlw m1, 6 ; yB yA u5 v4 y8 y7 v3 u3 y5 y4 u2 v1 y2 y1 v0 u0 + psrld m0, 22 ; 00 v5 00 y9 00 u4 00 y6 00 v2 00 y3 00 u1 00 y0 + +%if cpuflag(avx2) + vpblendd m2, m1, m0, 0x55 ; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 y2 y1 00 y0 + pshufb m2, m4 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 + vpermd m2, m6, m2 ; 00 00 00 00 yB yA y9 y8 y7 y6 y5 y4 y3 y2 y1 y0 + movu [r1+2*r4], m2 - shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __ - pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __ + vpblendd m1, m0, 0xaa ; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 00 u1 v0 u0 + pshufb m1, m5 ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0 + vpermq m1, m1, 0xd8 ; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 00 u2 u1 u0 + pshufb m1, m7 ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 u3 u2 u1 u0 + + movu [r2+r4], xm1 + vextracti128 [r3+r4], m1, 1 +%else + shufps m2, m1, m0, 0x8d ; 00 y9 00 y6 yB yA y8 y7 00 y3 00 y0 y5 y4 y2 y1 + pshufb m2, m4 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 movu [r1+2*r4], m2 - shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __ - pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __ + shufps m1, m0, 0xd8 ; 00 v5 00 u4 u5 v4 v3 u3 00 v2 00 u1 u2 v1 v0 u0 + pshufb m1, m5 ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0 + movq [r2+r4], m1 movhps [r3+r4], m1 +%endif add r0, mmsize - add r4, 6 + add r4, (mmsize*3)/8 jl .loop REP_RET @@ -81,6 +111,11 @@ INIT_XMM avx v210_planar_unpack unaligned %endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +v210_planar_unpack unaligned +%endif + INIT_XMM ssse3 v210_planar_unpack aligned @@ -88,3 +123,8 @@ v210_planar_unpack aligned INIT_XMM avx v210_planar_unpack aligned %endif + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +v210_planar_unpack aligned +%endif -- cgit v1.2.3