From b0907cf96b1756b50a321e6ed5b8b6910b35d701 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 14 Jun 2022 22:34:45 +0200 Subject: x86: Add high bit-depth loopfilter AVX-512 (Ice Lake) asm --- src/meson.build | 1 + src/x86/loopfilter16_avx512.asm | 912 ++++++++++++++++++++++++++++++++++++++++ src/x86/loopfilter_init_tmpl.c | 2 - 3 files changed, 913 insertions(+), 2 deletions(-) create mode 100644 src/x86/loopfilter16_avx512.asm diff --git a/src/meson.build b/src/meson.build index d75406d..5c41e37 100644 --- a/src/meson.build +++ b/src/meson.build @@ -229,6 +229,7 @@ if is_asm_enabled 'x86/cdef16_avx512.asm', 'x86/filmgrain16_avx512.asm', 'x86/ipred16_avx512.asm', + 'x86/loopfilter16_avx512.asm', 'x86/looprestoration16_avx512.asm', 'x86/mc16_avx512.asm', 'x86/cdef16_avx2.asm', diff --git a/src/x86/loopfilter16_avx512.asm b/src/x86/loopfilter16_avx512.asm new file mode 100644 index 0000000..b7bc3aa --- /dev/null +++ b/src/x86/loopfilter16_avx512.asm @@ -0,0 +1,912 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +l_shuf_v: times 2 db 0, 32 +pw_1: times 2 dw 1 + times 2 db 4, 36 +pw_3: times 2 dw 3 + times 2 db 8, 40 +pw_4: times 2 dw 4 + times 2 db 12, 44 +pw_16: times 2 dw 16 + times 2 db 16, 48 +pw_4096: times 2 dw 4096 + times 2 db 20, 52 +pw_16384: times 2 dw 16384 + times 2 db 24, 56 +pw_32767: times 2 dw 32767 + times 2 db 28, 60 + times 2 dw 0 +filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128 +stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25 +l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1 +clip_max: dw 511, 511, 2047, 2047 +clip_min: dw -512, -512, -2048, -2048 + +SECTION .text + +%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp + punpckhwd m%9, m%5, m%6 + punpcklwd m%5, m%6 + punpckhwd m%6, m%1, m%2 + punpcklwd m%1, m%2 + punpckhwd m%2, m%7, m%8 + punpcklwd m%7, m%8 + punpckhwd m%8, m%3, m%4 + punpcklwd m%3, m%4 + punpckhdq m%4, m%1, m%3 + punpckldq m%1, m%3 + punpckldq m%3, m%5, m%7 + punpckhdq m%5, m%7 + punpckhdq m%7, m%6, m%8 + punpckldq m%6, m%8 + punpckldq m%8, m%9, m%2 + punpckhdq m%9, m%2 + punpckhqdq m%2, m%1, m%3 + punpcklqdq m%1, m%3 + punpcklqdq m%3, m%4, m%5 + punpckhqdq m%4, m%5 + punpcklqdq m%5, m%6, m%8 + punpckhqdq m%6, m%8 + punpckhqdq m%8, m%7, m%9 + punpcklqdq m%7, m%9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] +%ifidn %2, v +%if %1 == 16 + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1 ] + mova m1, [tmpq+strideq*2 ] ; p5 + mova m2, [tmpq+stride3q ] ; p4 + mova m3, [tmpq+strideq*4 ] ; p3 + mova m4, [tmpq+stride5q ] ; p2 +%elif %1 == 6 || %1 == 8 + lea tmpq, [dstq+mstrideq*4] +%if %1 == 8 + mova m3, [tmpq+strideq*0 ] +%endif + mova m4, [tmpq+strideq*1 ] +%endif + mova m5, [dstq+mstrideq*2] ; p1 + mova m6, [dstq+mstrideq*1] ; p0 + mova m7, [dstq+strideq*0 ] ; q0 + mova m8, [dstq+strideq*1 ] ; q1 +%if %1 != 4 + mova m9, [dstq+strideq*2 ] ; q2 +%endif +%if %1 == 8 || %1 == 16 + mova m10, [dstq+stride3q ] ; q3 +%endif +%if %1 == 16 + mova m11, [dstq+strideq*4 ] ; q4 + mova m22, [dstq+stride5q ] ; q5 + mova m23, [dstq+stride3q*2] +%endif +%else ; h +%if %1 == 16 + movu ym16, [dstq+strideq*0 -16] + movu ym17, [dstq+strideq*1 -16] + movu ym18, [dstq+strideq*2 -16] + movu ym19, [dstq+stride3q -16] + movu ym20, [dstq+strideq*4 -16] + movu ym22, [dstq+stride5q -16] + movu ym23, [dstq+stride3q*2-16] + movu ym28, [dstq+stride7q -16] + lea tmpq, [dstq+strideq*8 -16] + vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1 + vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1 + vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1 + vinserti32x8 m10, m19, [tmpq+stride3q ], 1 + vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1 + vinserti32x8 m22, m22, [tmpq+stride5q ], 1 + vinserti32x8 m23, m23, [tmpq+stride3q*2], 1 + vinserti32x8 m28, m28, [tmpq+stride7q ], 1 + lea tmpq, [tmpq+strideq*8] + TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27 + movu ym16, [tmpq+strideq*0 ] + movu ym17, [tmpq+strideq*1 ] + movu ym18, [tmpq+strideq*2 ] + movu ym19, [tmpq+stride3q ] + movu ym24, [tmpq+strideq*4 ] + movu ym25, [tmpq+stride5q ] + movu ym26, [tmpq+stride3q*2] + movu ym20, [tmpq+stride7q ] + lea tmpq, [tmpq+strideq*8] + vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1 + vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1 + vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1 + vinserti32x8 m3, m19, [tmpq+stride3q ], 1 + vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1 + vinserti32x8 m5, m25, [tmpq+stride5q ], 1 + vinserti32x8 m6, m26, [tmpq+stride3q*2], 1 + vinserti32x8 m20, m20, [tmpq+stride7q ], 1 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27 + vshufi32x4 m27, m7, m0, q2020 + vshufi32x4 m7, m0, q3131 + vshufi32x4 m0, m8, m1, q2020 + vshufi32x4 m8, m1, q3131 + vshufi32x4 m1, m9, m2, q2020 + vshufi32x4 m9, m2, q3131 + vshufi32x4 m2, m10, m3, q2020 + vshufi32x4 m10, m3, q3131 + vshufi32x4 m3, m11, m4, q2020 + vshufi32x4 m11, m4, q3131 + vshufi32x4 m4, m22, m5, q2020 + vshufi32x4 m22, m5, q3131 + vshufi32x4 m5, m23, m6, q2020 + vshufi32x4 m23, m6, q3131 + vshufi32x4 m6, m28, m20, q2020 + vshufi32x4 m28, m20, q3131 +%elif %1 == 6 || %1 == 8 +%if %1 == 8 + sub dstq, 8 + movu xm16, [dstq+strideq*0 ] + movu xm17, [dstq+strideq*1 ] + movu xm18, [dstq+strideq*2 ] + movu xm19, [dstq+stride3q ] + movu xm24, [dstq+strideq*4 ] + movu xm25, [dstq+stride5q ] + movu xm26, [dstq+stride3q*2] + movu xm27, [dstq+stride7q ] + lea tmpq, [dstq+strideq*8 ] + vinserti128 ym16, [tmpq+strideq*0 ], 1 + vinserti128 ym17, [tmpq+strideq*1 ], 1 + vinserti128 ym18, [tmpq+strideq*2 ], 1 + vinserti128 ym19, [tmpq+stride3q ], 1 + vinserti128 ym24, [tmpq+strideq*4 ], 1 + vinserti128 ym25, [tmpq+stride5q ], 1 + vinserti128 ym26, [tmpq+stride3q*2], 1 + vinserti128 ym27, [tmpq+stride7q ], 1 + lea tmpq, [tmpq+strideq*8 ] + vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2 + vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2 + vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2 + vinserti32x4 m7, m19, [tmpq+stride3q ], 2 + vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2 + vinserti32x4 m9, m25, [tmpq+stride5q ], 2 + vinserti32x4 m3, m26, [tmpq+stride3q*2], 2 + vinserti32x4 m4, m27, [tmpq+stride7q ], 2 + lea tmpq, [tmpq+strideq*8 ] + vinserti32x4 m10, [tmpq+strideq*0 ], 3 + vinserti32x4 m8, [tmpq+strideq*1 ], 3 + vinserti32x4 m5, [tmpq+strideq*2 ], 3 + vinserti32x4 m7, [tmpq+stride3q ], 3 + vinserti32x4 m2, [tmpq+strideq*4 ], 3 + vinserti32x4 m9, [tmpq+stride5q ], 3 + vinserti32x4 m3, [tmpq+stride3q*2], 3 + vinserti32x4 m4, [tmpq+stride7q ], 3 +%else ; %1 == 6 + movu xm16, [dstq+strideq*0-8] + movu xm17, [dstq+strideq*1-8] + movu xm18, [dstq+strideq*2-8] + movu xm19, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4-8] + movu xm2, [tmpq+strideq*0] + movu xm9, [tmpq+strideq*1] + movu xm3, [tmpq+strideq*2] + movu xm4, [tmpq+stride3q ] + lea tmpq, [tmpq+strideq*4] + vinserti128 ym16, [tmpq+strideq*0], 1 + vinserti128 ym17, [tmpq+strideq*1], 1 + vinserti128 ym18, [tmpq+strideq*2], 1 + vinserti128 ym19, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 ym2, [tmpq+strideq*0], 1 + vinserti128 ym9, [tmpq+strideq*1], 1 + vinserti128 ym3, [tmpq+strideq*2], 1 + vinserti128 ym4, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m10, m16, [tmpq+strideq*0], 2 + vinserti32x4 m8, m17, [tmpq+strideq*1], 2 + vinserti32x4 m5, m18, [tmpq+strideq*2], 2 + vinserti32x4 m7, m19, [tmpq+stride3q ], 2 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m2, [tmpq+strideq*0], 2 + vinserti32x4 m9, [tmpq+strideq*1], 2 + vinserti32x4 m3, [tmpq+strideq*2], 2 + vinserti32x4 m4, [tmpq+stride3q ], 2 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m10, [tmpq+strideq*0], 3 + vinserti32x4 m8, [tmpq+strideq*1], 3 + vinserti32x4 m5, [tmpq+strideq*2], 3 + vinserti32x4 m7, [tmpq+stride3q ], 3 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m2, [tmpq+strideq*0], 3 + vinserti32x4 m9, [tmpq+strideq*1], 3 + vinserti32x4 m3, [tmpq+strideq*2], 3 + vinserti32x4 m4, [tmpq+stride3q ], 3 +%endif + punpcklwd m6, m10, m8 + punpckhwd m10, m8 + punpcklwd m8, m5, m7 + punpckhwd m5, m7 + punpcklwd m7, m2, m9 + punpckhwd m2, m9 + punpcklwd m9, m3, m4 + punpckhwd m3, m4 + punpckldq m4, m6, m8 + punpckhdq m6, m8 + punpckldq m8, m10, m5 + punpckhdq m10, m5 + punpckldq m5, m7, m9 + punpckhdq m7, m9 + punpckldq m9, m2, m3 + punpckhdq m2, m3 +%if %1 == 8 + punpcklqdq m3, m4, m5 +%endif + punpckhqdq m4, m5 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + punpcklqdq m7, m8, m9 + punpckhqdq m8, m9 + punpcklqdq m9, m10, m2 +%if %1 == 8 + punpckhqdq m10, m2 +%endif +%else ; %1 == 4 + kxnorb k1, k1, k1 + kmovb k2, k1 + vpgatherdq m7{k1}, [dstq+ym12-4] + lea tmpq, [dstq+strideq*2-4] + kmovb k1, k2 + vpgatherdq m4{k2}, [tmpq+ym12] + lea tmpq, [tmpq+strideq*2] + kmovb k2, k1 + vpgatherdq m5{k1}, [tmpq+ym12] + lea tmpq, [tmpq+strideq*2] + vpgatherdq m6{k2}, [tmpq+ym12] + punpcklwd m8, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m6, m8, m7 + punpckhwd m8, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + punpcklqdq m7, m8, m4 + punpckhqdq m8, m4 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + movu ym16, [lq+l_strideq*1] + movsldup m17, [l_shuf_v] + vptestnmb k1, ym16, ym16 + vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][] + vpermb m16, m17, m16 ; l[x][1] +%else + movq xm16, [lq+l_strideq*0] + movq xm17, [lq+l_strideq*1] + vinserti128 ym16, [lq+l_strideq*2], 1 + vinserti128 ym17, [lq+l_stride3q ], 1 + lea tmpq, [lq+l_strideq*4] + vinserti32x4 m16, [tmpq+l_strideq*0], 2 + vinserti32x4 m17, [tmpq+l_strideq*1], 2 + vinserti32x4 m16, [tmpq+l_strideq*2], 3 + vinserti32x4 m17, [tmpq+l_stride3q ], 3 + punpcklqdq m16, m17 + vbroadcasti32x4 m17, [l_shuf_h] + vptestnmb k1, m16, m16 + vpalignr m16{k1}, m16, 12 + pshufb m16, m17 ; l[x][1] +%endif + vpbroadcastd m20, [pw_32767] + psubw m17, m5, m6 ; p1-p0 + psubw m18, m7, m8 ; q1-q0 + vptestmw k1, m16, m16 ; L + pabsw m17, m17 + pabsw m18, m18 + vpmaxuw m20{k1}, m17, m18 + vpbroadcastw m17, [lutq+136] + psrlw m18, m16, [lutq+128] + vpbroadcastd m19, [pw_1] + pminuw m18, m17 + psrlw m17, m16, 4 ; H + paddw m16, m16 + pmaxuw m18, m19 ; I + vpaddd m16, [pw_4] {1to16} + paddw m16, m18 ; E + REPX {pmullw x, m13}, m17, m18, m16 + vpcmpw k4, m20, m17, 6 ; hev +%if %1 != 4 + psubw m19, m4, m5 ; p2-p1 + pabsw m19, m19 +%if %1 == 8 || %1 == 16 + psubw m17, m3, m4 ; p3-p2 + pabsw m17, m17 + pmaxuw m19, m17 + psubw m17, m9, m10 ; q3-q2 + pabsw m17, m17 + pmaxuw m19, m17 +%endif + psubw m17, m9, m8 ; q2-q1 + pabsw m17, m17 + pmaxuw m19, m17 +%if %1 == 16 + vpbroadcastd ym17, [maskq+4] + vpord ym17, [maskq+8] {1to8} + vptestmd k1, ym17, ym21 +%else + vptestmd k1, ym21, [maskq+4] {1to8} +%endif + pmaxuw m19, m20 + psubw m17, m4, m6 ; p2-p0 + pabsw m17, m17 + pmaxuw m17, m20 + vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks +%if %1 == 8 || %1 == 16 + psubw m19, m3, m6 ; p3-p0 + pabsw m19, m19 + pmaxuw m17, m19 + psubw m19, m7, m10 ; q3-q0 + pabsw m19, m19 + pmaxuw m17, m19 +%endif + psubw m19, m7, m9 ; q2-q0 + pabsw m19, m19 + pmaxuw m17, m19 +%endif + vpcmpw k1, m20, m18, 2 + psubw m18, m5, m8 ; p1-q1 + psubw m19, m6, m7 ; p0-q0 + pabsw m18, m18 + pabsw m19, m19 + psrlw m18, 1 + paddw m19, m19 + paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E +%if %1 != 4 + vpcmpw k2{k1}, m17, m13, 2 ; flat8in +%endif +%if %1 == 16 + psubw m20, m0, m6 + psubw m16, m1, m6 + pabsw m20, m20 + psubw m17, m2, m6 + pabsw m16, m16 + psubw m18, m11, m7 + pabsw m17, m17 + psubw m19, m22, m7 + pabsw m18, m18 + pmaxuw m20, m16 + psubw m16, m23, m7 + pabsw m19, m19 + pmaxuw m17, m18 + pabsw m16, m16 + vpandd ym18, ym21, [maskq+8] {1to8} + pmaxuw m20, m17 + pmaxuw m19, m16 + pcmpeqd ym16, ym21, ym18 + vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8 + pmaxuw m20, m19 + pcmpeqd ym17, ym21, ym18 + vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8 + vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out + pcmpeqd ym18, ym21 + vptestmb k3{k3}, ym16, ym16 ; flat8 & fm + vptestmb k2{k2}, ym17, ym17 ; flat8in + vptestmb k1{k1}, ym18, ym18 + kandnd k1, k2, k1 ; fm & !flat8 & !flat16 + kandnd k2, k3, k2 ; flat8 & !flat16 +%elif %1 == 6 || %1 == 8 + vpandd ym17, ym21, [maskq+4] {1to8} + pcmpeqd ym16, ym21, ym17 + vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8 + pcmpeqd ym17, ym21 + vptestmb k2{k2}, ym16, ym16 ; flat8 & fm + vptestmb k1{k1}, ym17, ym17 + kandnd k1, k2, k1 ; fm & !flat8 +%else ; %1 == 4 + vpandd ym16, ym21, [maskq+0] {1to8} + pcmpeqd ym16, ym21 + vptestmb k1{k1}, ym16, ym16 +%endif + + ; short filter + psubw m16, m7, m6 + vpbroadcastd m17, [pw_3] + paddw m18, m16, m16 + paddw m18, m16 + psubw m16, m5, m8 ; iclip_diff(p1-q1) + pminsw m16, m14 + vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev + knotd k4, k4 ; !hev + paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f) + vpbroadcastd m18, [pw_4] + pminsw m16, m14 + vpmaxsw m16{k1}{z}, m15 ; f&=fm + paddw m17, m16 + paddw m16, m18 + vpbroadcastd m18, [pw_16384] + pminsw m17, m14 + pminsw m16, m14 + psraw m17, 3 ; f2 + psraw m16, 3 ; f1 + paddw m6, m17 + psubw m7, m16 + vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev + psubw m17, m14, m15 ; 1023 or 4095 + pxor m18, m18 + paddw m5, m16 + psubw m8, m16 + REPX {pminsw x, m17}, m6, m7, m5, m8 + REPX {pmaxsw x, m18}, m6, m7, m5, m8 + +%if %1 == 16 ; flat16 filter + vpaddd m19, m0, [pw_1] {1to16} + paddw m16, m1, m2 ; p5+p4 + paddw m26, m1, m6 ; p5+p0 + paddw m24, m2, m7 ; p4+q0 + paddw m16, m4 ; p5+p4+p3 + paddw m17, m3, m5 ; p2+p1 + psllw m19, 3 + paddw m16, m26 ; p5*2+p4+p3+p0 + paddw m17, m24 ; p4+p2+p1+q0 + psubw m19, m0 ; p6*7+8 + paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0 + paddw m18, m3, m8 + paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0 + paddw m25, m1, m0 + paddw m16, m0, m0 + psrlw m1{k3}, m19, 4 + paddw m19, m18 + psubw m19, m16 ; +p3+q1-p6*2 + paddw m16, m2, m0 + psrlw m2{k3}, m19, 4 + psubw m19, m25 + paddw m25, m4, m9 + paddw m20, m10, m5 + paddw m19, m25 ; +p2+q2-p6-p5 + paddw m17, m0, m3 + psubw m16, m20, m16 + psrlw m3{k3}, m19, 4 + paddw m19, m16 ; +p1+q3-p6-p4 + paddw m16, m11, m6 + psubw m16, m17 + paddw m17, m0, m4 + psrlw m4{k3}, m19, 4 + paddw m19, m16 ; +p0+q4-p6-p3 + paddw m16, m22, m7 + psubw m16, m17 + paddw m17, m0, m5 + psrlw m5{k3}, m19, 4 + paddw m19, m16 ; +q0+q5-p6-p2 + paddw m16, m23, m8 + psrlw m6{k3}, m19, 4 + psubw m16, m17 + paddw m19, m16 ; +q1+q6-p6-p1 + paddw m16, m23, m9 + psrlw m7{k3}, m19, 4 + psubw m16, m26 + paddw m19, m16 ; +q2+q6-p5-p0 + paddw m16, m23, m10 + psrlw m8{k3}, m19, 4 + psubw m16, m24 + paddw m19, m16 ; +q3+q6-p4-p0 + paddw m16, m23, m11 + psrlw m9{k3}, m19, 4 + psubw m16, m18 + paddw m19, m16 ; +q4+q6-p3-q1 + paddw m16, m23, m22 + psrlw m10{k3}, m19, 4 + psubw m16, m25 + paddw m19, m16 ; +q5+q6-p2-q2 + paddw m16, m23, m23 + psrlw m11{k3}, m19, 4 + psubw m16, m20 + paddw m19, m16 ; +q6*2-p1-q3 + psrlw m22{k3}, m19, 4 +%endif +%if %1 == 8 || %1 == 16 ; flat8 filter + vpbroadcastd m20, [pw_4096] + paddw m16, m3, m4 ; p3+p2 + paddw m19, m5, m6 ; p1+p0 + paddw m17, m16, m16 ; 2*(p3+p2) + paddw m19, m3 ; p1+p0+p3 + paddw m17, m7 ; 2*(p3+p2)+q0 + paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0 + paddw m18, m4, m7 + pmulhrsw m4{k2}, m19, m20 + psubw m19, m16 + paddw m17, m5, m8 + paddw m16, m3, m5 + paddw m19, m17 + pmulhrsw m5{k2}, m19, m20 + psubw m19, m16 + paddw m16, m6, m9 + paddw m19, m16 + paddw m16, m3, m6 + pmulhrsw m6{k2}, m19, m20 + paddw m19, m10 + psubw m16, m7, m16 + paddw m19, m16 + psubw m16, m10, m18 + pmulhrsw m7{k2}, m19, m20 + paddw m16, m8 + paddw m19, m16 + psubw m16, m10, m17 + pmulhrsw m8{k2}, m19, m20 + paddw m16, m9 + paddw m19, m16 + pmulhrsw m9{k2}, m19, m20 +%elif %1 == 6 ; flat6 filter + vpbroadcastd m10, [pw_4096] + paddw m2, m5, m6 + paddw m0, m4, m7 + paddw m1, m2, m4 ; p2+p1+p0 + paddw m3, m4, m4 + paddw m1, m1 + paddw m4, m5 + paddw m1, m0 ; p2+2*(p2+p1+p0)+q0 + psubw m3, m7, m3 + pmulhrsw m5{k2}, m1, m10 + paddw m3, m8 + psubw m4, m8, m4 + paddw m1, m3 + pmulhrsw m6{k2}, m1, m10 + paddw m4, m9 + paddw m9, m9 + paddw m1, m4 + pmulhrsw m7{k2}, m1, m10 + psubw m9, m2 + paddw m1, m9 + pmulhrsw m8{k2}, m1, m10 +%endif + +%ifidn %2, v +%if %1 == 16 + mova [tmpq+strideq*2 ], m1 ; p5 + mova [tmpq+stride3q ], m2 ; p4 + mova [tmpq+strideq*4 ], m3 ; p3 + mova [tmpq+stride5q ], m4 ; p2 +%elif %1 == 8 + mova [tmpq+strideq*1 ], m4 ; p2 +%endif + mova [dstq+mstrideq*2], m5 ; p1 + mova [dstq+mstrideq ], m6 ; p0 + mova [dstq+strideq*0 ], m7 ; q0 + mova [dstq+strideq*1 ], m8 ; q1 +%if %1 == 8 || %1 == 16 + mova [dstq+strideq*2 ], m9 ; q2 +%endif +%if %1 == 16 + mova [dstq+stride3q ], m10 ; q3 + mova [dstq+strideq*4 ], m11 ; q4 + mova [dstq+stride5q ], m22 ; q5 +%endif +%else +%if %1 == 16 + TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20 + TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20 + mova [dstq+strideq*0 -16], xm27 + mova [dstq+strideq*0 ], xm7 + mova [dstq+strideq*1 -16], xm0 + mova [dstq+strideq*1 ], xm8 + mova [dstq+strideq*2 -16], xm1 + mova [dstq+strideq*2 ], xm9 + mova [dstq+stride3q -16], xm2 + mova [dstq+stride3q ], xm10 + mova [dstq+strideq*4 -16], xm3 + mova [dstq+strideq*4 ], xm11 + mova [dstq+stride5q -16], xm4 + mova [dstq+stride5q ], xm22 + mova [dstq+stride3q*2-16], xm5 + mova [dstq+stride3q*2 ], xm23 + mova [dstq+stride7q -16], xm6 + mova [dstq+stride7q ], xm28 + lea dstq, [dstq+strideq*8] + vextracti128 [dstq+strideq*0 -16], ym27, 1 + vextracti128 [dstq+strideq*0 ], ym7, 1 + vextracti128 [dstq+strideq*1 -16], ym0, 1 + vextracti128 [dstq+strideq*1 ], ym8, 1 + vextracti128 [dstq+strideq*2 -16], ym1, 1 + vextracti128 [dstq+strideq*2 ], ym9, 1 + vextracti128 [dstq+stride3q -16], ym2, 1 + vextracti128 [dstq+stride3q ], ym10, 1 + vextracti128 [dstq+strideq*4 -16], ym3, 1 + vextracti128 [dstq+strideq*4 ], ym11, 1 + vextracti128 [dstq+stride5q -16], ym4, 1 + vextracti128 [dstq+stride5q ], ym22, 1 + vextracti128 [dstq+stride3q*2-16], ym5, 1 + vextracti128 [dstq+stride3q*2 ], ym23, 1 + vextracti128 [dstq+stride7q -16], ym6, 1 + vextracti128 [dstq+stride7q ], ym28, 1 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 -16], m27, 2 + vextracti32x4 [dstq+strideq*0 ], m7, 2 + vextracti32x4 [dstq+strideq*1 -16], m0, 2 + vextracti32x4 [dstq+strideq*1 ], m8, 2 + vextracti32x4 [dstq+strideq*2 -16], m1, 2 + vextracti32x4 [dstq+strideq*2 ], m9, 2 + vextracti32x4 [dstq+stride3q -16], m2, 2 + vextracti32x4 [dstq+stride3q ], m10, 2 + vextracti32x4 [dstq+strideq*4 -16], m3, 2 + vextracti32x4 [dstq+strideq*4 ], m11, 2 + vextracti32x4 [dstq+stride5q -16], m4, 2 + vextracti32x4 [dstq+stride5q ], m22, 2 + vextracti32x4 [dstq+stride3q*2-16], m5, 2 + vextracti32x4 [dstq+stride3q*2 ], m23, 2 + vextracti32x4 [dstq+stride7q -16], m6, 2 + vextracti32x4 [dstq+stride7q ], m28, 2 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 -16], m27, 3 + vextracti32x4 [dstq+strideq*0 ], m7, 3 + vextracti32x4 [dstq+strideq*1 -16], m0, 3 + vextracti32x4 [dstq+strideq*1 ], m8, 3 + vextracti32x4 [dstq+strideq*2 -16], m1, 3 + vextracti32x4 [dstq+strideq*2 ], m9, 3 + vextracti32x4 [dstq+stride3q -16], m2, 3 + vextracti32x4 [dstq+stride3q ], m10, 3 + vextracti32x4 [dstq+strideq*4 -16], m3, 3 + vextracti32x4 [dstq+strideq*4 ], m11, 3 + vextracti32x4 [dstq+stride5q -16], m4, 3 + vextracti32x4 [dstq+stride5q ], m22, 3 + vextracti32x4 [dstq+stride3q*2-16], m5, 3 + vextracti32x4 [dstq+stride3q*2 ], m23, 3 + vextracti32x4 [dstq+stride7q -16], m6, 3 + vextracti32x4 [dstq+stride7q ], m28, 3 +%elif %1 == 8 + TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2 + movu [dstq+strideq*0 ], xm3 + movu [dstq+strideq*1 ], xm4 + movu [dstq+strideq*2 ], xm5 + movu [dstq+stride3q ], xm6 + movu [dstq+strideq*4 ], xm7 + movu [dstq+stride5q ], xm8 + movu [dstq+stride3q*2], xm9 + movu [dstq+stride7q ], xm10 + lea dstq, [dstq+strideq*8] + vextracti128 [dstq+strideq*0 ], ym3, 1 + vextracti128 [dstq+strideq*1 ], ym4, 1 + vextracti128 [dstq+strideq*2 ], ym5, 1 + vextracti128 [dstq+stride3q ], ym6, 1 + vextracti128 [dstq+strideq*4 ], ym7, 1 + vextracti128 [dstq+stride5q ], ym8, 1 + vextracti128 [dstq+stride3q*2], ym9, 1 + vextracti128 [dstq+stride7q ], ym10, 1 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 ], m3, 2 + vextracti32x4 [dstq+strideq*1 ], m4, 2 + vextracti32x4 [dstq+strideq*2 ], m5, 2 + vextracti32x4 [dstq+stride3q ], m6, 2 + vextracti32x4 [dstq+strideq*4 ], m7, 2 + vextracti32x4 [dstq+stride5q ], m8, 2 + vextracti32x4 [dstq+stride3q*2], m9, 2 + vextracti32x4 [dstq+stride7q ], m10, 2 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 ], m3, 3 + vextracti32x4 [dstq+strideq*1 ], m4, 3 + vextracti32x4 [dstq+strideq*2 ], m5, 3 + vextracti32x4 [dstq+stride3q ], m6, 3 + vextracti32x4 [dstq+strideq*4 ], m7, 3 + vextracti32x4 [dstq+stride5q ], m8, 3 + vextracti32x4 [dstq+stride3q*2], m9, 3 + vextracti32x4 [dstq+stride7q ], m10, 3 + lea dstq, [dstq+strideq*8+8] +%else ; %1 == 4 || %1 == 6 + punpcklwd m9, m5, m6 + punpckhwd m5, m6 + kxnorb k1, k1, k1 + punpcklwd m6, m7, m8 + punpckhwd m7, m8 + kmovb k2, k1 + punpckldq m8, m9, m6 + vpscatterdq [dstq+ym12-4]{k1}, m8 + punpckhdq m9, m6 + lea tmpq, [dstq+strideq*2-4] + kmovb k1, k2 + vpscatterdq [tmpq+ym12]{k2}, m9 + punpckldq m6, m5, m7 + lea tmpq, [tmpq+strideq*2] + kmovb k2, k1 + vpscatterdq [tmpq+ym12]{k1}, m6 + punpckhdq m5, m7 + lea tmpq, [tmpq+strideq*2] + vpscatterdq [tmpq+ym12]{k2}, m5 +%endif +%endif +%endmacro + +INIT_ZMM avx512icl +cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride, tmp, \ + mask_bits, stride5 +%define base tmpq-filter_mask + SWAP 12, 26 ; avoids clobbering xmm10 on WIN64 + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + lea stride3q, [strideq*3] + shl l_strideq, 2 + lea stride5q, [strideq*5] + shr r6d, 11 ; is_12bpc + mova ym21, [base+filter_mask] + mov mstrideq, strideq + vpbroadcastd m13, [base+pw_4+r6*8] + mov mask_bitsd, 0xff + vpbroadcastd m14, [base+clip_max+r6*4] + sub lq, l_strideq + vpbroadcastd m15, [base+clip_min+r6*4] + neg mstrideq + mov wd, wm +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + FILTER 16, v + jmp .end +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 8, v + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call .v4 +.end: + shl mask_bitsd, 8 + add dstq, 64 + pslld ym21, 8 + add lq, 32 + sub wd, 8 + jg .loop + RET +ALIGN function_align +.v4: ; called by both luma and chroma + FILTER 4, v + ret + +cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \ + lut, h, stride3, l_stride3, tmp, \ + mask_bits, stride5, stride7 + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + lea stride3q, [strideq*3] + vpbroadcastd ym12, strided + shl l_strideq, 2 + lea stride5q, [strideq*5] + shr r6d, 11 ; is_12bpc + pmulld ym12, [base+stride_mul] + lea stride7q, [strideq+stride3q*2] + mova ym21, [base+filter_mask] + mov mask_bitsd, 0xff + vpbroadcastd m13, [base+pw_4+r6*8] + sub lq, 4 + vpbroadcastd m14, [base+clip_max+r6*4] + lea l_stride3q, [l_strideq*3] + vpbroadcastd m15, [base+clip_min+r6*4] + mov hd, hm +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + FILTER 16, h + jmp .end +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 8, h + jmp .end2 +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + call .h4 +.no_filter: + lea dstq, [dstq+stride3q*8] +.end: + lea dstq, [dstq+strideq*8] +.end2: + shl mask_bitsd, 8 + pslld ym21, 8 + lea lq, [lq+l_strideq*8] + sub hd, 8 + jg .loop + RET +ALIGN function_align +.h4: ; called by both luma and chroma + FILTER 4, h + ret + +cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + shl l_strideq, 2 + lea stride3q, [strideq*3] + shr r6d, 11 ; is_12bpc + mova ym21, [base+filter_mask] + mov mstrideq, strideq + vpbroadcastd m13, [base+pw_4+r6*8] + mov mask_bitsd, 0xff + vpbroadcastd m14, [base+clip_max+r6*4] + sub lq, l_strideq + vpbroadcastd m15, [base+clip_min+r6*4] + neg mstrideq + mov wd, wm +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 6, v + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4 +.end: + shl mask_bitsd, 8 + add dstq, 64 + pslld ym21, 8 + add lq, 32 + sub wd, 8 + jg .loop + RET + +cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + vpbroadcastd ym12, strided + shl l_strideq, 2 + shr r6d, 11 ; is_12bpc + pmulld ym12, [base+stride_mul] + lea stride3q, [strideq*3] + mova ym21, [base+filter_mask] + mov mask_bitsd, 0xff + vpbroadcastd m13, [base+pw_4+r6*8] + sub lq, 4 + vpbroadcastd m14, [base+clip_max+r6*4] + lea l_stride3q, [l_strideq*3] + vpbroadcastd m15, [base+clip_min+r6*4] + mov hd, hm +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 6, h + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4 +.end: + lea tmpq, [strideq+stride3q] + shl mask_bitsd, 8 + pslld ym21, 8 + lea dstq, [dstq+tmpq*8] + lea lq, [lq+l_strideq*8] + sub hd, 8 + jg .loop + RET + +%endif ; ARCH_X86_64 diff --git a/src/x86/loopfilter_init_tmpl.c b/src/x86/loopfilter_init_tmpl.c index 932fada..1c085d9 100644 --- a/src/x86/loopfilter_init_tmpl.c +++ b/src/x86/loopfilter_init_tmpl.c @@ -58,11 +58,9 @@ COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; -#if BITDEPTH == 8 c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl); c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl); c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl); c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl); #endif -#endif } -- cgit v1.2.3