; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12 wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13 wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47 db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63 sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8 db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128 sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 r_ext_mask: times 68 db -1 times 4 db 0 wiener_x_shuf: db 0, 2, -1, 0 wiener_x_add: db 0, 1,127, 0 pw_61448: times 2 dw 61448 pw_164_455: dw 164, 455 pd_m16380: dd -16380 pd_m4096: dd -4096 pd_m25 dd -25 pd_m9: dd -9 pd_34816: dd 34816 pd_8421376: dd 8421376 cextern sgr_x_by_x SECTION .text DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers INIT_ZMM avx512icl cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt mov fltq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti32x4 m6, [wiener_shufA] vbroadcasti32x4 m7, [wiener_shufB] mov r10d, 0xfffe vbroadcasti32x4 m8, [wiener_shufC] vbroadcasti32x4 m9, [wiener_shufD] kmovw k1, r10d vpbroadcastd m0, [wiener_x_shuf] vpbroadcastd m1, [wiener_x_add] mov r10, 0xaaaaaaaaaaaaaaaa vpbroadcastd m11, [fltq+ 0] vpbroadcastd m12, [fltq+ 4] kmovq k2, r10 vpbroadcastd m10, [pd_m16380] packsswb m11, m11 ; x0 x1 x0 x1 vpbroadcastd m14, [fltq+16] pshufb m12, m0 vpbroadcastd m15, [fltq+20] paddb m12, m1 ; x2 x3+1 x2 127 vpbroadcastd m13, [pd_8421376] psllw m14, 5 ; y0 y1 psllw m15, 5 ; y2 y3 cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32 jle .w32 ; pixels, so we need a special case for small widths lea t1, [rsp+wq*2+16] add lpfq, wq add dstq, wq neg wq test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call .v .v2: call .v jmp .v1 .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm16, [leftq] vmovdqu32 m16{k1}, [lpfq+r10-4] add leftq, 4 jmp .h_main .h_extend_left: vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory jmp .h_main .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m16, [lpfq+r10-4] .h_main: movu m17, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -66 jl .h_have_right push r0 lea r0, [r_ext_mask+65] vpbroadcastb m0, [lpfq-1] vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b vpternlogd m17, m0, [r0+r10+8], 0xe4 pop r0 .h_have_right: pshufb m4, m16, m6 mova m0, m10 vpdpbusd m0, m4, m11 pshufb m4, m16, m7 mova m2, m10 vpdpbusd m2, m4, m11 pshufb m4, m17, m6 mova m1, m10 vpdpbusd m1, m4, m11 pshufb m4, m17, m7 mova m3, m10 vpdpbusd m3, m4, m11 pshufb m4, m16, m8 vpdpbusd m0, m4, m12 pshufb m16, m9 vpdpbusd m2, m16, m12 pshufb m4, m17, m8 vpdpbusd m1, m4, m12 pshufb m17, m9 vpdpbusd m3, m17, m12 packssdw m0, m2 packssdw m1, m3 psraw m0, 3 psraw m1, 3 mova [t1+r10*2+ 0], m0 mova [t1+r10*2+64], m1 add r10, 64 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm16, [leftq] vmovdqu32 m16{k1}, [lpfq+r10-4] add leftq, 4 jmp .hv_main .hv_extend_left: vpbroadcastb xm16, [lpfq+r10] vmovdqu32 m16{k1}, [lpfq+r10-4] jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m16, [lpfq+r10-4] .hv_main: movu m17, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -66 jl .hv_have_right push r0 lea r0, [r_ext_mask+65] vpbroadcastb m0, [lpfq-1] vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b vpternlogd m17, m0, [r0+r10+8], 0xe4 pop r0 .hv_have_right: pshufb m4, m16, m6 mova m0, m10 vpdpbusd m0, m4, m11 pshufb m4, m16, m7 mova m2, m10 vpdpbusd m2, m4, m11 pshufb m4, m17, m6 mova m1, m10 vpdpbusd m1, m4, m11 pshufb m4, m17, m7 mova m3, m10 vpdpbusd m3, m4, m11 pshufb m4, m16, m8 vpdpbusd m0, m4, m12 pshufb m16, m9 vpdpbusd m2, m16, m12 pshufb m4, m17, m8 vpdpbusd m1, m4, m12 pshufb m17, m9 vpdpbusd m3, m17, m12 packssdw m0, m2 packssdw m1, m3 psraw m0, 3 psraw m1, 3 mova m16, [t4+r10*2] paddw m16, [t2+r10*2] mova m3, [t3+r10*2] mova m17, [t4+r10*2+64] paddw m17, [t2+r10*2+64] mova m5, [t3+r10*2+64] punpcklwd m4, m16, m3 mova m2, m13 vpdpwssd m2, m4, m15 punpcklwd m18, m17, m5 mova m4, m13 vpdpwssd m4, m18, m15 punpckhwd m16, m3 mova m3, m13 vpdpwssd m3, m16, m15 punpckhwd m17, m5 mova m5, m13 vpdpwssd m5, m17, m15 mova m17, [t5+r10*2] paddw m17, [t1+r10*2] paddw m16, m0, [t6+r10*2] mova m19, [t5+r10*2+64] paddw m19, [t1+r10*2+64] paddw m18, m1, [t6+r10*2+64] mova [t0+r10*2+ 0], m0 mova [t0+r10*2+64], m1 punpcklwd m0, m16, m17 vpdpwssd m2, m0, m14 punpcklwd m1, m18, m19 vpdpwssd m4, m1, m14 punpckhwd m16, m17 vpdpwssd m3, m16, m14 punpckhwd m18, m19 vpdpwssd m5, m18, m14 packuswb m2, m4 psrlw m2, 8 vpackuswb m2{k2}, m3, m5 movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap add r10, 64 ; function is used for chroma as well, and in some jl .hv_loop ; esoteric edge cases chroma dst pointers may only mov t6, t5 ; have a 32-byte alignment despite having a width mov t5, t4 ; larger than 32, so use an unaligned store here. mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 add dstq, strideq ret .v: mov r10, wq .v_loop: mova m4, [t4+r10*2+ 0] paddw m4, [t2+r10*2+ 0] mova m1, [t3+r10*2+ 0] mova m5, [t4+r10*2+64] paddw m5, [t2+r10*2+64] mova m3, [t3+r10*2+64] punpcklwd m6, m4, m1 mova m0, m13 vpdpwssd m0, m6, m15 punpcklwd m6, m5, m3 mova m2, m13 vpdpwssd m2, m6, m15 punpckhwd m4, m1 mova m1, m13 vpdpwssd m1, m4, m15 punpckhwd m5, m3 mova m3, m13 vpdpwssd m3, m5, m15 mova m5, [t1+r10*2+ 0] paddw m4, m5, [t6+r10*2+ 0] paddw m5, [t5+r10*2+ 0] mova m7, [t1+r10*2+64] paddw m6, m7, [t6+r10*2+64] paddw m7, [t5+r10*2+64] punpcklwd m8, m4, m5 vpdpwssd m0, m8, m14 punpcklwd m8, m6, m7 vpdpwssd m2, m8, m14 punpckhwd m4, m5 vpdpwssd m1, m4, m14 punpckhwd m6, m7 vpdpwssd m3, m6, m14 packuswb m0, m2 psrlw m0, 8 vpackuswb m0{k2}, m1, m3 movu [dstq+r10], m0 add r10, 64 jl .v_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq ret .w32: lea r10, [r_ext_mask+73] mova ym18, [wiener_perm32] lea t1, [rsp+16] sub r10, wq test edgeb, 4 ; LR_HAVE_TOP jz .w32_no_top call .w32_h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 32*2 call .w32_h_top lea r9, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 32*2 add r9, strideq mov [rsp], r9 ; below call .w32_h mov t3, t1 mov t2, t1 dec hd jz .w32_v1 add lpfq, strideq add t1, 32*2 call .w32_h mov t2, t1 dec hd jz .w32_v2 add lpfq, strideq add t1, 32*2 call .w32_h dec hd jz .w32_v3 .w32_main: lea t0, [t1+32*2] .w32_main_loop: call .w32_hv dec hd jnz .w32_main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .w32_v3 mov lpfq, [rsp] call .w32_hv_bottom add lpfq, strideq call .w32_hv_bottom .w32_v1: call .w32_v RET .w32_no_top: lea r9, [lpfq+strideq*4] mov lpfq, dstq lea r9, [r9+strideq*2] mov [rsp], r9 call .w32_h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .w32_v1 add lpfq, strideq add t1, 32*2 call .w32_h mov t2, t1 dec hd jz .w32_v2 add lpfq, strideq add t1, 32*2 call .w32_h dec hd jz .w32_v3 lea t0, [t1+32*2] call .w32_hv dec hd jz .w32_v3 add t0, 32*8 call .w32_hv dec hd jnz .w32_main .w32_v3: call .w32_v .w32_v2: call .w32_v jmp .w32_v1 .w32_h: test edgeb, 1 ; LR_HAVE_LEFT jz .w32_h_extend_left movd xm16, [leftq] vmovdqu32 ym16{k1}, [lpfq-4] add leftq, 4 jmp .w32_h_main .w32_h_extend_left: vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory jmp .w32_h_main .w32_h_top: test edgeb, 1 ; LR_HAVE_LEFT jz .w32_h_extend_left movu ym16, [lpfq-4] .w32_h_main: vinserti32x8 m16, [lpfq+4], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .w32_h_have_right vpbroadcastb m0, [lpfq+wq-1] movu ym17, [r10-8] vinserti32x8 m17, [r10+0], 1 vpternlogd m16, m0, m17, 0xe4 ; c ? a : b .w32_h_have_right: pshufb m2, m16, m6 mova m0, m10 vpdpbusd m0, m2, m11 pshufb m2, m16, m7 mova m1, m10 vpdpbusd m1, m2, m11 pshufb m2, m16, m8 vpdpbusd m0, m2, m12 pshufb m16, m9 vpdpbusd m1, m16, m12 packssdw m0, m1 psraw m0, 3 mova [t1], m0 ret .w32_hv: add lpfq, strideq test edgeb, 1 ; LR_HAVE_LEFT jz .w32_hv_extend_left movd xm16, [leftq] vmovdqu32 ym16{k1}, [lpfq-4] add leftq, 4 jmp .w32_hv_main .w32_hv_extend_left: vpbroadcastb xm16, [lpfq] vmovdqu32 ym16{k1}, [lpfq-4] jmp .w32_hv_main .w32_hv_bottom: test edgeb, 1 ; LR_HAVE_LEFT jz .w32_hv_extend_left movu ym16, [lpfq-4] .w32_hv_main: vinserti32x8 m16, [lpfq+4], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .w32_hv_have_right vpbroadcastb m0, [lpfq+wq-1] movu ym17, [r10-8] vinserti32x8 m17, [r10+0], 1 vpternlogd m16, m0, m17, 0xe4 .w32_hv_have_right: mova m3, [t4] paddw m3, [t2] mova m2, [t3] pshufb m4, m16, m6 mova m0, m10 vpdpbusd m0, m4, m11 pshufb m4, m16, m7 mova m5, m10 vpdpbusd m5, m4, m11 punpcklwd m4, m3, m2 mova m1, m13 vpdpwssd m1, m4, m15 punpckhwd m3, m2 mova m2, m13 vpdpwssd m2, m3, m15 pshufb m4, m16, m8 vpdpbusd m0, m4, m12 pshufb m16, m9 vpdpbusd m5, m16, m12 packssdw m0, m5 psraw m0, 3 mova m4, [t5] paddw m4, [t1] paddw m3, m0, [t6] mova [t0], m0 punpcklwd m0, m3, m4 vpdpwssd m1, m0, m14 punpckhwd m3, m4 vpdpwssd m2, m3, m14 packuswb m1, m2 vpermb m16, m18, m1 mova [dstq], ym16 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 add dstq, strideq ret .w32_v: mova m2, [t4] paddw m2, [t2] mova m1, [t3] mova m4, [t1] paddw m3, m4, [t6] paddw m4, [t5] punpcklwd m5, m2, m1 mova m0, m13 vpdpwssd m0, m5, m15 punpckhwd m2, m1 mova m1, m13 vpdpwssd m1, m2, m15 punpcklwd m2, m3, m4 vpdpwssd m0, m2, m14 punpckhwd m3, m4 vpdpwssd m1, m3, m14 packuswb m0, m1 vpermb m16, m18, m0 mova [dstq], ym16 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq ret cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \ w, h, edge, params mov paramsq, r6mp mov wd, wm mov hd, hm mov edged, r7m vbroadcasti32x4 m5, [sgr_shuf+1] add lpfq, wq vbroadcasti32x4 m6, [sgr_shuf+9] add dstq, wq vbroadcasti32x4 m7, [sgr_shuf+3] lea t3, [rsp+wq*4+16+416*12] vbroadcasti32x4 m8, [sgr_shuf+7] pxor m4, m4 vpbroadcastd m9, [pd_m25] vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 vpbroadcastw m15, [paramsq+8] ; w0 lea t1, [rsp+wq*2+20] vpbroadcastd m10, [pw_164_455] neg wq vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) mov r10d, 0xfe vpbroadcastd m13, [pd_m4096] kmovb k1, r10d vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) mov r10, 0x3333333333333333 mova m18, [sgr_x_by_x+64*0] kmovq k2, r10 mova m19, [sgr_x_by_x+64*1] lea r12, [r_ext_mask+75] mova m20, [sgr_x_by_x+64*2] psllw m15, 4 mova m21, [sgr_x_by_x+64*3] lea r10, [lpfq+strideq*4] mova ym22, [sgr_shuf] add r10, strideq mov [rsp], r10 ; below test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call .top_fixup add t1, 416*6 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, strideq call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq test hd, hd jz .odd_height call .h add lpfq, strideq call .hv call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .h_top add lpfq, strideq call .hv_bottom .end: call .n0 call .n1 .end2: RET .height1: call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv call .n0 call .n1 .odd_height_end: call .v call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+416*6] call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 jmp .main .no_top_height1: call .v call .prep_n jmp .odd_height_end .h: ; horizontal boxsum lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .h_main .h_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu ym17, [lpfq+r10-2] .h_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -34 jl .h_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .h_have_right: pshufb m3, m17, m5 pmullw m2, m3, m3 pshufb m1, m17, m6 paddw m0, m3, m1 shufps m3, m1, q2121 paddw m0, m3 punpcklwd m16, m3, m1 punpckhwd m3, m1 punpcklwd m1, m2, m4 vpdpwssd m1, m16, m16 punpckhwd m2, m4 vpdpwssd m2, m3, m3 pshufb m16, m17, m7 paddw m0, m16 pshufb m17, m8 paddw m0, m17 ; sum punpcklwd m3, m16, m17 vpdpwssd m1, m3, m3 ; sumsq punpckhwd m16, m17 vpdpwssd m2, m16, m16 test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+r10*2+416*0] paddd m1, [t1+r10*2+416*2] paddd m2, [t1+r10*2+416*4] .h_loop_end: mova [t1+r10*2+416*0], m0 mova [t1+r10*2+416*2], m1 mova [t1+r10*2+416*4], m2 add r10, 32 jl .h_loop ret .top_fixup: lea r10, [wq-2] .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+r10*2+416*0] mova m1, [t1+r10*2+416*2] mova m2, [t1+r10*2+416*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+r10*2+416*0], m0 mova [t2+r10*2+416*2], m1 mova [t2+r10*2+416*4], m2 add r10, 32 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .hv_main .hv_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .hv_main .hv_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu ym17, [lpfq+r10-2] .hv_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -34 jl .hv_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .hv_have_right: pshufb m1, m17, m5 pmullw m3, m1, m1 pshufb m2, m17, m6 paddw m0, m1, m2 shufps m1, m2, q2121 paddw m0, m1 punpcklwd m16, m1, m2 punpckhwd m1, m2 punpcklwd m2, m3, m4 vpdpwssd m2, m16, m16 punpckhwd m3, m4 vpdpwssd m3, m1, m1 pshufb m16, m17, m7 paddw m0, m16 pshufb m17, m8 paddw m0, m17 ; h sum punpcklwd m1, m16, m17 vpdpwssd m2, m1, m1 ; h sumsq punpckhwd m16, m17 vpdpwssd m3, m16, m16 paddw m1, m0, [t1+r10*2+416*0] paddd m16, m2, [t1+r10*2+416*2] paddd m17, m3, [t1+r10*2+416*4] test hd, hd jz .hv_last_row .hv_main2: paddd m16, [t2+r10*2+416*2] ; hv sumsq paddd m17, [t2+r10*2+416*4] paddw m1, [t2+r10*2+416*0] ; hv sum mova [t0+r10*2+416*2], m2 mova [t0+r10*2+416*4], m3 mova [t0+r10*2+416*0], m0 pmulld m16, m9 ; -a * 25 pmulld m17, m9 punpcklwd m0, m1, m4 ; b vpdpwssd m16, m0, m0 ; -p punpckhwd m1, m4 vpdpwssd m17, m1, m1 pmaddwd m0, m10 ; b * 164 pmaddwd m1, m10 pmulld m16, m11 ; p * s pmulld m17, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m14 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) vpternlogd m17, m1, m13, 0xd8 mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b. vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but mova [t3+r10*4+ 72], m17 ; that gets us most of the way. vextracti128 [t3+r10*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+104], m16, 3 add r10, 32 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+r10*2+416*0], m1 paddw m1, m0 mova [t1+r10*2+416*2], m16 paddd m16, m2 mova [t1+r10*2+416*4], m17 paddd m17, m3 jmp .hv_main2 .v: ; vertical boxsum + ab lea r10, [wq-2] .v_loop: mova m2, [t1+r10*2+416*2] paddd m16, m2, [t2+r10*2+416*2] mova m3, [t1+r10*2+416*4] paddd m17, m3, [t2+r10*2+416*4] paddd m2, m2 paddd m3, m3 paddd m16, m2 ; hv sumsq paddd m17, m3 pmulld m16, m9 ; -a * 25 pmulld m17, m9 mova m0, [t1+r10*2+416*0] paddw m1, m0, [t2+r10*2+416*0] paddw m0, m0 paddw m1, m0 ; hv sum punpcklwd m0, m1, m4 ; b vpdpwssd m16, m0, m0 ; -p punpckhwd m1, m4 vpdpwssd m17, m1, m1 pmaddwd m0, m10 ; b * 164 pmaddwd m1, m10 pmulld m16, m11 ; p * s pmulld m17, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m14 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) vpternlogd m17, m1, m13, 0xd8 mova [t3+r10*4+ 8], m16 mova [t3+r10*4+ 24], xm17 vextracti32x4 [t3+r10*4+ 56], m17, 2 mova [t3+r10*4+ 72], m17 vextracti128 [t3+r10*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+104], m16, 3 add r10, 32 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t3+r10*4+ 4] movu m1, [t3+r10*4+68] paddd m2, m0, [t3+r10*4+ 0] paddd m3, m1, [t3+r10*4+64] paddd m2, [t3+r10*4+ 8] paddd m3, [t3+r10*4+72] paddd m0, m2 pslld m2, 2 paddd m1, m3 pslld m3, 2 paddd m2, m0 ; ab 565 paddd m3, m1 pandn m0, m13, m2 ; a psrld m2, 12 ; b pandn m1, m13, m3 psrld m3, 12 mova [t3+r10*4+416*4+ 0], m0 mova [t3+r10*4+416*8+ 0], m2 mova [t3+r10*4+416*4+64], m1 mova [t3+r10*4+416*8+64], m3 add r10, 32 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m16, [t3+r10*4+ 4] movu m17, [t3+r10*4+68] paddd m0, m16, [t3+r10*4+ 0] paddd m1, m17, [t3+r10*4+64] paddd m0, [t3+r10*4+ 8] paddd m1, [t3+r10*4+72] paddd m16, m0 pslld m0, 2 paddd m17, m1 pslld m1, 2 paddd m0, m16 paddd m1, m17 pandn m16, m13, m0 psrld m0, 12 pandn m17, m13, m1 psrld m1, 12 paddd m2, m16, [t3+r10*4+416*4+ 0] ; a paddd m3, m17, [t3+r10*4+416*4+64] mova [t3+r10*4+416*4+ 0], m16 mova [t3+r10*4+416*4+64], m17 paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8) paddd m17, m1, [t3+r10*4+416*8+64] mova [t3+r10*4+416*8+ 0], m0 mova [t3+r10*4+416*8+64], m1 pmovzxbd m0, [dstq+r10+ 0] pmovzxbd m1, [dstq+r10+16] pmaddwd m2, m0 ; a * src pmaddwd m3, m1 packssdw m0, m1 psubd m16, m2 ; b - a * src + (1 << 8) psubd m17, m3 psrad m16, 9 psrad m17, 9 packssdw m16, m17 pmulhrsw m16, m15 paddw m16, m0 packuswb m16, m16 vpermd m16, m22, m16 mova [dstq+r10], ym16 add r10, 32 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: pmovzxbd m0, [dstq+r10+ 0] pmovzxbd m1, [dstq+r10+16] pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src pmaddwd m3, m1, [t3+r10*4+416*4+64] mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7) mova m17, [t3+r10*4+416*8+64] packssdw m0, m1 psubd m16, m2 ; b - a * src + (1 << 7) psubd m17, m3 psrad m16, 8 psrad m17, 8 packssdw m16, m17 pmulhrsw m16, m15 paddw m16, m0 packuswb m16, m16 vpermd m16, m22, m16 mova [dstq+r10], ym16 add r10, 32 jl .n1_loop add dstq, strideq ret cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \ w, h, edge, params mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti32x4 m5, [sgr_shuf+3] add lpfq, wq vbroadcasti32x4 m6, [sgr_shuf+5] add dstq, wq vbroadcasti32x4 m7, [sgr_shuf+7] pxor m4, m4 vpbroadcastd m8, [pd_m9] vpsubd m11, m4, [paramsq+4] {1to16} ; -s1 vpbroadcastw m15, [paramsq+10] ; w1 lea t1, [rsp+wq*2+20] vpbroadcastd m10, [pw_164_455] lea t3, [rsp+wq*4+16+416*12] vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) neg wq vpbroadcastd m13, [pd_m4096] mov r10d, 0xfe vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) kmovb k1, r10d mova m18, [sgr_x_by_x+64*0] mov r10, 0x3333333333333333 mova m19, [sgr_x_by_x+64*1] kmovq k2, r10 mova m20, [sgr_x_by_x+64*2] psllw m15, 4 mova m21, [sgr_x_by_x+64*3] lea r14, [r_ext_mask+75] mova ym9, [sgr_shuf] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 add t1, 416*6 call .h_top lea t4, [lpfq+strideq*4] mov lpfq, dstq add t4, strideq mov [rsp], t4 ; below mov t0, t2 call .hv .main: mov t5, t3 add t3, 416*4 dec hd jz .height1 add lpfq, strideq call .hv call .prep_n dec hd jz .extend_bottom .main_loop: add lpfq, strideq call .hv call .n dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv_bottom call .n add lpfq, strideq call .hv_bottom .end: call .n RET .height1: call .v call .prep_n mov t2, t1 call .v jmp .end .extend_bottom: call .v call .n mov t2, t1 call .v jmp .end .no_top: lea t4, [lpfq+strideq*4] mov lpfq, dstq lea t4, [t4+strideq*2] mov [rsp], t4 call .h lea t0, [t1+416*6] mov t2, t1 call .v jmp .main .h: ; horizontal boxsum lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .h_main .h_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu ym17, [lpfq+r10-2] .h_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -33 jl .h_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r14+r10-8] vinserti32x8 m16, [r14+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .h_have_right: pshufb m0, m17, m5 pmullw m2, m0, m0 pshufb m16, m17, m6 paddw m0, m16 pshufb m17, m7 paddw m0, m17 ; sum punpcklwd m3, m16, m17 punpcklwd m1, m2, m4 vpdpwssd m1, m3, m3 ; sumsq punpckhwd m16, m17 punpckhwd m2, m4 vpdpwssd m2, m16, m16 mova [t1+r10*2+416*0], m0 mova [t1+r10*2+416*2], m1 mova [t1+r10*2+416*4], m2 add r10, 32 jl .h_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .hv_main .hv_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .hv_main .hv_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu ym17, [lpfq+r10-2] .hv_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -33 jl .hv_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r14+r10-8] vinserti32x8 m16, [r14+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .hv_have_right: pshufb m0, m17, m5 pmullw m3, m0, m0 pshufb m1, m17, m6 paddw m0, m1 pshufb m17, m7 paddw m0, m17 ; h sum punpcklwd m16, m17, m1 punpcklwd m2, m3, m4 vpdpwssd m2, m16, m16 ; h sumsq punpckhwd m17, m1 punpckhwd m3, m4 vpdpwssd m3, m17, m17 paddw m1, m0, [t2+r10*2+416*0] paddw m1, [t1+r10*2+416*0] ; hv sum paddd m16, m2, [t2+r10*2+416*2] paddd m17, m3, [t2+r10*2+416*4] paddd m16, [t1+r10*2+416*2] ; hv sumsq paddd m17, [t1+r10*2+416*4] mova [t0+r10*2+416*0], m0 mova [t0+r10*2+416*2], m2 mova [t0+r10*2+416*4], m3 pmulld m16, m8 ; -a * 9 pmulld m17, m8 punpcklwd m0, m4, m1 ; b vpdpwssd m16, m0, m0 ; -p punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 pmulld m16, m11 ; p * s pmulld m17, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m14 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) vpternlogd m17, m1, m13, 0xd8 mova [t3+r10*4+ 8], m16 mova [t3+r10*4+ 24], xm17 vextracti32x4 [t3+r10*4+ 56], m17, 2 mova [t3+r10*4+ 72], m17 vextracti128 [t3+r10*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+104], m16, 3 add r10, 32 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .v: ; vertical boxsum + ab lea r10, [wq-2] .v_loop: mova m16, [t1+r10*2+416*2] mova m17, [t1+r10*2+416*4] paddd m16, m16 paddd m17, m17 paddd m16, [t2+r10*2+416*2] ; hv sumsq paddd m17, [t2+r10*2+416*4] pmulld m16, m8 ; -a * 9 pmulld m17, m8 mova m1, [t1+r10*2+416*0] paddw m1, m1 paddw m1, [t2+r10*2+416*0] ; hv sum punpcklwd m0, m4, m1 ; b vpdpwssd m16, m0, m0 ; -p punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 pmulld m16, m11 ; p * s pmulld m17, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m14 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) vpternlogd m17, m1, m13, 0xd8 mova [t3+r10*4+ 8], m16 mova [t3+r10*4+ 24], xm17 vextracti32x4 [t3+r10*4+ 56], m17, 2 mova [t3+r10*4+ 72], m17 vextracti128 [t3+r10*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+104], m16, 3 add r10, 32 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq mov t4, t3 add t3, 416*4 .prep_n_loop: mova m2, [t5+r10*4+0] mova m3, [t4+r10*4+0] paddd m2, [t5+r10*4+8] paddd m3, [t4+r10*4+8] paddd m0, m2, [t5+r10*4+4] paddd m1, m3, [t4+r10*4+4] pslld m0, 2 paddd m1, m1 ; ab[ 0] 222 psubd m0, m2 ; ab[-1] 343 mova [t3+r10*4+416*4], m1 paddd m1, m1 mova [t5+r10*4], m0 psubd m1, m3 ; ab[ 0] 343 mova [t4+r10*4], m1 add r10, 16 jl .prep_n_loop ret ; a+b are packed together in a single dword, but we can't do the ; full neighbor calculations before splitting them since we don't ; have sufficient precision. The solution is to do the calculations ; in two equal halves and split a and b before doing the final sum. ALIGN function_align .n: ; neighbor + output mov r10, wq .n_loop: mova m16, [t3+r10*4+ 0] paddd m16, [t3+r10*4+ 8] paddd m17, m16, [t3+r10*4+ 4] paddd m17, m17 ; ab[+1] 222 mova m2, [t3+r10*4+416*4+ 0] paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 mova m3, [t3+r10*4+416*4+64] paddd m1, m3, [t5+r10*4+64] mova [t3+r10*4+416*4+ 0], m17 paddd m17, m17 psubd m17, m16 ; ab[+1] 343 mova [t5+r10*4+ 0], m17 paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343 mova m16, [t3+r10*4+64] paddd m16, [t3+r10*4+72] paddd m17, m16, [t3+r10*4+68] paddd m17, m17 mova [t3+r10*4+416*4+64], m17 paddd m17, m17 psubd m17, m16 mova [t5+r10*4+64], m17 pandn m16, m13, m0 psrld m0, 12 paddd m3, m17 pandn m17, m13, m2 psrld m2, 12 paddd m16, m17 ; a pandn m17, m13, m1 psrld m1, 12 paddd m0, m2 ; b + (1 << 8) pandn m2, m13, m3 psrld m3, 12 paddd m17, m2 pmovzxbd m2, [dstq+r10+ 0] paddd m1, m3 pmovzxbd m3, [dstq+r10+16] pmaddwd m16, m2 ; a * src pmaddwd m17, m3 packssdw m2, m3 psubd m0, m16 ; b - a * src + (1 << 8) psubd m1, m17 psrad m0, 9 psrad m1, 9 packssdw m0, m1 pmulhrsw m0, m15 paddw m0, m2 packuswb m0, m0 vpermd m16, m9, m0 mova [dstq+r10], ym16 add r10, 32 jl .n_loop mov r10, t5 mov t5, t4 mov t4, r10 add dstq, strideq ret cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \ w, h, edge, params mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti128 m5, [sgr_shuf+1] add lpfq, wq vbroadcasti128 m6, [sgr_shuf+9] add dstq, wq vbroadcasti128 m7, [sgr_shuf+3] lea t3, [rsp+wq*4+416*24+8] vbroadcasti128 m8, [sgr_shuf+7] pxor m4, m4 vpbroadcastd m9, [pd_m9] vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 vpbroadcastd m14, [pw_61448] vpsubd m12, m4, [paramsq+4] {1to16} ; -s1 vpbroadcastd m26, [paramsq+8] ; w0 w1 lea t1, [rsp+wq*2+12] vpbroadcastd m10, [pd_m25] neg wq vpbroadcastd m13, [pw_164_455] mov r10d, 0xfe vpbroadcastd m15, [pd_34816] kmovb k1, r10d mova m20, [sgr_x_by_x+64*0] mov r10, 0x3333333333333333 mova m21, [sgr_x_by_x+64*1] kmovq k2, r10 mova m22, [sgr_x_by_x+64*2] lea r12, [r_ext_mask+75] mova m23, [sgr_x_by_x+64*3] vpbroadcastd m24, [pd_m4096] vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____ psllw m26, 5 mova xm27, [sgr_mix_perm] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup add t1, 416*12 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq call .hv0 test hd, hd jz .odd_height add lpfq, strideq call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom add lpfq, strideq call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+416*12] lea r10, [wq-2] .top_fixup_loop: mova m0, [t1+r10*2+416* 0] mova m1, [t1+r10*2+416* 2] mova m2, [t1+r10*2+416* 4] paddw m0, m0 mova m3, [t1+r10*2+416* 6] paddd m1, m1 mova m16, [t1+r10*2+416* 8] paddd m2, m2 mova m17, [t1+r10*2+416*10] mova [t2+r10*2+416* 0], m0 mova [t2+r10*2+416* 2], m1 mova [t2+r10*2+416* 4], m2 mova [t2+r10*2+416* 6], m3 mova [t2+r10*2+416* 8], m16 mova [t2+r10*2+416*10], m17 add r10, 32 jl .top_fixup_loop call .v0 jmp .main .h: ; horizontal boxsums lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .h_main .h_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu ym17, [lpfq+r10-2] .h_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -34 jl .h_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .h_have_right: pshufb m3, m17, m5 pshufb m18, m17, m6 shufps m0, m3, m18, q2121 pmullw m2, m0, m0 pshufb m19, m17, m7 paddw m0, m19 pshufb m17, m8 paddw m0, m17 ; sum3 punpcklwd m16, m19, m17 punpcklwd m1, m2, m4 vpdpwssd m1, m16, m16 ; sumsq3 punpckhwd m19, m17 punpckhwd m2, m4 vpdpwssd m2, m19, m19 mova [t1+r10*2+416* 6], m0 mova [t1+r10*2+416* 8], m1 mova [t1+r10*2+416*10], m2 punpcklwd m19, m3, m18 paddw m0, m3 vpdpwssd m1, m19, m19 ; sumsq5 punpckhwd m3, m18 paddw m0, m18 ; sum5 vpdpwssd m2, m3, m3 mova [t1+r10*2+416* 0], m0 mova [t1+r10*2+416* 2], m1 mova [t1+r10*2+416* 4], m2 add r10, 32 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .hv0_main .hv0_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .hv0_main .hv0_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left .hv0_loop: movu ym17, [lpfq+r10-2] .hv0_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp r10d, -34 jl .hv0_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .hv0_have_right: pshufb m18, m17, m5 pshufb m19, m17, m6 shufps m1, m18, m19, q2121 pmullw m3, m1, m1 pshufb m0, m17, m7 paddw m1, m0 pshufb m17, m8 paddw m1, m17 ; sum3 punpcklwd m16, m0, m17 punpcklwd m2, m3, m4 vpdpwssd m2, m16, m16 ; sumsq3 punpckhwd m0, m17 punpckhwd m3, m4 vpdpwssd m3, m0, m0 paddw m0, m1, [t1+r10*2+416* 6] paddd m16, m2, [t1+r10*2+416* 8] paddd m17, m3, [t1+r10*2+416*10] mova [t1+r10*2+416* 6], m1 mova [t1+r10*2+416* 8], m2 mova [t1+r10*2+416*10], m3 paddw m1, m18 paddw m1, m19 ; sum5 mova [t3+r10*4+416*8+ 8], m1 paddw m1, [t1+r10*2+416* 0] mova [t1+r10*2+416* 0], m1 punpcklwd m1, m18, m19 vpdpwssd m2, m1, m1 ; sumsq5 punpckhwd m18, m19 vpdpwssd m3, m18, m18 mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row mova [t3+r10*4+416*0+72], m3 ; in case height is odd paddd m2, [t1+r10*2+416* 2] paddd m3, [t1+r10*2+416* 4] mova [t1+r10*2+416* 2], m2 mova [t1+r10*2+416* 4], m3 paddw m1, m0, [t2+r10*2+416* 6] paddd m2, m16, [t2+r10*2+416* 8] paddd m3, m17, [t2+r10*2+416*10] mova [t2+r10*2+416* 6], m0 mova [t2+r10*2+416* 8], m16 mova [t2+r10*2+416*10], m17 pmulld m16, m2, m9 ; -a3 * 9 pmulld m17, m3, m9 punpcklwd m0, m4, m1 ; b3 vpdpwssd m16, m0, m0 ; -p3 punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 pmulld m16, m12 ; p3 * s1 pmulld m17, m12 pmaddwd m0, m13 ; b3 * 455 pmaddwd m1, m13 vpalignr m17{k2}, m16, m16, 2 mova m16, m22 paddusw m17, m14 psraw m17, 4 ; min(z3, 255) - 256 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x3 pandn m16, m24, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m15 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) vpternlogd m17, m1, m24, 0xd8 mova [t3+r10*4+416*4+ 8], m16 mova [t3+r10*4+416*4+ 24], xm17 vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 mova [t3+r10*4+416*4+ 72], m17 vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+416*4+104], m16, 3 add r10, 32 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .hv1_main .hv1_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .hv1_main .hv1_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left .hv1_loop: movu ym17, [lpfq+r10-2] .hv1_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp r10d, -34 jl .hv1_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .hv1_have_right: pshufb m3, m17, m5 pshufb m19, m17, m6 shufps m2, m3, m19, q2121 pmullw m1, m2, m2 pshufb m18, m17, m7 paddw m2, m18 pshufb m17, m8 paddw m2, m17 ; sum3 punpcklwd m16, m17, m18 punpcklwd m0, m1, m4 vpdpwssd m0, m16, m16 ; sumsq3 punpckhwd m17, m18 punpckhwd m1, m4 vpdpwssd m1, m17, m17 paddd m16, m0, [t2+r10*2+416* 8] paddd m17, m1, [t2+r10*2+416*10] mova [t2+r10*2+416* 8], m0 mova [t2+r10*2+416*10], m1 punpcklwd m18, m3, m19 vpdpwssd m0, m18, m18 ; sumsq5 punpckhwd m18, m3, m19 vpdpwssd m1, m18, m18 paddw m3, m19 pmulld m16, m9 ; -a3 * 9 pmulld m17, m9 paddd m18, m0, [t2+r10*2+416*2] paddd m19, m1, [t2+r10*2+416*4] paddd m18, [t1+r10*2+416*2] paddd m19, [t1+r10*2+416*4] mova [t2+r10*2+416*2], m0 mova [t2+r10*2+416*4], m1 pmulld m18, m10 ; -a5 * 25 pmulld m19, m10 paddw m1, m2, [t2+r10*2+416* 6] mova [t2+r10*2+416* 6], m2 paddw m2, m3 ; sum5 paddw m3, m2, [t2+r10*2+416*0] paddw m3, [t1+r10*2+416*0] mova [t2+r10*2+416*0], m2 punpcklwd m0, m4, m1 ; b3 vpdpwssd m16, m0, m0 ; -p3 punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 punpcklwd m2, m3, m4 ; b5 vpdpwssd m18, m2, m2 ; -p5 punpckhwd m3, m4 vpdpwssd m19, m3, m3 pmulld m16, m12 ; p3 * s1 pmulld m17, m12 pmulld m18, m11 ; p5 * s0 pmulld m19, m11 pmaddwd m0, m13 ; b3 * 455 pmaddwd m1, m13 pmaddwd m2, m13 ; b5 * 164 pmaddwd m3, m13 vpalignr m17{k2}, m16, m16, 2 vpalignr m19{k2}, m18, m18, 2 paddusw m17, m14 mova m16, m22 psraw m17, 4 ; min(z3, 255) - 256 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] paddusw m19, m14 mova m18, m22 psraw m19, 4 ; min(z5, 255) - 256 vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] vpmovb2m k4, m19 vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x3 vmovdqu8 m19{k4}, m18 ; x5 pandn m16, m24, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 pandn m18, m24, m19 psrld m19, 16 pmulld m2, m18 pmulld m3, m19 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m15 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) vpternlogd m17, m1, m24, 0xd8 mova [t3+r10*4+416*8+ 8], m16 mova [t3+r10*4+416*8+ 24], xm17 vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m3, m15 mova [t3+r10*4+416*8+ 72], m17 vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 vextracti32x4 [t3+r10*4+416*8+104], m16, 3 vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) vpternlogd m19, m3, m24, 0xd8 mova [t3+r10*4+416*0+ 8], m18 mova [t3+r10*4+416*0+ 24], xm19 vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 mova [t3+r10*4+416*0+ 72], m19 vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 vextracti32x4 [t3+r10*4+416*0+104], m18, 3 add r10, 32 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) lea r10, [wq-2] .v0_loop: mova m2, [t1+r10*2+416* 8] mova m3, [t1+r10*2+416*10] paddd m2, m2 paddd m3, m3 paddd m16, m2, [t2+r10*2+416* 8] paddd m17, m3, [t2+r10*2+416*10] mova m0, [t1+r10*2+416* 6] paddw m0, m0 paddw m1, m0, [t2+r10*2+416* 6] pmulld m16, m9 ; -a3 * 9 pmulld m17, m9 mova [t2+r10*2+416* 6], m0 mova [t2+r10*2+416* 8], m2 mova [t2+r10*2+416*10], m3 mova m2, [t1+r10*2+416*0] mova m3, [t1+r10*2+416*2] mova m18, [t1+r10*2+416*4] punpcklwd m0, m4, m1 ; b3 vpdpwssd m16, m0, m0 ; -p3 punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 pmulld m16, m12 ; p3 * s1 pmulld m17, m12 pmaddwd m0, m13 ; b3 * 455 pmaddwd m1, m13 mova [t3+r10*4+416*8+ 8], m2 mova [t3+r10*4+416*0+ 8], m3 mova [t3+r10*4+416*0+72], m18 vpalignr m17{k2}, m16, m16, 2 mova m16, m22 paddusw m17, m14 psraw m17, 4 ; min(z3, 255) - 256 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x3 pandn m16, m24, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddw m2, m2 ; cc5 paddd m3, m3 paddd m18, m18 mova [t1+r10*2+416*0], m2 mova [t1+r10*2+416*2], m3 mova [t1+r10*2+416*4], m18 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m15 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) vpternlogd m17, m1, m24, 0xd8 mova [t3+r10*4+416*4+ 8], m16 mova [t3+r10*4+416*4+ 24], xm17 vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 mova [t3+r10*4+416*4+ 72], m17 vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+416*4+104], m16, 3 add r10, 32 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) lea r10, [wq-2] .v1_loop: mova m0, [t1+r10*2+416* 8] paddd m16, m0, [t2+r10*2+416* 8] mova m1, [t1+r10*2+416*10] paddd m17, m1, [t2+r10*2+416*10] mova m2, [t3+r10*4+416*0+ 8] paddd m18, m2, [t2+r10*2+416* 2] mova m3, [t3+r10*4+416*0+72] paddd m19, m3, [t2+r10*2+416* 4] paddd m18, [t1+r10*2+416* 2] paddd m19, [t1+r10*2+416* 4] mova [t2+r10*2+416* 8], m0 mova [t2+r10*2+416*10], m1 mova [t2+r10*2+416* 2], m2 mova [t2+r10*2+416* 4], m3 pmulld m16, m9 ; -a3 * 9 pmulld m17, m9 pmulld m18, m10 ; -a5 * 25 pmulld m19, m10 mova m0, [t1+r10*2+416* 6] paddw m1, m0, [t2+r10*2+416* 6] mova m2, [t3+r10*4+416*8+ 8] paddw m3, m2, [t2+r10*2+416*0] paddw m3, [t1+r10*2+416*0] mova [t2+r10*2+416* 6], m0 mova [t2+r10*2+416*0], m2 punpcklwd m0, m4, m1 ; b3 vpdpwssd m16, m0, m0 ; -p3 punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 punpcklwd m2, m3, m4 ; b5 vpdpwssd m18, m2, m2 ; -p5 punpckhwd m3, m4 vpdpwssd m19, m3, m3 pmulld m16, m12 ; p3 * s1 pmulld m17, m12 pmulld m18, m11 ; p5 * s0 pmulld m19, m11 pmaddwd m0, m13 ; b3 * 455 pmaddwd m1, m13 pmaddwd m2, m13 ; b5 * 164 pmaddwd m3, m13 vpalignr m17{k2}, m16, m16, 2 vpalignr m19{k2}, m18, m18, 2 paddusw m17, m14 mova m16, m22 psraw m17, 4 ; min(z3, 255) - 256 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] paddusw m19, m14 mova m18, m22 psraw m19, 4 ; min(z5, 255) - 256 vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] vpmovb2m k4, m19 vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x3 vmovdqu8 m19{k4}, m18 ; x5 pandn m16, m24, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 pandn m18, m24, m19 psrld m19, m19, 16 pmulld m2, m18 pmulld m3, m19 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m15 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) vpternlogd m17, m1, m24, 0xd8 mova [t3+r10*4+416*8+ 8], m16 mova [t3+r10*4+416*8+ 24], xm17 vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m3, m15 mova [t3+r10*4+416*8+ 72], m17 vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 vextracti32x4 [t3+r10*4+416*8+104], m16, 3 vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) vpternlogd m19, m3, m24, 0xd8 mova [t3+r10*4+416*0+ 8], m18 mova [t3+r10*4+416*0+ 24], xm19 vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 mova [t3+r10*4+416*0+ 72], m19 vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 vextracti32x4 [t3+r10*4+416*0+104], m18, 3 add r10, 32 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t3+r10*4+416*0+4] paddd m1, m0, [t3+r10*4+416*0+0] mova m16, [t3+r10*4+416*4+0] paddd m1, [t3+r10*4+416*0+8] mova m17, [t3+r10*4+416*8+0] paddd m16, [t3+r10*4+416*4+8] paddd m17, [t3+r10*4+416*8+8] paddd m2, m16, [t3+r10*4+416*4+4] paddd m3, m17, [t3+r10*4+416*8+4] paddd m0, m1 pslld m1, 2 pslld m2, 2 paddd m1, m0 ; ab5 565 paddd m3, m3 ; ab3[ 0] 222 psubd m2, m16 ; ab3[-1] 343 mova [t3+r10*4+416*20], m3 pandn m0, m24, m1 ; a5 565 mova [t3+r10*4+416*24], m2 psrld m1, 12 ; b5 565 mova [t3+r10*4+416*12], m0 paddd m3, m3 mova [t3+r10*4+416*16], m1 psubd m3, m17 ; ab3[ 0] 343 mova [t3+r10*4+416*28], m3 add r10, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m2, [t3+r10*4+4] paddd m3, m2, [t3+r10*4+0] paddd m3, [t3+r10*4+8] mova m1, [t3+r10*4+416*4+0] paddd m2, m3 pslld m3, 2 paddd m1, [t3+r10*4+416*4+8] paddd m3, m2 pandn m2, m24, m3 psrld m3, 12 paddd m0, m2, [t3+r10*4+416*12] ; a5 paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8) mova [t3+r10*4+416*12], m2 mova [t3+r10*4+416*16], m3 paddd m2, m1, [t3+r10*4+416*4+4] paddd m2, m2 ; ab3[ 1] 222 mova m3, [t3+r10*4+416*20] paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343 mova [t3+r10*4+416*20], m2 paddd m2, m2 psubd m2, m1 ; ab3[ 1] 343 mova [t3+r10*4+416*24], m2 paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343 pandn m1, m24, m17 psrld m17, 12 pandn m3, m24, m2 psrld m2, 12 paddd m1, m3 ; a3 pmovzxbd m3, [dstq+r10] paddd m17, m2 ; b3 + (1 << 8) pmaddwd m0, m3 ; a5 * src pmaddwd m1, m3 ; a3 * src vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) psubd m16, m0 ; b5 - a5 * src + (1 << 8) psubd m17, m1 ; b3 - a3 * src + (1 << 8) psrld m16, 9 pslld m17, 7 vmovdqu8 m17{k2}, m16 vpdpwssd m3, m17, m26 packuswb m3, m2 vpermb m16, m27, m3 mova [dstq+r10], xm16 add r10, 16 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova m1, [t3+r10*4+416*8+0] paddd m1, [t3+r10*4+416*8+8] paddd m2, m1, [t3+r10*4+416*8+4] paddd m2, m2 ; ab3[ 1] 222 mova m0, [t3+r10*4+416*20] paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343 pmovzxbd m3, [dstq+r10] mova [t3+r10*4+416*20], m2 paddd m2, m2 psubd m2, m1 ; ab3[ 1] 343 mova [t3+r10*4+416*28], m2 paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343 pandn m1, m24, m17 psrld m17, 12 pandn m2, m24, m0 psrld m0, 12 paddd m1, m2 ; a3 paddd m17, m0 ; b3 + (1 << 8) mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7) pmaddwd m1, m3 ; a3 * src pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) psubd m17, m1 ; b3 - a3 * src + (1 << 8) psubd m16, m0 ; b5 - a5 * src + (1 << 7) pslld m17, 7 palignr m17{k2}, m16, m16, 1 vpdpwssd m3, m17, m26 packuswb m3, m3 vpermb m16, m27, m3 mova [dstq+r10], xm16 add r10, 16 jl .n1_loop add dstq, strideq ret %endif ; ARCH_X86_64