diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2021-05-04 15:02:57 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2021-05-04 18:00:05 +0300 |
commit | 787862dbd7b6fc3d48b24ba09039fd33ab652811 (patch) | |
tree | 69bcded48494b7e6dc4c816c6cfa2ed7cb31c76d | |
parent | f37bb252d2dbbbdcbe4300fb54eb07fb1a1339b0 (diff) |
x86: Add high bitdepth (10-bit) sgr AVX2 asm
-rw-r--r-- | src/looprestoration.h | 4 | ||||
-rw-r--r-- | src/looprestoration_tmpl.c | 8 | ||||
-rw-r--r-- | src/meson.build | 2 | ||||
-rw-r--r-- | src/ppc/looprestoration_init_tmpl.c | 4 | ||||
-rw-r--r-- | src/x86/looprestoration16_avx2.asm | 1929 | ||||
-rw-r--r-- | src/x86/looprestoration_avx2.asm | 24 | ||||
-rw-r--r-- | src/x86/looprestoration_init_tmpl.c | 64 |
7 files changed, 1977 insertions, 58 deletions
diff --git a/src/looprestoration.h b/src/looprestoration.h index 0b7defc..db410f9 100644 --- a/src/looprestoration.h +++ b/src/looprestoration.h @@ -76,7 +76,7 @@ typedef struct Dav1dLoopRestorationDSPContext { bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc); bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc); -bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c); -bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c); +bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c, int bpc); +bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c, int bpc); #endif /* DAV1D_SRC_LOOPRESTORATION_H */ diff --git a/src/looprestoration_tmpl.c b/src/looprestoration_tmpl.c index ab5421b..c456db2 100644 --- a/src/looprestoration_tmpl.c +++ b/src/looprestoration_tmpl.c @@ -529,7 +529,9 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride, } } -COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) { +COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, + const int bpc) +{ c->wiener[0] = c->wiener[1] = wiener_c; c->sgr[0] = sgr_5x5_c; c->sgr[1] = sgr_3x3_c; @@ -539,9 +541,9 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext #if ARCH_AARCH64 || ARCH_ARM bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc); #elif ARCH_PPC64LE - bitfn(dav1d_loop_restoration_dsp_init_ppc)(c); + bitfn(dav1d_loop_restoration_dsp_init_ppc)(c, bpc); #elif ARCH_X86 - bitfn(dav1d_loop_restoration_dsp_init_x86)(c); + bitfn(dav1d_loop_restoration_dsp_init_x86)(c, bpc); #endif #endif } diff --git a/src/meson.build b/src/meson.build index eef5df8..8ecbd95 100644 --- a/src/meson.build +++ b/src/meson.build @@ -189,6 +189,7 @@ if is_asm_enabled 'x86/cpuid.asm', 'x86/msac.asm', 'x86/cdef_avx2.asm', + 'x86/looprestoration_avx2.asm', 'x86/cdef_sse.asm', ) @@ -201,7 +202,6 @@ if is_asm_enabled 'x86/ipred_avx2.asm', 'x86/itx_avx2.asm', 'x86/loopfilter_avx2.asm', - 'x86/looprestoration_avx2.asm', 'x86/film_grain_sse.asm', 'x86/ipred_sse.asm', 'x86/itx_sse.asm', diff --git a/src/ppc/looprestoration_init_tmpl.c b/src/ppc/looprestoration_init_tmpl.c index 41d79be..448663c 100644 --- a/src/ppc/looprestoration_init_tmpl.c +++ b/src/ppc/looprestoration_init_tmpl.c @@ -324,8 +324,8 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride, } #endif -COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc) - (Dav1dLoopRestorationDSPContext *const c) +COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc)(Dav1dLoopRestorationDSPContext *const c, + const int bpc) { const unsigned flags = dav1d_get_cpu_flags(); diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm index 4551c3d..c1ebdc4 100644 --- a/src/x86/looprestoration16_avx2.asm +++ b/src/x86/looprestoration16_avx2.asm @@ -30,6 +30,8 @@ SECTION_RODATA 32 +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 @@ -49,7 +51,18 @@ pb_m6_m5: times 2 db -6, -5 pb_m2_m1: times 2 db -2, -1 pb_2_3: times 2 db 2, 3 pb_6_7: times 2 db 6, 7 +pw_1023: times 2 dw 1023 +pd_8: dd 8 +pd_25: dd 25 +pd_4096: dd 4096 +pd_34816: dd 34816 pd_m262128 dd -262128 +pd_0xf00800a4: dd 0xf00800a4 +pd_0xf00801c7: dd 0xf00801c7 + +%define pw_256 sgr_lshuf5 + +cextern sgr_x_by_x_avx2 SECTION .text @@ -649,4 +662,1920 @@ ALIGN function_align jl .v_loop ret +cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + mov hd, r6m + add wd, wd + vpbroadcastw m7, [paramsq+8] ; w0 + add lpfq, wq + vpbroadcastd m8, [pd_8] + lea t1, [rsp+wq+20] + vpbroadcastd m9, [pd_25] + add dstq, wq + vpbroadcastd m10, [paramsq+0] ; s0 + lea t3, [rsp+wq*2+400*12+16] + vpbroadcastd m11, [pd_0xf00800a4] + lea t4, [rsp+wq+400*20+16] + vpbroadcastd m12, [pw_256] + neg wq + vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + vpbroadcastd m14, [pw_1023] + psllw m7, 4 + mova xm15, [sgr_lshuf5] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + add r10, lpf_strideq + mov [rsp+8*0], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, dst_strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + test hd, hd + jz .odd_height + call .h + add lpfq, dst_strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp+8*0] + call .h_top + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov [rsp+8*0], r10 + call .h + lea t2, [t1+400*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + vpbroadcastw m0, [lpfq-2] + movu m1, [r13+r10+ 0] + movu m2, [r13+r10+16] + vpblendvb m4, m0, m1 + vpblendvb m5, m0, m2 + ret +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm15 + vinserti128 m4, [lpfq+wq+10], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10- 2] +.h_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + shufpd m5, m4, m5, 0x05 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10+400*0] + paddd m1, [t1+r10+400*2] + paddd m2, [t1+r10+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+r10+400*0], m0 + mova [t1+r10+400*2], m1 + mova [t1+r10+400*4], m2 + add r10, 32 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-4] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10+400*0] + mova m1, [t1+r10+400*2] + mova m2, [t1+r10+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m1 + mova [t2+r10+400*4], m2 + add r10, 32 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm15 + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv_main +.hv_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10- 2] +.hv_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -36 + jl .hv_have_right + call .extend_right +.hv_have_right: + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + shufpd m5, m4, m5, 0x05 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+r10+400*0] + paddd m4, m2, [t1+r10+400*2] + paddd m5, m3, [t1+r10+400*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10+400*0] ; hv sum + paddd m4, [t2+r10+400*2] ; hv sumsq + paddd m5, [t2+r10+400*4] + mova [t0+r10+400*0], m0 + mova [t0+r10+400*2], m2 + mova [t0+r10+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + pmulld m4, m9 ; a * 25 + pmulld m5, m9 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + pmaxud m5, m3 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m10 ; p * s + pmulld m5, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + psubw m2, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+r10+4], m2 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+r10*2+ 8], xm0 + vextracti128 [t3+r10*2+40], m0, 1 + mova [t3+r10*2+24], xm1 + vextracti128 [t3+r10*2+56], m1, 1 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10+400*0], m1 + paddw m1, m0 + mova [t1+r10+400*2], m4 + paddd m4, m2 + mova [t1+r10+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-4] +.v_loop: + mova m0, [t1+r10+400*0] + mova m2, [t1+r10+400*2] + mova m3, [t1+r10+400*4] + paddw m1, m0, [t2+r10+400*0] + paddd m4, m2, [t2+r10+400*2] + paddd m5, m3, [t2+r10+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + pmulld m4, m9 ; a * 25 + pmulld m5, m9 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + pmaxud m5, m3 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m10 ; p * s + pmulld m5, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + psubw m2, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+r10+4], m2 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+r10*2+ 8], xm0 + vextracti128 [t3+r10*2+40], m0, 1 + mova [t3+r10*2+24], xm1 + vextracti128 [t3+r10*2+56], m1, 1 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+36] + paddw m3, m0, [t4+r10*1+ 0] + paddd m4, m1, [t3+r10*2+ 0] + paddd m5, m2, [t3+r10*2+32] + paddw m3, [t4+r10*1+ 4] + paddd m4, [t3+r10*2+ 8] + paddd m5, [t3+r10*2+40] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+r10*1+400*2+ 0], m0 + mova [t3+r10*2+400*4+ 0], m1 + mova [t3+r10*2+400*4+32], m2 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+36] + paddw m3, m0, [t4+r10*1+ 0] + paddd m4, m1, [t3+r10*2+ 0] + paddd m5, m2, [t3+r10*2+32] + paddw m3, [t4+r10*1+ 4] + paddd m4, [t3+r10*2+ 8] + paddd m5, [t3+r10*2+40] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+r10*1+400*2+ 0] + paddd m4, m1, [t3+r10*2+400*4+ 0] + paddd m5, m2, [t3+r10*2+400*4+32] + mova [t4+r10*1+400*2+ 0], m0 + mova [t3+r10*2+400*4+ 0], m1 + mova [t3+r10*2+400*4+32], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 32 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m0, [dstq+r10] + mova m3, [t4+r10*1+400*2+ 0] + mova m4, [t3+r10*2+400*4+ 0] + mova m5, [t3+r10*2+400*4+32] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 <<7) + paddd m3, m4 + psrld m2, 8 + psrld m3, 8 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 32 + jl .n1_loop + add dstq, dst_strideq + ret + +cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + add wd, wd + mov hd, r6m + add lpfq, wq + vpbroadcastw m7, [paramsq+10] ; w1 + lea t1, [rsp+wq+12] + vpbroadcastd m8, [pd_8] + add dstq, wq + vpbroadcastd m9, [paramsq+ 4] ; s1 + lea t3, [rsp+wq*2+400*12+8] + vpbroadcastd m10, [pd_0xf00801c7] + lea t4, [rsp+wq+400*32+8] + vpbroadcastd m11, [pd_34816] + neg wq + vpbroadcastd m12, [pw_256] + pxor m6, m6 + vpbroadcastd m13, [pw_1023] + psllw m7, 4 + mova xm14, [sgr_lshuf3] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, dst_strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, lpf_strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+r10+400*0] + mova m1, [t1+r10+400*2] + mova m2, [t1+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m1 + mova [t2+r10+400*4], m2 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.extend_right: + vpbroadcastw m0, [lpfq-2] + movu m1, [r13+r10+ 2] + movu m2, [r13+r10+18] + vpblendvb m4, m0, m1 + vpblendvb m5, m0, m2 + ret +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10+ 0] +.h_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+r10+400*0], m1 + mova [t1+r10+400*2], m2 + mova [t1+r10+400*4], m3 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m4, [lpfq+r10+ 0] +.hv0_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -34 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+r10+400*0] + paddd m4, m2, [t1+r10+400*2] + paddd m5, m3, [t1+r10+400*4] + mova [t1+r10+400*0], m1 + mova [t1+r10+400*2], m2 + mova [t1+r10+400*4], m3 + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m2 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m4, [lpfq+r10+ 0] +.hv1_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -34 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+r10+400*0] + paddd m4, m2, [t2+r10+400*2] + paddd m5, m3, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m2 + mova [t2+r10+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2 +4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m2 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m0, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + mova xm0, [t4+r10*1+400*0+0] + paddw xm0, [t4+r10*1+400*0+4] + paddw xm2, xm0, [t4+r10*1+400*0+2] + mova m1, [t3+r10*2+400*0+0] + paddd m1, [t3+r10*2+400*0+8] + paddd m3, m1, [t3+r10*2+400*0+4] + psllw xm2, 2 ; a[-1] 444 + pslld m3, 2 ; b[-1] 444 + psubw xm2, xm0 ; a[-1] 343 + psubd m3, m1 ; b[-1] 343 + mova [t4+r10*1+400* 4], xm2 + mova [t3+r10*2+400* 8], m3 + mova xm0, [t4+r10*1+400*2+0] + paddw xm0, [t4+r10*1+400*2+4] + paddw xm2, xm0, [t4+r10*1+400*2+2] + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m3, m1, [t3+r10*2+400*4+4] + psllw xm2, 2 ; a[ 0] 444 + pslld m3, 2 ; b[ 0] 444 + mova [t4+r10*1+400* 6], xm2 + mova [t3+r10*2+400*12], m3 + psubw xm2, xm0 ; a[ 0] 343 + psubd m3, m1 ; b[ 0] 343 + mova [t4+r10*1+400* 8], xm2 + mova [t3+r10*2+400*16], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + mova m3, [t4+r10*1+400*0+0] + paddw m3, [t4+r10*1+400*0+4] + paddw m1, m3, [t4+r10*1+400*0+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+400*4] + paddw m3, [t4+r10*1+400*6] + mova [t4+r10*1+400*4], m2 + mova [t4+r10*1+400*6], m1 + mova m4, [t3+r10*2+400*0+0] + paddd m4, [t3+r10*2+400*0+8] + paddd m1, m4, [t3+r10*2+400*0+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+r10*2+400* 8+ 0] + paddd m4, [t3+r10*2+400*12+ 0] + mova [t3+r10*2+400* 8+ 0], m2 + mova [t3+r10*2+400*12+ 0], m1 + mova m5, [t3+r10*2+400*0+32] + paddd m5, [t3+r10*2+400*0+40] + paddd m1, m5, [t3+r10*2+400*0+36] + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+r10*2+400* 8+32] + paddd m5, [t3+r10*2+400*12+32] + mova [t3+r10*2+400* 8+32], m2 + mova [t3+r10*2+400*12+32], m1 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+r10], m0 + add r10, 32 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t4+r10*1+400*2+0] + paddw m3, [t4+r10*1+400*2+4] + paddw m1, m3, [t4+r10*1+400*2+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+400*6] + paddw m3, [t4+r10*1+400*8] + mova [t4+r10*1+400*6], m1 + mova [t4+r10*1+400*8], m2 + mova m4, [t3+r10*2+400*4+0] + paddd m4, [t3+r10*2+400*4+8] + paddd m1, m4, [t3+r10*2+400*4+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+r10*2+400*12+ 0] + paddd m4, [t3+r10*2+400*16+ 0] + mova [t3+r10*2+400*12+ 0], m1 + mova [t3+r10*2+400*16+ 0], m2 + mova m5, [t3+r10*2+400*4+32] + paddd m5, [t3+r10*2+400*4+40] + paddd m1, m5, [t3+r10*2+400*4+36] + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+r10*2+400*12+32] + paddd m5, [t3+r10*2+400*16+32] + mova [t3+r10*2+400*12+32], m1 + mova [t3+r10*2+400*16+32], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+r10], m0 + add r10, 32 + jl .n1_loop + add dstq, dst_strideq + ret + +cglobal sgr_filter_mix_16bpc, 5, 14, 16, 400*66+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + add wd, wd + mov hd, r6m + add lpfq, wq + vpbroadcastd m9, [pd_8] + lea t1, [rsp+wq+12] + vpbroadcastd m10, [pd_34816] + add dstq, wq + vpbroadcastd m11, [pw_256] + lea t3, [rsp+wq*2+400*24+8] + vpbroadcastd m12, [pd_0xf00801c7] + lea t4, [rsp+wq+400*52+8] + vpbroadcastd m15, [paramsq+8] ; w0 w1 + neg wq + vpbroadcastd m13, [paramsq+0] ; s0 + pxor m7, m7 + vpbroadcastd m14, [paramsq+4] ; s1 + psllw m15, 2 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, dst_strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, lpf_strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+r10+400* 0] + mova m1, [t1+r10+400* 2] + mova m2, [t1+r10+400* 4] + paddw m0, m0 + mova m3, [t1+r10+400* 6] + paddd m1, m1 + mova m4, [t1+r10+400* 8] + paddd m2, m2 + mova m5, [t1+r10+400*10] + mova [t2+r10+400* 0], m0 + mova [t2+r10+400* 2], m1 + mova [t2+r10+400* 4], m2 + mova [t2+r10+400* 6], m3 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10- 2] +.h_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m6, m0, m7 + pmaddwd m6, m6 + punpckhwd m0, m7 + pmaddwd m0, m0 + paddd m2, m6 ; sumsq3 + shufpd m6, m4, m5, 0x05 + punpcklwd m5, m6, m4 + paddw m8, m4, m6 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + paddd m3, m0 + mova [t1+r10+400* 6], m1 + mova [t1+r10+400* 8], m2 + mova [t1+r10+400*10], m3 + paddw m8, m1 ; sum5 + paddd m5, m2 ; sumsq5 + paddd m6, m3 + mova [t1+r10+400* 0], m8 + mova [t1+r10+400* 2], m5 + mova [t1+r10+400* 4], m6 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m4, [lpfq+r10- 2] +.hv0_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -36 + jl .hv0_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m6, m0, m7 + pmaddwd m6, m6 + punpckhwd m0, m7 + pmaddwd m0, m0 + paddd m2, m6 ; h sumsq3 + shufpd m6, m4, m5, 0x05 + punpcklwd m5, m6, m4 + paddw m8, m4, m6 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m5, m2 ; h sumsq5 + paddd m6, m3 + mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? + mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd + mova [t3+r10*2+400*0+40], m6 + paddw m8, [t1+r10+400* 0] + paddd m5, [t1+r10+400* 2] + paddd m6, [t1+r10+400* 4] + mova [t1+r10+400* 0], m8 + mova [t1+r10+400* 2], m5 + mova [t1+r10+400* 4], m6 + paddw m0, m1, [t1+r10+400* 6] + paddd m4, m2, [t1+r10+400* 8] + paddd m5, m3, [t1+r10+400*10] + mova [t1+r10+400* 6], m1 + mova [t1+r10+400* 8], m2 + mova [t1+r10+400*10], m3 + paddw m1, m0, [t2+r10+400* 6] + paddd m2, m4, [t2+r10+400* 8] + paddd m3, m5, [t2+r10+400*10] + mova [t2+r10+400* 6], m0 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m4, [lpfq+r10- 2] +.hv1_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -36 + jl .hv1_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.hv1_have_right: + palignr m6, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m6, m3 + punpcklwd m0, m6, m3 + pmaddwd m0, m0 + punpckhwd m6, m3 + pmaddwd m6, m6 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m7 + pmaddwd m1, m1 + punpckhwd m3, m7 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + shufpd m1, m4, m5, 0x05 + punpckhwd m5, m4, m1 + paddw m8, m4, m1 + pmaddwd m5, m5 + punpcklwd m4, m1 + pmaddwd m4, m4 + paddd m6, m3 + paddw m1, m2, [t2+r10+400* 6] + mova [t2+r10+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+r10+400* 8] + paddd m3, m6, [t2+r10+400*10] + mova [t2+r10+400* 8], m0 + mova [t2+r10+400*10], m6 + paddd m4, m0 ; h sumsq5 + paddd m5, m6 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m6, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m6 + psrlw m6, m1, 1 + pavgw m6, m7 ; (b3 + 2) >> 2 + punpcklwd m0, m6, m7 + pmaddwd m0, m0 + punpckhwd m6, m7 + pmaddwd m6, m6 + pmaxud m2, m0 + psubd m2, m0 ; p3 + pmaxud m3, m6 + psubd m3, m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmulld m2, m14 ; p3 * s1 + pmulld m3, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 + psrad m7, m2, 20 ; min(z3, 255) - 256 + vpgatherdd m6, [r13+m7*4], m2 + psrad m2, m3, 20 + vpgatherdd m7, [r13+m2*4], m3 + pmulld m0, m6 + packssdw m6, m7 + pmulld m7, m1 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m7, m10 + psubw m6, m11, m6 + psrld m0, 12 + psrld m7, 12 + paddw m1, m8, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + paddw m1, [t1+r10+400*0] + paddd m2, [t1+r10+400*2] + paddd m3, [t1+r10+400*4] + mova [t2+r10+400*0], m8 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + mova [t4+r10*1+400*4 +4], m6 + mova [t3+r10*2+400*8+ 8], xm0 + vextracti128 [t3+r10*2+400*8+40], m0, 1 + mova [t3+r10*2+400*8+24], xm7 + vextracti128 [t3+r10*2+400*8+56], m7, 1 + vpbroadcastd m4, [pd_25] + pxor m7, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 + pmulld m3, m4 + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmaxud m2, m4 + psubd m2, m4 ; p5 + vpbroadcastd m4, [pd_0xf00800a4] + pmaxud m3, m5 + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r13+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r13+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + packssdw m4, m5 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m4, m11, m4 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m4 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+400* 6] + mova m4, [t1+r10+400* 8] + mova m5, [t1+r10+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10+400* 6] + paddd m2, m4, [t2+r10+400* 8] + paddd m3, m5, [t2+r10+400*10] + mova [t2+r10+400* 6], m0 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + mova [t3+r10*2+400*8+ 8], m3 + mova [t3+r10*2+400*0+ 8], m4 + mova [t3+r10*2+400*0+40], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+r10+400*0], m3 + mova [t1+r10+400*2], m4 + mova [t1+r10+400*4], m5 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m4, [t1+r10+400* 6] + mova m5, [t1+r10+400* 8] + mova m6, [t1+r10+400*10] + paddw m1, m4, [t2+r10+400* 6] + paddd m2, m5, [t2+r10+400* 8] + paddd m3, m6, [t2+r10+400*10] + mova [t2+r10+400* 6], m4 + mova [t2+r10+400* 8], m5 + mova [t2+r10+400*10], m6 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m8, m1, 12 + mova [t4+r10*1+400*4+4], m2 + mova m4, [t3+r10*2+400*8+ 8] + mova m5, [t3+r10*2+400*0+ 8] + mova m6, [t3+r10*2+400*0+40] + paddw m1, m4, [t2+r10+400*0] + paddd m2, m5, [t2+r10+400*2] + paddd m3, m6, [t2+r10+400*4] + paddw m1, [t1+r10+400*0] + paddd m2, [t1+r10+400*2] + paddd m3, [t1+r10+400*4] + mova [t2+r10+400*0], m4 + mova [t2+r10+400*2], m5 + mova [t2+r10+400*4], m6 + vpbroadcastd m4, [pd_25] + mova [t3+r10*2+400*8+ 8], xm0 + vextracti128 [t3+r10*2+400*8+40], m0, 1 + mova [t3+r10*2+400*8+24], xm8 + vextracti128 [t3+r10*2+400*8+56], m8, 1 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 + pmulld m3, m4 + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmaxud m2, m4 + psubd m2, m4 ; p5 + vpbroadcastd m4, [pd_0xf00800a4] + pmaxud m3, m5 + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r13+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r13+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + packssdw m4, m5 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m4, m11, m4 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m4 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu xm0, [t4+r10*1+400*0+2] + paddw xm2, xm0, [t4+r10*1+400*0+0] + paddw xm2, [t4+r10*1+400*0+4] + movu m1, [t3+r10*2+400*0+4] + paddd m3, m1, [t3+r10*2+400*0+0] + paddd m3, [t3+r10*2+400*0+8] + paddw xm0, xm2 + paddd m1, m3 + psllw xm2, 2 + pslld m3, 2 + paddw xm0, xm2 ; a5 565 + paddd m1, m3 ; b5 565 + mova [t4+r10*1+400* 6], xm0 + mova [t3+r10*2+400*12], m1 + mova xm0, [t4+r10*1+400*2+0] + paddw xm0, [t4+r10*1+400*2+4] + paddw xm2, xm0, [t4+r10*1+400*2+2] + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m3, m1, [t3+r10*2+400*4+4] + psllw xm2, 2 ; a3[-1] 444 + pslld m3, 2 ; b3[-1] 444 + psubw xm2, xm0 ; a3[-1] 343 + psubd m3, m1 ; b3[-1] 343 + mova [t4+r10*1+400* 8], xm2 + mova [t3+r10*2+400*16], m3 + mova xm0, [t4+r10*1+400*4+0] + paddw xm0, [t4+r10*1+400*4+4] + paddw xm2, xm0, [t4+r10*1+400*4+2] + mova m1, [t3+r10*2+400*8+0] + paddd m1, [t3+r10*2+400*8+8] + paddd m3, m1, [t3+r10*2+400*8+4] + psllw xm2, 2 ; a3[ 0] 444 + pslld m3, 2 ; b3[ 0] 444 + mova [t4+r10*1+400*10], xm2 + mova [t3+r10*2+400*20], m3 + psubw xm2, xm0 ; a3[ 0] 343 + psubd m3, m1 ; b3[ 0] 343 + mova [t4+r10*1+400*12], xm2 + mova [t3+r10*2+400*24], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu xm2, [t4+r10*1+2] + paddw xm0, xm2, [t4+r10*1+0] + paddw xm0, [t4+r10*1+4] + paddw xm2, xm0 + psllw xm0, 2 + paddw xm0, xm2 ; a5 + movu m1, [t3+r10*2+4] + paddd m4, m1, [t3+r10*2+0] + paddd m4, [t3+r10*2+8] + paddd m1, m4 + pslld m4, 2 + paddd m4, m1 ; b5 + paddw xm2, xm0, [t4+r10*1+400* 6] + mova [t4+r10*1+400* 6], xm0 + paddd m0, m4, [t3+r10*2+400*12] + mova [t3+r10*2+400*12], m4 + mova xm3, [t4+r10*1+400*2+0] + paddw xm3, [t4+r10*1+400*2+4] + paddw xm5, xm3, [t4+r10*1+400*2+2] + psllw xm5, 2 ; a3[ 1] 444 + psubw xm4, xm5, xm3 ; a3[ 1] 343 + paddw xm3, xm4, [t4+r10*1+400* 8] + paddw xm3, [t4+r10*1+400*10] + mova [t4+r10*1+400* 8], xm4 + mova [t4+r10*1+400*10], xm5 + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m5, m1, [t3+r10*2+400*4+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+400*16] + paddd m1, [t3+r10*2+400*20] + mova [t3+r10*2+400*16], m4 + mova [t3+r10*2+400*20], m5 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, xm2 ; a5 + pmovzxwd m3, xm3 ; a3 + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + pslld m4, 13 + psubd m0, m4 + psubd m1, m4 + paddd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) + psrld m0, 9 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + vpbroadcastd m1, [pd_4096] + paddd m4, m1 + paddd m0, m4 + psrad m0, 7 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 ; clip + psrlw xm0, 6 + mova [dstq+r10], xm0 + add r10, 16 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova xm3, [t4+r10*1+400*4+0] + paddw xm3, [t4+r10*1+400*4+4] + paddw xm5, xm3, [t4+r10*1+400*4+2] + psllw xm5, 2 ; a3[ 1] 444 + psubw xm4, xm5, xm3 ; a3[ 1] 343 + paddw xm3, xm4, [t4+r10*1+400*12] + paddw xm3, [t4+r10*1+400*10] + mova [t4+r10*1+400*10], xm5 + mova [t4+r10*1+400*12], xm4 + mova m1, [t3+r10*2+400*8+0] + paddd m1, [t3+r10*2+400*8+8] + paddd m5, m1, [t3+r10*2+400*8+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+400*24] + paddd m1, [t3+r10*2+400*20] + mova [t3+r10*2+400*20], m5 + mova [t3+r10*2+400*24], m4 + pmovzxwd m4, [dstq+r10] + pmovzxwd m0, [t4+r10*1+400* 6] + pmovzxwd m3, xm3 + pmaddwd m0, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + pslld m4, 12 + psubd m2, m4, [t3+r10*2+400*12] + paddd m4, m4 + psubd m1, m4 + psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) + psrld m0, 8 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + vpbroadcastd m1, [pd_4096] + paddd m4, m1 + paddd m0, m4 + psrad m0, 7 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 ; clip + psrlw xm0, 6 + mova [dstq+r10], xm0 + add r10, 16 + jl .n1_loop + add dstq, dst_strideq + ret + %endif ; ARCH_X86_64 diff --git a/src/x86/looprestoration_avx2.asm b/src/x86/looprestoration_avx2.asm index 71e3e0d..67ea6cc 100644 --- a/src/x86/looprestoration_avx2.asm +++ b/src/x86/looprestoration_avx2.asm @@ -41,7 +41,8 @@ sgr_r_ext: times 16 db 1 ; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of ; cache but eliminates some shifts in the inner sgr loop which is overall a win -sgr_x_by_x: dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 +const sgr_x_by_x_avx2 + dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 @@ -58,17 +59,18 @@ sgr_x_by_x: dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 + times 4 db -1 ; needed for 16-bit sgr +pb_m5: times 4 db -5 +pb_3: times 4 db 3 +pw_5_6: dw 5, 6 + sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 db 9, -1, 10, -1, 11, -1, 12, -1 -pb_3: times 4 db 3 -pb_m5: times 4 db -5 -pw_16: times 2 dw 16 pw_256: times 2 dw 256 pw_2056: times 2 dw 2056 pw_m16380: times 2 dw -16380 -pw_5_6: dw 5, 6 pd_25: dd 25 pd_34816: dd 34816 pd_m4096: dd -4096 @@ -729,8 +731,8 @@ ALIGN function_align cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ lpf_stride, w, edge, params, h -%define base r12-sgr_x_by_x-256*4 - lea r12, [sgr_x_by_x+256*4] +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] mov paramsq, paramsmp mov wd, wm mov edged, r8m @@ -1189,12 +1191,12 @@ ALIGN function_align cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ lpf_stride, w, edge, params, h -%define base r14-sgr_x_by_x-256*4 +%define base r14-sgr_x_by_x_avx2-256*4 mov paramsq, paramsmp mov edged, r8m mov wd, wm mov hd, r6m - lea r14, [sgr_x_by_x+256*4] + lea r14, [sgr_x_by_x_avx2+256*4] vbroadcasti128 m8, [base+sgr_shuf+2] add lpfq, wq vbroadcasti128 m9, [base+sgr_shuf+4] @@ -1548,8 +1550,8 @@ ALIGN function_align cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ lpf_stride, w, edge, params, h -%define base r12-sgr_x_by_x-256*4 - lea r12, [sgr_x_by_x+256*4] +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] mov paramsq, paramsmp mov wd, wm mov edged, r8m diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration_init_tmpl.c index 9d6c639..f7819a1 100644 --- a/src/x86/looprestoration_init_tmpl.c +++ b/src/x86/looprestoration_init_tmpl.c @@ -35,21 +35,9 @@ decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) #define decl_sgr_filter_fns(ext) \ -void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \ - const pixel (*left)[4], const pixel *lpf, \ - ptrdiff_t lpf_stride, int w, int h, \ - const LooprestorationParams *params, \ - enum LrEdgeFlags edges); \ -void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \ - const pixel (*left)[4], const pixel *lpf, \ - ptrdiff_t lpf_stride, int w, int h, \ - const LooprestorationParams *params, \ - enum LrEdgeFlags edges); \ -void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \ - const pixel (*left)[4], const pixel *lpf, \ - ptrdiff_t lpf_stride, int w, int h, \ - const LooprestorationParams *params, \ - enum LrEdgeFlags edges); +decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext)) /* FIXME: Replace with a port of the AVX2 code */ #define SGR_FILTER_OLD(ext) \ @@ -63,13 +51,13 @@ void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \ const enum LrEdgeFlags edges); \ void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \ const int w, const int h, const unsigned s); \ -void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \ +void BF(dav1d_sgr_finish_filter1, ext)(int16_t *tmp, \ const pixel *src, const ptrdiff_t stride, \ const int32_t *a, const int16_t *b, \ const int w, const int h); \ \ /* filter with a 3x3 box (radius=1) */ \ -static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \ +static void BF(dav1d_sgr_filter1, ext)(int16_t *tmp, \ const pixel *src, const ptrdiff_t stride, \ const pixel (*left)[4], \ const pixel *lpf, const ptrdiff_t lpf_stride, \ @@ -106,13 +94,13 @@ void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \ const enum LrEdgeFlags edges); \ void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \ const int w, const int h, const int strength); \ -void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \ +void BF(dav1d_sgr_finish_filter2, ext)(int16_t *tmp, \ const pixel *src, const ptrdiff_t stride, \ const int32_t *a, const int16_t *b, \ const int w, const int h); \ \ /* filter with a 5x5 box (radius=2) */ \ -static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \ +static void BF(dav1d_sgr_filter2, ext)(int16_t *tmp, \ const pixel *src, const ptrdiff_t stride, \ const pixel (*left)[4], \ const pixel *lpf, const ptrdiff_t lpf_stride, \ @@ -140,10 +128,10 @@ static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \ } \ \ void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \ - const coef *t1, const int w, const int h, \ + const int16_t *t1, const int w, const int h, \ const int wt); \ void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \ - const coef *t1, const coef *t2, \ + const int16_t *t1, const int16_t *t2, \ const int w, const int h, \ const uint32_t wt); \ \ @@ -152,9 +140,9 @@ static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride const pixel *lpf, const ptrdiff_t lpf_stride, \ const int w, const int h, \ const LooprestorationParams *const params, \ - const enum LrEdgeFlags edges) \ + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \ { \ - ALIGN_STK_32(coef, tmp, 64 * 384,); \ + ALIGN_STK_32(int16_t, tmp, 64 * 384,); \ BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ w, h, params->sgr.s0, edges); \ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \ @@ -164,9 +152,9 @@ static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride const pixel *lpf, const ptrdiff_t lpf_stride, \ const int w, const int h, \ const LooprestorationParams *const params, \ - const enum LrEdgeFlags edges) \ + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \ { \ - ALIGN_STK_32(coef, tmp, 64 * 384,); \ + ALIGN_STK_32(int16_t, tmp, 64 * 384,); \ BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ w, h, params->sgr.s1, edges); \ BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \ @@ -176,10 +164,10 @@ static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride const pixel *lpf, const ptrdiff_t lpf_stride, \ const int w, const int h, \ const LooprestorationParams *const params, \ - const enum LrEdgeFlags edges) \ + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \ { \ - ALIGN_STK_32(coef, tmp1, 64 * 384,); \ - ALIGN_STK_32(coef, tmp2, 64 * 384,); \ + ALIGN_STK_32(int16_t, tmp1, 64 * 384,); \ + ALIGN_STK_32(int16_t, tmp2, 64 * 384,); \ BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ w, h, params->sgr.s0, edges); \ BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ @@ -191,17 +179,15 @@ static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride decl_wiener_filter_fns(sse2); decl_wiener_filter_fns(ssse3); decl_wiener_filter_fns(avx2); -decl_sgr_filter_fns(avx2) +decl_sgr_filter_fns(avx2); #if BITDEPTH == 8 SGR_FILTER_OLD(ssse3) #endif -#if ARCH_X86_64 -decl_wiener_filter_fns(avx2); -#endif - -COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { +COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c, + const int bpc) +{ const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; @@ -224,10 +210,10 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont c->wiener[0] = BF(dav1d_wiener_filter7, avx2); c->wiener[1] = BF(dav1d_wiener_filter5, avx2); -#if BITDEPTH == 8 - c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); - c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); - c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); -#endif + if (bpc <= 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); + } #endif } |