diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2021-05-04 15:02:10 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2021-05-04 17:59:16 +0300 |
commit | 0d3a9e01edff9e5450486a5b2080c5652e5ab1fa (patch) | |
tree | 1d4409efdacf6858f5b4ddd2d1662037a898e782 | |
parent | 901062f23f8ba60d9cef5efe17a55f6ac1ad84b7 (diff) |
x86: Add high bitdepth prep_bilin AVX2 asm
-rw-r--r-- | src/x86/mc16_avx2.asm | 500 | ||||
-rw-r--r-- | src/x86/mc_init_tmpl.c | 3 |
2 files changed, 501 insertions, 2 deletions
diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 2ee3fc8..e3e1ebc 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -31,11 +31,14 @@ SECTION_RODATA put_bilin_h_rnd: dw 8, 8, 10, 10 +prep_mul: dw 16, 16, 4, 4 + +%define pw_16 prep_mul pw_2: times 2 dw 2 -pw_16: times 2 dw 16 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 +pw_32766: times 2 dw 32766 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -48,8 +51,10 @@ pw_8192: times 2 dw 8192 %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) +%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) @@ -84,6 +89,7 @@ BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 %endmacro HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX @@ -612,4 +618,496 @@ INIT_YMM avx2 jg .hv_w16_loop0 RET +cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep_avx2] +%if UNIX64 + DECLARE_REG_TMP 7 + %define org_w r7d +%else + DECLARE_REG_TMP 6 + %define org_w r5m +%endif + mov org_w, wd + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [r6-prep_avx2+pw_8192] + add wq, r6 + shr r5d, 11 + vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + vpbroadcastq m1, [srcq+strideq*2] + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m0, m1, 0x30 + vpblendd m0, m2, 0xc0 + pmullw m0, m4 + psubw m0, m5 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*2] + vinserti128 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m4, [srcq+strideq*2] + pmullw m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmullw m0, m4, [srcq+strideq*0+32*0] + pmullw m1, m4, [srcq+strideq*0+32*1] + pmullw m2, m4, [srcq+strideq*1+32*0] + pmullw m3, m4, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .prep_w32 + RET +.prep_w64: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m4, [srcq+32*1] + pmullw m2, m4, [srcq+32*2] + pmullw m3, m4, [srcq+32*3] + add srcq, strideq + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .prep_w64 + RET +.prep_w128: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m4, [srcq+32*1] + pmullw m2, m4, [srcq+32*2] + pmullw m3, m4, [srcq+32*3] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + pmullw m0, m4, [srcq+32*4] + pmullw m1, m4, [srcq+32*5] + pmullw m2, m4, [srcq+32*6] + pmullw m3, m4, [srcq+32*7] + add tmpq, 32*8 + add srcq, strideq + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd xm5, mxyd + mov mxyd, r6m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + vpbroadcastd m3, [pw_32766] + psubw m4, m5 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m4, 2 + psllw m5, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti128 m2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq m0, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + pmullw m0, m4 + vpblendd m1, m2, 0xcc + pmullw m1, m5 + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*0+2] + vinserti128 m1, [srcq+strideq*1+2], 1 + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m5, [srcq+strideq*0+2] + psubw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m1, m3 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: +.h_w64: +.h_w128: + movifnidn t0d, org_w +.h_w32_loop0: + mov r3d, t0d +.h_w32_loop: + pmullw m0, m4, [srcq+r3*2-32*1] + pmullw m1, m5, [srcq+r3*2-32*1+2] + psubw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+r3*2-32*2] + pmullw m2, m5, [srcq+r3*2-32*2+2] + psubw m1, m3 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+r3*2-32*1], m0 + mova [tmpq+r3*2-32*2], m1 + sub r3d, 32 + jg .h_w32_loop + add srcq, strideq + lea tmpq, [tmpq+t0*2] + dec hd + jg .h_w32_loop0 + RET +.v: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + movd xm5, mxyd + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + vpbroadcastd m3, [pw_32766] + add wq, r6 + lea stride3q, [strideq*3] + psubw m4, m5 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m4, 2 + psllw m5, 2 +.v_12bpc: + jmp wq +.v_w4: + movq xm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq xm1, [srcq+strideq*1] + vpblendd m2, m0, 0x03 ; 0 2 2 2 + vpbroadcastq m0, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0xf0 ; 1 1 3 3 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m1, m2, 0x33 ; 0 1 2 3 + vpblendd m0, m2, 0x0c ; 4 2 4 4 + punpckhqdq m2, m1, m0 ; 1 2 3 4 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movu xm0, [srcq+strideq*0] +.v_w8_loop: + vbroadcasti128 m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m1, m0, m2, 0xf0 ; 0 1 + vbroadcasti128 m0, [srcq+strideq*0] + vpblendd m2, m0, 0xf0 ; 1 2 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5, m2 + psubw m0, m3 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m4 + mova [tmpq+32*0], m1 + pmullw m1, m5, m0 + psubw m2, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + PUSH r7 +%endif + movifnidn r7d, org_w + add r7d, r7d + mov r3, srcq + lea r6d, [hq+r7*8-256] + mov r5, tmpq +.v_w32_loop0: + movu m0, [srcq+strideq*0] +.v_w32_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5, m2 + psubw m0, m3 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m4 + mova [tmpq+r7*0], m1 + pmullw m1, m5, m0 + psubw m2, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+r7*1], m1 + lea tmpq, [tmpq+r7*2] + sub hd, 2 + jg .v_w32_loop + add r3, 32 + add r5, 32 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + POP r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + movd xm6, mxyd + add wq, r6 + lea stride3q, [strideq*3] + vpbroadcastw m6, xm6 + jmp wq +.hv_w4: + movu xm1, [srcq+strideq*0] +%if WIN64 + movaps [rsp+24], xmm7 +%endif + pmullw xm0, xm4, xm1 + psrldq xm1, 2 + pmullw xm1, xm5 + psubw xm0, xm3 + paddw xm0, xm1 + psraw xm0, 2 + vpbroadcastq m0, xm0 +.hv_w4_loop: + movu xm1, [srcq+strideq*1] + vinserti128 m1, [srcq+stride3q ], 1 + movu xm2, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vinserti128 m2, [srcq+strideq*0], 1 + punpcklqdq m7, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + pmullw m7, m4 + vpblendd m1, m2, 0xcc + pmullw m1, m5 + psubw m7, m3 + paddw m1, m7 + psraw m1, 2 ; 1 2 3 4 + vpblendd m0, m1, 0x3f + vpermq m2, m0, q2103 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop +%if WIN64 + movaps xmm7, [rsp+24] +%endif + RET +.hv_w8: + pmullw xm0, xm4, [srcq+strideq*0] + pmullw xm1, xm5, [srcq+strideq*0+2] + psubw xm0, xm3 + paddw xm0, xm1 + psraw xm0, 2 + vinserti128 m0, xm0, 1 +.hv_w8_loop: + movu xm1, [srcq+strideq*1] + movu xm2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + vinserti128 m1, [srcq+strideq*0], 1 + vinserti128 m2, [srcq+strideq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +.hv_w32: +.hv_w64: +.hv_w128: +%if WIN64 + PUSH r7 +%endif + movifnidn r7d, org_w + add r7d, r7d + mov r3, srcq + lea r6d, [hq+r7*8-256] + mov r5, tmpq +.hv_w16_loop0: + pmullw m0, m4, [srcq] + pmullw m1, m5, [srcq+2] + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+r7*0], m2 + pmullw m0, m4, [srcq+strideq*0] + pmullw m2, m5, [srcq+strideq*0+2] + psubw m0, m3 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+r7*1], m2 + lea tmpq, [tmpq+r7*2] + sub hd, 2 + jg .hv_w16_loop + add r3, 32 + add r5, 32 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .hv_w16_loop0 +%if WIN64 + POP r7 +%endif + RET + %endif ; ARCH_X86_64 diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c index 6cf1a56..fa3d486 100644 --- a/src/x86/mc_init_tmpl.c +++ b/src/x86/mc_init_tmpl.c @@ -222,6 +222,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { return; init_mc_fn (FILTER_2D_BILINEAR, bilin, avx2); + + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); #if BITDEPTH == 8 init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); @@ -243,7 +245,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); - init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); |