diff options
author | Francois Cartegnie <fcvlcdev@free.fr> | 2019-02-15 14:17:54 +0300 |
---|---|---|
committer | Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com> | 2019-03-04 10:55:45 +0300 |
commit | 0afec6b13fbacdc4fb25117c17ce4472945f901d (patch) | |
tree | 236161978c2caa9f004880114d962980b60a46d7 | |
parent | 65ee1233cf86f03e029d0520f7cc5a3e152d3bbd (diff) |
x86: add SSSE3 mc prep_8tap implementation
---------------------
x86_64:
------------------------------------------
mct_8tap_regular_w4_0_8bpc_c: 115.6
mct_8tap_regular_w4_0_8bpc_ssse3: 13.1
mct_8tap_regular_w4_0_8bpc_avx2: 13.3
------------------------------------------
mct_8tap_regular_w4_h_8bpc_c: 363.0
mct_8tap_regular_w4_h_8bpc_ssse3: 19.1
mct_8tap_regular_w4_h_8bpc_avx2: 16.5
------------------------------------------
mct_8tap_regular_w4_hv_8bpc_c: 832.2
mct_8tap_regular_w4_hv_8bpc_ssse3: 113.4
mct_8tap_regular_w4_hv_8bpc_avx2: 53.1
------------------------------------------
mct_8tap_regular_w4_v_8bpc_c: 488.5
mct_8tap_regular_w4_v_8bpc_ssse3: 38.9
mct_8tap_regular_w4_v_8bpc_avx2: 26.0
------------------------------------------
mct_8tap_regular_w8_0_8bpc_c: 259.3
mct_8tap_regular_w8_0_8bpc_ssse3: 20.4
mct_8tap_regular_w8_0_8bpc_avx2: 18.0
------------------------------------------
mct_8tap_regular_w8_h_8bpc_c: 1124.3
mct_8tap_regular_w8_h_8bpc_ssse3: 67.7
mct_8tap_regular_w8_h_8bpc_avx2: 43.3
------------------------------------------
mct_8tap_regular_w8_hv_8bpc_c: 2155.0
mct_8tap_regular_w8_hv_8bpc_ssse3: 340.8
mct_8tap_regular_w8_hv_8bpc_avx2: 151.3
------------------------------------------
mct_8tap_regular_w8_v_8bpc_c: 1195.4
mct_8tap_regular_w8_v_8bpc_ssse3: 72.4
mct_8tap_regular_w8_v_8bpc_avx2: 39.8
------------------------------------------
mct_8tap_regular_w16_0_8bpc_c: 158.3
mct_8tap_regular_w16_0_8bpc_ssse3: 52.9
mct_8tap_regular_w16_0_8bpc_avx2: 30.2
------------------------------------------
mct_8tap_regular_w16_h_8bpc_c: 4267.4
mct_8tap_regular_w16_h_8bpc_ssse3: 211.9
mct_8tap_regular_w16_h_8bpc_avx2: 121.4
------------------------------------------
mct_8tap_regular_w16_hv_8bpc_c: 5430.9
mct_8tap_regular_w16_hv_8bpc_ssse3: 986.8
mct_8tap_regular_w16_hv_8bpc_avx2: 428.4
------------------------------------------
mct_8tap_regular_w16_v_8bpc_c: 4604.2
mct_8tap_regular_w16_v_8bpc_ssse3: 199.1
mct_8tap_regular_w16_v_8bpc_avx2: 100.7
------------------------------------------
mct_8tap_regular_w32_0_8bpc_c: 372.9
mct_8tap_regular_w32_0_8bpc_ssse3: 231.9
mct_8tap_regular_w32_0_8bpc_avx2: 99.7
------------------------------------------
mct_8tap_regular_w32_h_8bpc_c: 15975.0
mct_8tap_regular_w32_h_8bpc_ssse3: 802.9
mct_8tap_regular_w32_h_8bpc_avx2: 468.5
------------------------------------------
mct_8tap_regular_w32_hv_8bpc_c: 18555.5
mct_8tap_regular_w32_hv_8bpc_ssse3: 3673.5
mct_8tap_regular_w32_hv_8bpc_avx2: 1587.6
------------------------------------------
mct_8tap_regular_w32_v_8bpc_c: 16632.4
mct_8tap_regular_w32_v_8bpc_ssse3: 743.5
mct_8tap_regular_w32_v_8bpc_avx2: 337.8
------------------------------------------
mct_8tap_regular_w64_0_8bpc_c: 675.9
mct_8tap_regular_w64_0_8bpc_ssse3: 513.6
mct_8tap_regular_w64_0_8bpc_avx2: 285.4
------------------------------------------
mct_8tap_regular_w64_h_8bpc_c: 37161.3
mct_8tap_regular_w64_h_8bpc_ssse3: 1929.7
mct_8tap_regular_w64_h_8bpc_avx2: 1138.1
------------------------------------------
mct_8tap_regular_w64_hv_8bpc_c: 42434.0
mct_8tap_regular_w64_hv_8bpc_ssse3: 8822.1
mct_8tap_regular_w64_hv_8bpc_avx2: 3853.5
------------------------------------------
mct_8tap_regular_w64_v_8bpc_c: 37969.1
mct_8tap_regular_w64_v_8bpc_ssse3: 1805.6
mct_8tap_regular_w64_v_8bpc_avx2: 826.1
------------------------------------------
mct_8tap_regular_w128_0_8bpc_c: 1532.7
mct_8tap_regular_w128_0_8bpc_ssse3: 1397.7
mct_8tap_regular_w128_0_8bpc_avx2: 813.8
------------------------------------------
mct_8tap_regular_w128_h_8bpc_c: 91204.3
mct_8tap_regular_w128_h_8bpc_ssse3: 4783.0
mct_8tap_regular_w128_h_8bpc_avx2: 2767.2
------------------------------------------
mct_8tap_regular_w128_hv_8bpc_c: 102396.0
mct_8tap_regular_w128_hv_8bpc_ssse3: 22202.3
mct_8tap_regular_w128_hv_8bpc_avx2: 9637.2
------------------------------------------
mct_8tap_regular_w128_v_8bpc_c: 92294.3
mct_8tap_regular_w128_v_8bpc_ssse3: 4952.8
mct_8tap_regular_w128_v_8bpc_avx2: 2370.1
------------------------------------------
---------------------
x86_32:
------------------------------------------
mct_8tap_regular_w4_0_8bpc_c: 131.3
mct_8tap_regular_w4_0_8bpc_ssse3: 18.7
------------------------------------------
mct_8tap_regular_w4_h_8bpc_c: 422.0
mct_8tap_regular_w4_h_8bpc_ssse3: 27.3
------------------------------------------
mct_8tap_regular_w4_hv_8bpc_c: 1012.6
mct_8tap_regular_w4_hv_8bpc_ssse3: 123.6
------------------------------------------
mct_8tap_regular_w4_v_8bpc_c: 589.6
mct_8tap_regular_w4_v_8bpc_ssse3: 48.9
------------------------------------------
mct_8tap_regular_w8_0_8bpc_c: 278.5
mct_8tap_regular_w8_0_8bpc_ssse3: 26.3
------------------------------------------
mct_8tap_regular_w8_h_8bpc_c: 1129.3
mct_8tap_regular_w8_h_8bpc_ssse3: 80.6
------------------------------------------
mct_8tap_regular_w8_hv_8bpc_c: 2556.4
mct_8tap_regular_w8_hv_8bpc_ssse3: 354.6
------------------------------------------
mct_8tap_regular_w8_v_8bpc_c: 1460.2
mct_8tap_regular_w8_v_8bpc_ssse3: 103.8
------------------------------------------
mct_8tap_regular_w16_0_8bpc_c: 218.9
mct_8tap_regular_w16_0_8bpc_ssse3: 58.4
------------------------------------------
mct_8tap_regular_w16_h_8bpc_c: 4471.8
mct_8tap_regular_w16_h_8bpc_ssse3: 237.2
------------------------------------------
mct_8tap_regular_w16_hv_8bpc_c: 5570.5
mct_8tap_regular_w16_hv_8bpc_ssse3: 1044.1
------------------------------------------
mct_8tap_regular_w16_v_8bpc_c: 4885.5
mct_8tap_regular_w16_v_8bpc_ssse3: 268.3
------------------------------------------
mct_8tap_regular_w32_0_8bpc_c: 495.6
mct_8tap_regular_w32_0_8bpc_ssse3: 236.6
------------------------------------------
mct_8tap_regular_w32_h_8bpc_c: 15903.5
mct_8tap_regular_w32_h_8bpc_ssse3: 872.5
------------------------------------------
mct_8tap_regular_w32_hv_8bpc_c: 19402.2
mct_8tap_regular_w32_hv_8bpc_ssse3: 3832.8
------------------------------------------
mct_8tap_regular_w32_v_8bpc_c: 17119.5
mct_8tap_regular_w32_v_8bpc_ssse3: 935.2
------------------------------------------
mct_8tap_regular_w64_0_8bpc_c: 877.0
mct_8tap_regular_w64_0_8bpc_ssse3: 515.7
------------------------------------------
mct_8tap_regular_w64_h_8bpc_c: 36832.1
mct_8tap_regular_w64_h_8bpc_ssse3: 2094.1
------------------------------------------
mct_8tap_regular_w64_hv_8bpc_c: 43965.3
mct_8tap_regular_w64_hv_8bpc_ssse3: 9423.0
------------------------------------------
mct_8tap_regular_w64_v_8bpc_c: 37041.2
mct_8tap_regular_w64_v_8bpc_ssse3: 2348.9
------------------------------------------
mct_8tap_regular_w128_0_8bpc_c: 1929.9
mct_8tap_regular_w128_0_8bpc_ssse3: 1392.3
------------------------------------------
mct_8tap_regular_w128_h_8bpc_c: 86022.5
mct_8tap_regular_w128_h_8bpc_ssse3: 5110.8
------------------------------------------
mct_8tap_regular_w128_hv_8bpc_c: 105793.5
mct_8tap_regular_w128_hv_8bpc_ssse3: 23278.8
------------------------------------------
mct_8tap_regular_w128_v_8bpc_c: 88223.5
mct_8tap_regular_w128_v_8bpc_ssse3: 7442.7
------------------------------------------
-rw-r--r-- | src/x86/mc_init_tmpl.c | 18 | ||||
-rw-r--r-- | src/x86/mc_ssse3.asm | 887 |
2 files changed, 905 insertions, 0 deletions
diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c index 246e3b5..0e33cd4 100644 --- a/src/x86/mc_init_tmpl.c +++ b/src/x86/mc_init_tmpl.c @@ -50,14 +50,23 @@ decl_mc_fn(dav1d_put_bilin_avx2); decl_mc_fn(dav1d_put_bilin_ssse3); decl_mct_fn(dav1d_prep_8tap_regular_avx2); +decl_mct_fn(dav1d_prep_8tap_regular_ssse3); decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2); +decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3); decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2); +decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3); decl_mct_fn(dav1d_prep_8tap_smooth_avx2); +decl_mct_fn(dav1d_prep_8tap_smooth_ssse3); decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2); +decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3); decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2); +decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3); decl_mct_fn(dav1d_prep_8tap_sharp_avx2); +decl_mct_fn(dav1d_prep_8tap_sharp_ssse3); decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2); +decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3); decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2); +decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3); decl_mct_fn(dav1d_prep_bilin_avx2); decl_mct_fn(dav1d_prep_bilin_ssse3); @@ -108,6 +117,15 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); c->avg = dav1d_avg_ssse3; c->w_avg = dav1d_w_avg_ssse3; diff --git a/src/x86/mc_ssse3.asm b/src/x86/mc_ssse3.asm index b218d16..abca6cf 100644 --- a/src/x86/mc_ssse3.asm +++ b/src/x86/mc_ssse3.asm @@ -64,6 +64,7 @@ pw_1024: times 8 dw 1024 pw_2048: times 8 dw 2048 pw_6903: times 8 dw 6903 pw_8192: times 8 dw 8192 +pd_32: times 4 dd 32 pd_512: times 4 dd 512 pw_258: times 2 dw 258 @@ -141,6 +142,7 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %endmacro HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 @@ -2424,6 +2426,891 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .hv_w8_loop0 RET +%if ARCH_X86_32 +DECLARE_REG_TMP 1, 2 +%elif WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif +%macro PREP_8TAP_FN 3 ; type, type_h, type_v +cglobal prep_8tap_%1 + mov t0d, FILTER_%2 + mov t1d, FILTER_%3 +%ifnidn %1, sharp_smooth ; skip the jump in the last filter + jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX) +%endif +%endmacro + +PREP_8TAP_FN regular, REGULAR, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH + +%if ARCH_X86_32 + %define base_reg r2 + %define base base_reg-prep_ssse3 + %define W32_RESTORE_SSQ mov strideq, stridem +%else + %define base_reg r7 + %define base 0 + %define W32_RESTORE_SSQ +%endif + +cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 +%assign org_stack_offset stack_offset + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + movsxd wq, wm + movifnidn srcd, srcm + movifnidn hd, hm + LEA base_reg, prep_ssse3 + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [base_reg+wq*2+table_offset(prep,)] + add wq, base_reg + movifnidn strided, stridem + lea r6, [strideq*3] + %assign stack_offset org_stack_offset +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + WIN64_SPILL_XMM 12 + cmp wd, 4 + je .h_w4 + tzcnt wd, wd +%if ARCH_X86_64 + mova m10, [base+subpel_h_shufA] + mova m11, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] +%endif + shr mxd, 16 + sub srcq, 3 + movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] + movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0] + pshufd m5, m5, q0000 + movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4] + pshufd m6, m6, q0000 + mova m7, [base+pw_8192] + add wq, base_reg + jmp wq +.h_w4: +%if ARCH_X86_32 + and mxd, 0xff +%else + movzx mxd, mxb +%endif + dec srcq + movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] + pshufd m4, m4, q0000 + mova m6, [base+pw_8192] + mova m5, [base+subpel_h_shufA] + W32_RESTORE_SSQ +%if ARCH_X86_64 + lea stride3q, [strideq*3] +%endif +.h_w4_loop: + movq m0, [srcq+strideq*0] ; 0 + movq m1, [srcq+strideq*1] ; 1 +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m2, [srcq+strideq*0] ; 2 + movq m3, [srcq+strideq*1] ; 3 + lea srcq, [srcq+strideq*2] +%else + movq m2, [srcq+strideq*2] ; 2 + movq m3, [srcq+stride3q ] ; 3 + lea srcq, [srcq+strideq*4] +%endif + pshufb m0, m5 ; subpel_h_shufA + pshufb m1, m5 + pshufb m2, m5 + pshufb m3, m5 + pmaddubsw m0, m4 ; subpel_filters + 2 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmaddubsw m3, m4 + phaddw m0, m1 + phaddw m2, m3 + pmulhrsw m0, m6 ; pw_8192 + pmulhrsw m2, m6 ; pw_8192 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m2 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET + ; +%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3] +%if ARCH_X86_32 + pshufb %2, %1, [base+subpel_h_shufB] + pshufb %3, %1, [base+subpel_h_shufC] + pshufb %1, [base+subpel_h_shufA] +%else + pshufb %2, %1, m11; subpel_h_shufB + pshufb %3, %1, m9 ; subpel_h_shufC + pshufb %1, m10 ; subpel_h_shufA +%endif + pmaddubsw %4, %2, m5 ; subpel +0 B0 + pmaddubsw %2, m6 ; subpel +4 B4 + pmaddubsw %3, m6 ; subpel +4 C4 + pmaddubsw %1, m5 ; subpel +0 A0 + paddw %3, %4 + paddw %1, %2 + phaddw %1, %3 + pmulhrsw %1, m7 ; 8192 +%endmacro + ; +.h_w8: +%if ARCH_X86_32 + mov r3, r2 + %define base_reg r3 + W32_RESTORE_SSQ +%endif +.h_w8_loop: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + PREP_8TAP_H m0, m2, m3, m4 + PREP_8TAP_H m1, m2, m3, m4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 32 + sub hd, 2 + jg .h_w8_loop + RET +.h_w16: + xor r6d, r6d + jmp .h_start +.h_w32: + mov r6, -16*1 + jmp .h_start +.h_w64: + mov r6, -16*3 + jmp .h_start +.h_w128: + mov r6, -16*7 +.h_start: +%if ARCH_X86_32 + mov r3, r2 + %define base_reg r3 +%endif + sub srcq, r6 + mov r5, r6 + W32_RESTORE_SSQ +.h_loop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PREP_8TAP_H m0, m2, m3, m4 + PREP_8TAP_H m1, m2, m3, m4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 32 + add r6, 16 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET +%if ARCH_X86_32 + %define base_reg r2 +%endif + +.v: +%if ARCH_X86_32 + mov mxd, myd + and mxd, 0xff +%else + %assign stack_offset org_stack_offset + WIN64_SPILL_XMM 16 + movzx mxd, myb +%endif + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3] + mova m2, [base+pw_512] + psrlw m2, m2, 1 ; 0x0100 + mova m7, [base+pw_8192] +%if ARCH_X86_32 + %define subpel0 [rsp+mmsize*0] + %define subpel1 [rsp+mmsize*1] + %define subpel2 [rsp+mmsize*2] + %define subpel3 [rsp+mmsize*3] +%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed + ALLOC_STACK -mmsize*4 +%assign regs_used 7 + movd m0, [myq+0] + pshufb m0, m2 + mova subpel0, m0 + movd m0, [myq+2] + pshufb m0, m2 + mova subpel1, m0 + movd m0, [myq+4] + pshufb m0, m2 + mova subpel2, m0 + movd m0, [myq+6] + pshufb m0, m2 + mova subpel3, m0 + mov strideq, [rstk+stack_offset+gprsize*3] + lea strideq, [strideq*3] + sub [rstk+stack_offset+gprsize*2], strideq + mov strideq, [rstk+stack_offset+gprsize*3] + mov srcq, [rstk+stack_offset+gprsize*2] +%else + %define subpel0 m8 + %define subpel1 m9 + %define subpel2 m10 + %define subpel3 m11 + movd subpel0, [myq+0] + pshufb subpel0, m2 + movd subpel1, [myq+2] + pshufb subpel1, m2 + movd subpel2, [myq+4] + pshufb subpel2, m2 + movd subpel3, [myq+6] + pshufb subpel3, m2 + lea stride3q, [strideq*3] + sub srcq, stride3q + cmp wd, 8 + jg .v_w16 + je .v_w8 +%endif +.v_w4: +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize + %define srcm [rsp+mmsize*4+gprsize*1] + %define tmpm [rsp+mmsize*4+gprsize*2] +%endif + mov tmpm, tmpq + mov srcm, srcq + lea r5d, [wq - 4] ; horizontal loop + shl r5d, (16 - 2) ; (wq / 4) << 16 + mov r5w, hw +.v_w4_loop0: +%endif + movd m2, [srcq+strideq*0] ; 0 + movhps m2, [srcq+strideq*2] ; 0 _ 2 + movd m3, [srcq+strideq*1] ; 1 +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movhps m3, [srcq+strideq*1] ; 1 _ 3 + lea srcq, [srcq+strideq*2] +%else + movhps m3, [srcq+stride3q ] ; 1 _ 3 + lea srcq, [srcq+strideq*4] +%endif + pshufd m2, m2, q2020 ; 0 2 0 2 + pshufd m3, m3, q2020 ; 1 3 1 3 + punpckldq m2, m3 ; 0 1 2 3 + movd m3, [srcq+strideq*0] ; 4 + movd m1, [srcq+strideq*1] ; 5 + movd m0, [srcq+strideq*2] ; 6 +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + add srcq, strideq +%else + add srcq, stride3q +%endif + punpckldq m3, m1 ; 4 5 _ _ + punpckldq m1, m0 ; 5 6 _ _ + palignr m4, m3, m2, 4 ; 1 2 3 4 + punpcklbw m3, m1 ; 45 56 + punpcklbw m1, m2, m4 ; 01 12 + punpckhbw m2, m4 ; 23 34 +.v_w4_loop: + pmaddubsw m5, m1, subpel0 ; a0 b0 + mova m1, m2 + pmaddubsw m2, subpel1 ; a1 b1 + paddw m5, m2 + mova m2, m3 + pmaddubsw m3, subpel2 ; a2 b2 + paddw m5, m3 + movd m4, [srcq+strideq*0] + punpckldq m3, m0, m4 ; 6 7 _ _ + movd m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpckldq m4, m0 ; 7 8 _ _ + punpcklbw m3, m4 ; 67 78 + pmaddubsw m4, m3, subpel3 ; a3 b3 + paddw m5, m4 + pmulhrsw m5, m7 + movq [tmpq+wq*0], m5 + movhps [tmpq+wq*2], m5 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w4_loop +%if ARCH_X86_32 + mov hw, r5w ; reset vertical loop + mov tmpq, tmpm + mov srcq, srcm + add tmpq, 8 + add srcq, 4 + mov tmpm, tmpq + mov srcm, srcq + sub r5d, 1<<16 ; horizontal-- + jg .v_w4_loop0 +%endif + RET + +%if ARCH_X86_64 +.v_w8: +.v_w16: + lea r5d, [wq - 8] ; horizontal loop + mov r8, tmpq + mov r6, srcq + shl r5d, 8 - 3; (wq / 8) << 8 + mov r5b, hb +.v_w8_loop0: + movq m4, [srcq+strideq*0] ; 0 + movq m5, [srcq+strideq*1] ; 1 + lea srcq, [srcq+strideq*2] + movq m6, [srcq+strideq*0] ; 2 + movq m0, [srcq+strideq*1] ; 3 + lea srcq, [srcq+strideq*2] + movq m1, [srcq+strideq*0] ; 4 + movq m2, [srcq+strideq*1] ; 5 + lea srcq, [srcq+strideq*2] ; + movq m3, [srcq+strideq*0] ; 6 + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklbw m1, m4, m5 ; 01 + punpckhbw m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklbw m2, m5, m6 ; 12 + punpckhbw m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklbw m3, m6, m0 ; 23 + punpckhbw m6, m0 ; 56 +.v_w8_loop: + movq m12, [srcq+strideq*1] ; 8 + lea srcq, [srcq+strideq*2] + movq m13, [srcq+strideq*0] ; 9 + pmaddubsw m14, m1, subpel0 ; a0 + pmaddubsw m15, m2, subpel0 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, subpel1 ; a1 + pmaddubsw m4, subpel1 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, subpel2 ; a2 + pmaddubsw m6, subpel2 ; b2 + paddw m14, m5 + paddw m15, m6 + shufpd m6, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m6, m0 ; 67 + punpckhbw m6, m0 ; 78 + pmaddubsw m12, m5, subpel3 ; a3 + pmaddubsw m13, m6, subpel3 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + movu [tmpq+wq*0], xm14 + movu [tmpq+wq*2], xm15 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w8_loop + movzx hd, r5b ; reset vertical loop + add r8, 16 + add r6, 8 + mov tmpq, r8 + mov srcq, r6 + sub r5d, 1<<8 ; horizontal-- + jg .v_w8_loop0 + RET +%endif ;ARCH_X86_64 +%undef subpel0 +%undef subpel1 +%undef subpel2 +%undef subpel3 + +.hv: + %assign stack_offset org_stack_offset + cmp wd, 4 + jg .hv_w8 + and mxd, 0xff + movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] +%if ARCH_X86_32 + mov mxd, myd + and mxd, 0xff + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] + mov r5, r2; use as new base + %define base_reg r5 + %assign regs_used 2 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 + mov strideq, [rstk+stack_offset+gprsize*3] + lea strideq, [strideq*3 + 1] + sub [rstk+stack_offset+gprsize*2], strideq + mov strideq, [rstk+stack_offset+gprsize*3] + mov srcq, [rstk+stack_offset+gprsize*2] + %define subpelv0 [rsp+mmsize*0] + %define subpelv1 [rsp+mmsize*1] + %define subpelv2 [rsp+mmsize*2] + %define subpelv3 [rsp+mmsize*3] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m6, m0, q0000 + mova subpelv0, m6 + pshufd m6, m0, q1111 + mova subpelv1, m6 + pshufd m6, m0, q2222 + mova subpelv2, m6 + pshufd m6, m0, q3333 + mova subpelv3, m6 +%else + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] + ALLOC_STACK mmsize*14, 14 + lea stride3q, [strideq*3] + sub srcq, stride3q + dec srcq + %define subpelv0 m10 + %define subpelv1 m11 + %define subpelv2 m12 + %define subpelv3 m13 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + mova m8, [base+pw_8192] + mova m9, [base+pd_32] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + pshufd m7, m1, q0000 +.hv_w4: +%define hv4_line_0_0 4 +%define hv4_line_0_1 5 +%define hv4_line_0_2 6 +%define hv4_line_0_3 7 +%define hv4_line_0_4 8 +%define hv4_line_0_5 9 +%define hv4_line_1_0 10 +%define hv4_line_1_1 11 +%define hv4_line_1_2 12 +%define hv4_line_1_3 13 + ; + ; +%if ARCH_X86_32 + %define w8192reg [base+pw_8192] + %define d32reg [base+pd_32] +%else + %define w8192reg m8 + %define d32reg m9 +%endif + ; lower shuffle 0 1 2 3 4 + mova m6, [base+subpel_h_shuf4] + movq m5, [srcq+strideq*0] ; 0 _ _ _ + movhps m5, [srcq+strideq*1] ; 0 _ 1 _ + movq m4, [srcq+strideq*2] ; 2 _ _ _ +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + add srcq, strideq + movhps m4, [srcq+strideq*0] ; 2 _ 3 _ + add srcq, strideq +%else + movhps m4, [srcq+stride3q ] ; 2 _ 3 _ + lea srcq, [srcq+strideq*4] +%endif + pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ + pmaddubsw m2, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m2, m0 ;H 0 1 2 3 + pmulhrsw m2, w8192reg ;H pw_8192 + SAVELINE_W4 m2, 2, 0 + ; upper shuffle 2 3 4 5 6 + mova m6, [base+subpel_h_shuf4+16] + pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ + pmaddubsw m2, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m2, m0 ;H 0 1 2 3 + pmulhrsw m2, w8192reg ;H pw_8192 + ; + ; lower shuffle + mova m6, [base+subpel_h_shuf4] + movq m5, [srcq+strideq*0] ; 4 _ _ _ + movhps m5, [srcq+strideq*1] ; 4 _ 5 _ + movq m4, [srcq+strideq*2] ; 6 _ _ _ + pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ + pmaddubsw m3, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m3, m0 ;H 4 5 6 7 + pmulhrsw m3, w8192reg ;H pw_8192 + SAVELINE_W4 m3, 3, 0 + ; upper shuffle + mova m6, [base+subpel_h_shuf4+16] + pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ + pmaddubsw m3, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m3, m0 ;H 4 5 6 7 + pmulhrsw m3, w8192reg ;H pw_8192 + ; +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + add srcq, strideq +%else + add srcq, stride3q +%endif + ;process high + palignr m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + ;process low + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + palignr m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 +.hv_w4_loop: + ;process low + pmaddwd m5, m1, subpelv0 ; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 + ; + mova m6, [base+subpel_h_shuf4] + movq m4, [srcq+strideq*0] ; 7 + movhps m4, [srcq+strideq*1] ; 7 _ 8 _ + pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ + pmaddubsw m4, m7 ;H subpel_filters + phaddw m4, m4 ;H 7 8 7 8 + pmulhrsw m4, w8192reg ;H pw_8192 + palignr m3, m4, m0, 12 ; 6 7 8 7 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 + paddd m5, d32reg ; pd_32 + paddd m5, m4 + psrad m5, 6 + SAVELINE_W4 m0, 0, 0 + SAVELINE_W4 m1, 1, 0 + SAVELINE_W4 m2, 2, 0 + SAVELINE_W4 m3, 3, 0 + SAVELINE_W4 m5, 5, 0 + ;process high + RESTORELINE_W4 m0, 0, 1 + RESTORELINE_W4 m1, 1, 1 + RESTORELINE_W4 m2, 2, 1 + RESTORELINE_W4 m3, 3, 1 + pmaddwd m5, m1, subpelv0; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 + ; + mova m6, [base+subpel_h_shuf4+16] + movq m4, [srcq+strideq*0] ; 7 + movhps m4, [srcq+strideq*1] ; 7 _ 8 _ + pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ + pmaddubsw m4, m7 ;H subpel_filters + phaddw m4, m4 ;H 7 8 7 8 + pmulhrsw m4, w8192reg ;H pw_8192 + palignr m3, m4, m0, 12 ; 6 7 8 7 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 + paddd m5, d32reg ; pd_32 + paddd m5, m4 + psrad m4, m5, 6 + ; + RESTORELINE_W4 m5, 5, 0 + packssdw m5, m4 + pshufd m5, m5, q3120 + movu [tmpq], m5 + lea srcq, [srcq+strideq*2] + add tmpq, 16 + sub hd, 2 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + RESTORELINE_W4 m0, 0, 0 + RESTORELINE_W4 m1, 1, 0 + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + jg .hv_w4_loop + RET +%undef subpelv0 +%undef subpelv1 +%undef subpelv2 +%undef subpelv3 + ; + + +.hv_w8: + %assign stack_offset org_stack_offset +%define hv8_line_1 0 +%define hv8_line_2 1 +%define hv8_line_3 2 +%define hv8_line_4 3 +%define hv8_line_6 4 + shr mxd, 16 +%if ARCH_X86_32 + %define base_reg r2 + %define subpelh0 [rsp+mmsize*5] + %define subpelh1 [rsp+mmsize*6] + %define subpelv0 [rsp+mmsize*7] + %define subpelv1 [rsp+mmsize*8] + %define subpelv2 [rsp+mmsize*9] + %define subpelv3 [rsp+mmsize*10] + %define accuv0 [rsp+mmsize*11] + %define accuv1 [rsp+mmsize*12] + movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3] + movzx mxd, myw + and mxd, 0xff + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3] + ALLOC_STACK -mmsize*13 +%if STACK_ALIGNMENT < mmsize + mov rstk, r2m + %define tmpm [rsp+mmsize*13+gprsize*1] + %define srcm [rsp+mmsize*13+gprsize*2] + %define stridem [rsp+mmsize*13+gprsize*3] + mov stridem, rstk +%endif + mov r6, r2 +%define base_reg r6 + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + punpcklbw m5, m5 + psraw m5, 8 ; sign-extend + pshufd m2, m5, q0000 + pshufd m3, m5, q1111 + pshufd m4, m5, q2222 + pshufd m5, m5, q3333 + mova subpelh0, m0 + mova subpelh1, m1 + mova subpelv0, m2 + mova subpelv1, m3 + mova subpelv2, m4 + mova subpelv3, m5 + W32_RESTORE_SSQ + lea strided, [strided*3] + sub srcd, strided + sub srcd, 3 + mov srcm, srcd + W32_RESTORE_SSQ +%else + ALLOC_STACK mmsize*5, 16 + %define subpelh0 m10 + %define subpelh1 m11 + %define subpelv0 m12 + %define subpelv1 m13 + %define subpelv2 m14 + %define subpelv3 m15 + %define accuv0 m8 + %define accuv1 m9 + movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3] + pshufd subpelh0, m0, q0000 + pshufd subpelh1, m0, q1111 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + pshufd subpelv0, m1, q0000 + pshufd subpelv1, m1, q1111 + pshufd subpelv2, m1, q2222 + pshufd subpelv3, m1, q3333 + lea stride3q, [strideq*3] + sub srcq, 3 + sub srcq, stride3q + mov r6, srcq +%endif + lea r5d, [wq-4] +%if ARCH_X86_64 + mov r8, tmpq +%else + mov tmpm, tmpq +%endif + shl r5d, (16 - 2) + mov r5w, hw +.hv_w8_loop0: + movu m4, [srcq+strideq*0] ; 0 = _ _ + movu m5, [srcq+strideq*1] ; 1 = _ _ + lea srcq, [srcq+strideq*2] +%if ARCH_X86_64 + mova m7, [base+subpel_h_shufA] + mova m8, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] +%endif + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ + movu m6, [srcq+strideq*0] ; 2 = _ _ + movu m0, [srcq+strideq*1] ; 3 = _ _ + lea srcq, [srcq+strideq*2] + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ + ; + mova m7, [base+pw_8192] + pmulhrsw m4, m7 ; H pw_8192 + pmulhrsw m5, m7 ; H pw_8192 + pmulhrsw m6, m7 ; H pw_8192 + pmulhrsw m0, m7 ; H pw_8192 + punpcklwd m1, m4, m5 ; 0 1 ~ + punpcklwd m2, m5, m6 ; 1 2 ~ + punpcklwd m3, m6, m0 ; 2 3 ~ + SAVELINE_W8 1, m1 + SAVELINE_W8 2, m2 + SAVELINE_W8 3, m3 + ; + mova m7, [base+subpel_h_shufA] + movu m4, [srcq+strideq*0] ; 4 = _ _ + movu m5, [srcq+strideq*1] ; 5 = _ _ + lea srcq, [srcq+strideq*2] + movu m6, [srcq+strideq*0] ; 6 = _ _ + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ + mova m7, [base+pw_8192] + pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ + pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ + pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ + punpcklwd m4, m0, m1 ; 3 4 ~ + punpcklwd m5, m1, m2 ; 4 5 ~ + punpcklwd m6, m2, m3 ; 5 6 ~ + ; + SAVELINE_W8 6, m3 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 +.hv_w8_loop: + ; m8 accu for V a + ; m9 accu for V b + SAVELINE_W8 1, m3 + SAVELINE_W8 2, m4 + SAVELINE_W8 3, m5 + SAVELINE_W8 4, m6 +%if ARCH_X86_32 + pmaddwd m0, m1, subpelv0 ; a0 + pmaddwd m7, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd m0, m3 + paddd m7, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd m0, m5 + paddd m7, m6 + mova m5, [base+pd_32] + paddd m0, m5 ; pd_512 + paddd m7, m5 ; pd_512 + mova accuv0, m0 + mova accuv1, m7 +%else + pmaddwd m8, m1, subpelv0 ; a0 + pmaddwd m9, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd m8, m3 + paddd m9, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd m8, m5 + paddd m9, m6 + mova m7, [base+pd_32] + paddd m8, m7 ; pd_512 + paddd m9, m7 ; pd_512 + mova m7, [base+subpel_h_shufB] + mova m6, [base+subpel_h_shufC] + mova m5, [base+subpel_h_shufA] +%endif + movu m0, [srcq+strideq*1] ; 7 + movu m4, [srcq+strideq*2] ; 8 + lea srcq, [srcq+strideq*2] + HV_H_W8 m0, m1, m2, m3, m5, m7, m6 + HV_H_W8 m4, m1, m2, m3, m5, m7, m6 + mova m5, [base+pw_8192] + pmulhrsw m0, m5 ; H pw_8192 + pmulhrsw m4, m5 ; H pw_8192 + RESTORELINE_W8 6, m6 + punpcklwd m5, m6, m0 ; 6 7 ~ + punpcklwd m6, m0, m4 ; 7 8 ~ + pmaddwd m1, m5, subpelv3 ; a3 + paddd m2, m1, accuv0 + pmaddwd m1, m6, subpelv3 ; b3 + paddd m1, m1, accuv1 ; H + V + psrad m2, 6 + psrad m1, 6 + packssdw m2, m1 ; d -> w + movq [tmpq+wq*0], m2 + movhps [tmpq+wq*2], m2 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jle .hv_w8_outer + SAVELINE_W8 6, m4 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 + RESTORELINE_W8 4, m4 + jmp .hv_w8_loop +.hv_w8_outer: + movzx hd, r5w +%if ARCH_X86_32 + add dword tmpm, 8 + mov tmpq, tmpm + mov srcq, srcm + add srcq, 4 + mov srcm, srcq +%else + add r8, 8 + mov tmpq, r8 + add r6, 4 + mov srcq, r6 +%endif + sub r5d, 1<<16 + jg .hv_w8_loop0 + RET + %if WIN64 DECLARE_REG_TMP 6, 4 %else |