diff options
author | Victorien Le Couviour--Tuffet <victorien@videolan.org> | 2020-05-07 18:01:56 +0300 |
---|---|---|
committer | Victorien Le Couviour--Tuffet <victorien@videolan.org> | 2020-06-01 16:30:36 +0300 |
commit | a755541faa79832a6306ec9b9789b8919e55d8f9 (patch) | |
tree | 509bc73e66555a68d4bc5ed547d0e0f769ed8ecf | |
parent | ed39e8fb63bcdb837e3f131140d2d73d02095ca1 (diff) |
x86: Add put_8tap_scaled AVX2 asm
mc_scaled_8tap_regular_w2_8bpc_c: 764.4
mc_scaled_8tap_regular_w2_8bpc_avx2: 191.3
mc_scaled_8tap_regular_w2_dy1_8bpc_c: 705.8
mc_scaled_8tap_regular_w2_dy1_8bpc_avx2: 89.5
mc_scaled_8tap_regular_w2_dy2_8bpc_c: 964.0
mc_scaled_8tap_regular_w2_dy2_8bpc_avx2: 120.3
mc_scaled_8tap_regular_w4_8bpc_c: 1355.7
mc_scaled_8tap_regular_w4_8bpc_avx2: 180.9
mc_scaled_8tap_regular_w4_dy1_8bpc_c: 1233.2
mc_scaled_8tap_regular_w4_dy1_8bpc_avx2: 115.3
mc_scaled_8tap_regular_w4_dy2_8bpc_c: 1707.6
mc_scaled_8tap_regular_w4_dy2_8bpc_avx2: 117.9
mc_scaled_8tap_regular_w8_8bpc_c: 2483.2
mc_scaled_8tap_regular_w8_8bpc_avx2: 294.8
mc_scaled_8tap_regular_w8_dy1_8bpc_c: 2166.4
mc_scaled_8tap_regular_w8_dy1_8bpc_avx2: 222.0
mc_scaled_8tap_regular_w8_dy2_8bpc_c: 3133.7
mc_scaled_8tap_regular_w8_dy2_8bpc_avx2: 292.6
mc_scaled_8tap_regular_w16_8bpc_c: 5239.2
mc_scaled_8tap_regular_w16_8bpc_avx2: 729.9
mc_scaled_8tap_regular_w16_dy1_8bpc_c: 5156.5
mc_scaled_8tap_regular_w16_dy1_8bpc_avx2: 602.2
mc_scaled_8tap_regular_w16_dy2_8bpc_c: 8018.4
mc_scaled_8tap_regular_w16_dy2_8bpc_avx2: 783.1
mc_scaled_8tap_regular_w32_8bpc_c: 14745.0
mc_scaled_8tap_regular_w32_8bpc_avx2: 2205.0
mc_scaled_8tap_regular_w32_dy1_8bpc_c: 14862.3
mc_scaled_8tap_regular_w32_dy1_8bpc_avx2: 1721.3
mc_scaled_8tap_regular_w32_dy2_8bpc_c: 23607.6
mc_scaled_8tap_regular_w32_dy2_8bpc_avx2: 2325.7
mc_scaled_8tap_regular_w64_8bpc_c: 54891.7
mc_scaled_8tap_regular_w64_8bpc_avx2: 8351.4
mc_scaled_8tap_regular_w64_dy1_8bpc_c: 50249.0
mc_scaled_8tap_regular_w64_dy1_8bpc_avx2: 5864.4
mc_scaled_8tap_regular_w64_dy2_8bpc_c: 79400.1
mc_scaled_8tap_regular_w64_dy2_8bpc_avx2: 8295.7
mc_scaled_8tap_regular_w128_8bpc_c: 121046.8
mc_scaled_8tap_regular_w128_8bpc_avx2: 21809.1
mc_scaled_8tap_regular_w128_dy1_8bpc_c: 133720.4
mc_scaled_8tap_regular_w128_dy1_8bpc_avx2: 16197.8
mc_scaled_8tap_regular_w128_dy2_8bpc_c: 218774.8
mc_scaled_8tap_regular_w128_dy2_8bpc_avx2: 22993.1
-rw-r--r-- | src/x86/mc.asm | 1754 | ||||
-rw-r--r-- | src/x86/mc_init_tmpl.c | 63 | ||||
-rw-r--r-- | tests/checkasm/mc.c | 56 |
3 files changed, 1823 insertions, 50 deletions
diff --git a/src/x86/mc.asm b/src/x86/mc.asm index 7ff0cac..0fcf8d9 100644 --- a/src/x86/mc.asm +++ b/src/x86/mc.asm @@ -133,16 +133,23 @@ subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 pb_8x0_8x8: times 8 db 0 times 8 db 8 +bdct_lb_dw: times 4 db 0 + times 4 db 4 + times 4 db 8 + times 4 db 12 ALIGN 32 -resize_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 resize_shuf: times 5 db 0 db 1, 2, 3, 4, 5, 6 times 5+8 db 7 @@ -154,8 +161,11 @@ wm_422_sign: dd 0x80808080, 0x7f7f7f7f wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040 ALIGN 4 +pb_0123: db 0, 1, 2, 3 +pb_4567: db 4, 5, 6, 7 pw_m128 times 2 dw -128 pw_m256: times 2 dw -256 +pw_32: times 2 dw 32 pw_34: times 2 dw 34 pw_258: times 2 dw 258 pw_512: times 2 dw 512 @@ -163,11 +173,14 @@ pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 -pd_2: dd 2 -pd_32: dd 32 -pd_63: dd 63 -pd_512: dd 512 -pd_32768: dd 32768 +pd_2: dd 2 +pd_32: dd 32 +pd_63: dd 63 +pd_512: dd 512 +pd_32768: dd 32768 +pd_0x3ff: dd 0x3ff +pd_0x4000: dd 0x4000 +pq_0x40000000: dq 0x40000000 %define pb_m64 (wm_sign_avx512+4) %define pb_64 (wm_sign_avx512+8) @@ -230,27 +243,52 @@ cextern mc_warp_filter %endrep %endmacro +%macro SCALED_JMP_TABLE 1-* + %xdefine %1_table (%%table - 2) + %xdefine %%base mangle(private_prefix %+ _%1) +%%table: + %rep %0 - 1 + dw %%base %+ .w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_1024: + %xdefine %1_dy1_table (%%dy_1024 - 2) + %rep %0 - 1 + dw %%base %+ .dy1_w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_2048: + %xdefine %1_dy2_table (%%dy_2048 - 2) + %rep %0 - 1 + dw %%base %+ .dy2_w%2 - %%base + %rotate 1 + %endrep +%endmacro + %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX -BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 -BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 -BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 -BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 %if HAVE_AVX512ICL BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 @@ -1943,19 +1981,22 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 + mov t1d, FILTER_%4 +%ifnidn %1, sharp_smooth ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif -%macro PUT_8TAP_FN 3 ; type, type_h, type_v -cglobal put_8tap_%1 - mov t0d, FILTER_%2 - mov t1d, FILTER_%3 -%ifnidn %1, sharp_smooth ; skip the jump in the last filter - jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX) -%endif -%endmacro + +%define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN regular, REGULAR, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP @@ -3873,6 +3914,1659 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 RET %endmacro +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH + +%if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy +%else +cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy +%endif + lea r12, [put_8tap_scaled_avx2] +%define base r12-put_8tap_scaled_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm + vpbroadcastd m14, mxm + mov dyd, dym +%if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m +%else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, _, dy, ss3 + %define hm r6m +%endif +%if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+96] + %define rX r1 + %define rXd r1d +%else + %define dsm dsq + %define rX r14 + %define rXd r14d +%endif + vpbroadcastd m10, [base+pd_0x3ff] + vpbroadcastd m12, [base+pw_8192] + vpbroadcastd m13, [base+pd_512] + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+put_8tap_scaled_avx2_table+wq*2] + add wq, r12 + jmp wq +.w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpblendd m15, m7, 0xaa + vpblendd m0, m2, 0xc0 ; 0 1 4 5 + vpblendd m1, m3, 0xc0 ; 2 3 6 7 + pblendvb m15, m11, m8 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 + vextracti128 xm1, m0, 1 ; 4 5 6 7 + palignr xm2, xm1, xm0, 4 ; 1 2 3 4 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + pshufd xm4, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm4 ; 45 56 + punpckhwd xm4, xm1, xm4 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + pmaddwd xm8, xm4, xm11 + paddd xm5, xm6 + paddd xm7, xm8 + paddd xm5, xm13 + paddd xm5, xm7 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq], xm5, 0 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movq xm5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps xm3, xm0, q1032 ; 01 12 + shufps xm0, xm2, q1032 ; 23 34 + shufps xm2, xm4, q1032 ; 45 56 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 + palignr xm1, xm5, xm1, 12 + punpcklqdq xm1, xm1 ; 6 7 6 7 + punpcklwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movhps xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xm3, xm0 ; 01 12 + mova xm0, xm2 ; 23 34 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 ; 6 7 6 7 + palignr xm1, xm5, xm1, 8 ; 4 5 6 7 + pshufd xm5, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm5 ; 45 56 + punpckhwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd xm15, xm0 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m0, m9 + psrld m14, 10 + movu xm7, [srcq+ssq*0] + movu xm9, [srcq+ssq*1] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm8, [srcq+ssq*2] + movu xm10, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m7, [srcq+ssq*0], 1 + vinserti128 m9, [srcq+ssq*1], 1 + vinserti128 m15, xm15, 1 + vinserti128 m8, [srcq+ssq*2], 1 + vinserti128 m10, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pblendvb m15, m11, m0 + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + phaddw m7, m9 + phaddw m8, m10 + pmulhrsw m7, m12 ; 0 1 4 5 + pmulhrsw m8, m12 ; 2 3 6 7 + vextracti128 xm9, m7, 1 ; 4 5 + vextracti128 xm3, m8, 1 ; 6 7 + shufps xm4, xm7, xm8, q1032 ; 1 2 + shufps xm5, xm8, xm9, q1032 ; 3 4 + shufps xm6, xm9, xm3, q1032 ; 5 6 + psrldq xm11, xm3, 8 ; 7 _ + punpcklwd xm0, xm7, xm4 ; 01 + punpckhwd xm7, xm4 ; 12 + punpcklwd xm1, xm8, xm5 ; 23 + punpckhwd xm8, xm5 ; 34 + punpcklwd xm2, xm9, xm6 ; 45 + punpckhwd xm9, xm6 ; 56 + punpcklwd xm3, xm11 ; 67 + mova [rsp+0x00], xm7 + mova [rsp+0x10], xm8 + mova [rsp+0x20], xm9 +.w4_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm10, r6q + punpcklbw xm10, xm10 + psraw xm10, 8 + pshufd xm7, xm10, q0000 + pshufd xm8, xm10, q1111 + pshufd xm9, xm10, q2222 + pshufd xm10, xm10, q3333 + pmaddwd xm4, xm0, xm7 + pmaddwd xm5, xm1, xm8 + pmaddwd xm6, xm2, xm9 + pmaddwd xm7, xm3, xm10 + paddd xm4, xm5 + paddd xm6, xm7 + paddd xm4, xm13 + paddd xm4, xm6 + psrad xm4, 10 + packssdw xm4, xm4 + packuswb xm4, xm4 + movd [dstq], xm4 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + movu xm4, [srcq] + test myd, 0x400 + jz .w4_skip_line + mova xm0, [rsp+0x00] + mova [rsp+0x00], xm1 + mova xm1, [rsp+0x10] + mova [rsp+0x10], xm2 + mova xm2, [rsp+0x20] + mova [rsp+0x20], xm3 + pshufb xm4, xm14 + pmaddubsw xm4, xm15 + phaddw xm4, xm4 + pmulhrsw xm4, xm12 + punpcklwd xm3, xm11, xm4 + mova xm11, xm4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu xm5, [srcq+ssq*1] + movu m6, [rsp+0x10] + pshufb xm4, xm14 + pshufb xm5, xm14 + pmaddubsw xm4, xm15 + pmaddubsw xm5, xm15 + movu [rsp+0x00], m6 + phaddw xm4, xm5 + pmulhrsw xm4, xm12 + punpcklwd xm9, xm11, xm4 + mova [rsp+0x20], xm9 + psrldq xm11, xm4, 8 + mova xm0, xm1 + mova xm1, xm2 + mova xm2, xm3 + punpcklwd xm3, xm4, xm11 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +.w8: + movifnidn dsm, dsq + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+r4*8] + movq xm10, [base+subpel_filters+r6*8] + movhps xm15, [base+subpel_filters+r7*8] + movhps xm10, [base+subpel_filters+r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+rX*8] + psrld m14, 10 + mova [rsp], xm14 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] +%macro PUT_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movq xm%1, [srcq+r4] + movq xm%2, [srcq+r6] + movhps xm%1, [srcq+r7] + movhps xm%2, [srcq+r9] + vinserti128 m%1, [srcq+r10], 1 + vinserti128 m%2, [srcq+r11], 1 + vpbroadcastq m%5, [srcq+r13] + vpbroadcastq m%6, [srcq+rX] + add srcq, ssq + movq xm%3, [srcq+r4] + movq xm%4, [srcq+r6] + movhps xm%3, [srcq+r7] + movhps xm%4, [srcq+r9] + vinserti128 m%3, [srcq+r10], 1 + vinserti128 m%4, [srcq+r11], 1 + vpbroadcastq m%7, [srcq+r13] + vpbroadcastq m%8, [srcq+rX] + add srcq, ssq + vpblendd m%1, m%5, 0xc0 + vpblendd m%2, m%6, 0xc0 + vpblendd m%3, m%7, 0xc0 + vpblendd m%4, m%8, 0xc0 + pmaddubsw m%1, m15 + pmaddubsw m%2, m10 + pmaddubsw m%3, m15 + pmaddubsw m%4, m10 + phaddw m%1, m%2 + phaddw m%3, m%4 + phaddw m%1, m%3 + pmulhrsw m%1, m12 +%endmacro + PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.w8_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, 10 + packssdw m4, m4 + vpermq m4, m4, q3120 + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w8_loop + test myd, 0x400 + mov [rsp+16], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .w8_skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+rX] + movq xm4, [srcq+r4] + movq xm5, [srcq+r6] + movhps xm4, [srcq+r7] + movhps xm5, [srcq+r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+16] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .w8_loop +.w8_skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+rX] + movq xm3, [srcq+r4] + movq xm4, [srcq+r6] + movhps xm3, [srcq+r7] + movhps xm4, [srcq+r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+r4] + movq xm6, [srcq+r6] + movhps xm5, [srcq+r7] + movhps xm6, [srcq+r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+rX] + add srcq, ssq + mov myd, [rsp+16] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .w8_loop +.w16: + mov dword [rsp+48], 1 << 1 + jmp .w_start +.w32: + mov dword [rsp+48], 1 << 3 + jmp .w_start +.w64: + mov dword [rsp+48], 1 << 7 + jmp .w_start +.w128: + mov dword [rsp+48], 1 << 15 +.w_start: + movifnidn dsm, dsq + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+72], t0d + mov [rsp+56], srcq + mov [rsp+64], dstq +%if UNIX64 + mov hm, hd +%endif + shl dword r8m, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .hloop +.hloop_prep: + shr dword [rsp+48], 1 + jz .ret + add qword [rsp+64], 8 + mov hd, hm + vpbroadcastd m8, r8m + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+16] + vpbroadcastd m15, [rsp+72] + pxor m9, m9 + mov srcq, [rsp+56] + mov dstq, [rsp+64] +.hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+16], m14 + movq xm15, [base+subpel_filters+r4*8] + movq xm10, [base+subpel_filters+r6*8] + movhps xm15, [base+subpel_filters+r7*8] + movhps xm10, [base+subpel_filters+r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + mova [rsp], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, 10 + packssdw m4, m4 + vpermq m4, m4, q3120 + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+52], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+rX] + movq xm4, [srcq+r4] + movq xm5, [srcq+r6] + movhps xm4, [srcq+r7] + movhps xm5, [srcq+r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .vloop +.skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+rX] + movq xm3, [srcq+r4] + movq xm4, [srcq+r6] + movhps xm3, [srcq+r7] + movhps xm4, [srcq+r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+r4] + movq xm6, [srcq+r6] + movhps xm5, [srcq+r7] + movhps xm6, [srcq+r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+rX] + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .vloop +.dy1: + movzx wd, word [base+put_8tap_scaled_avx2_dy1_table+wq*2] + add wq, r12 + jmp wq +.dy1_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + add srcq, ss3q + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + pshufd xm8, xm10, q0000 + pshufd xm9, xm10, q1111 + pshufd xm11, xm10, q3333 + pshufd xm10, xm10, q2222 + vpblendd m0, m2, 0xc0 + pshufb m1, m14 + pshufb m0, m14 + pmaddubsw m1, m15 + pmaddubsw m0, m15 + phaddw m0, m1 + pmulhrsw m0, m12 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 + pshufd xm4, xm1, q2121 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + punpcklwd xm2, xm1, xm4 ; 45 56 +.dy1_w2_loop: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + mova xm3, xm0 + mova xm0, xm2 + paddd xm5, xm13 + paddd xm6, xm7 + pshufb xm1, xm14 + pmaddubsw xm1, xm15 + phaddw xm1, xm1 + pmulhrsw xm1, xm12 + palignr xm7, xm1, xm4, 12 + punpcklwd xm2, xm7, xm1 ; 67 78 + pmaddwd xm7, xm2, xm11 + mova xm4, xm1 + paddd xm5, xm6 + paddd xm5, xm7 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +.dy1_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + vpermq m8, m8, q3120 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r11d, xm15, 1 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + movu xm2, [srcq+ssq*0] + movu xm3, [srcq+ssq*2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pcmpeqd m8, m9 + psrld m14, 10 + pinsrd xm15, [base+subpel_filters+r11*8+2], 1 + vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 + vinserti128 m2, [srcq+ssq*1], 1 + vinserti128 m3, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*2] + vinserti128 m4, [srcq+ssq*1], 1 + add srcq, ss3q + vpblendd m15, m7, 0x30 + punpcklqdq m15, m15 + pblendvb m15, m11, m8 + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vinserti128 m10, xm10, 1 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb xm5, xm14 + vpermq m2, m2, q3120 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m2, m3 + phaddw m4, m5 + pmulhrsw m2, m12 + pmulhrsw m4, m12 + palignr m5, m4, m2, 4 + pshufd m3, m4, q2121 + punpcklwd m0, m2, m5 ; 01 12 + punpckhwd m1, m2, m5 ; 23 34 + punpcklwd m2, m4, m3 ; 45 56 +.dy1_w4_loop: + movu xm11, [srcq+ssq*0] + vinserti128 m11, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + mova m0, m1 + mova m1, m2 + paddd m4, m13 + paddd m5, m6 + pshufb m11, m14 + vpermq m11, m11, q3120 + pmaddubsw m11, m15 + phaddw m11, m11 + pmulhrsw m11, m12 + palignr m6, m11, m3, 12 + punpcklwd m2, m6, m11 ; 67 78 + mova m3, m11 + pmaddwd m6, m2, m10 + paddd m4, m5 + paddd m4, m6 + psrad m4, 10 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + packuswb xm4, xm4 + pshuflw xm4, xm4, q3120 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w4_loop + RET +.dy1_w8: + movifnidn dsm, dsq + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+r4*8] + movq xm10, [base+subpel_filters+r6*8] + movhps xm15, [base+subpel_filters+r7*8] + movhps xm10, [base+subpel_filters+r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + mov [rsp+32], r7d + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + movu [rsp], m10 + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + lea myd, [t1+myq] + mov t1d, 64 << 24 + cmovnz t1q, [base+subpel_filters+myq*8] + vbroadcasti128 m14, [base+wswap] + movq xm11, t1q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r7d, [rsp+32] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 +.dy1_w8_loop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, 10 + packssdw m4, m4 + vpermq m4, m4, q3120 + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm + dec hd + jz .ret + movq xm4, [srcq+r4] + movq xm5, [srcq+r6] + movhps xm4, [srcq+r7] + movhps xm5, [srcq+r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_w8_loop +.dy1_w16: + mov dword [rsp+72], 1 << 1 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+72], 1 << 3 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+72], 1 << 7 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+72], 1 << 15 +.dy1_w_start: + movifnidn dsm, dsq + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+76], t0d + mov [rsp+80], srcq + mov [rsp+88], dstq +%if UNIX64 + mov hm, hd +%endif + shl dword r8m, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .dy1_hloop +.dy1_hloop_prep: + shr dword [rsp+72], 1 + jz .ret + add qword [rsp+88], 8 + mov hd, hm + vpbroadcastd m8, r8m + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+32] + vpbroadcastd m15, [rsp+76] + pxor m9, m9 + mov srcq, [rsp+80] + mov dstq, [rsp+88] +.dy1_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+32], m14 + movq xm15, [base+subpel_filters+r4*8] + movq xm10, [base+subpel_filters+r6*8] + movhps xm15, [base+subpel_filters+r7*8] + movhps xm10, [base+subpel_filters+r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+64], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + movu [rsp], m10 + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vbroadcasti128 m14, [base+wswap] + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r4d, [rsp+64] + mov r7d, [rsp+68] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, 10 + packssdw m4, m4 + vpermq m4, m4, q3120 + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm + dec hd + jz .dy1_hloop_prep + movq xm4, [srcq+r4] + movq xm5, [srcq+r6] + movhps xm4, [srcq+r7] + movhps xm5, [srcq+r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_vloop +.dy2: + movzx wd, word [base+put_8tap_scaled_avx2_dy2_table+wq*2] + add wq, r12 + jmp wq +.dy2_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*1] + movhps xm0, [srcq+ssq*2] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + movhps xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vpblendd m0, m2, 0x30 + vpblendd m1, m4, 0xc0 + vpblendd m0, m3, 0xc0 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 01 23 + punpckhwd xm2, xm1 ; 23 45 +.dy2_w2_loop: + movq xm6, [srcq+ssq*0] + vpbroadcastq m7, [srcq+ssq*1] + movhps xm6, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd xm4, xm3, xm8 + pmaddwd xm5, xm2, xm9 + vpblendd m6, m7, 0x30 + vpblendd m6, m1, 0xc0 + pshufb m6, m14 + pmaddubsw m6, m15 + phaddw m6, m6 + pmulhrsw m6, m12 + palignr m0, m6, m0, 8 + pshufd m2, m0, q3221 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 45 67 + punpckhwd xm2, xm1 ; 67 89 + pmaddwd xm6, xm3, xm10 + pmaddwd xm7, xm2, xm11 + paddd xm4, xm5 + paddd xm4, xm13 + paddd xm6, xm7 + paddd xm4, xm6 + psrad xm4, 10 + packssdw xm4, xm4 + packuswb xm4, xm4 + pextrw [dstq+dsq*0], xm4, 0 + pextrw [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +.dy2_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m8, m9 + psrld m14, 10 + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*2] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm1, [srcq+ssq*1] + movu xm3, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vinserti128 m15, xm15, 1 + pshufb m14, m5 + paddb m14, m6 + vinserti128 m2, [srcq+ssq*0], 1 + vinserti128 m3, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pblendvb m15, m11, m8 + pshufb xm0, xm14 + pshufb m2, m14 + pshufb xm1, xm14 + pshufb m3, m14 + pmaddubsw xm0, xm15 + pmaddubsw m2, m15 + pmaddubsw xm1, xm15 + pmaddubsw m3, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + punpcklwd xm2, xm0, xm1 + punpckhwd m1, m0, m1 ; 23 45 + vinserti128 m0, m2, xm1, 1 ; 01 23 +.dy2_w4_loop: + movu xm6, [srcq+ssq*0] + movu xm7, [srcq+ssq*1] + vinserti128 m6, [srcq+ssq*2], 1 + vinserti128 m7, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + psrld m2, m6, 16 + pslld m3, m7, 16 + paddw m6, m2 + paddw m7, m3 + pblendw m6, m7, 0xaa ; 67 89 + pmulhrsw m6, m12 + paddd m4, m5 + vpblendd m0, m1, m6, 0x0f + mova m1, m6 + vpermq m0, m0, q1032 ; 45 67 + pmaddwd m6, m0, m10 + pmaddwd m7, m1, m11 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, 10 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + packuswb xm4, xm4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w4_loop + RET +.dy2_w8: + movifnidn dsm, dsq + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+r4*8] + movq xm10, [base+subpel_filters+r6*8] + movhps xm15, [base+subpel_filters+r7*8] + movhps xm10, [base+subpel_filters+r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + mov [rsp], r7d + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + lea myd, [t1+myq] + mov t1d, 64 << 24 + cmovnz t1q, [base+subpel_filters+myq*8] + movq xm11, t1q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r7d, [rsp] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m14, m11, q2222 + pshufd m11, m11, q3333 +.dy2_w8_loop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m14 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, 10 + packssdw m4, m4 + vpermq m4, m4, q3120 + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm + dec hd + jz .ret + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+r4] + movq xm4, [srcq+r6] + movhps xm3, [srcq+r7] + movhps xm4, [srcq+r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+r4] + movq xm5, [srcq+r6] + movhps xm4, [srcq+r7] + movhps xm5, [srcq+r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_w8_loop +.dy2_w16: + mov dword [rsp+40], 1 << 1 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+40], 1 << 3 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+40], 1 << 7 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+40], 1 << 15 +.dy2_w_start: + movifnidn dsm, dsq + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+64], t0d + mov [rsp+48], srcq + mov [rsp+56], dstq +%if UNIX64 + mov r6m, hd +%endif + shl dword r8m, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .dy2_hloop +.dy2_hloop_prep: + shr dword [rsp+40], 1 + jz .ret + add qword [rsp+56], 8 + mov hd, hm + vpbroadcastd m8, r8m + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp] + vpbroadcastd m15, [rsp+64] + pxor m9, m9 + mov srcq, [rsp+48] + mov dstq, [rsp+56] +.dy2_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp], m14 + movq xm15, [base+subpel_filters+r4*8] + movq xm10, [base+subpel_filters+r6*8] + movhps xm15, [base+subpel_filters+r7*8] + movhps xm10, [base+subpel_filters+r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+32], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movq xm14, r4q + punpcklbw xm14, xm14 + psraw xm14, 8 + vinserti128 m14, xm14, 1 + mov r4d, [rsp+32] + mov r7d, [rsp+36] + pshufd m8, m14, q0000 + pshufd m9, m14, q1111 + pshufd m11, m14, q2222 + pshufd m14, m14, q3333 +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m14 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, 10 + packssdw m4, m4 + vpermq m4, m4, q3120 + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm + dec hd + jz .dy2_hloop_prep + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+r4] + movq xm4, [srcq+r6] + movhps xm3, [srcq+r7] + movhps xm4, [srcq+r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+r4] + movq xm5, [srcq+r6] + movhps xm4, [srcq+r7] + movhps xm5, [srcq+r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_vloop +.ret: + RET + %macro WARP_V 5 ; dst, 02, 46, 13, 57 ; Can be done using gathers, but that's terribly slow on many CPU:s lea tmp1d, [myq+deltaq*4] @@ -5010,7 +6704,7 @@ cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ vpbroadcastd m3, [base+pw_m256] vpbroadcastd m7, [base+pd_63] vbroadcasti128 m15, [base+pb_8x0_8x8] - pmaddwd m2, m5, [base+resize_mul] ; dx*[0,1,2,3,4,5,6,7] + pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] pslld m5, 3 ; dx*8 pslld m6, 14 paddd m8, m2 ; mx+[0..7]*dx diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c index a21877c..720b168 100644 --- a/src/x86/mc_init_tmpl.c +++ b/src/x86/mc_init_tmpl.c @@ -49,6 +49,16 @@ decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3); decl_mc_fn(dav1d_put_bilin_avx2); decl_mc_fn(dav1d_put_bilin_ssse3); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2); + decl_mct_fn(dav1d_prep_8tap_regular_avx512icl); decl_mct_fn(dav1d_prep_8tap_regular_avx2); decl_mct_fn(dav1d_prep_8tap_regular_ssse3); @@ -123,6 +133,9 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { c->mc[type] = dav1d_put_##name##_##suffix #define init_mct_fn(type, name, suffix) \ c->mct[type] = dav1d_prep_##name##_##suffix +#define init_mc_scaled_fn(type, name, suffix) \ + c->mc_scaled[type] = dav1d_put_##name##_##suffix + const unsigned flags = dav1d_get_cpu_flags(); if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) @@ -137,16 +150,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { return; #if BITDEPTH == 8 - init_mc_fn (FILTER_2D_BILINEAR, bilin, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); @@ -187,16 +200,26 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { return; #if BITDEPTH == 8 - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); - init_mc_fn (FILTER_2D_BILINEAR, bilin, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); diff --git a/tests/checkasm/mc.c b/tests/checkasm/mc.c index ed0866e..8b3595b 100644 --- a/tests/checkasm/mc.c +++ b/tests/checkasm/mc.c @@ -38,6 +38,7 @@ static const char *const filter_names[] = { }; static const char *const mxy_names[] = { "0", "h", "v", "hv" }; +static const char *const scaled_paths[] = { "", "_dy1", "_dy2" }; static int mc_h_next(const int h) { switch (h) { @@ -106,6 +107,60 @@ static void check_mc(Dav1dMCDSPContext *const c) { report("mc"); } +static void check_mc_scaled(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 263 * 263,); + ALIGN_STK_64(pixel, c_dst, 128 * 128,); + ALIGN_STK_64(pixel, a_dst, 128 * 128,); + const pixel *src = src_buf + 263 * 3 + 3; + const ptrdiff_t src_stride = 263 * sizeof(pixel); +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, + ptrdiff_t src_stride, int w, int h, + int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX); + + for (int filter = 0; filter < N_2D_FILTERS; filter++) + for (int w = 2; w <= 128; w <<= 1) { + const ptrdiff_t dst_stride = w * sizeof(pixel); + for (int p = 0; p < 3; ++p) { + if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc", + filter_names[filter], w, scaled_paths[p], BITDEPTH)) + { + const int h_min = w <= 32 ? 2 : w / 4; + const int h_max = imax(imin(w * 4, 128), 32); + for (int h = h_min; h <= h_max; h = mc_h_next(h)) { + const int mx = rnd() % 1024; + const int my = rnd() % 1024; + const int dx = rnd() % 2048 + 1; + const int dy = !p + ? rnd() % 2048 + 1 + : p << 10; // ystep=1.0 and ystep=2.0 paths + + for (int k = 0; k < 263 * 263; k++) + src_buf[k] = rnd() & bitdepth_max; + + call_ref(c_dst, dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + call_new(a_dst, dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel(c_dst, dst_stride, + a_dst, dst_stride, w, h, "dst"); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) + bench_new(a_dst, dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + } + } + } + } + report("mc_scaled"); +} + /* Generate worst case input in the topleft corner, randomize the rest */ static void generate_mct_input(pixel *const buf, const int bitdepth_max) { static const int8_t pattern[8] = { -1, 0, -1, 0, 0, -1, 0, -1 }; @@ -632,6 +687,7 @@ void bitfn(checkasm_check_mc)(void) { bitfn(dav1d_mc_dsp_init)(&c); check_mc(&c); + check_mc_scaled(&c); check_mct(&c); check_avg(&c); check_w_avg(&c); |