; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 ; dav1d_obmc_masks[] * -512 const obmc_masks_avx2 dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 dw 0, 0, 0, 0, 0, 0, 0, 0 deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 bdct_lb_q: times 8 db 0 times 8 db 4 times 8 db 8 times 8 db 12 prep_mul: dw 16, 16, 4, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 put_8tap_h_rnd: dd 34, 40 s_8tap_h_rnd: dd 2, 8 s_8tap_h_sh: dd 2, 4 put_s_8tap_v_rnd: dd 512, 128 put_s_8tap_v_sh: dd 10, 8 prep_8tap_1d_rnd: dd 8 - (8192 << 4) prep_8tap_2d_rnd: dd 32 - (8192 << 5) warp8x8t_rnd: dd 16384 - (8192 << 15) warp8x8_shift: dd 5, 3 warp8x8_rnd: dw 4096, 4096, 16384, 16384 bidir_rnd: dw -16400, -16400, -16388, -16388 bidir_mul: dw 2048, 2048, 8192, 8192 %define pw_16 prep_mul %define pd_512 put_s_8tap_v_rnd pw_2: times 2 dw 2 pw_64: times 2 dw 64 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 pw_m512: times 2 dw -512 pd_32: dd 32 pd_63: dd 63 pd_64: dd 64 pd_32768: dd 32768 pd_65538: dd 65538 pd_m524256: dd -524256 ; -8192 << 6 + 32 pd_0x3ff: dd 0x3ff pq_0x40000000: dq 0x40000000 dd 0 %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern resize_filter SECTION .text INIT_XMM avx2 cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx2] %if UNIX64 DECLARE_REG_TMP 8 %define org_w r8d mov r8d, wd %else DECLARE_REG_TMP 7 %define org_w wm %endif tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET INIT_YMM avx2 .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*1+32*0], m2 mova [dstq+dsq*1+32*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] add srcq, ssq mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 movu m0, [srcq+32*4] movu m1, [srcq+32*5] movu m2, [srcq+32*6] movu m3, [srcq+32*7] add srcq, ssq mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 add dstq, dsq dec hd jg .put_w128 RET .h: movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add wq, r7 shr r6d, 11 vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] jmp wq .h_w2: movq xm1, [srcq+ssq*0] movhps xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xm0, xm4, xm1 psrlq xm1, 16 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 4 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] movq xm1, [srcq+ssq*0+2] movhps xm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xm0, xm4 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 4 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 movu xm1, [srcq+ssq*0+2] vinserti128 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+32*0] pmullw m1, m5, [srcq+32*0+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+32*1] pmullw m2, m5, [srcq+32*1+2] add srcq, ssq paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .h_w32 RET .h_w64: .h_w128: movifnidn t0d, org_w .h_w64_loop0: mov r6d, t0d .h_w64_loop: pmullw m0, m4, [srcq+r6*2-32*1] pmullw m1, m5, [srcq+r6*2-32*1+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+r6*2-32*2] pmullw m2, m5, [srcq+r6*2-32*2+2] paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+r6*2-32*1], m0 mova [dstq+r6*2-32*2], m1 sub r6d, 32 jg .h_w64_loop add srcq, ssq add dstq, dsq dec hd jg .h_w64_loop0 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] shl mxyd, 11 movd xm5, mxyd add wq, r7 vpbroadcastw m5, xm5 jmp wq .v_w2: movd xm0, [srcq+ssq*0] .v_w2_loop: movd xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xm2, xm0, xm1 movd xm0, [srcq+ssq*0] punpckldq xm1, xm0 psubw xm1, xm2 pmulhrsw xm1, xm5 paddw xm1, xm2 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm0, [srcq+ssq*0] .v_w4_loop: movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xm2, xm0, xm1 movq xm0, [srcq+ssq*0] punpcklqdq xm1, xm0 psubw xm1, xm2 pmulhrsw xm1, xm5 paddw xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m2, m0, m1, 0xf0 vbroadcasti128 m0, [srcq+ssq*0] vpblendd m1, m0, 0xf0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w32: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] .v_w32_loop: movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m5 paddw m4, m0 movu m0, [srcq+ssq*0+32*0] mova [dstq+dsq*0+32*0], m4 psubw m4, m3, m1 pmulhrsw m4, m5 paddw m4, m1 movu m1, [srcq+ssq*0+32*1] mova [dstq+dsq*0+32*1], m4 psubw m4, m0, m2 pmulhrsw m4, m5 paddw m4, m2 mova [dstq+dsq*1+32*0], m4 psubw m4, m1, m3 pmulhrsw m4, m5 paddw m4, m3 mova [dstq+dsq*1+32*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w16: .v_w64: .v_w128: movifnidn t0d, org_w add t0d, t0d mov r4, srcq lea r6d, [hq+t0*8-256] mov r7, dstq .v_w16_loop0: movu m0, [srcq+ssq*0] .v_w16_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m5 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m5 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 vpbroadcastd m3, [pw_2] movd xm6, mxyd vpbroadcastd m7, [pw_8192] add wq, r7 vpbroadcastw m6, xm6 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m7, [pw_2048] .hv_12bpc: jmp wq .hv_w2: vpbroadcastq xm1, [srcq+ssq*0] pmullw xm0, xm4, xm1 psrlq xm1, 16 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 .hv_w2_loop: movq xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm2, [srcq+ssq*0] pmullw xm1, xm4, xm2 psrlq xm2, 16 pmullw xm2, xm5 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 ; 1 _ 2 _ shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ mova xm0, xm1 psubw xm1, xm2 paddw xm1, xm1 pmulhw xm1, xm6 paddw xm1, xm2 pmulhrsw xm1, xm7 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xm0, xm4, [srcq+ssq*0-8] pmullw xm1, xm5, [srcq+ssq*0-6] paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 .hv_w4_loop: movq xm1, [srcq+ssq*1] movq xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xm1, [srcq+ssq*0] movhps xm2, [srcq+ssq*0+2] pmullw xm1, xm4 pmullw xm2, xm5 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 ; 1 2 shufpd xm2, xm0, xm1, 0x01 ; 0 1 mova xm0, xm1 psubw xm1, xm2 paddw xm1, xm1 pmulhw xm1, xm6 paddw xm1, xm2 pmulhrsw xm1, xm7 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+ssq*0] pmullw xm1, xm5, [srcq+ssq*0+2] paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 vinserti128 m0, xm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti128 m1, [srcq+ssq*0], 1 vinserti128 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: .hv_w32: .hv_w64: .hv_w128: %if UNIX64 lea r6d, [r8*2-32] %else mov r6d, wm lea r6d, [r6*2-32] %endif mov r4, srcq lea r6d, [hq+r6*8] mov r7, dstq .hv_w16_loop0: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w16_loop: pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m1, m3 paddw m1, m2 psrlw m1, 2 psubw m2, m1, m0 paddw m2, m2 pmulhw m2, m6 paddw m2, m0 pmulhrsw m2, m7 mova [dstq+dsq*0], m2 pmullw m0, m4, [srcq+ssq*0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m1 paddw m2, m2 pmulhw m2, m6 paddw m2, m1 pmulhrsw m2, m7 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 RET cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep_avx2] %if UNIX64 DECLARE_REG_TMP 7 %define org_w r7d %else DECLARE_REG_TMP 6 %define org_w r5m %endif mov org_w, wd tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [r6-prep_avx2+pw_8192] add wq, r6 shr r5d, 11 vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] lea stride3q, [strideq*3] jmp wq .prep_w4: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] vpbroadcastq m1, [srcq+strideq*2] vpbroadcastq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 pmullw m0, m4 psubw m0, m5 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .prep_w4 RET .prep_w8: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 movu xm1, [srcq+strideq*2] vinserti128 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 4 jg .prep_w8 RET .prep_w16: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m4, [srcq+strideq*1] pmullw m2, m4, [srcq+strideq*2] pmullw m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+strideq*0+32*0] pmullw m1, m4, [srcq+strideq*0+32*1] pmullw m2, m4, [srcq+strideq*1+32*0] pmullw m3, m4, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 2 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+32*0] pmullw m1, m4, [srcq+32*1] pmullw m2, m4, [srcq+32*2] pmullw m3, m4, [srcq+32*3] add srcq, strideq psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 dec hd jg .prep_w64 RET .prep_w128: pmullw m0, m4, [srcq+32*0] pmullw m1, m4, [srcq+32*1] pmullw m2, m4, [srcq+32*2] pmullw m3, m4, [srcq+32*3] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 pmullw m0, m4, [srcq+32*4] pmullw m1, m4, [srcq+32*5] pmullw m2, m4, [srcq+32*6] pmullw m3, m4, [srcq+32*7] add tmpq, 32*8 add srcq, strideq psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq-32*4], m0 mova [tmpq-32*3], m1 mova [tmpq-32*2], m2 mova [tmpq-32*1], m3 dec hd jg .prep_w128 RET .h: movd xm5, mxyd mov mxyd, r6m ; my vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 vpbroadcastd m3, [pw_32766] psubw m4, m5 test dword r7m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti128 m2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq m0, m1, m2 psrldq m1, 2 pslldq m2, 6 pmullw m0, m4 vpblendd m1, m2, 0xcc pmullw m1, m5 psubw m0, m3 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4 RET .h_w8: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 movu xm1, [srcq+strideq*0+2] vinserti128 m1, [srcq+strideq*1+2], 1 lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m3 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m5, [srcq+strideq*0+2] psubw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+strideq*1] pmullw m2, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m1, m3 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 2 jg .h_w16 RET .h_w32: .h_w64: .h_w128: movifnidn t0d, org_w .h_w32_loop0: mov r3d, t0d .h_w32_loop: pmullw m0, m4, [srcq+r3*2-32*1] pmullw m1, m5, [srcq+r3*2-32*1+2] psubw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+r3*2-32*2] pmullw m2, m5, [srcq+r3*2-32*2+2] psubw m1, m3 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+r3*2-32*1], m0 mova [tmpq+r3*2-32*2], m1 sub r3d, 32 jg .h_w32_loop add srcq, strideq lea tmpq, [tmpq+t0*2] dec hd jg .h_w32_loop0 RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] movd xm5, mxyd vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 vpbroadcastd m3, [pw_32766] add wq, r6 lea stride3q, [strideq*3] psubw m4, m5 test dword r7m, 0x800 jnz .v_12bpc psllw m4, 2 psllw m5, 2 .v_12bpc: jmp wq .v_w4: movq xm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq xm1, [srcq+strideq*1] vpblendd m2, m0, 0x03 ; 0 2 2 2 vpbroadcastq m0, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m0, 0xf0 ; 1 1 3 3 vpbroadcastq m0, [srcq+strideq*0] vpblendd m1, m2, 0x33 ; 0 1 2 3 vpblendd m0, m2, 0x0c ; 4 2 4 4 punpckhqdq m2, m1, m0 ; 1 2 3 4 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+strideq*0] .v_w8_loop: vbroadcasti128 m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpblendd m1, m0, m2, 0xf0 ; 0 1 vbroadcasti128 m0, [srcq+strideq*0] vpblendd m2, m0, 0xf0 ; 1 2 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 32 sub hd, 2 jg .v_w8_loop RET .v_w16: movu m0, [srcq+strideq*0] .v_w16_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5, m2 psubw m0, m3 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m4 mova [tmpq+32*0], m1 pmullw m1, m5, m0 psubw m2, m3 paddw m1, m2 psraw m1, 2 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 PUSH r7 %endif movifnidn r7d, org_w add r7d, r7d mov r3, srcq lea r6d, [hq+r7*8-256] mov r5, tmpq .v_w32_loop0: movu m0, [srcq+strideq*0] .v_w32_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5, m2 psubw m0, m3 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m4 mova [tmpq+r7*0], m1 pmullw m1, m5, m0 psubw m2, m3 paddw m1, m2 psraw m1, 2 mova [tmpq+r7*1], m1 lea tmpq, [tmpq+r7*2] sub hd, 2 jg .v_w32_loop add r3, 32 add r5, 32 movzx hd, r6b mov srcq, r3 mov tmpq, r5 sub r6d, 1<<8 jg .v_w32_loop0 %if WIN64 POP r7 %endif RET .hv: WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 movd xm6, mxyd add wq, r6 lea stride3q, [strideq*3] vpbroadcastw m6, xm6 jmp wq .hv_w4: movu xm1, [srcq+strideq*0] %if WIN64 movaps [rsp+24], xmm7 %endif pmullw xm0, xm4, xm1 psrldq xm1, 2 pmullw xm1, xm5 psubw xm0, xm3 paddw xm0, xm1 psraw xm0, 2 vpbroadcastq m0, xm0 .hv_w4_loop: movu xm1, [srcq+strideq*1] vinserti128 m1, [srcq+stride3q ], 1 movu xm2, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vinserti128 m2, [srcq+strideq*0], 1 punpcklqdq m7, m1, m2 psrldq m1, 2 pslldq m2, 6 pmullw m7, m4 vpblendd m1, m2, 0xcc pmullw m1, m5 psubw m7, m3 paddw m1, m7 psraw m1, 2 ; 1 2 3 4 vpblendd m0, m1, 0x3f vpermq m2, m0, q2103 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop %if WIN64 movaps xmm7, [rsp+24] %endif RET .hv_w8: pmullw xm0, xm4, [srcq+strideq*0] pmullw xm1, xm5, [srcq+strideq*0+2] psubw xm0, xm3 paddw xm0, xm1 psraw xm0, 2 vinserti128 m0, xm0, 1 .hv_w8_loop: movu xm1, [srcq+strideq*1] movu xm2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] vinserti128 m1, [srcq+strideq*0], 1 vinserti128 m2, [srcq+strideq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 32 sub hd, 2 jg .hv_w8_loop RET .hv_w16: .hv_w32: .hv_w64: .hv_w128: %if WIN64 PUSH r7 %endif movifnidn r7d, org_w add r7d, r7d mov r3, srcq lea r6d, [hq+r7*8-256] mov r5, tmpq .hv_w16_loop0: pmullw m0, m4, [srcq] pmullw m1, m5, [srcq+2] psubw m0, m3 paddw m0, m1 psraw m0, 2 .hv_w16_loop: pmullw m1, m4, [srcq+strideq*1] pmullw m2, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m1, m3 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+r7*0], m2 pmullw m0, m4, [srcq+strideq*0] pmullw m2, m5, [srcq+strideq*0+2] psubw m0, m3 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+r7*1], m2 lea tmpq, [tmpq+r7*2] sub hd, 2 jg .hv_w16_loop add r3, 32 add r5, 32 movzx hd, r6b mov srcq, r3 mov tmpq, r5 sub r6d, 1<<8 jg .hv_w16_loop0 %if WIN64 POP r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; prefix, type, type_h, type_v cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx2 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx2] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w2: movzx mxd, mxb sub srcq, 2 mova xm2, [subpel_h_shuf2] vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] pmovsxbw xm3, xm3 .h_w2_loop: movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm2 pshufb xm1, xm2 pmaddwd xm0, xm3 pmaddwd xm1, xm3 phaddd xm0, xm1 paddd xm0, xm4 psrad xm0, 6 packusdw xm0, xm0 pminsw xm0, xm5 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xm3, [base+subpel_filters+mxq*8] WIN64_SPILL_XMM 8 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] pshufd xm3, xm3, q2211 vpbroadcastq m2, xm3 vpermq m3, m3, q1111 .h_w4_loop: movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 pshufb m1, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m2 pmaddwd m1, m3 paddd m0, m4 paddd m0, m1 psrad m0, 6 vextracti128 xm1, m0, 1 packusdw xm0, xm1 pminsw xm0, xm5 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv mov r7d, r8m vpbroadcastw m5, r8m shr r7d, 11 vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 je .h_w4 jl .h_w2 %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 13 shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 8 jg .h_w16 .h_w8: %macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 pmaddwd m%5, m9, m%4 ; abcd1 pmaddwd m%1, m8 ; abcd0 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m%5, m4 paddd m%1, m%5 pmaddwd m%5, m11, m%2 ; abcd3 paddd m%1, m%5 pmaddwd m%5, m10, m%4 ; abcd2 pshufb m%3, m7 ; a b b c c d d e pmaddwd m%4, m8 ; efgh0 paddd m%1, m%5 pmaddwd m%5, m9, m%2 ; efgh1 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m11 ; efgh3 pmaddwd m%2, m10 ; efgh2 paddd m%4, m4 paddd m%4, m%5 paddd m%3, m%4 paddd m%2, m%3 psrad m%1, 6 psrad m%2, 6 packusdw m%1, m%2 pminsw m%1, m5 %endmacro movu xm0, [srcq+ssq*0+ 0] vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] shufpd m1, m0, m2, 0x05 PUT_8TAP_H 0, 1, 2, 3, 12 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6*2-32] movu m1, [srcq+r6*2-24] movu m2, [srcq+r6*2-16] PUT_8TAP_H 0, 1, 2, 3, 12 mova [dstq+r6*2-32], m0 sub r6d, 16 jg .h_w16_loop add srcq, ssq add dstq, dsq dec hd jg .h_w16 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m6, [pd_32] vpbroadcastw m7, r8m lea r6, [ssq*3] sub srcq, r6 punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 jg .v_w8 je .v_w4 .v_w2: movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*4] movd xm3, [srcq+ssq*0] vpbroadcastd xm1, [srcq+ssq*1] vpbroadcastd xm0, [srcq+ssq*2] add srcq, r6 vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklwd xm3, xm1 ; 45 56 punpcklwd xm1, xm2, xm4 ; 01 12 punpckhwd xm2, xm4 ; 23 34 .v_w2_loop: vpbroadcastd xm4, [srcq+ssq*0] pmaddwd xm5, xm8, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm9 ; a1 b1 paddd xm5, xm6 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm10 ; a2 b2 paddd xm5, xm3 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklwd xm3, xm4 ; 67 78 pmaddwd xm4, xm11, xm3 ; a3 b3 paddd xm5, xm4 psrad xm5, 6 packusdw xm5, xm5 pminsw xm5, xm7 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm1, [srcq+ssq*0] vpbroadcastq m0, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+ssq*4] vpbroadcastq m3, [srcq+ssq*0] vpbroadcastq m5, [srcq+ssq*1] vpblendd m1, m0, 0x30 vpblendd m0, m2, 0x30 punpcklwd m1, m0 ; 01 12 vpbroadcastq m0, [srcq+ssq*2] add srcq, r6 vpblendd m2, m4, 0x30 vpblendd m4, m3, 0x30 punpcklwd m2, m4 ; 23 34 vpblendd m3, m5, 0x30 vpblendd m5, m0, 0x30 punpcklwd m3, m5 ; 45 56 .v_w4_loop: vpbroadcastq m4, [srcq+ssq*0] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m6 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m4, m0, 0x30 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 6 vextracti128 xm4, m5, 1 packusdw xm5, xm4 pminsw xm5, xm7 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*1], xm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: shl wd, 5 mov r7, srcq mov r8, dstq lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+ssq*2] lea srcq, [srcq+ssq*4] vbroadcasti128 m1, [srcq+ssq*0] vbroadcasti128 m2, [srcq+ssq*1] vbroadcasti128 m3, [srcq+ssq*2] add srcq, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: vbroadcasti128 m14, [srcq+ssq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 vbroadcasti128 m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c punpcklwd m5, m6, m0 ; 67 punpckhwd m6, m0 ; 78 pmaddwd m14, m11, m5 ; a3 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 5 psrad m13, 5 packusdw m12, m13 pxor m13, m13 pavgw m12, m13 pminsw m12, m7 vpermq m12, m12, q3120 mova [dstq+dsq*0], xm12 vextracti128 [dstq+dsq*1], m12, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop add r7, 16 add r8, 16 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .v_w8_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastw m15, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] vpbroadcastd m6, [pd_512] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 pxor m7, m7 punpcklbw m7, m0 punpcklbw m1, m1 psraw m1, 8 ; sign-extend test dword r8m, 0x800 jz .hv_10bit psraw m7, 2 psllw m1, 2 .hv_10bit: pshufd m11, m1, q0000 pshufd m12, m1, q1111 pshufd m13, m1, q2222 pshufd m14, m1, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 m9, [subpel_h_shuf2] vbroadcasti128 m1, [srcq+r6 ] ; 3 3 movu xm3, [srcq+ssq*2] movu xm0, [srcq+ssq*0] movu xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*4] vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 add srcq, r6 pshufb m1, m9 pshufb m3, m9 pshufb m0, m9 pshufb m2, m9 pmaddwd m1, m7 pmaddwd m3, m7 pmaddwd m0, m7 pmaddwd m2, m7 phaddd m1, m3 phaddd m0, m2 paddd m1, m6 paddd m0, m6 psrad m1, 10 psrad m0, 10 packssdw m1, m0 ; 3 2 0 1 vextracti128 xm0, m1, 1 ; 3 4 5 6 pshufd xm2, xm1, q1301 ; 2 3 1 2 pshufd xm3, xm0, q2121 ; 4 5 4 5 punpckhwd xm1, xm2 ; 01 12 punpcklwd xm2, xm0 ; 23 34 punpckhwd xm3, xm0 ; 45 56 .hv_w2_loop: movu xm4, [srcq+ssq*0] movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm4, xm9 pshufb xm5, xm9 pmaddwd xm4, xm7 pmaddwd xm5, xm7 phaddd xm4, xm5 pmaddwd xm5, xm11, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm12 ; a1 b1 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm13 ; a2 b2 paddd xm5, xm3 paddd xm4, xm6 psrad xm4, 10 packssdw xm4, xm4 palignr xm3, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm3, xm0 ; 67 78 pmaddwd xm4, xm14, xm3 ; a3 b3 paddd xm5, xm6 paddd xm5, xm4 psrad xm5, 10 packusdw xm5, xm5 pminsw xm5, xm15 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti128 m9, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] pshufd m8, m7, q1111 pshufd m7, m7, q0000 movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 vbroadcasti128 m0, [srcq+r6 ] vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 movu xm3, [srcq+ssq*1] vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 add srcq, r6 pshufb m4, m1, m9 pshufb m1, m10 pmaddwd m4, m7 pmaddwd m1, m8 pshufb m5, m2, m9 pshufb m2, m10 pmaddwd m5, m7 pmaddwd m2, m8 paddd m4, m6 paddd m1, m4 pshufb m4, m0, m9 pshufb m0, m10 pmaddwd m4, m7 pmaddwd m0, m8 paddd m5, m6 paddd m2, m5 pshufb m5, m3, m9 pshufb m3, m10 pmaddwd m5, m7 pmaddwd m3, m8 paddd m4, m6 paddd m4, m0 paddd m5, m6 paddd m5, m3 vperm2i128 m0, m1, m2, 0x21 psrld m1, 10 psrld m2, 10 vperm2i128 m3, m4, m5, 0x21 pslld m4, 6 pslld m5, 6 pblendw m2, m4, 0xaa ; 23 34 pslld m0, 6 pblendw m1, m0, 0xaa ; 01 12 psrld m3, 10 pblendw m3, m5, 0xaa ; 45 56 psrad m0, m5, 16 .hv_w4_loop: movu xm4, [srcq+ssq*0] vinserti128 m4, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m6 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 pshufb m3, m4, m9 pshufb m4, m10 pmaddwd m3, m7 pmaddwd m4, m8 paddd m3, m6 paddd m4, m3 psrad m4, 10 packssdw m0, m4 ; _ 7 6 8 vpermq m3, m0, q1122 ; _ 6 _ 7 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m4, m5 psrad m4, 10 vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, xm15 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] shl wd, 5 lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 pxor m0, m0 punpcklbw m0, m2 mov r7, srcq mov r8, dstq lea wd, [hq+wq-256] test dword r8m, 0x800 jz .hv_w8_10bit psraw m0, 2 psllw xm1, 2 .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 pshufd m13, m0, q2222 pshufd m14, m0, q3333 %if WIN64 %define v_mul (rsp+stack_offset+40) ; r4m %else %define v_mul (rsp-24) ; red zone %endif mova [v_mul], xm1 .hv_w8_loop0: %macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 pmaddwd m3, m12, m2 pmaddwd m%1, m11 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m3, m10 paddd m%1, m3 pmaddwd m3, m14, m%2 paddd m%1, m3 pmaddwd m3, m13, m2 pshufb m%3, m9 ; a b b c c d d e pmaddwd m2, m11 paddd m%1, m3 pmaddwd m3, m12, m%2 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m14 pmaddwd m%2, m13 paddd m2, m10 paddd m2, m3 paddd m%3, m2 paddd m%2, m%3 psrad m%1, 10 psrad m%2, 10 packssdw m%1, m%2 %endmacro movu xm4, [srcq+r6 *1+ 0] vbroadcasti128 m8, [subpel_h_shufA] movu xm6, [srcq+r6 *1+ 8] vbroadcasti128 m9, [subpel_h_shufB] movu xm0, [srcq+r6 *1+16] vpbroadcastd m10, [pd_512] movu xm5, [srcq+ssq*0+ 0] vinserti128 m5, [srcq+ssq*4+ 0], 1 movu xm1, [srcq+ssq*0+16] vinserti128 m1, [srcq+ssq*4+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PUT_8TAP_HV_H 4, 6, 0 ; 3 INIT_YMM avx2 PUT_8TAP_HV_H 5, 7, 1 ; 0 4 movu xm0, [srcq+ssq*2+ 0] vinserti128 m0, [srcq+r6 *2+ 0], 1 movu xm1, [srcq+ssq*2+16] vinserti128 m1, [srcq+r6 *2+16], 1 shufpd m7, m0, m1, 0x05 PUT_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+ssq*1+ 0] movu xm1, [srcq+ssq*1+16] lea srcq, [srcq+ssq*4] vinserti128 m6, [srcq+ssq*1+ 0], 1 vinserti128 m1, [srcq+ssq*1+16], 1 add srcq, r6 shufpd m7, m6, m1, 0x05 PUT_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 vpermq m5, m5, q3120 vpermq m6, m6, q3120 vpermq m7, m0, q3120 punpcklwd m3, m7, m4 ; 23 punpckhwd m4, m5 ; 34 punpcklwd m1, m5, m6 ; 01 punpckhwd m5, m6 ; 45 punpcklwd m2, m6, m7 ; 12 punpckhwd m6, m7 ; 56 .hv_w8_loop: vpbroadcastd m9, [v_mul+4*0] vpbroadcastd m7, [v_mul+4*1] vpbroadcastd m10, [v_mul+4*2] pmaddwd m8, m9, m1 ; a0 pmaddwd m9, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m7 ; a1 pmaddwd m4, m7 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 movu xm5, [srcq+ssq*0] vinserti128 m5, [srcq+ssq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] movu xm6, [srcq+ssq*0+16] vinserti128 m6, [srcq+ssq*1+16], 1 vextracti128 [dstq], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 pmaddwd m5, m12 paddd m0, m5 pshufb m5, m6, m7 ; 89 pshufb m6, m10 ; ab pmaddwd m5, m13 pmaddwd m6, m14 paddd m6, m5 movu xm5, [srcq+ssq*0+8] vinserti128 m5, [srcq+ssq*1+8], 1 lea srcq, [srcq+ssq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 pmaddwd m7, m11 paddd m0, m10 vpbroadcastd m10, [pd_512] paddd m6, m7 pmaddwd m7, m14, m5 pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 vbroadcasti128 m6, [dstq] paddd m8, m10 paddd m9, m10 paddd m0, m10 paddd m5, m10 vpbroadcastd m10, [v_mul+4*3] psrad m0, 10 psrad m5, 10 packssdw m0, m5 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m10, m5 ; a3 pmaddwd m10, m6 ; b3 paddd m7, m8 paddd m9, m10 psrad m7, 10 psrad m9, 10 packusdw m7, m9 pminsw m7, m15 vpermq m7, m7, q3120 mova [dstq+dsq*0], xm7 vextracti128 [dstq+dsq*1], m7, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop add r7, 16 add r8, 16 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .hv_w8_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx2 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx2] movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd mov r6d, r7m ; bitdepth_max movzx wd, word [r7+wq*2+table_offset(prep,)] vpbroadcastd m5, [r7-prep_avx2+pw_8192] shr r6d, 11 add wq, r7 vpbroadcastd m4, [base+prep_mul+r6*4] lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xm0, [base+subpel_filters+mxq*8] vbroadcasti128 m3, [subpel_h_shufA] vbroadcasti128 m4, [subpel_h_shufB] WIN64_SPILL_XMM 8 pshufd xm0, xm0, q2211 test dword r7m, 0x800 jnz .h_w4_12bpc psllw xm0, 2 .h_w4_12bpc: vpbroadcastq m6, xm0 vpermq m7, m0, q1111 .h_w4_loop: movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti128 m2, [srcq+r6 ], 1 lea srcq, [srcq+strideq*4] pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m6 pmaddwd m1, m7 paddd m0, m5 paddd m0, m1 pshufb m1, m2, m3 pshufb m2, m4 pmaddwd m1, m6 pmaddwd m2, m7 paddd m1, m5 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) lea r6, [strideq*3] cmp wd, 4 je .h_w4 shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] punpcklbw m0, m0 psraw m0, 8 ; sign-extend test dword r7m, 0x800 jnz .h_12bpc psllw m0, 2 .h_12bpc: pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 8 jg .h_w16 .h_w8: %macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 pmaddwd m%5, m9, m%4 ; abcd1 pmaddwd m%1, m8 ; abcd0 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m%5, m5 paddd m%1, m%5 pmaddwd m%5, m11, m%2 ; abcd3 paddd m%1, m%5 pmaddwd m%5, m10, m%4 ; abcd2 pshufb m%3, m7 ; a b b c c d d e pmaddwd m%4, m8 ; efgh0 paddd m%1, m%5 pmaddwd m%5, m9, m%2 ; efgh1 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m11 ; efgh3 pmaddwd m%2, m10 ; efgh2 paddd m%4, m5 paddd m%4, m%5 paddd m%3, m%4 paddd m%2, m%3 psrad m%1, 4 psrad m%2, 4 packssdw m%1, m%2 %endmacro movu xm0, [srcq+strideq*0+ 0] vinserti128 m0, [srcq+strideq*1+ 0], 1 movu xm2, [srcq+strideq*0+16] vinserti128 m2, [srcq+strideq*1+16], 1 lea srcq, [srcq+strideq*2] shufpd m1, m0, m2, 0x05 PREP_8TAP_H 0, 1, 2, 3, 4 mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: add wd, wd .h_w16_loop0: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6-32] movu m1, [srcq+r6-24] movu m2, [srcq+r6-16] PREP_8TAP_H 0, 1, 2, 3, 4 mova [tmpq+r6-32], m0 sub r6d, 32 jg .h_w16_loop add srcq, strideq add tmpq, wq dec hd jg .h_w16_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m7, [prep_8tap_1d_rnd] lea r6, [strideq*3] sub srcq, r6 punpcklbw m0, m0 psraw m0, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc psllw m0, 2 .v_12bpc: pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 jg .v_w8 .v_w4: movq xm1, [srcq+strideq*0] vpbroadcastq m0, [srcq+strideq*1] vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m5, [srcq+strideq*1] vpblendd m1, m0, 0x30 vpblendd m0, m2, 0x30 punpcklwd m1, m0 ; 01 12 vpbroadcastq m0, [srcq+strideq*2] add srcq, r6 vpblendd m2, m4, 0x30 vpblendd m4, m3, 0x30 punpcklwd m2, m4 ; 23 34 vpblendd m3, m5, 0x30 vpblendd m5, m0, 0x30 punpcklwd m3, m5 ; 45 56 .v_w4_loop: vpbroadcastq m4, [srcq+strideq*0] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m7 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpblendd m4, m0, 0x30 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 4 vextracti128 xm4, m5, 1 packssdw xm5, xm4 mova [tmpq], xm5 add tmpq, 16 sub hd, 2 jg .v_w4_loop RET .v_w8: %if WIN64 push r8 %endif mov r8d, wd shl wd, 5 mov r5, srcq mov r7, tmpq lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vbroadcasti128 m1, [srcq+strideq*0] vbroadcasti128 m2, [srcq+strideq*1] vbroadcasti128 m3, [srcq+strideq*2] add srcq, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: vbroadcasti128 m14, [srcq+strideq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m7 paddd m13, m7 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 vbroadcasti128 m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c punpcklwd m5, m6, m0 ; 67 punpckhwd m6, m0 ; 78 pmaddwd m14, m11, m5 ; a3 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 4 psrad m13, 4 packssdw m12, m13 vpermq m12, m12, q3120 mova [tmpq+r8*0], xm12 vextracti128 [tmpq+r8*2], m12, 1 lea tmpq, [tmpq+r8*4] sub hd, 2 jg .v_w8_loop add r5, 16 add r7, 16 movzx hd, wb mov srcq, r5 mov tmpq, r7 sub wd, 1<<8 jg .v_w8_loop0 %if WIN64 pop r8 %endif RET .hv: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastd m15, [prep_8tap_2d_rnd] cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 sub srcq, r6 pxor m7, m7 punpcklbw m7, m0 punpcklbw m1, m1 psraw m7, 4 psraw m1, 8 test dword r7m, 0x800 jz .hv_w4_10bit psraw m7, 2 .hv_w4_10bit: pshufd m11, m1, q0000 pshufd m12, m1, q1111 pshufd m13, m1, q2222 pshufd m14, m1, q3333 .hv_w4: vbroadcasti128 m9, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] pshufd m8, m7, q1111 pshufd m7, m7, q0000 movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*1], 1 ; 0 1 vbroadcasti128 m0, [srcq+r6 ] vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3 lea srcq, [srcq+strideq*4] vinserti128 m0, [srcq+strideq*0], 1 ; 3 4 movu xm3, [srcq+strideq*1] vinserti128 m3, [srcq+strideq*2], 1 ; 5 6 add srcq, r6 pshufb m4, m1, m9 pshufb m1, m10 pmaddwd m4, m7 pmaddwd m1, m8 pshufb m5, m2, m9 pshufb m2, m10 pmaddwd m5, m7 pmaddwd m2, m8 paddd m4, m15 paddd m1, m4 pshufb m4, m0, m9 pshufb m0, m10 pmaddwd m4, m7 pmaddwd m0, m8 paddd m5, m15 paddd m2, m5 pshufb m5, m3, m9 pshufb m3, m10 pmaddwd m5, m7 pmaddwd m3, m8 paddd m4, m15 paddd m4, m0 paddd m5, m15 paddd m5, m3 vperm2i128 m0, m1, m2, 0x21 psrld m1, 6 psrld m2, 6 vperm2i128 m3, m4, m5, 0x21 pslld m4, 10 pslld m5, 10 pblendw m2, m4, 0xaa ; 23 34 pslld m0, 10 pblendw m1, m0, 0xaa ; 01 12 psrld m3, 6 pblendw m3, m5, 0xaa ; 45 56 psrad m0, m5, 16 .hv_w4_loop: movu xm4, [srcq+strideq*0] vinserti128 m4, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m15 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 pshufb m3, m4, m9 pshufb m4, m10 pmaddwd m3, m7 pmaddwd m4, m8 paddd m3, m15 paddd m4, m3 psrad m4, 6 packssdw m0, m4 ; _ 7 6 8 vpermq m3, m0, q1122 ; _ 6 _ 7 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m4, m5 psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] %if WIN64 PUSH r8 %endif mov r8d, wd shl wd, 5 lea r6, [strideq*3] sub srcq, 6 sub srcq, r6 mov r5, srcq mov r7, tmpq lea wd, [hq+wq-256] pxor m0, m0 punpcklbw m0, m2 mova [v_mul], xm1 psraw m0, 4 test dword r7m, 0x800 jz .hv_w8_10bit psraw m0, 2 .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 pshufd m13, m0, q2222 pshufd m14, m0, q3333 .hv_w8_loop0: %macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 pmaddwd m3, m12, m2 pmaddwd m%1, m11 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m3, m15 paddd m%1, m3 pmaddwd m3, m14, m%2 paddd m%1, m3 pmaddwd m3, m13, m2 pshufb m%3, m9 ; a b b c c d d e pmaddwd m2, m11 paddd m%1, m3 pmaddwd m3, m12, m%2 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m14 pmaddwd m%2, m13 paddd m2, m15 paddd m2, m3 paddd m2, m%3 paddd m2, m%2 psrad m%1, 6 psrad m2, 6 packssdw m%1, m2 %endmacro movu xm4, [srcq+r6 + 0] vbroadcasti128 m8, [subpel_h_shufA] movu xm6, [srcq+r6 + 8] vbroadcasti128 m9, [subpel_h_shufB] movu xm0, [srcq+r6 +16] movu xm5, [srcq+strideq*0+ 0] vinserti128 m5, [srcq+strideq*4+ 0], 1 movu xm1, [srcq+strideq*0+16] vinserti128 m1, [srcq+strideq*4+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PREP_8TAP_HV_H 4, 6, 0 ; 3 INIT_YMM avx2 PREP_8TAP_HV_H 5, 7, 1 ; 0 4 movu xm0, [srcq+strideq*2+ 0] vinserti128 m0, [srcq+r6 *2+ 0], 1 movu xm1, [srcq+strideq*2+16] vinserti128 m1, [srcq+r6 *2+16], 1 shufpd m7, m0, m1, 0x05 PREP_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+strideq*1+ 0] movu xm1, [srcq+strideq*1+16] lea srcq, [srcq+strideq*4] vinserti128 m6, [srcq+strideq*1+ 0], 1 vinserti128 m1, [srcq+strideq*1+16], 1 add srcq, r6 shufpd m7, m6, m1, 0x05 PREP_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 vpermq m5, m5, q3120 vpermq m6, m6, q3120 vpermq m7, m0, q3120 punpcklwd m3, m7, m4 ; 23 punpckhwd m4, m5 ; 34 punpcklwd m1, m5, m6 ; 01 punpckhwd m5, m6 ; 45 punpcklwd m2, m6, m7 ; 12 punpckhwd m6, m7 ; 56 .hv_w8_loop: vpbroadcastd m9, [v_mul+4*0] vpbroadcastd m7, [v_mul+4*1] vpbroadcastd m10, [v_mul+4*2] pmaddwd m8, m9, m1 ; a0 pmaddwd m9, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m7 ; a1 pmaddwd m4, m7 ; b1 paddd m8, m15 paddd m9, m15 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 movu xm5, [srcq+strideq*0] vinserti128 m5, [srcq+strideq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] movu xm6, [srcq+strideq*0+16] vinserti128 m6, [srcq+strideq*1+16], 1 vextracti128 [tmpq], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 pmaddwd m5, m12 paddd m0, m15 paddd m0, m5 pshufb m5, m6, m7 ; 89 pshufb m6, m10 ; ab pmaddwd m5, m13 pmaddwd m6, m14 paddd m5, m15 paddd m6, m5 movu xm5, [srcq+strideq*0+8] vinserti128 m5, [srcq+strideq*1+8], 1 lea srcq, [srcq+strideq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 pmaddwd m7, m11 paddd m0, m10 paddd m6, m7 pmaddwd m7, m14, m5 pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 vbroadcasti128 m6, [tmpq] vpbroadcastd m10, [v_mul+4*3] psrad m0, 6 psrad m5, 6 packssdw m0, m5 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m10, m5 ; a3 pmaddwd m10, m6 ; b3 paddd m7, m8 paddd m9, m10 psrad m7, 6 psrad m9, 6 packssdw m7, m9 vpermq m7, m7, q3120 mova [tmpq+r8*0], xm7 vextracti128 [tmpq+r8*2], m7, 1 lea tmpq, [tmpq+r8*4] sub hd, 2 jg .hv_w8_loop add r5, 16 add r7, 16 movzx hd, wb mov srcq, r5 mov tmpq, r7 sub wd, 1<<8 jg .hv_w8_loop0 %if WIN64 POP r8 %endif RET %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro REMAP_REG 2 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %xdefine r14_save r14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep %xdefine r14 r14_save %undef r14_save %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd movu xm%1, [srcq+ r4*2] movu xm%2, [srcq+ r6*2] movu xm%3, [srcq+ r7*2] movu xm%4, [srcq+ r9*2] vinserti128 m%1, [srcq+r10*2], 1 vinserti128 m%2, [srcq+r11*2], 1 vinserti128 m%3, [srcq+r13*2], 1 vinserti128 m%4, [srcq+ rX*2], 1 add srcq, ssq movu xm%5, [srcq+ r4*2] movu xm%6, [srcq+ r6*2] movu xm%7, [srcq+ r7*2] movu xm%8, [srcq+ r9*2] vinserti128 m%5, [srcq+r10*2], 1 vinserti128 m%6, [srcq+r11*2], 1 vinserti128 m%7, [srcq+r13*2], 1 vinserti128 m%8, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m%1, m12 pmaddwd m%2, m13 pmaddwd m%3, m14 pmaddwd m%4, m15 pmaddwd m%5, m12 pmaddwd m%6, m13 pmaddwd m%7, m14 pmaddwd m%8, m15 phaddd m%1, m%2 %if %9 mova m10, [rsp+0x00] %endif phaddd m%3, m%4 phaddd m%5, m%6 phaddd m%7, m%8 phaddd m%1, m%3 phaddd m%5, m%7 paddd m%1, m10 paddd m%5, m10 psrad m%1, xm11 psrad m%5, xm11 packssdw m%1, m%5 %endmacro %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isput 1 %assign isprep 0 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_16bpc, 4, 15, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %endif %xdefine base_reg r12 mov r7d, pxmaxm %else %assign isput 0 %assign isprep 1 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_16bpc, 4, 15, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %xdefine tmp_stridem r14q %else cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [rsp+0xd0] %endif %xdefine base_reg r11 %endif lea base_reg, [%1_8tap_scaled_16bpc_avx2] %define base base_reg-%1_8tap_scaled_16bpc_avx2 tzcnt wd, wm vpbroadcastd m8, dxm %if isprep && UNIX64 movd xm10, mxd vpbroadcastd m10, xm10 mov r5d, t0d DECLARE_REG_TMP 5, 7 mov r6d, pxmaxm %else vpbroadcastd m10, mxm %if isput vpbroadcastw m11, pxmaxm %else mov r6d, pxmaxm %endif %endif mov dyd, dym %if isput %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %else DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %if required_stack_alignment > STACK_ALIGNMENT %define dsm [rsp+0x98] %define rX r1 %define rXd r1d %else %define dsm dsq %define rX r14 %define rXd r14d %endif %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %else DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+0x98] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define rX r14 %define rXd r14d %endif shr r7d, 11 vpbroadcastd m6, [base+pd_0x3ff] vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4] movd xm7, [base+s_8tap_h_sh+r7*4] %if isput vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4] pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2 %else vpbroadcastd m13, [base+pd_m524256] %endif pxor m9, m9 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] add wq, base_reg jmp wq %if isput .w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0,1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd xm4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*2] movu xm3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*0], 1 ; 0 4 vinserti128 m1, [srcq+ssq*1], 1 ; 1 5 vinserti128 m2, [srcq+ssq*2], 1 ; 2 6 vinserti128 m3, [srcq+ss3q ], 1 ; 3 7 lea srcq, [srcq+ssq*4] REPX {pshufb x, m10}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 phaddd m0, m1 phaddd m2, m3 paddd m0, m12 paddd m2, m12 psrad m0, xm7 psrad m2, xm7 packssdw m0, m2 ; 0 1 2 3 4 5 6 7 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 pshufd xm4, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm4 ; 45 56 punpckhwd xm4, xm1, xm4 ; 67 __ .w2_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm14, r6q pmovsxbw xm14, xm14 pshufd xm8, xm14, q0000 pshufd xm9, xm14, q1111 pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pshufd xm8, xm14, q2222 pshufd xm14, xm14, q3333 paddd xm5, xm6 pmaddwd xm6, xm2, xm8 pmaddwd xm8, xm4, xm14 psrldq xm9, xm7, 8 paddd xm5, xm6 paddd xm5, xm13 paddd xm5, xm8 psrad xm5, xm9 packusdw xm5, xm5 pminsw xm5, xm11 movd [dstq], xm5 add dstq, dsq dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w2_loop movu xm5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps xm3, xm0, q1032 ; 01 12 shufps xm0, xm2, q1032 ; 23 34 shufps xm2, xm4, q1032 ; 45 56 pshufb xm5, xm10 pmaddwd xm5, xm15 phaddd xm5, xm5 paddd xm5, xm12 psrad xm5, xm7 packssdw xm5, xm5 palignr xm1, xm5, xm1, 12 punpcklqdq xm1, xm1 ; 6 7 6 7 punpcklwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop .w2_skip_line: movu xm6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xm3, xm0 ; 01 12 mova xm0, xm2 ; 23 34 pshufb xm5, xm10 pshufb xm6, xm10 pmaddwd xm5, xm15 pmaddwd xm6, xm15 phaddd xm5, xm6 paddd xm5, xm12 psrad xm5, xm7 packssdw xm5, xm5 ; 6 7 6 7 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 pshufd xm5, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm5 ; 45 56 punpckhwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop %endif .w4: mov myd, mym mova [rsp+0x00], m12 %if isput mova [rsp+0x20], xm13 %else SWAP m11, m13 %endif mova [rsp+0x30], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m0, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] pcmpeqd m6, m9 punpckldq m10, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 pblendvb m14, m2, m10 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu xm7, [srcq+ssq*0] movu xm9, [srcq+ssq*1] movu xm8, [srcq+ssq*2] movu xm10, [srcq+ss3q ] movu xm1, [srcq+r4 ] movu xm3, [srcq+r6 ] movu xm2, [srcq+r11 ] movu xm4, [srcq+r13 ] lea srcq, [srcq+ssq*4] vinserti128 m7, [srcq+ssq*0], 1 vinserti128 m9, [srcq+ssq*1], 1 vinserti128 m8, [srcq+ssq*2], 1 vinserti128 m10, [srcq+ss3q ], 1 vinserti128 m1, [srcq+r4 ], 1 vinserti128 m3, [srcq+r6 ], 1 vinserti128 m2, [srcq+r11 ], 1 vinserti128 m4, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m0 paddb m13, m0 REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m14}, m7, m9, m8, m10 REPX {pshufb x, m13}, m1, m2, m3, m4 REPX {pmaddwd x, m15}, m1, m2, m3, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x30] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 4 5 packssdw m8, m10 ; 2 3 6 7 vextracti128 xm9, m7, 1 ; 4 5 vextracti128 xm3, m8, 1 ; 6 7 shufps xm4, xm7, xm8, q1032 ; 1 2 shufps xm5, xm8, xm9, q1032 ; 3 4 shufps xm6, xm9, xm3, q1032 ; 5 6 psrldq xm10, xm3, 8 ; 7 _ punpcklwd xm0, xm7, xm4 ; 01 punpckhwd xm7, xm4 ; 12 punpcklwd xm1, xm8, xm5 ; 23 punpckhwd xm8, xm5 ; 34 punpcklwd xm2, xm9, xm6 ; 45 punpckhwd xm9, xm6 ; 56 punpcklwd xm3, xm10 ; 67 mova [rsp+0x40], xm7 mova [rsp+0x50], xm8 mova [rsp+0x60], xm9 .w4_loop: and myd, 0x3ff mov r11d, 64 << 24 mov r13d, myd shr r13d, 6 lea r13d, [t1+r13] cmovnz r11q, [base+subpel_filters+r13*8] movq xm9, r11q pmovsxbw xm9, xm9 pshufd xm7, xm9, q0000 pshufd xm8, xm9, q1111 pmaddwd xm4, xm0, xm7 pmaddwd xm5, xm1, xm8 pshufd xm7, xm9, q2222 pshufd xm9, xm9, q3333 pmaddwd xm6, xm2, xm7 pmaddwd xm8, xm3, xm9 %if isput mova xm7, [rsp+0x20] movd xm9, [rsp+0x38] %else SWAP m7, m11 %endif paddd xm4, xm5 paddd xm6, xm8 paddd xm4, xm6 paddd xm4, xm7 %if isput psrad xm4, xm9 packusdw xm4, xm4 pminuw xm4, xm11 movq [dstq], xm4 add dstq, dsq %else SWAP m11, m7 psrad xm4, 6 packssdw xm4, xm4 movq [tmpq], xm4 add tmpq, 8 %endif dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w4_loop mova xm8, [rsp+0x00] movd xm9, [rsp+0x30] movu xm4, [srcq] movu xm5, [srcq+r4] test myd, 0x400 jz .w4_skip_line mova xm0, [rsp+0x40] mova [rsp+0x40], xm1 mova xm1, [rsp+0x50] mova [rsp+0x50], xm2 mova xm2, [rsp+0x60] mova [rsp+0x60], xm3 pshufb xm4, xm12 pshufb xm5, xm13 pmaddwd xm4, xm14 pmaddwd xm5, xm15 phaddd xm4, xm5 paddd xm4, xm8 psrad xm4, xm9 packssdw xm4, xm4 punpcklwd xm3, xm10, xm4 mova xm10, xm4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu xm6, [srcq+ssq*1] movu xm7, [srcq+r6] movu m0, [rsp+0x50] pshufb xm4, xm12 pshufb xm6, xm12 pshufb xm5, xm13 pshufb xm7, xm13 pmaddwd xm4, xm14 pmaddwd xm6, xm14 pmaddwd xm5, xm15 pmaddwd xm7, xm15 mova [rsp+0x40], m0 phaddd xm4, xm5 phaddd xm6, xm7 paddd xm4, xm8 paddd xm6, xm8 psrad xm4, xm9 psrad xm6, xm9 packssdw xm4, xm6 punpcklwd xm9, xm10, xm4 mova [rsp+0x60], xm9 psrldq xm10, xm4, 8 mova xm0, xm1 mova xm1, xm2 mova xm2, xm3 punpcklwd xm3, xm4, xm10 lea srcq, [srcq+ssq*2] jmp .w4_loop SWAP m10, m13 %if isprep SWAP m13, m11 %endif .w8: mov dword [rsp+0x80], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+0x80], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+0x80], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+0x80], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+0x80], 16 movifprep tmp_stridem, 256 .w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free %if isput movifnidn dsm, dsq mova [rsp+0xb0], xm7 %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 shr t0d, 16 sub srcq, 6 pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0x84], t0d mov [rsp+0x88], srcq mov [rsp+0x90], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] jmp .hloop .hloop_prep: dec dword [rsp+0x80] jz .ret add qword [rsp+0x90], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x40] vpbroadcastd m15, [rsp+0x84] pxor m9, m9 mov srcq, [rsp+0x88] mov r0q, [rsp+0x90] ; dstq / tmpq .hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x40], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq [rsp+0xa0], xm1 movq [rsp+0xa8], xm7 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x60], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x60] vbroadcasti128 m9, [base+subpel_s_shuf8] mov myd, mym mov dyd, dym pshufb m0, m9 ; 01a 01b pshufb m1, m9 ; 23a 23b pshufb m2, m9 ; 45a 45b pshufb m3, m9 ; 67a 67b .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm9, r6q punpcklqdq xm9, xm9 pmovsxbw m9, xm9 pshufd m8, m9, q0000 pshufd m7, m9, q1111 pmaddwd m4, m0, m8 pmaddwd m5, m1, m7 pshufd m8, m9, q2222 pshufd m9, m9, q3333 pmaddwd m6, m2, m8 pmaddwd m7, m3, m9 %if isput psrldq xm8, xm11, 8 %endif paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, xm8 vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xb0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+0x60], myd mov r4d, [rsp+0xa0] mov r6d, [rsp+0xa4] mov r7d, [rsp+0xa8] mov r9d, [rsp+0xac] jz .skip_line vbroadcasti128 m9, [base+wswap] movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq mov myd, [rsp+0x60] mov dyd, dym pshufb m0, m9 pshufb m1, m9 pshufb m2, m9 pshufb m3, m9 pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 phaddd m4, m6 paddd m4, m10 psrad m4, xm11 pslld m4, 16 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .vloop .skip_line: mova m0, m1 mova m1, m2 mova m2, m3 MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1 vbroadcasti128 m9, [base+subpel_s_shuf8] mov myd, [rsp+0x60] mov dyd, dym pshufb m3, m9 jmp .vloop SWAP m1, m12, m10 SWAP m7, m11 .dy1: movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] add wq, base_reg jmp wq %if isput .dy1_w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0-1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*2] movu xm3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*1], 1 vinserti128 m2, [srcq+ssq*2], 1 add srcq, ss3q movq xm6, r4q pmovsxbw xm6, xm6 pshufd xm8, xm6, q0000 pshufd xm9, xm6, q1111 pshufd xm14, xm6, q2222 pshufd xm6, xm6, q3333 REPX {pshufb x, m10}, m0, m1, m2 pshufb xm3, xm10 REPX {pmaddwd x, m15}, m0, m1, m2 pmaddwd xm3, xm15 phaddd m0, m1 phaddd m2, m3 paddd m0, m12 paddd m2, m12 psrad m0, xm7 psrad m2, xm7 packssdw m0, m2 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 pshufd xm4, xm1, q2121 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 punpcklwd xm2, xm1, xm4 ; 45 56 .dy1_w2_loop: movu xm1, [srcq+ssq*0] movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm1, xm10 pshufb xm5, xm10 pmaddwd xm1, xm15 pmaddwd xm5, xm15 phaddd xm1, xm5 pmaddwd xm5, xm3, xm8 mova xm3, xm0 pmaddwd xm0, xm9 paddd xm1, xm12 psrad xm1, xm7 packssdw xm1, xm1 paddd xm5, xm0 mova xm0, xm2 pmaddwd xm2, xm14 paddd xm5, xm2 palignr xm2, xm1, xm4, 12 punpcklwd xm2, xm1 ; 67 78 pmaddwd xm4, xm2, xm6 paddd xm5, xm13 paddd xm5, xm4 mova xm4, xm1 psrldq xm1, xm7, 8 psrad xm5, xm1 packusdw xm5, xm5 pminsw xm5, xm11 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif .dy1_w4: mov myd, mym %if isput mova [rsp+0x50], xm11 %endif mova [rsp+0x00], m12 mova [rsp+0x20], m13 mova [rsp+0x40], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m4, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] pcmpeqd m6, m9 punpckldq m10, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 pblendvb m14, m2, m10 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*2] lea r11, [r4+ssq*1] lea r13, [r4+ss3q ] movu xm0, [srcq+ssq*0] movu xm7, [srcq+r4 ] movu xm1, [srcq+ssq*2] movu xm8, [srcq+r6 ] vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 vinserti128 m7, [srcq+r11 ], 1 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 vinserti128 m8, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] movu xm2, [srcq+ssq*0] movu xm9, [srcq+r4 ] movu xm3, [srcq+ssq*2] ; 6 _ movu xm10, [srcq+r6 ] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 vinserti128 m9, [srcq+r11 ], 1 lea srcq, [srcq+ss3q ] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m4 paddb m13, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x40] pshufb m0, m12 pshufb m1, m12 pmaddwd m0, m14 pmaddwd m1, m14 pshufb m7, m13 pshufb m8, m13 pmaddwd m7, m15 pmaddwd m8, m15 pshufb m2, m12 pshufb xm3, xm12 pmaddwd m2, m14 pmaddwd xm3, xm14 pshufb m9, m13 pshufb xm10, xm13 pmaddwd m9, m15 pmaddwd xm10, xm15 phaddd m0, m7 phaddd m1, m8 phaddd m2, m9 phaddd xm3, xm10 paddd m0, m5 paddd m1, m5 paddd m2, m5 paddd xm3, xm5 psrad m0, xm6 psrad m1, xm6 psrad m2, xm6 psrad xm3, xm6 vperm2i128 m4, m0, m1, 0x21 ; 1 2 vperm2i128 m5, m1, m2, 0x21 ; 3 4 vperm2i128 m6, m2, m3, 0x21 ; 5 6 shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] pslld m4, 16 pslld m5, 16 pslld m6, 16 pblendw m0, m4, 0xaa ; 01 12 pblendw m1, m5, 0xaa ; 23 34 pblendw m2, m6, 0xaa ; 45 56 movq xm10, r13q punpcklqdq xm10, xm10 pmovsxbw m10, xm10 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 .dy1_w4_loop: movu xm11, [srcq+ssq*0] movu xm6, [srcq+r4 ] vinserti128 m11, [srcq+ssq*1], 1 vinserti128 m6, [srcq+r11 ], 1 lea srcq, [srcq+ssq*2] pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pshufb m11, m12 pshufb m6, m13 pmaddwd m11, m14 pmaddwd m6, m15 paddd m4, [rsp+0x20] phaddd m11, m6 pmaddwd m6, m2, m9 paddd m11, [rsp+0x00] psrad m11, [rsp+0x40] mova m0, m1 mova m1, m2 paddd m5, m6 paddd m4, m5 vinserti128 m2, m3, xm11, 1 pslld m3, m11, 16 pblendw m2, m3, 0xaa ; 67 78 pmaddwd m5, m2, m10 vextracti128 xm3, m11, 1 paddd m4, m5 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0x50] movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET SWAP m10, m13 .dy1_w8: mov dword [rsp+0xa0], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+0xa0], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+0xa0], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+0xa0], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+0xa0], 16 movifprep tmp_stridem, 256 .dy1_w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput %if required_stack_alignment > STACK_ALIGNMENT %define dsm [rsp+0xb8] %endif movifnidn dsm, dsq mova [rsp+0xc0], xm7 %else %if UNIX64 %define hm [rsp+0xb8] %endif %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 mova [rsp+0x40], xm11 shr t0d, 16 sub srcq, 6 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0xa4], t0d mov [rsp+0xa8], srcq mov [rsp+0xb0], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+0xa0] jz .ret add qword [rsp+0xb0], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x60] vpbroadcastd m15, [rsp+0xa4] pxor m9, m9 mov srcq, [rsp+0xa8] mov r0q, [rsp+0xb0] ; dstq / tmpq mova m10, [rsp+0x00] mova xm11, [rsp+0x40] .dy1_hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x60], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x80], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x80] vbroadcasti128 m7, [base+subpel_s_shuf8] vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m10, [rsp+0x58] vpbroadcastd m11, [rsp+0x5c] pshufb m0, m7 ; 01a 01b pshufb m1, m7 ; 23a 23b pshufb m2, m7 ; 45a 45b pshufb m3, m7 ; 67a 67b .dy1_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xc0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep vbroadcasti128 m7, [base+wswap] pshufb m0, m7 pshufb m1, m7 pshufb m2, m7 pshufb m3, m7 movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 phaddd m4, m6 paddd m4, [rsp+0x00] psrad m4, [rsp+0x40] pslld m4, 16 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .dy1_vloop SWAP m1, m12, m10 SWAP m7, m11 .dy2: movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] add wq, base_reg jmp wq %if isput .dy2_w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0-1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd xm4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*2] movu xm2, [srcq+ssq*4] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m0, m10 pshufb m1, m10 pshufb m2, m10 pmaddwd m0, m15 pmaddwd m1, m15 pmaddwd m2, m15 movq xm6, r4q pmovsxbw xm6, xm6 phaddd m0, m1 phaddd m1, m2 paddd m0, m12 paddd m1, m12 psrad m0, xm7 psrad m1, xm7 packssdw m0, m1 ; 0 2 2 4 1 3 3 5 vextracti128 xm1, m0, 1 pshufd xm8, xm6, q0000 pshufd xm9, xm6, q1111 pshufd xm14, xm6, q2222 pshufd xm6, xm6, q3333 punpcklwd xm2, xm0, xm1 ; 01 23 punpckhwd xm1, xm0, xm1 ; 23 45 .dy2_w2_loop: movu xm3, [srcq+ssq*0] movu xm5, [srcq+ssq*2] vinserti128 m3, [srcq+ssq*1], 1 ; 6 7 vinserti128 m5, [srcq+ss3q ], 1 ; 8 9 lea srcq, [srcq+ssq*4] pmaddwd xm4, xm2, xm8 pmaddwd xm1, xm9 pshufb m3, m10 pshufb m5, m10 pmaddwd m3, m15 pmaddwd m5, m15 phaddd m3, m5 paddd xm4, xm1 paddd m3, m12 psrad m3, xm7 packssdw m3, m3 pshufd m3, m3, q2100 palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9 vextracti128 xm1, m0, 1 punpcklwd xm2, xm0, xm1 ; 45 67 punpckhwd xm1, xm0, xm1 ; 67 89 pmaddwd xm3, xm2, xm14 pmaddwd xm5, xm1, xm6 paddd xm4, xm13 paddd xm4, xm3 psrldq xm3, xm7, 8 paddd xm4, xm5 psrad xm4, xm3 packusdw xm4, xm4 pminsw xm4, xm11 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif .dy2_w4: mov myd, mym %if isput mova [rsp+0x50], xm11 %endif mova [rsp+0x00], m12 mova [rsp+0x20], m13 mova [rsp+0x40], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m4, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] pcmpeqd m6, m9 punpckldq m11, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 movq xm10, r13q pblendvb m14, m2, m11 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu xm0, [srcq+ssq*0] movu xm7, [srcq+r4 ] movu xm1, [srcq+ssq*1] movu xm8, [srcq+r6 ] vinserti128 m0, [srcq+ssq*2], 1 ; 0 2 vinserti128 m7, [srcq+r11 ], 1 vinserti128 m1, [srcq+ss3q ], 1 ; 1 3 vinserti128 m8, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] movu xm2, [srcq+ssq*0] movu xm9, [srcq+r4 ] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 vinserti128 m9, [srcq+r6 ], 1 lea srcq, [srcq+ssq*2] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m4 paddb m13, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x40] pshufb m0, m12 pshufb m1, m12 pshufb m2, m12 pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pshufb m7, m13 pshufb m8, m13 pshufb m9, m13 pmaddwd m7, m15 pmaddwd m8, m15 pmaddwd m9, m15 punpcklqdq xm10, xm10 pmovsxbw m10, xm10 phaddd m0, m7 phaddd m1, m8 phaddd m2, m9 paddd m0, m5 paddd m1, m5 paddd m2, m5 psrad m0, xm6 psrad m1, xm6 psrad m2, xm6 vperm2i128 m3, m0, m2, 0x21 ; 2 4 vperm2i128 m2, m1, 0x13 ; 3 5 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 packssdw m0, m3 ; 0 2 2 4 packssdw m1, m2 ; 1 3 3 5 punpckhwd m2, m0, m1 ; 23 45 punpcklwd m0, m1 ; 01 23 .dy2_w4_loop: movu xm1, [srcq+ssq*0] movu xm6, [srcq+r4 ] movu xm3, [srcq+ssq*1] movu xm11, [srcq+r6 ] vinserti128 m1, [srcq+ssq*2], 1 ; 6 8 vinserti128 m6, [srcq+r11 ], 1 vinserti128 m3, [srcq+ss3q ], 1 ; 7 9 vinserti128 m11, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] pmaddwd m4, m0, m7 pmaddwd m5, m2, m8 pshufb m1, m12 pshufb m3, m12 pmaddwd m1, m14 pmaddwd m3, m14 mova m0, [rsp+0x00] pshufb m6, m13 pshufb m11, m13 pmaddwd m6, m15 pmaddwd m11, m15 paddd m4, m5 movd xm5, [rsp+0x40] phaddd m1, m6 phaddd m3, m11 paddd m1, m0 paddd m3, m0 psrad m1, xm5 psrad m3, xm5 pslld m3, 16 pblendw m1, m3, 0xaa ; 67 89 vperm2i128 m0, m2, m1, 0x21 ; 45 67 paddd m4, [rsp+0x20] mova m2, m1 pmaddwd m5, m0, m9 pmaddwd m6, m2, m10 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0x50] movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET SWAP m10, m13 .dy2_w8: mov dword [rsp+0xa0], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+0xa0], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+0xa0], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+0xa0], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+0xa0], 16 movifprep tmp_stridem, 256 .dy2_w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput movifnidn dsm, dsq mova [rsp+0xc0], xm7 %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 mova [rsp+0x40], xm11 shr t0d, 16 sub srcq, 6 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0xa4], t0d mov [rsp+0xa8], srcq mov [rsp+0xb0], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+0xa0] jz .ret add qword [rsp+0xb0], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x60] vpbroadcastd m15, [rsp+0xa4] pxor m9, m9 mov srcq, [rsp+0xa8] mov r0q, [rsp+0xb0] ; dstq / tmpq mova m10, [rsp+0x00] mova xm11, [rsp+0x40] .dy2_hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x60], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x80], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x80] vbroadcasti128 m7, [base+subpel_s_shuf8] vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m10, [rsp+0x58] vpbroadcastd m11, [rsp+0x5c] pshufb m0, m7 ; 01a 01b pshufb m1, m7 ; 23a 23b pshufb m2, m7 ; 45a 45b pshufb m3, m7 ; 67a 67b .dy2_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xc0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep mova m0, m1 mova m1, m2 mova m2, m3 movu xm3, [srcq+ r4*2] movu xm4, [srcq+ r6*2] movu xm5, [srcq+ r7*2] movu xm6, [srcq+ r9*2] vinserti128 m3, [srcq+r10*2], 1 vinserti128 m4, [srcq+r11*2], 1 vinserti128 m5, [srcq+r13*2], 1 vinserti128 m6, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m3, m12 pmaddwd m4, m13 pmaddwd m5, m14 pmaddwd m6, m15 phaddd m3, m4 phaddd m5, m6 phaddd m3, m5 movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 mova m5, [rsp+0x00] movd xm7, [rsp+0x40] phaddd m4, m6 paddd m3, m5 paddd m4, m5 psrad m3, xm7 psrad m4, xm7 pslld m4, 16 pblendw m3, m4, 0xaa jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %undef isput %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_16bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, t0d jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %else DECLARE_REG_TMP 6, 8 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, SHARP PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, SHARP PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %macro WARP_V 5 ; dst, 01, 23, 45, 67 lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm8, [filterq+myq *8] vinserti128 m8, [filterq+tmp1q*8], 1 ; a e lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+deltaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; b f lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm9, [filterq+myq *8] vinserti128 m9, [filterq+tmp1q*8], 1 ; c g lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+gammaq] ; my += gamma punpcklwd m8, m0 shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; d h punpcklwd m0, m9, m0 punpckldq m9, m8, m0 punpckhdq m0, m8, m0 punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 pmaddwd m%2, m8 pmaddwd m9, m%3 punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 pmaddwd m8, m%4 pmaddwd m0, m%5 paddd m9, m%2 mova m%2, m%3 paddd m0, m8 mova m%3, m%4 mova m%4, m%5 paddd m%1, m0, m9 %endmacro cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts mov r6d, r7m lea r9, [$$] shr r6d, 11 vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] vpbroadcastd m14, [warp8x8t_rnd] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main jmp .start .loop: call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 lea tmpq, [tmpq+tsq*4] .start: paddd m7, m14 paddd m0, m14 psrad m7, 15 psrad m0, 15 packssdw m7, m0 vpermq m7, m7, q3120 mova [tmpq+tsq*0], xm7 vextracti128 [tmpq+tsq*2], m7, 1 dec r4d jg .loop .end: RET cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ alpha, beta, filter, tmp1, delta, \ my, gamma mov r6d, r7m lea filterq, [$$] shr r6d, 11 vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] vpbroadcastw m15, r7m ; pixel_max call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m7, 16 psrad m0, 16 packusdw m7, m0 pmulhrsw m7, m14 pminsw m7, m15 vpermq m7, m7, q3120 mova [dstq+dsq*0], xm7 vextracti128 [dstq+dsq*1], m7, 1 dec r4d jg .loop .end: RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov abcdq, r5m mov mxd, r6m %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] vpbroadcastd m12, [pd_32768] pxor m11, m11 add filterq, mc_warp_filter-$$ lea tmp1q, [ssq*3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 sub betad, tmp2d ; beta -= alpha*3 mov myd, r7m call .h psrld m1, m0, 16 call .h pblendw m1, m0, 0xaa ; 01 psrld m2, m0, 16 call .h pblendw m2, m0, 0xaa ; 12 psrld m3, m0, 16 call .h pblendw m3, m0, 0xaa ; 23 psrld m4, m0, 16 call .h pblendw m4, m0, 0xaa ; 34 psrld m5, m0, 16 call .h pblendw m5, m0, 0xaa ; 45 psrld m6, m0, 16 call .h pblendw m6, m0, 0xaa ; 56 movsx deltad, word [abcdq+2*2] movsx gammad, word [abcdq+2*3] add myd, 512+(64<<10) mov r4d, 4 lea tmp1d, [deltaq*3] sub gammad, tmp1d ; gamma -= delta*3 .main2: call .h psrld m7, m6, 16 pblendw m7, m0, 0xaa ; 67 WARP_V 7, 1, 3, 5, 7 call .h psrld m10, m5, 16 pblendw m10, m0, 0xaa ; 78 WARP_V 0, 2, 4, 6, 10 ret ALIGN function_align .h: lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] movu xm10, [srcq-6] vinserti128 m10, [srcq+2], 1 shr mxd, 10 ; 0 shr tmp1d, 10 ; 4 movq xm0, [filterq+mxq *8] vinserti128 m0, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] movu xm8, [srcq-4] vinserti128 m8, [srcq+4], 1 shr tmp2d, 10 ; 1 shr tmp1d, 10 ; 5 movq xm9, [filterq+tmp2q*8] vinserti128 m9, [filterq+tmp1q*8], 1 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 ; 2 shr tmp1d, 10 ; 6 punpcklbw m0, m11, m0 pmaddwd m0, m10 movu xm10, [srcq-2] vinserti128 m10, [srcq+6], 1 punpcklbw m9, m11, m9 pmaddwd m9, m8 movq xm8, [filterq+mxq *8] vinserti128 m8, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta phaddd m0, m9 ; 0 1 4 5 movu xm9, [srcq+0] vinserti128 m9, [srcq+8], 1 shr tmp2d, 10 ; 3 shr tmp1d, 10 ; 7 punpcklbw m8, m11, m8 pmaddwd m8, m10 movq xm10, [filterq+tmp2q*8] vinserti128 m10, [filterq+tmp1q*8], 1 punpcklbw m10, m11, m10 pmaddwd m9, m10 add srcq, ssq phaddd m8, m9 ; 2 3 6 7 phaddd m0, m8 ; 0 1 2 3 4 5 6 7 vpsllvd m0, m13 paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 cmp hd, 8 je .ret lea dstq, [dstq+strideq*4] movq [dstq ], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq ], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .ret: RET .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 cmp hd, 4 jne .w8_loop_start RET .w8_loop: call .main lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 .w8_loop_start: lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 sub hd, 8 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 dec hd jg .w128_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx2_table lea r6, [avg_avx2_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m4, [base+bidir_rnd+t0*4] vpbroadcastd m5, [base+bidir_mul+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+32*0] paddsw m0, [tmp2q+32*0] mova m1, [tmp1q+32*1] paddsw m1, [tmp2q+32*1] mova m2, [tmp1q+32*2] paddsw m2, [tmp2q+32*2] mova m3, [tmp1q+32*3] paddsw m3, [tmp2q+32*3] add tmp1q, 32*4 add tmp2q, 32*4 pmaxsw m0, m4 pmaxsw m1, m4 pmaxsw m2, m4 pmaxsw m3, m4 psubsw m0, m4 psubsw m1, m4 psubsw m2, m4 psubsw m3, m4 pmulhw m0, m5 pmulhw m1, m5 pmulhw m2, m5 pmulhw m3, m5 ret cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3 lea r6, [w_avg_avx2_table] tzcnt wd, wm mov t0d, r6m ; weight vpbroadcastw m8, r7m ; pixel_max vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538] movsxd wq, [r6+wq*4] paddw m7, m8 add wq, r6 lea r6d, [t0-16] shl t0d, 16 sub t0d, r6d ; 16-weight, weight pslld m7, 7 rorx r6d, t0d, 30 ; << 2 test dword r7m, 0x800 cmovz r6d, t0d movifnidn hd, hm movd xm6, r6d vpbroadcastd m6, xm6 BIDIR_FN ALIGN function_align .main: mova m4, [tmp1q+32*0] mova m0, [tmp2q+32*0] punpckhwd m5, m0, m4 punpcklwd m0, m4 mova m4, [tmp1q+32*1] mova m1, [tmp2q+32*1] pmaddwd m5, m6 pmaddwd m0, m6 paddd m5, m7 paddd m0, m7 psrad m5, 8 psrad m0, 8 packusdw m0, m5 punpckhwd m5, m1, m4 punpcklwd m1, m4 mova m4, [tmp1q+32*2] mova m2, [tmp2q+32*2] pmaddwd m5, m6 pmaddwd m1, m6 paddd m5, m7 paddd m1, m7 psrad m5, 8 psrad m1, 8 packusdw m1, m5 punpckhwd m5, m2, m4 punpcklwd m2, m4 mova m4, [tmp1q+32*3] mova m3, [tmp2q+32*3] add tmp1q, 32*4 add tmp2q, 32*4 pmaddwd m5, m6 pmaddwd m2, m6 paddd m5, m7 paddd m2, m7 psrad m5, 8 psrad m2, 8 packusdw m2, m5 punpckhwd m5, m3, m4 punpcklwd m3, m4 pmaddwd m5, m6 pmaddwd m3, m6 paddd m5, m7 paddd m3, m7 psrad m5, 8 psrad m3, 8 packusdw m3, m5 pminsw m0, m8 pminsw m1, m8 pminsw m2, m8 pminsw m3, m8 ret cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx2_table lea r7, [mask_avx2_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_64] vpbroadcastd m9, [base+bidir_rnd+r6*4] vpbroadcastd m10, [base+bidir_mul+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: %macro MASK 1 pmovzxbw m5, [maskq+16*%1] mova m%1, [tmp1q+32*%1] mova m6, [tmp2q+32*%1] punpckhwd m4, m%1, m6 punpcklwd m%1, m6 psubw m7, m8, m5 punpckhwd m6, m5, m7 ; m, 64-m punpcklwd m5, m7 pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m) pmaddwd m%1, m5 psrad m4, 5 psrad m%1, 5 packssdw m%1, m4 pmaxsw m%1, m9 psubsw m%1, m9 pmulhw m%1, m10 %endmacro MASK 0 MASK 1 MASK 2 MASK 3 add maskq, 16*4 add tmp1q, 32*4 add tmp2q, 32*4 ret cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx2_table lea r7, [w_mask_420_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movd xm0, r7m ; sign movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+bidir_rnd+r6*4] vpbroadcastd m13, [base+bidir_mul+r6*4] movd xm14, [base+pw_2] mov maskq, maskmp psubw xm14, xm0 vpbroadcastw m14, xm14 add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: phaddd m4, m5 paddw m4, m14 psrlw m4, 2 packuswb m4, m4 vextracti128 xm5, m4, 1 punpcklwd xm4, xm5 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 mova [maskq], xm4 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8: vperm2i128 m6, m4, m5, 0x21 vpblendd m4, m5, 0xf0 paddw m4, m14 paddw m4, m6 psrlw m4, 2 vextracti128 xm5, m4, 1 packuswb xm4, xm5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 mova [maskq], xm4 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16: punpcklqdq m6, m4, m5 punpckhqdq m4, m5 paddw m6, m14 paddw m4, m6 psrlw m4, 2 vextracti128 xm5, m4, 1 packuswb xm4, xm5 pshufd xm4, xm4, q3120 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 mova [maskq], xm4 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m4, m14 paddw m4, m5 psrlw m15, m4, 2 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 call .main mova m6, [deint_shuf] paddw m4, m14 paddw m4, m5 psrlw m4, 2 packuswb m15, m4 vpermd m4, m6, m15 mova [dstq+strideq*2+32*0], m0 mova [dstq+strideq*2+32*1], m1 mova [dstq+stride3q +32*0], m2 mova [dstq+stride3q +32*1], m3 mova [maskq], m4 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 32 .w64: paddw m4, m14 paddw m15, m14, m5 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [maskq], m4 ; no available registers call .main paddw m4, [maskq] mova m6, [deint_shuf] paddw m5, m15 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 ; 0 2 4 6 1 3 5 7 vpermd m4, m6, m4 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 mova [maskq], m4 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 64 .w128: paddw m4, m14 paddw m5, m14 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [maskq+32*0], m4 mova [dstq+strideq], m5 call .main paddw m4, m14 paddw m15, m14, m5 mova [dstq+strideq*0+32*4], m0 mova [dstq+strideq*0+32*5], m1 mova [dstq+strideq*0+32*6], m2 mova [dstq+strideq*0+32*7], m3 mova [maskq+32*1], m4 call .main paddw m4, [maskq+32*0] paddw m5, [dstq+strideq] mova m6, [deint_shuf] psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m6, m4 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 mova [maskq+32*0], m4 call .main paddw m4, [maskq+32*1] mova m6, [deint_shuf] paddw m5, m15 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m6, m4 mova [dstq+strideq*1+32*4], m0 mova [dstq+strideq*1+32*5], m1 mova [dstq+strideq*1+32*6], m2 mova [dstq+strideq*1+32*7], m3 mova [maskq+32*1], m4 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: %macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul mova m%1, [tmp1q+32*%1] mova m%2, [tmp2q+32*%1] punpcklwd m8, m%2, m%1 punpckhwd m9, m%2, m%1 psubsw m%1, m%2 pabsw m%1, m%1 psubusw m7, m10, m%1 psrlw m7, 10 ; 64-m psubw m%2, m%3, m7 ; m punpcklwd m%1, m7, m%2 punpckhwd m7, m%2 pmaddwd m%1, m8 pmaddwd m7, m9 psrad m%1, 5 psrad m7, 5 packssdw m%1, m7 pmaxsw m%1, m%4 psubsw m%1, m%4 pmulhw m%1, m%5 %endmacro W_MASK 0, 4 W_MASK 1, 5 phaddw m4, m5 W_MASK 2, 5 W_MASK 3, 6 phaddw m5, m6 add tmp1q, 32*4 add tmp2q, 32*4 ret cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx2_table lea r7, [w_mask_422_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max vpbroadcastb m14, r7m ; sign movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+bidir_rnd+r6*4] vpbroadcastd m13, [base+bidir_mul+r6*4] mova m15, [base+deint_shuf] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 4 W_MASK 1, 5 phaddw m4, m5 W_MASK 2, 5 W_MASK 3, 6 phaddw m5, m6 add tmp1q, 32*4 add tmp2q, 32*4 packuswb m4, m5 pxor m5, m5 psubb m4, m14 pavgb m4, m5 vpermd m4, m15, m4 mova [maskq], m4 add maskq, 32 ret cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx2_table lea r7, [w_mask_444_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] vpbroadcastd m4, [base+pw_64] vpbroadcastd m5, [base+bidir_rnd+r6*4] vpbroadcastd m6, [base+bidir_mul+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end call .main lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+32*0], m0 mova [dstq+32*1], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 call .main mova [dstq+32*2], m0 mova [dstq+32*3], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 call .main mova [dstq+32*2], m0 mova [dstq+32*3], m1 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 call .main mova [dstq+32*6], m0 mova [dstq+32*7], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2, 4, 5, 6 W_MASK 1, 3, 4, 5, 6 packuswb m2, m3 vpermq m2, m2, q3120 add tmp1q, 32*2 add tmp2q, 32*2 mova [maskq], m2 add maskq, 32 ret ; (a * (64 - m) + b * m + 32) >> 6 ; = (((b - a) * m + 32) >> 6) + a ; = (((b - a) * (m << 9) + 16384) >> 15) + a ; except m << 9 overflows int16_t when m == 64 (which is possible), ; but if we negate m it works out (-64 << 9 == -32768). ; = (((a - b) * (m * -512) + 16384) >> 15) + a cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx2_table lea r6, [blend_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw m3, [maskq] movq xm0, [dstq+dsq*0] movhps xm0, [dstq+dsq*1] vpbroadcastq m1, [dstq+dsq*2] vpbroadcastq m2, [dstq+r6 ] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 psubw m1, m0, [tmpq] add maskq, 16 add tmpq, 32 pmullw m3, m6 pmulhrsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET .w8: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 mova xm1, [dstq+dsq*2] vinserti128 m1, [dstq+r6 ], 1 psubw m2, m0, [tmpq+32*0] psubw m3, m1, [tmpq+32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 mova [dstq+dsq*2], xm1 vextracti128 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16 RET .w32: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova m0, [dstq+32*0] psubw m2, m0, [tmpq+32*0] mova m1, [dstq+32*1] psubw m3, m1, [tmpq+32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .w32 RET INIT_XMM avx2 cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h %define base r5-blend_v_avx2_table lea r5, [blend_v_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: vpbroadcastd m2, [base+obmc_masks_avx2+2*2] .w2_loop: movd m0, [dstq+dsq*0] pinsrd m0, [dstq+dsq*1], 1 movq m1, [tmpq] add tmpq, 4*2 psubw m1, m0, m1 pmulhrsw m1, m2 paddw m0, m1 movd [dstq+dsq*0], m0 pextrd [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_loop RET .w4: vpbroadcastq m2, [base+obmc_masks_avx2+4*2] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] psubw m1, m0, [tmpq] add tmpq, 8*2 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET INIT_YMM avx2 .w8: vbroadcasti128 m2, [base+obmc_masks_avx2+8*2] .w8_loop: mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 psubw m1, m0, [tmpq] add tmpq, 16*2 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: mova m4, [base+obmc_masks_avx2+16*2] .w16_loop: mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add tmpq, 32*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: %if WIN64 movaps [rsp+ 8], xmm6 movaps [rsp+24], xmm7 %endif mova m6, [base+obmc_masks_avx2+32*2] vbroadcasti128 m7, [base+obmc_masks_avx2+32*3] .w32_loop: mova m0, [dstq+dsq*0+32*0] psubw m3, m0, [tmpq +32*0] mova xm2, [dstq+dsq*0+32*1] mova xm5, [tmpq +32*1] mova m1, [dstq+dsq*1+32*0] psubw m4, m1, [tmpq +32*2] vinserti128 m2, [dstq+dsq*1+32*1], 1 vinserti128 m5, [tmpq +32*3], 1 add tmpq, 32*4 psubw m5, m2, m5 pmulhrsw m3, m6 pmulhrsw m4, m6 pmulhrsw m5, m7 paddw m0, m3 paddw m1, m4 paddw m2, m5 mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*1+32*0], m1 mova [dstq+dsq*0+32*1], xm2 vextracti128 [dstq+dsq*1+32*1], m2, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif RET %macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp mova m0, [dstq+32*(%1+0)] psubw m2, m0, [tmpq+32*(%2+0)] mova m1, [dstq+32*(%1+1)] psubw m3, m1, [tmpq+32*(%2+1)] %if %3 add tmpq, 32*%3 %endif pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+32*(%1+0)], m0 mova [dstq+32*(%1+1)], m1 %endmacro INIT_XMM avx2 cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_h_avx2_table lea r5, [blend_h_avx2_table] tzcnt wd, wm mov hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea maskq, [base+obmc_masks_avx2+hq*2] lea hd, [hq*3] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd m0, [dstq+dsq*0] pinsrd m0, [dstq+dsq*1], 1 movd m2, [maskq+hq*2] movq m1, [tmpq] add tmpq, 4*2 punpcklwd m2, m2 psubw m1, m0, m1 pmulhrsw m1, m2 paddw m0, m1 movd [dstq+dsq*0], m0 pextrd [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova m3, [blend_shuf] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] movd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 8*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET INIT_YMM avx2 .w8: vbroadcasti128 m3, [blend_shuf] shufpd m3, m3, 0x0c .w8_loop: mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 vpbroadcastd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 16*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET .w16: vpbroadcastw m4, [maskq+hq*2] vpbroadcastw m5, [maskq+hq*2+2] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add tmpq, 32*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16 RET .w32: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0, 2 add dstq, dsq inc hq jl .w32 RET .w64: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2, 4 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2, 8 BLEND_H_ROW 4, -4 BLEND_H_ROW 6, -2 add dstq, dsq inc hq jl .w128 RET cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ bottomext, rightext ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor r12d, r12d lea r10, [ihq-1] cmp yq, ihq cmovs r10, yq test yq, yq cmovs r10, r12 imul r10, sstrideq add srcq, r10 ; ref += iclip(x, 0, iw - 1) lea r10, [iwq-1] cmp xq, iwq cmovs r10, xq test xq, xq cmovs r10, r12 lea srcq, [srcq+r10*2] ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) lea bottomextq, [yq+bhq] sub bottomextq, ihq lea r3, [bhq-1] cmovs bottomextq, r12 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ bottomext, rightext ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, r12 cmp bottomextq, bhq cmovns bottomextq, r3 cmp topextq, bhq cmovg topextq, r3 ; right_ext = iclip(x + bw - iw, 0, bw - 1) lea rightextq, [xq+bwq] sub rightextq, iwq lea r2, [bwq-1] cmovs rightextq, r12 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ bottomext, rightext ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, r12 cmp rightextq, bwq cmovns rightextq, r2 cmp leftextq, bwq cmovns leftextq, r2 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ dst, dstride, src, sstride, bottomext, rightext ; center_h = bh - top_ext - bottom_ext lea r3, [bottomextq+topextq] sub centerhq, r3 ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq imul r2, dstrideq add dstq, r2 mov r9m, dstq ; center_w = bw - left_ext - right_ext mov centerwq, bwq lea r3, [rightextq+leftextq] sub centerwq, r3 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: %if %1 ; left extension xor r3, r3 vpbroadcastw m0, [srcq] .left_loop_%3: mova [dstq+r3*2], m0 add r3, 16 cmp r3, leftextq jl .left_loop_%3 ; body lea r12, [dstq+leftextq*2] %endif xor r3, r3 .body_loop_%3: movu m0, [srcq+r3*2] %if %1 movu [r12+r3*2], m0 %else movu [dstq+r3*2], m0 %endif add r3, 16 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 lea r12, [r12+centerwq*2] %else lea r12, [dstq+centerwq*2] %endif xor r3, r3 vpbroadcastw m0, [srcq+centerwq*2-2] .right_loop_%3: movu [r12+r3*2], m0 add r3, 16 cmp r3, rightextq jl .right_loop_%3 %endif add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %endmacro test leftextq, leftextq jnz .need_left_ext test rightextq, rightextq jnz .need_right_ext v_loop 0, 0, 0 jmp .body_done .need_left_ext: test rightextq, rightextq jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; bottom edge extension test bottomextq, bottomextq jz .top mov srcq, dstq sub srcq, dstrideq xor r1, r1 .bottom_x_loop: mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, bottomextq .bottom_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .bottom_y_loop add r1, 16 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end mov srcq, r9m mov dstq, dstm xor r1, r1 .top_x_loop: mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, topextq .top_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .top_y_loop add r1, 16 cmp r1, bwq jl .top_x_loop .end: RET cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax sub dword mx0m, 4<<14 sub dword src_wm, 8 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pd_64] vpbroadcastw xm7, pxmaxm pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] pslld m5, 3 ; dx*8 pslld m6, 14 paddd m8, m2 ; mx+[0..7]*dx .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: vpbroadcastd m10, [base+pd_63] pxor m2, m2 pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset pand m9, m10 ; filter offset (masked) ; load source pixels movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vextracti128 xm0, m0, 1 movu xm10, [srcq+r8*2] movu xm11, [srcq+r9*2] movu xm12, [srcq+r10*2] movu xm13, [srcq+r11*2] movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vinserti128 m10, [srcq+r8*2], 1 vinserti128 m11, [srcq+r9*2], 1 vinserti128 m12, [srcq+r10*2], 1 vinserti128 m13, [srcq+r11*2], 1 ptest m1, m1 jz .filter movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vextracti128 xm1, m1, 1 movu xm14, [base+resize_shuf+8+r8*2] movu xm15, [base+resize_shuf+8+r9*2] movu xm0, [base+resize_shuf+8+r10*2] movu xm2, [base+resize_shuf+8+r11*2] movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vinserti128 m14, [base+resize_shuf+8+r8*2], 1 vinserti128 m15, [base+resize_shuf+8+r9*2], 1 vinserti128 m0, [base+resize_shuf+8+r10*2], 1 vinserti128 m2, [base+resize_shuf+8+r11*2], 1 pshufb m10, m14 pshufb m11, m15 pshufb m12, m0 pshufb m13, m2 .filter: movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 vextracti128 xm9, m9, 1 movq xm14, [base+resize_filter+r8*8] movq xm15, [base+resize_filter+r9*8] movq xm0, [base+resize_filter+r10*8] movq xm2, [base+resize_filter+r11*8] movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 movhps xm14, [base+resize_filter+r8*8] movhps xm15, [base+resize_filter+r9*8] movhps xm0, [base+resize_filter+r10*8] movhps xm2, [base+resize_filter+r11*8] pmovsxbw m14, xm14 pmovsxbw m15, xm15 pmovsxbw m0, xm0 pmovsxbw m2, xm2 pmaddwd m10, m14 pmaddwd m11, m15 pmaddwd m12, m0 pmaddwd m13, m2 phaddd m10, m11 phaddd m12, m13 phaddd m10, m12 psubd m10, m3, m10 psrad m10, 7 vextracti128 xm0, m10, 1 packusdw xm10, xm0 pminsw xm10, xm7 mova [dstq+xq*2], xm10 paddd m4, m5 add xd, 8 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64