; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 dd 1 pw_2048: times 2 dw 2048 dd 3 pw_8192: times 2 dw 8192 avg_shift: dw 5, 5, 3, 3 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 resize_permE: dq 0, 2, 4, 6 resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 prep_hv_shift: dq 6, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 prep_mul: dw 16, 16, 4, 4 put_8tap_h_rnd: dd 34, 40 prep_8tap_rnd: dd 128 - (8192 << 8) warp_8x8_rnd_h: dd 512, 2048 warp_8x8_rnd_v: dd 262144, 65536 warp_8x8t_rnd_v: dd 16384 - (8192 << 15) avg_round: dw -16400, -16400, -16388, -16388 w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) w_mask_round: dd 128, 64 bidir_shift: dw 6, 6, 4, 4 pb_64: times 4 db 64 pw_m512: times 2 dw -512 pw_2: times 2 dw 2 pw_64: times 2 dw 64 pd_32: dd 32 pd_63: dd 63 pd_128: dd 128 pd_640: dd 640 pd_2176: dd 2176 pd_16384: dd 16384 pd_0_4: dd 0, 4 %define pw_16 prep_mul %define pd_512 warp_8x8_rnd_h %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern obmc_masks_avx2 cextern resize_filter SECTION .text %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif INIT_ZMM avx512icl cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx512icl] tzcnt t0d, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx t0d, word [r7+t0*2+table_offset(put,)] add t0, r7 jmp t0 .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu xmm0, [srcq+ssq*0] movu xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], xmm0 mova [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu ym0, [srcq+ssq*0] movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], ym0 mova [dstq+dsq*1], ym1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+64*0], m0 mova [dstq+dsq*0+64*1], m1 mova [dstq+dsq*1+64*0], m2 mova [dstq+dsq*1+64*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] add srcq, ssq mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .put_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add t0, r7 shr r6d, 11 vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] jmp t0 .h_w2: movq xmm1, [srcq+ssq*0] movhps xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xmm0, [srcq+ssq*0+0] movhps xmm0, [srcq+ssq*1+0] movq xmm1, [srcq+ssq*0+2] movhps xmm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xmm0, xm4 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0+0] vinserti32x4 ym0, [srcq+ssq*1+0], 1 movu xm1, [srcq+ssq*0+2] vinserti32x4 ym1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw ym0, ym4 pmullw ym1, ym5 paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 4 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu ym0, [srcq+ssq*0+0] vinserti32x8 m0, [srcq+ssq*1+0], 1 movu ym1, [srcq+ssq*0+2] vinserti32x8 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m6 paddw m0, m1 psrlw m0, 4 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] pmullw m1, m4, [srcq+ssq*1+0] pmullw m3, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+64*0+0] pmullw m2, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m3, m5, [srcq+64*1+2] add srcq, ssq paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+64*0+0] pmullw m7, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m8, m5, [srcq+64*1+2] pmullw m2, m4, [srcq+64*2+0] pmullw m9, m5, [srcq+64*2+2] pmullw m3, m4, [srcq+64*3+0] pmullw m10, m5, [srcq+64*3+2] add srcq, ssq REPX {paddw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psrlw x, 4}, m0, m1, m2, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .h_w128 RET .v: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] shl mxyd, 11 vpbroadcastw m8, mxyd add t0, r7 jmp t0 .v_w2: movd xmm0, [srcq+ssq*0] .v_w2_loop: movd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xmm2, xmm0, xmm1 movd xmm0, [srcq+ssq*0] punpckldq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm0, [srcq+ssq*0] .v_w4_loop: movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xmm2, xmm0, xmm1 movq xmm0, [srcq+ssq*0] punpcklqdq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xmm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 ymm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd ymm2, ymm0, ymm1, 0xf0 vbroadcasti128 ymm0, [srcq+ssq*0] vpblendd ymm1, ymm0, 0xf0 psubw ymm1, ymm2 pmulhrsw ymm1, ym8 paddw ymm1, ymm2 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: movu ym0, [srcq+ssq*0] .v_w16_loop: movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw ym1, ym3, ym0 pmulhrsw ym1, ym8 paddw ym1, ym0 movu ym0, [srcq+ssq*0] psubw ym2, ym0, ym3 pmulhrsw ym2, ym8 paddw ym2, ym3 mova [dstq+dsq*0], ym1 mova [dstq+dsq*1], ym2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: movu m0, [srcq+ssq*0] .v_w32_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m8 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m8 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] .v_w64_loop: movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m8 paddw m4, m0 movu m0, [srcq+ssq*0+64*0] psubw m5, m3, m1 pmulhrsw m5, m8 paddw m5, m1 movu m1, [srcq+ssq*0+64*1] psubw m6, m0, m2 pmulhrsw m6, m8 psubw m7, m1, m3 pmulhrsw m7, m8 mova [dstq+dsq*0+64*0], m4 mova [dstq+dsq*0+64*1], m5 paddw m6, m2 paddw m7, m3 mova [dstq+dsq*1+64*0], m6 mova [dstq+dsq*1+64*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w64_loop RET .v_w128: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*0+64*2] movu m3, [srcq+ssq*0+64*3] .v_w128_loop: movu m4, [srcq+ssq*1+64*0] movu m5, [srcq+ssq*1+64*1] movu m6, [srcq+ssq*1+64*2] movu m7, [srcq+ssq*1+64*3] lea srcq, [srcq+ssq*2] psubw m9, m4, m0 pmulhrsw m9, m8 paddw m9, m0 movu m0, [srcq+ssq*0+64*0] psubw m10, m5, m1 pmulhrsw m10, m8 paddw m10, m1 movu m1, [srcq+ssq*0+64*1] psubw m11, m6, m2 pmulhrsw m11, m8 paddw m11, m2 movu m2, [srcq+ssq*0+64*2] psubw m12, m7, m3 pmulhrsw m12, m8 paddw m12, m3 movu m3, [srcq+ssq*0+64*3] mova [dstq+dsq*0+64*0], m9 psubw m9, m0, m4 pmulhrsw m9, m8 mova [dstq+dsq*0+64*1], m10 psubw m10, m1, m5 pmulhrsw m10, m8 mova [dstq+dsq*0+64*2], m11 psubw m11, m2, m6 pmulhrsw m11, m8 mova [dstq+dsq*0+64*3], m12 psubw m12, m3, m7 pmulhrsw m12, m8 paddw m9, m4 paddw m10, m5 mova [dstq+dsq*1+64*0], m9 mova [dstq+dsq*1+64*1], m10 paddw m11, m6 paddw m12, m7 mova [dstq+dsq*1+64*2], m11 mova [dstq+dsq*1+64*3], m12 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w128_loop RET .hv: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] shl mxyd, 11 vpbroadcastd m6, [pw_2] vpbroadcastw m7, mxyd vpbroadcastd m8, [pw_8192] add t0, r7 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m8, [pw_2048] .hv_12bpc: jmp t0 .hv_w2: vpbroadcastq xmm1, [srcq+ssq*0] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w2_loop: movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm2, [srcq+ssq*0] pmullw xmm1, xmm2, xm4 psrlq xmm2, 16 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 _ 2 _ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xmm0, xm4, [srcq+ssq*0-8] pmullw xmm1, xm5, [srcq+ssq*0-6] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w4_loop: movq xmm1, [srcq+ssq*1+0] movq xmm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xmm1, [srcq+ssq*0+0] movhps xmm2, [srcq+ssq*0+2] pmullw xmm1, xm4 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 2 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xmm0, xm4, [srcq+ssq*0+0] pmullw xmm1, xm5, [srcq+ssq*0+2] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 vinserti32x4 ym0, xmm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1+0] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x4 ym1, [srcq+ssq*0+0], 1 vinserti32x4 ym2, [srcq+ssq*0+2], 1 pmullw ym1, ym4 pmullw ym2, ym5 paddw ym1, ym6 paddw ym1, ym2 psrlw ym1, 2 ; 1 2 vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 mova ym0, ym1 psubw ym1, ym2 paddw ym1, ym1 pmulhw ym1, ym7 paddw ym1, ym2 pmulhrsw ym1, ym8 mova [dstq+dsq*0], xm1 vextracti32x4 [dstq+dsq*1], ym1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+ssq*0+0] pmullw ym1, ym5, [srcq+ssq*0+2] paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+ssq*1+0] movu ym2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x8 m1, [srcq+ssq*0+0], 1 vinserti32x8 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m7 paddw m1, m2 pmulhrsw m1, m8 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: .hv_w64: .hv_w128: movifnidn wd, wm lea r6d, [hq+wq*8-256] mov r4, srcq mov r7, dstq .hv_w32_loop0: pmullw m0, m4, [srcq+ssq*0+0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m1 psrlw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+ssq*1+0] pmullw m1, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m3, m6 paddw m3, m1 psrlw m3, 2 psubw m1, m3, m0 paddw m1, m1 pmulhw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m3 paddw m2, m2 pmulhw m2, m7 paddw m2, m3 pmulhrsw m1, m8 pmulhrsw m2, m8 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop add r4, 64 add r7, 64 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w32_loop0 RET cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] add wq, r6 shr r5d, 11 vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] lea stride3q, [strideq*3] jmp wq .prep_w4: movq xmm0, [srcq+strideq*0] movhps xmm0, [srcq+strideq*1] vpbroadcastq ymm1, [srcq+strideq*2] vpbroadcastq ymm2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm0, ymm1, 0x30 vpblendd ymm0, ymm2, 0xc0 pmullw ymm0, ym4 psubw ymm0, ym5 mova [tmpq], ymm0 add tmpq, 32 sub hd, 4 jg .prep_w4 vzeroupper RET .prep_w8: movu xm0, [srcq+strideq*0] vinserti32x4 ym0, [srcq+strideq*1], 1 vinserti32x4 m0, [srcq+strideq*2], 2 vinserti32x4 m0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 psubw m0, m5 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .prep_w8 RET .prep_w16: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 movu ym1, [srcq+strideq*2] vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m4, [srcq+strideq*1] pmullw m2, m4, [srcq+strideq*2] pmullw m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+strideq*0+64*0] pmullw m1, m4, [srcq+strideq*0+64*1] pmullw m2, m4, [srcq+strideq*1+64*0] pmullw m3, m4, [srcq+strideq*1+64*1] lea srcq, [srcq+strideq*2] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .prep_w64 RET .prep_w128: pmullw m0, m4, [srcq+64*0] pmullw m1, m4, [srcq+64*1] pmullw m2, m4, [srcq+64*2] pmullw m3, m4, [srcq+64*3] add srcq, strideq REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .prep_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r6m ; my vpbroadcastd m4, [pw_16] vpbroadcastd m6, [pw_32766] psubw m4, m5 test dword r7m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: movu xm1, [srcq+strideq*0] vinserti32x4 ym1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti32x4 ym2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym0, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4 RET .h_w8: movu xm0, [srcq+strideq*0+0] movu xm1, [srcq+strideq*0+2] vinserti32x4 ym0, [srcq+strideq*1+0], 1 vinserti32x4 ym1, [srcq+strideq*1+2], 1 vinserti32x4 m0, [srcq+strideq*2+0], 2 vinserti32x4 m1, [srcq+strideq*2+2], 2 vinserti32x4 m0, [srcq+stride3q +0], 3 vinserti32x4 m1, [srcq+stride3q +2], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8 RET .h_w16: movu ym0, [srcq+strideq*0+0] vinserti32x8 m0, [srcq+strideq*1+0], 1 movu ym1, [srcq+strideq*0+2] vinserti32x8 m1, [srcq+strideq*1+2], 1 lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] pmullw m1, m4, [srcq+strideq*1+0] pmullw m3, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] add srcq, strideq psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+ 0] pmullw m7, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m8, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m9, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m10, m5, [srcq+194] add srcq, strideq REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psraw x, 2}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .h_w128 RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] vpbroadcastw m9, mxyd vpbroadcastd m8, [pw_16] vpbroadcastd m10, [pw_32766] add wq, r6 lea stride3q, [strideq*3] psubw m8, m9 test dword r7m, 0x800 jnz .v_12bpc psllw m8, 2 psllw m9, 2 .v_12bpc: jmp wq .v_w4: movq xmm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastq xmm2, [srcq+strideq*1] vpbroadcastq ymm1, [srcq+strideq*2] vpbroadcastq ymm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm2, ymm1, 0x30 vpblendd ymm2, ymm3, 0xc0 vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 movq xmm0, [srcq+strideq*0] valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 pmullw ymm1, ym8 pmullw ymm2, ym9 psubw ymm1, ym10 paddw ymm1, ymm2 psraw ymm1, 2 mova [tmpq], ymm1 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: movu xm0, [srcq+strideq*0] .v_w8_loop: vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vinserti32x4 m1, [srcq+strideq*2], 2 vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 lea srcq, [srcq+strideq*4] movu xm0, [srcq+strideq*0] valignq m2, m0, m1, 2 ; 1 2 3 4 pmullw m1, m8 pmullw m2, m9 psubw m1, m10 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: movu ym0, [srcq+strideq*0] .v_w16_loop: vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 movu ym3, [srcq+strideq*2] vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 lea srcq, [srcq+strideq*4] movu ym0, [srcq+strideq*0] vshufi32x4 m3, m1, m3, q1032 ; 1 2 vshufi32x4 m4, m2, m0, q1032 ; 3 4 pmullw m1, m8 pmullw m2, m8 pmullw m3, m9 pmullw m4, m9 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: movu m0, [srcq+strideq*0] .v_w32_loop: movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m1, m8, m0 movu m0, [srcq+strideq*0] pmullw m2, m8, m3 pmullw m3, m9 pmullw m4, m9, m0 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+64*0] movu m1, [srcq+64*1] .v_w64_loop: add srcq, strideq pmullw m2, m8, m0 movu m0, [srcq+64*0] pmullw m3, m8, m1 movu m1, [srcq+64*1] pmullw m4, m9, m0 pmullw m5, m9, m1 psubw m2, m10 psubw m3, m10 paddw m2, m4 paddw m3, m5 psraw m2, 2 psraw m3, 2 mova [tmpq+64*0], m2 mova [tmpq+64*1], m3 add tmpq, 64*2 dec hd jg .v_w64_loop RET .v_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] .v_w128_loop: add srcq, strideq pmullw m4, m8, m0 movu m0, [srcq+64*0] pmullw m5, m8, m1 movu m1, [srcq+64*1] pmullw m6, m8, m2 movu m2, [srcq+64*2] pmullw m7, m8, m3 movu m3, [srcq+64*3] pmullw m11, m9, m0 pmullw m12, m9, m1 pmullw m13, m9, m2 pmullw m14, m9, m3 REPX {psubw x, m10}, m4, m5, m6, m7 paddw m4, m11 paddw m5, m12 paddw m6, m13 paddw m7, m14 REPX {psraw x, 2}, m4, m5, m6, m7 mova [tmpq+64*0], m4 mova [tmpq+64*1], m5 mova [tmpq+64*2], m6 mova [tmpq+64*3], m7 add tmpq, 64*4 dec hd jg .v_w128_loop RET .hv: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 vpbroadcastw m7, mxyd add wq, r6 lea stride3q, [strideq*3] jmp wq .hv_w4: movq xmm0, [srcq+strideq*0+0] movq xmm1, [srcq+strideq*0+2] pmullw xmm0, xm4 pmullw xmm1, xm5 psubw xmm0, xm6 paddw xmm0, xmm1 psraw xmm0, 2 vpbroadcastq ym0, xmm0 .hv_w4_loop: movu xm1, [srcq+strideq*1] vinserti128 ym1, [srcq+stride3q ], 1 movu xm2, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vinserti128 ym2, [srcq+strideq*0], 1 punpcklqdq ym3, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym3, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym3, ym6 paddw ym1, ym3 psraw ym1, 2 ; 1 2 3 4 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 mova ym0, ym1 psubw ym1, ym2 pmulhrsw ym1, ym7 paddw ym1, ym2 mova [tmpq], ym1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+strideq*0+0] pmullw xm1, xm5, [srcq+strideq*0+2] psubw xm0, xm6 paddw xm0, xm1 psraw xm0, 2 vinserti32x4 m0, xm0, 3 .hv_w8_loop: movu xm1, [srcq+strideq*1+0] movu xm2, [srcq+strideq*1+2] vinserti32x4 ym1, [srcq+strideq*2+0], 1 vinserti32x4 ym2, [srcq+strideq*2+2], 1 vinserti32x4 m1, [srcq+stride3q +0], 2 vinserti32x4 m2, [srcq+stride3q +2], 2 lea srcq, [srcq+strideq*4] vinserti32x4 m1, [srcq+strideq*0+0], 3 vinserti32x4 m2, [srcq+strideq*0+2], 3 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 3 4 valignq m2, m1, m0, 6 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+strideq*0+0] pmullw ym1, ym5, [srcq+strideq*0+2] psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+strideq*1+0] movu ym2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] vinserti32x8 m1, [srcq+strideq*0+0], 1 vinserti32x8 m2, [srcq+strideq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 2 jg .hv_w16_loop RET .hv_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m1, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m1 psraw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+strideq*1+0] pmullw m1, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m3, m6 paddw m3, m1 psraw m3, 2 psubw m1, m3, m0 pmulhrsw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m2 psraw m0, 2 psubw m2, m0, m3 pmulhrsw m2, m7 paddw m2, m3 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .hv_w32_loop RET .hv_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 .hv_w64_loop: add srcq, strideq pmullw m2, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m3, m4, [srcq+64] pmullw m9, m5, [srcq+66] psubw m2, m6 psubw m3, m6 paddw m2, m8 paddw m3, m9 psraw m2, 2 psraw m3, 2 psubw m8, m2, m0 psubw m9, m3, m1 pmulhrsw m8, m7 pmulhrsw m9, m7 paddw m8, m0 mova m0, m2 paddw m9, m1 mova m1, m3 mova [tmpq+64*0], m8 mova [tmpq+64*1], m9 add tmpq, 64*2 dec hd jg .hv_w64_loop RET .hv_w128: pmullw m0, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m9, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m10, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m11, m5, [srcq+194] REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 REPX {psraw x, 2}, m0, m1, m2, m3 .hv_w128_loop: add srcq, strideq pmullw m8, m4, [srcq+ 0] pmullw m12, m5, [srcq+ 2] pmullw m9, m4, [srcq+ 64] pmullw m13, m5, [srcq+ 66] pmullw m10, m4, [srcq+128] pmullw m14, m5, [srcq+130] pmullw m11, m4, [srcq+192] pmullw m15, m5, [srcq+194] REPX {psubw x, m6}, m8, m9, m10, m11 paddw m8, m12 paddw m9, m13 paddw m10, m14 paddw m11, m15 REPX {psraw x, 2}, m8, m9, m10, m11 psubw m12, m8, m0 psubw m13, m9, m1 psubw m14, m10, m2 psubw m15, m11, m3 REPX {pmulhrsw x, m7}, m12, m13, m14, m15 paddw m12, m0 mova m0, m8 paddw m13, m1 mova m1, m9 mova [tmpq+64*0], m12 mova [tmpq+64*1], m13 paddw m14, m2 mova m2, m10 paddw m15, m3 mova m3, m11 mova [tmpq+64*2], m14 mova [tmpq+64*3], m15 add tmpq, 64*4 dec hd jg .hv_w128_loop RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v cglobal %1_8tap_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %define buf rsp+stack_offset+8 ; shadow space %else DECLARE_REG_TMP 7, 8 %define buf rsp-40 ; red zone %endif MC_8TAP_FN put, sharp, SHARP, SHARP MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP MC_8TAP_FN put, smooth, SMOOTH, SMOOTH MC_8TAP_FN put, sharp_regular, SHARP, REGULAR MC_8TAP_FN put, regular_sharp, REGULAR, SHARP MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH MC_8TAP_FN put, regular, REGULAR, REGULAR cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx512icl] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w2: movzx mxd, mxb sub srcq, 2 mova ym2, [spel_h_shuf2a] pmovsxbw xmm4, [base+subpel_filters+mxq*8] pshufd xmm3, xmm4, q1111 pshufd xmm4, xmm4, q2222 .h_w2_loop: movu xm1, [srcq+ssq*0] vinserti32x4 ym1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova xmm0, xm8 vpermb ym1, ym2, ym1 vpdpwssd xmm0, xmm3, xm1 vextracti32x4 xm1, ym1, 1 vpdpwssd xmm0, xmm4, xm1 psrad xmm0, 6 packusdw xmm0, xmm0 pminsw xmm0, xm9 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] vbroadcasti32x4 ym4, [spel_h_shufA] vbroadcasti32x4 ym5, [spel_h_shufB] pshufd xmm0, xmm0, q2211 vpbroadcastq ym6, xmm0 vpermq ym7, ymm0, q1111 .h_w4_loop: movu xm2, [srcq+ssq*0] vinserti32x4 ym2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova ym0, ym8 pshufb ym1, ym2, ym4 vpdpwssd ym0, ym6, ym1 pshufb ym2, ym5 vpdpwssd ym0, ym7, ym2 psrad ym0, 6 vextracti32x4 xm1, ym0, 1 packusdw xm0, xm1 pminsw xmm0, xm0, xm9 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv mov r7d, r8m vpbroadcastw m9, r8m shr r7d, 11 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 je .h_w4 jl .h_w2 shr mxd, 16 sub srcq, 6 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mova [buf], xmm0 vpbroadcastd m10, xmm0 vpbroadcastd m11, [buf+ 4] vpbroadcastd m12, [buf+ 8] vpbroadcastd m13, [buf+12] cmp wd, 16 je .h_w16 jg .h_w32 .h_w8: mova m4, [spel_h_shufA] movu m5, [spel_h_shufB] movu m6, [spel_h_shufC] mova m7, [spel_h_shufD] .h_w8_loop: movu ym2, [srcq+ssq*0] vinserti32x8 m2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m0, m8 vpermb m1, m4, m2 vpdpwssd m0, m10, m1 vpermb m1, m5, m2 vpdpwssd m0, m11, m1 vpermb m1, m6, m2 vpdpwssd m0, m12, m1 vpermb m1, m7, m2 vpdpwssd m0, m13, m1 psrad m0, 6 vextracti32x8 ym1, m0, 1 packusdw ym0, ym1 pminsw ym0, ym9 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8_loop RET .h_w16: vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+16] vinserti32x8 m3, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m11, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m13, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a2 vpdpwssd m1, m10, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a3 vpdpwssd m1, m11, m2 ; b1 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m9 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] vbroadcasti32x4 m6, [spel_h_shufA] lea dstq, [dstq+wq*2] vbroadcasti32x4 m7, [spel_h_shufB] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m10, m4 ; b0 vpdpwssd m0, m12, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m11, m3 ; b1 vpdpwssd m0, m13, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m12, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m11, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m13, m4 ; b3 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m9 mova [dstq+r6*2], m0 add r6, 32 jl .h_w32_loop add srcq, ssq add dstq, dsq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m10, [pd_32] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r7d, wd vpbroadcastw m11, r8m lea r6, [ssq*3] movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] sub srcq, r6 mova [rsp+stack_offset+8], xmm0 vpbroadcastd m12, xmm0 add r7, r8 vpbroadcastd m13, [rsp+stack_offset+12] vpbroadcastd m14, [rsp+stack_offset+16] vpbroadcastd m15, [rsp+stack_offset+20] jmp r7 .v_w2: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 pinsrd xmm2, [srcq+ssq*2], 2 add srcq, r6 pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, r6 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklwd xmm3, xmm1 ; 45 56 punpcklwd xmm1, xmm2, xmm4 ; 01 12 punpckhwd xmm2, xmm4 ; 23 34 .v_w2_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xmm5, xm10 vpdpwssd xmm5, xm12, xmm1 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xm13, xmm2 ; a1 b1 mova xmm2, xmm3 vpdpwssd xmm5, xm14, xmm3 ; a2 b2 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm0, 0x02 ; 7 8 punpcklwd xmm3, xmm4 ; 67 78 vpdpwssd xmm5, xm15, xmm3 ; a3 b3 psrad xmm5, 6 packusdw xmm5, xmm5 pminsw xmm5, xm11 movd [dstq+dsq*0], xmm5 pextrd [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm1, [srcq+ssq*0] vpbroadcastq ymm0, [srcq+ssq*1] vpbroadcastq ymm2, [srcq+ssq*2] add srcq, r6 vpbroadcastq ymm4, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm5, [srcq+ssq*2] add srcq, r6 vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklwd ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm3, 0x30 punpcklwd ymm2, ymm4 ; 23 34 vpblendd ymm3, ymm5, 0x30 vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 45 56 .v_w4_loop: vpbroadcastq ymm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova ymm4, ym10 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 mova ymm1, ymm2 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 mova ymm2, ymm3 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 vpblendd ymm3, ymm0, ymm5, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 67 78 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 psrad ymm4, 6 vextracti128 xmm5, ymm4, 1 packusdw xmm4, xmm5 pminsw xmm4, xm11 movq [dstq+dsq*0], xmm4 movhps [dstq+dsq*1], xmm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m2, [srcq+ssq*2] vinserti32x4 m1, m2, [srcq+ssq*0], 0 vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 add srcq, r6 vinserti32x4 ym2, [srcq+ssq*0], 1 vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 mova m6, [spel_v_shuf8] movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m2 ; 23 34 vpermb m3, m6, m0 ; 45 56 .v_w8_loop: vinserti32x4 m0, [srcq+ssq*1], 3 lea srcq, [srcq+ssq*2] movu xm5, [srcq+ssq*0] mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m1, m2 vshufi32x4 m0, m5, q1032 ; 6 7 8 vpdpwssd m4, m13, m2 ; a1 b1 mova m2, m3 vpdpwssd m4, m14, m3 ; a2 b2 vpermb m3, m6, m0 ; 67 78 vpdpwssd m4, m15, m3 ; a3 b3 psrad m4, 6 vextracti32x8 ym5, m4, 1 packusdw ym4, ym5 pminsw ym4, ym11 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m1, [srcq+ssq*1] vinserti32x8 m0, m1, [srcq+ssq*0], 0 vinserti32x8 m1, [srcq+ssq*2], 1 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+ssq*0] vinserti32x8 m3, [srcq+ssq*1], 1 movu ym5, [srcq+ssq*2] add srcq, r6 vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m0, m8, m0 ; 01 vpermb m1, m8, m1 ; 12 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 mova m9, [deint_q_shuf] vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m6, m10 mova m7, m10 vpdpwssd m6, m12, m0 ; a0 mova m0, m2 vpdpwssd m7, m12, m1 ; b0 mova m1, m3 vpdpwssd m6, m13, m2 ; a1 mova m2, m4 vpdpwssd m7, m13, m3 ; b1 mova m3, m5 vpdpwssd m6, m14, m4 ; a2 mova m4, m5 vpdpwssd m7, m14, m5 ; b2 movu ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m5, 16 ; 67 vpdpwssd m6, m15, m4 ; a3 vpdpwssd m7, m15, m5 ; b3 psrad m6, 6 psrad m7, 6 packusdw m6, m7 pminsw m6, m11 vpermq m6, m9, m6 mova [dstq+dsq*0], ym6 vextracti32x8 [dstq+dsq*1], m6, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 movaps [rsp+stack_offset+8], xmm6 %endif lea wd, [hq+wq*8-256] mov r7, srcq mov r8, dstq .v_w32_loop0: movu m16, [srcq+ssq*0] movu m17, [srcq+ssq*1] movu m18, [srcq+ssq*2] add srcq, r6 movu m19, [srcq+ssq*0] movu m20, [srcq+ssq*1] movu m21, [srcq+ssq*2] add srcq, r6 movu m22, [srcq+ssq*0] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [srcq+ssq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h REPX {psrad x, 6}, m6, m8, m7, m9 packusdw m6, m8 packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 mova [dstq+dsq*0], m6 mova [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop add r7, 64 add r8, 64 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .v_w32_loop0 %if WIN64 movaps xmm6, [rsp+stack_offset+8] %endif vzeroupper RET .hv: vpbroadcastw m11, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 test dword r8m, 0x800 jnz .hv_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_main .hv_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m8, [buf+ 4] vpbroadcastd m9, [buf+ 8] vpbroadcastd ym12, xmm1 vpbroadcastd ym13, [buf+20] vpbroadcastd ym14, [buf+24] vpbroadcastd ym15, [buf+28] movu xm4, [srcq+ssq*0] vinserti32x4 ym4, [srcq+ssq*1], 1 vinserti32x4 m4, [srcq+ssq*2], 2 add srcq, r6 vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 cmp wd, 4 je .hv_w4 vbroadcasti32x4 m2, [spel_h_shufA] mova m3, [spel_h_shuf2b] mova ym6, [spel_h_shuf2a] mova xm7, [spel_shuf2] mova m1, m10 pshufb m4, m2 pshufb m0, m2 punpcklqdq m2, m4, m0 vpdpwssd m1, m8, m2 ; 04 15 26 3_ punpckhqdq m4, m0 vpdpwssd m1, m9, m4 vpermb m1, m3, m1 ; 01 12 vextracti32x4 xm2, ym1, 1 ; 23 34 vextracti32x4 xm3, m1, 2 ; 45 56 .hv_w2_loop: movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym5, [srcq+ssq*0], 1 mova xm4, xm10 vpermb ym5, ym6, ym5 pmaddwd xmm0, xm12, xm1 ; a0 b0 vpdpwssd xm4, xm8, xm5 vextracti32x4 xm5, ym5, 1 mova xm1, xm2 vpdpwssd xmm0, xm13, xm2 ; a1 b1 vpdpwssd xm4, xm9, xm5 ; 7 8 mova xm2, xm3 vpdpwssd xmm0, xm14, xm3 ; a2 b2 vpermt2b xm3, xm7, xm4 ; 67 78 vpdpwssd xmm0, xm15, xm3 ; a3 b3 psrad xmm0, 10 packusdw xmm0, xmm0 pminsw xmm0, xm11 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti32x4 m19, [spel_h_shufA] vbroadcasti32x4 m20, [spel_h_shufB] mova ym6, [spel_shuf4a] mova ym7, [spel_shuf4b] mova m2, m10 mova m3, m10 pshufb m1, m4, m19 vpdpwssd m2, m8, m1 pshufb m1, m0, m19 vpdpwssd m3, m8, m1 pshufb m4, m20 vpdpwssd m2, m9, m4 pshufb m0, m20 vpdpwssd m3, m9, m0 vpermb m1, m6, m2 ; 01 12 vshufi32x4 m2, m3, q1032 vpermb m3, m6, m3 ; 45 56 vpermb m2, m6, m2 ; 23 34 .hv_w4_loop: movu xm18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym18, [srcq+ssq*0], 1 mova ym4, ym10 pshufb ym17, ym18, ym19 pmaddwd ym16, ym12, ym1 ; a0 b0 vpdpwssd ym4, ym8, ym17 pshufb ym18, ym20 mova ym1, ym2 vpdpwssd ym16, ym13, ym2 ; a1 b1 vpdpwssd ym4, ym9, ym18 ; 7 8 mova ym2, ym3 vpdpwssd ym16, ym14, ym3 ; a2 b2 vpermt2b ym3, ym7, ym4 ; 67 78 vpdpwssd ym16, ym15, ym3 ; a3 b3 psrad ym16, 10 vextracti128 xm17, ym16, 1 packusdw xm16, xm17 pminsw xm16, xm11 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 test dword r8m, 0x800 jnz .hv_w8_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_w8_main .hv_w8_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_w8_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [buf+ 4] vpbroadcastd m14, [buf+ 8] vpbroadcastd m15, [buf+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [buf+20] vpbroadcastd m18, [buf+24] vpbroadcastd m19, [buf+28] cmp wd, 16 je .hv_w16 jg .hv_w32 mova m5, [spel_h_shufA] movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 movu ym9, [srcq+ssq*2] add srcq, r6 vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 movu ym20, [srcq+ssq*1] vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+ssq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m12, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m12, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m12, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m12, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m13, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m13, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m13, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m13, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m14, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m14, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m14, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m14, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m15, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m15, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m15, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m15, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m0, [srcq+ssq*0], 1 mova m4, m10 vpermb m21, m5, m0 vpdpwssd m4, m12, m21 ; h0 i0 vpermb m21, m6, m0 pmaddwd m20, m16, m1 ; A0 B0 vpdpwssd m4, m13, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m14, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m15, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 psrad m20, 10 vextracti32x8 ym21, m20, 1 packusdw ym20, ym21 pminsw ym20, ym11 mova [dstq+dsq*0], xm20 vextracti128 [dstq+dsq*1], ym20, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: WIN64_SPILL_XMM 26 vbroadcasti32x8 m5, [srcq+ssq*0+ 8] vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 movu ym6, [srcq+ssq*1+ 0] movu ym7, [srcq+ssq*1+16] vinserti32x8 m6, [srcq+ssq*2+ 0], 1 vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 add srcq, r6 movu ym22, [srcq+ssq*0+ 0] movu ym23, [srcq+ssq*0+16] vinserti32x8 m22, [srcq+ssq*1+ 0], 1 vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4 movu ym24, [srcq+ssq*2+ 0] movu ym25, [srcq+ssq*2+16] add srcq, r6 vinserti32x8 m24, [srcq+ssq*0+ 0], 1 vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m9, [spel_shuf16] pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m12, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m14, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m13, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m13, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m15, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m14, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m14, m7 ; b2 vpdpwssd m3, m12, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m15, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m15, m6 ; b3 vpdpwssd m3, m13, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m12, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m14, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m12, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m14, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m13, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m15, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m13, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m15, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m14, m23 ; d2 vpdpwssd m5, m12, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m14, m25 ; f2 vpdpwssd m7, m12, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m15, m22 ; d3 vpdpwssd m5, m13, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m15, m24 ; f3 vpdpwssd m7, m13, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [srcq+ssq*1+ 0] movu ym25, [srcq+ssq*1+16] lea srcq, [srcq+ssq*2] vinserti32x8 m24, [srcq+ssq*0+ 0], 1 vinserti32x8 m25, [srcq+ssq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m12, m0 ; h0 pshufb m0, m25, m20 vpdpwssd m8, m14, m0 ; i2 pmaddwd m22, m16, m1 ; A0 mova m1, m3 pmaddwd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m13, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m15, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m14, m25 ; h2 vpdpwssd m8, m12, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m15, m24 ; h3 vpdpwssd m8, m13, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 psrad m22, 10 psrad m23, 10 vshufi32x4 m0, m22, m23, q3232 vinserti32x8 m22, ym23, 1 packusdw m22, m0 pminsw m22, m11 mova [dstq+dsq*0], ym22 vextracti32x8 [dstq+dsq*1], m22, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 32 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m22, [spel_shuf32] lea wd, [hq+wq*8-256] mov r7, srcq mov r8, dstq .hv_w32_loop0: movu m6, [srcq+ssq*0+ 0] movu m7, [srcq+ssq*0+ 8] movu m8, [srcq+ssq*0+16] mova m0, m10 mova m23, m10 pshufb m9, m6, m20 vpdpwssd m0, m12, m9 ; a0l pshufb m9, m7, m20 vpdpwssd m23, m12, m9 ; a0h vpdpwssd m0, m14, m9 ; a2l pshufb m7, m21 vpdpwssd m23, m13, m7 ; a1h vpdpwssd m0, m15, m7 ; a3l pshufb m7, m8, m20 vpdpwssd m23, m14, m7 ; a2h pshufb m6, m21 vpdpwssd m0, m13, m6 ; a1l pshufb m8, m21 vpdpwssd m23, m15, m8 ; a3h %macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2] movu m6, [srcq+%3*%4+ 0] movu m7, [srcq+%3*%4+ 8] movu m8, [srcq+%3*%4+16] %if %4 == 2 add srcq, r6 %endif movu m29, [srcq+%3*%5+ 0] movu m30, [srcq+%3*%5+ 8] movu m31, [srcq+%3*%5+16] %if %5 == 2 add srcq, r6 %endif mova m%1, m10 mova m9, m10 pshufb m%2, m6, m20 vpdpwssd m%1, m12, m%2 ; x0l pshufb m%2, m29, m20 vpdpwssd m9, m12, m%2 ; y0l pshufb m6, m21 vpdpwssd m%1, m13, m6 ; x1l pshufb m29, m21 vpdpwssd m9, m13, m29 ; y1l pshufb m6, m7, m20 mova m%2, m10 vpdpwssd m%2, m12, m6 ; x0h pshufb m29, m30, m20 vpdpwssd m%1, m14, m6 ; y2l mova m6, m10 vpdpwssd m6, m12, m29 ; x0h pshufb m7, m21 vpdpwssd m9, m14, m29 ; y2l pshufb m30, m21 vpdpwssd m%2, m13, m7 ; x1h vpdpwssd m%1, m15, m7 ; x3l pshufb m7, m8, m20 vpdpwssd m6, m13, m30 ; y1h vpdpwssd m9, m15, m30 ; y3l pshufb m30, m31, m20 vpdpwssd m%2, m14, m7 ; x2h pshufb m8, m21 vpdpwssd m6, m14, m30 ; y2h pshufb m31, m21 vpdpwssd m%2, m15, m8 ; x3h vpdpwssd m6, m15, m31 ; y3h %if %1 == 1 vpermt2b m0, m22, m%1 ; 01l vpermt2b m23, m22, m%2 ; 01h %endif vpermt2b m%1, m22, m9 ; xyl vpermt2b m%2, m22, m6 ; xyh %endmacro PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12 PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34 PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56 vpshrdd m2, m1, m3, 16 ; 23l vpshrdd m25, m24, m26, 16 ; 23h vpshrdd m4, m3, m5, 16 ; 45l vpshrdd m27, m26, m28, 16 ; 45h .hv_w32_loop: movu m7, [srcq+ssq*1+ 0] movu m9, [srcq+ssq*2+ 0] movu m6, [srcq+ssq*1+ 8] movu m8, [srcq+ssq*2+ 8] mova m29, m10 mova m31, m10 pshufb m30, m7, m20 vpdpwssd m29, m12, m30 ; h0l pshufb m30, m9, m20 vpdpwssd m31, m12, m30 ; i0l pshufb m7, m21 vpdpwssd m29, m13, m7 ; h1l pshufb m9, m21 vpdpwssd m31, m13, m9 ; i1l pshufb m7, m6, m20 vpdpwssd m29, m14, m7 ; h2l pshufb m9, m8, m20 vpdpwssd m31, m14, m9 ; i2l pshufb m6, m21 vpdpwssd m29, m15, m6 ; h3l pshufb m8, m21 vpdpwssd m31, m15, m8 ; i3l mova m30, m10 vpdpwssd m30, m12, m7 ; h0h movu m7, [srcq+ssq*1+16] lea srcq, [srcq+ssq*2] vpermt2b m29, m22, m31 ; 78l mova m31, m10 vpdpwssd m31, m12, m9 ; i0h movu m9, [srcq+ssq*0+16] vpdpwssd m30, m13, m6 ; h1h pshufb m6, m7, m20 vpdpwssd m31, m13, m8 ; i1h pshufb m8, m9, m20 vpdpwssd m30, m14, m6 ; h2h pmaddwd m6, m16, m0 ; A0l pshufb m7, m21 vpdpwssd m31, m14, m8 ; i2h pmaddwd m8, m16, m23 ; A0h pshufb m9, m21 vpdpwssd m30, m15, m7 ; h3h pmaddwd m7, m16, m1 ; B0l vpdpwssd m31, m15, m9 ; i3h pmaddwd m9, m16, m24 ; B0h mova m0, m2 vpdpwssd m6, m17, m2 ; A1l mova m23, m25 vpdpwssd m8, m17, m25 ; A1h mova m1, m3 vpdpwssd m7, m17, m3 ; B1l mova m24, m26 vpdpwssd m9, m17, m26 ; B1h vpermt2b m30, m22, m31 ; 78h vpdpwssd m6, m18, m4 ; A2l mova m2, m4 vpdpwssd m8, m18, m27 ; A2h mova m25, m27 vpdpwssd m7, m18, m5 ; B2l mova m3, m5 vpdpwssd m9, m18, m28 ; B2h mova m26, m28 vpshrdd m4, m5, m29, 16 ; 67l vpdpwssd m6, m19, m4 ; A3l vpshrdd m27, m28, m30, 16 ; 67h vpdpwssd m8, m19, m27 ; A3h mova m5, m29 vpdpwssd m7, m19, m29 ; B3l mova m28, m30 vpdpwssd m9, m19, m30 ; B3h REPX {psrad x, 10}, m6, m8, m7, m9 packusdw m6, m8 packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 mova [dstq+dsq*0], m6 mova [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop add r7, 64 add r8, 64 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .hv_w32_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif MC_8TAP_FN prep, sharp, SHARP, SHARP MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH MC_8TAP_FN prep, regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [pw_8192] movzx wd, word [r7+wq*2+table_offset(prep,)] shr r5d, 11 vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] add wq, r7 lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m vbroadcasti32x4 m4, [spel_h_shufA] vbroadcasti32x4 m5, [spel_h_shufB] shr r5d, 11 mova ym9, [prep_endA] psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m6, [tmpq+4] vpbroadcastd m7, [tmpq+8] .h_w4_loop: movu xm2, [srcq+strideq*0] vinserti32x4 ym2, [srcq+strideq*1], 1 vinserti32x4 m2, [srcq+strideq*2], 2 vinserti32x4 m2, [srcq+r6 ], 3 lea srcq, [srcq+strideq*4] mova m0, m10 pshufb m1, m2, m4 vpdpwssd m0, m6, m1 pshufb m2, m5 vpdpwssd m0, m7, m2 vpermb m0, m9, m0 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m10, [prep_8tap_rnd] lea r6, [strideq*3] cmp wd, 4 je .h_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m sub srcq, 6 shr r5d, 11 psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] cmp wd, 16 je .h_w16 jg .h_w32 .h_w8: mova m6, [spel_h_shufA] movu m7, [spel_h_shufB] movu m8, [spel_h_shufC] mova m9, [spel_h_shufD] mova m11, [prep_endB] .h_w8_loop: movu ym4, [srcq+strideq*0] vinserti32x8 m4, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] vinserti32x8 m5, [srcq+r6 ], 1 lea srcq, [srcq+strideq*4] mova m0, m10 mova m1, m10 vpermb m2, m6, m4 vpermb m3, m6, m5 vpdpwssd m0, m12, m2 vpdpwssd m1, m12, m3 vpermb m2, m7, m4 vpermb m3, m7, m5 vpdpwssd m0, m13, m2 vpdpwssd m1, m13, m3 vpermb m2, m8, m4 vpermb m3, m8, m5 vpdpwssd m0, m14, m2 vpdpwssd m1, m14, m3 vpermb m2, m9, m4 vpermb m3, m9, m5 vpdpwssd m0, m15, m2 vpdpwssd m1, m15, m3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] mova m11, [prep_endC] .h_w16_loop: movu ym2, [srcq+strideq*0+ 0] vinserti32x8 m2, [srcq+strideq*1+ 0], 1 movu ym3, [srcq+strideq*0+16] vinserti32x8 m3, [srcq+strideq*1+16], 1 lea srcq, [srcq+strideq*2] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m14, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m13, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m15, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m14, m4 ; a2 vpdpwssd m1, m12, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m15, m2 ; a3 vpdpwssd m1, m13, m2 ; b1 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16_loop RET .h_w32: vbroadcasti32x4 m6, [spel_h_shufA] lea srcq, [srcq+wq*2] vbroadcasti32x4 m7, [spel_h_shufB] neg wq mova m11, [prep_endC] .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b0 vpdpwssd m0, m14, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m13, m3 ; b1 vpdpwssd m0, m15, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m14, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m15, m4 ; b3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 add r6, 32 jl .h_w32_loop add srcq, strideq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m vpbroadcastd m10, [prep_8tap_rnd] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r6d, wd shr r5d, 11 movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] psllw xmm0, [base+prep_hv_shift+r5*8] add r7, r6 lea r6, [strideq*3] sub srcq, r6 mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] jmp r7 .v_w4: movq xmm1, [srcq+strideq*0] vpbroadcastq ymm0, [srcq+strideq*1] vpbroadcastq ymm2, [srcq+strideq*2] add srcq, r6 vpbroadcastq ymm4, [srcq+strideq*0] vpbroadcastq ymm3, [srcq+strideq*1] vpbroadcastq ymm5, [srcq+strideq*2] mova xm11, [prep_endA] add srcq, r6 vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklwd ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+strideq*0] vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm3, 0x30 punpcklwd ymm2, ymm4 ; 23 34 vpblendd ymm3, ymm5, 0x30 vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 45 56 .v_w4_loop: vpbroadcastq ymm5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] mova ymm4, ym10 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 mova ymm1, ymm2 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 mova ymm2, ymm3 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 vpblendd ymm3, ymm0, ymm5, 0x30 vpbroadcastq ymm0, [srcq+strideq*0] vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 67 78 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 vpermb ymm4, ym11, ymm4 mova [tmpq], xmm4 add tmpq, 16 sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m2, [srcq+strideq*2] vinserti32x4 m1, m2, [srcq+strideq*0], 0 vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2 add srcq, r6 vinserti32x4 ym2, [srcq+strideq*0], 1 vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4 mova m6, [spel_v_shuf8] movu xm0, [srcq+strideq*1] vinserti32x4 ym0, [srcq+strideq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 mova ym11, [prep_endB] vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m2 ; 23 34 vpermb m3, m6, m0 ; 45 56 .v_w8_loop: vinserti32x4 m0, [srcq+strideq*1], 3 lea srcq, [srcq+strideq*2] movu xm5, [srcq+strideq*0] mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m1, m2 vshufi32x4 m0, m5, q1032 ; 6 7 8 vpdpwssd m4, m13, m2 ; a1 b1 mova m2, m3 vpdpwssd m4, m14, m3 ; a2 b2 vpermb m3, m6, m0 ; 67 78 vpdpwssd m4, m15, m3 ; a3 b3 vpermb m4, m11, m4 mova [tmpq], ym4 add tmpq, 32 sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m1, [srcq+strideq*1] vinserti32x8 m0, m1, [srcq+strideq*0], 0 vinserti32x8 m1, [srcq+strideq*2], 1 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+strideq*0] vinserti32x8 m3, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] add srcq, r6 vinserti32x8 m5, [srcq+strideq*0], 1 mova m11, [prep_endA] vpermb m0, m8, m0 ; 01 vpermb m1, m8, m1 ; 12 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m6, m10 mova m7, m10 vpdpwssd m6, m12, m0 ; a0 mova m0, m2 vpdpwssd m7, m12, m1 ; b0 mova m1, m3 vpdpwssd m6, m13, m2 ; a1 mova m2, m4 vpdpwssd m7, m13, m3 ; b1 mova m3, m5 vpdpwssd m6, m14, m4 ; a2 mova m4, m5 vpdpwssd m7, m14, m5 ; b2 movu ym5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m5, [srcq+strideq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m5, 16 ; 67 vpdpwssd m6, m15, m4 ; a3 vpdpwssd m7, m15, m5 ; b3 vpermt2b m6, m11, m7 mova [tmpq], m6 add tmpq, 64 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 PUSH r8 movaps [rsp+stack_offset+8], xmm6 %endif lea r5, [hq+wq*8-256] mov r7, srcq mov r8, tmpq .v_w32_loop0: movu m16, [srcq+strideq*0] movu m17, [srcq+strideq*1] movu m18, [srcq+strideq*2] add srcq, r6 movu m19, [srcq+strideq*0] movu m20, [srcq+strideq*1] movu m21, [srcq+strideq*2] add srcq, r6 movu m22, [srcq+strideq*0] mova m11, [prep_endC] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [srcq+strideq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h vpermt2b m6, m11, m8 vpermt2b m7, m11, m9 mova [tmpq+wq*0], m6 mova [tmpq+wq*2], m7 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w32_loop add r7, 64 add r8, 64 movzx hd, r5b mov srcq, r7 mov tmpq, r8 sub r5d, 1<<8 jg .v_w32_loop0 %if WIN64 movaps xmm6, [rsp+stack_offset+8] POP r8 %endif vzeroupper RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 shr r5d, 11 sub srcq, r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 vpbroadcastd m10, [prep_8tap_rnd] vpbroadcastd ym11, [pd_128] mova xm21, [prep_endA] mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m8, [tmpq+ 4] vpbroadcastd m9, [tmpq+ 8] vpbroadcastd ym12, xmm1 vpbroadcastd ym13, [tmpq+20] vpbroadcastd ym14, [tmpq+24] vpbroadcastd ym15, [tmpq+28] movu xm4, [srcq+strideq*0] vinserti32x4 ym4, [srcq+strideq*1], 1 vinserti32x4 m4, [srcq+strideq*2], 2 add srcq, r6 vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3 movu xm0, [srcq+strideq*1] vinserti32x4 ym0, [srcq+strideq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 vbroadcasti32x4 m19, [spel_h_shufA] vbroadcasti32x4 m20, [spel_h_shufB] mova ym6, [spel_shuf4a] mova ym7, [spel_shuf4b] mova m2, m10 mova m3, m10 pshufb m1, m4, m19 vpdpwssd m2, m8, m1 pshufb m1, m0, m19 vpdpwssd m3, m8, m1 pshufb m4, m20 vpdpwssd m2, m9, m4 pshufb m0, m20 vpdpwssd m3, m9, m0 vpermb m1, m6, m2 ; 01 12 vshufi32x4 m2, m3, q1032 vpermb m3, m6, m3 ; 45 56 vpermb m2, m6, m2 ; 23 34 .hv_w4_loop: movu xm18, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti128 ym18, [srcq+strideq*0], 1 mova ym16, ym11 mova ym4, ym10 pshufb ym17, ym18, ym19 vpdpwssd ym16, ym12, ym1 ; a0 b0 vpdpwssd ym4, ym8, ym17 pshufb ym18, ym20 mova ym1, ym2 vpdpwssd ym16, ym13, ym2 ; a1 b1 vpdpwssd ym4, ym9, ym18 ; 7 8 mova ym2, ym3 vpdpwssd ym16, ym14, ym3 ; a2 b2 vpermt2b ym3, ym7, ym4 ; 67 78 vpdpwssd ym16, ym15, ym3 ; a3 b3 vpermb ym16, ym21, ym16 mova [tmpq], xm16 add tmpq, 16 sub hd, 2 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 6 shr r5d, 11 sub srcq, r6 vpbroadcastd m10, [prep_8tap_rnd] vpbroadcastd m11, [pd_128] psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [tmpq+20] vpbroadcastd m18, [tmpq+24] vpbroadcastd m19, [tmpq+28] cmp wd, 16 je .hv_w16 jg .hv_w32 WIN64_SPILL_XMM 23 mova m5, [spel_h_shufA] movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 movu ym9, [srcq+strideq*2] add srcq, r6 vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 movu ym20, [srcq+strideq*1] vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+strideq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] mova ym22, [prep_endB] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m12, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m12, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m12, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m12, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m13, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m13, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m13, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m13, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m14, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m14, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m14, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m14, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m15, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m15, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m15, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m15, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m0, [srcq+strideq*0], 1 mova m4, m10 mova m20, m11 vpermb m21, m5, m0 vpdpwssd m4, m12, m21 ; h0 i0 vpermb m21, m6, m0 vpdpwssd m20, m16, m1 ; A0 B0 vpdpwssd m4, m13, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m14, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m15, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 vpermb m20, m22, m20 mova [tmpq], ym20 add tmpq, 32 sub hd, 2 jg .hv_w8_loop RET .hv_w16: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 27 vbroadcasti32x8 m5, [srcq+strideq*0+ 8] vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 movu ym6, [srcq+strideq*1+ 0] movu ym7, [srcq+strideq*1+16] vinserti32x8 m6, [srcq+strideq*2+ 0], 1 vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 add srcq, r6 movu ym22, [srcq+strideq*0+ 0] movu ym23, [srcq+strideq*0+16] vinserti32x8 m22, [srcq+strideq*1+ 0], 1 vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4 movu ym24, [srcq+strideq*2+ 0] movu ym25, [srcq+strideq*2+16] add srcq, r6 vinserti32x8 m24, [srcq+strideq*0+ 0], 1 vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m9, [spel_shuf16] mova m26, [prep_endB] pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m12, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m14, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m13, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m13, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m15, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m14, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m14, m7 ; b2 vpdpwssd m3, m12, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m15, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m15, m6 ; b3 vpdpwssd m3, m13, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m12, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m14, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m12, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m14, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m13, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m15, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m13, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m15, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m14, m23 ; d2 vpdpwssd m5, m12, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m14, m25 ; f2 vpdpwssd m7, m12, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m15, m22 ; d3 vpdpwssd m5, m13, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m15, m24 ; f3 vpdpwssd m7, m13, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [srcq+strideq*1+ 0] movu ym25, [srcq+strideq*1+16] lea srcq, [srcq+strideq*2] vinserti32x8 m24, [srcq+strideq*0+ 0], 1 vinserti32x8 m25, [srcq+strideq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m12, m0 ; h0 mova m22, m11 pshufb m0, m25, m20 vpdpwssd m8, m14, m0 ; i2 mova m23, m11 vpdpwssd m22, m16, m1 ; A0 mova m1, m3 vpdpwssd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m13, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m15, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m14, m25 ; h2 vpdpwssd m8, m12, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m15, m24 ; h3 vpdpwssd m8, m13, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 vpermt2b m22, m26, m23 mova [tmpq], m22 add tmpq, 64 sub hd, 2 jg .hv_w16_loop RET .hv_w32: %if WIN64 %assign stack_offset stack_offset - stack_size_padded PUSH r8 %assign regs_used regs_used + 1 WIN64_SPILL_XMM 32 %endif vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m22, [spel_shuf32] lea r5d, [hq+wq*8-256] mov r7, srcq mov r8, tmpq .hv_w32_loop0: movu m6, [srcq+strideq*0+ 0] movu m7, [srcq+strideq*0+ 8] movu m8, [srcq+strideq*0+16] mova m0, m10 mova m23, m10 pshufb m9, m6, m20 vpdpwssd m0, m12, m9 ; a0l pshufb m9, m7, m20 vpdpwssd m23, m12, m9 ; a0h vpdpwssd m0, m14, m9 ; a2l pshufb m7, m21 vpdpwssd m23, m13, m7 ; a1h vpdpwssd m0, m15, m7 ; a3l pshufb m7, m8, m20 vpdpwssd m23, m14, m7 ; a2h pshufb m6, m21 vpdpwssd m0, m13, m6 ; a1l pshufb m8, m21 vpdpwssd m23, m15, m8 ; a3h PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12 PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34 PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56 vpshrdd m2, m1, m3, 16 ; 23l vpshrdd m25, m24, m26, 16 ; 23h vpshrdd m4, m3, m5, 16 ; 45l vpshrdd m27, m26, m28, 16 ; 45h .hv_w32_loop: movu m7, [srcq+strideq*1+ 0] movu m9, [srcq+strideq*2+ 0] movu m6, [srcq+strideq*1+ 8] movu m8, [srcq+strideq*2+ 8] mova m29, m10 mova m31, m10 pshufb m30, m7, m20 vpdpwssd m29, m12, m30 ; h0l pshufb m30, m9, m20 vpdpwssd m31, m12, m30 ; i0l pshufb m7, m21 vpdpwssd m29, m13, m7 ; h1l pshufb m9, m21 vpdpwssd m31, m13, m9 ; i1l pshufb m7, m6, m20 vpdpwssd m29, m14, m7 ; h2l pshufb m9, m8, m20 vpdpwssd m31, m14, m9 ; i2l pshufb m6, m21 vpdpwssd m29, m15, m6 ; h3l pshufb m8, m21 vpdpwssd m31, m15, m8 ; i3l mova m30, m10 vpdpwssd m30, m12, m7 ; h0h movu m7, [srcq+strideq*1+16] lea srcq, [srcq+strideq*2] vpermt2b m29, m22, m31 ; 78l mova m31, m10 vpdpwssd m31, m12, m9 ; i0h movu m9, [srcq+strideq*0+16] vpdpwssd m30, m13, m6 ; h1h pshufb m6, m7, m20 vpdpwssd m31, m13, m8 ; i1h pshufb m8, m9, m20 vpdpwssd m30, m14, m6 ; h2h mova m6, m11 vpdpwssd m6, m16, m0 ; A0l pshufb m7, m21 vpdpwssd m31, m14, m8 ; i2h mova m8, m11 vpdpwssd m8, m16, m23 ; A0h pshufb m9, m21 vpdpwssd m30, m15, m7 ; h3h mova m7, m11 vpdpwssd m7, m16, m1 ; B0l vpdpwssd m31, m15, m9 ; i3h mova m9, m11 vpdpwssd m9, m16, m24 ; B0h mova m0, m2 vpdpwssd m6, m17, m2 ; A1l mova m23, m25 vpdpwssd m8, m17, m25 ; A1h mova m1, m3 vpdpwssd m7, m17, m3 ; B1l mova m24, m26 vpdpwssd m9, m17, m26 ; B1h vpermt2b m30, m22, m31 ; 78h mova m31, [prep_endC] vpdpwssd m6, m18, m4 ; A2l mova m2, m4 vpdpwssd m8, m18, m27 ; A2h mova m25, m27 vpdpwssd m7, m18, m5 ; B2l mova m3, m5 vpdpwssd m9, m18, m28 ; B2h mova m26, m28 vpshrdd m4, m5, m29, 16 ; 67l vpdpwssd m6, m19, m4 ; A3l vpshrdd m27, m28, m30, 16 ; 67h vpdpwssd m8, m19, m27 ; A3h mova m5, m29 vpdpwssd m7, m19, m29 ; B3l mova m28, m30 vpdpwssd m9, m19, m30 ; B3h vpermt2b m6, m31, m8 vpermt2b m7, m31, m9 mova [tmpq+wq*0], m6 mova [tmpq+wq*2], m7 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w32_loop add r7, 64 add r8, 64 movzx hd, r5b mov srcq, r7 mov tmpq, r8 sub r5d, 1<<8 jg .hv_w32_loop0 RET %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts %define base r6-pd_0to7 mov t0d, r7m lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m8, [base+warp_8x8t_rnd_v] vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main psrad m14, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m16, 15 packssdw m14, m16 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m15, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 add tsq, tsq psrad m16, 15 packssdw m15, m16 jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd mov t0d, r7m ; pixel_max lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] call .main psrad m14, m16, 13 call .main2 psrad m16, 13 packusdw m14, m16 call .main2 psrad m15, m16, 13 call .main2 vpbroadcastd m0, [base+bidir_shift+t0*4] vpsrlvw m14, m0 psrad m16, 13 packusdw m15, m16 vpsrlvw m15, m0 .end: mova m0, [base+warp8x8_end] vpermb m16, m0, m14 lea r2, [dsq*3] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 vpermb m16, m0, m15 lea dstq, [dstq+dsq*4] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 RET .main: vpbroadcastd ym3, [base+pd_512] %if WIN64 mov abcdq, r5mp vpaddd ym18, ym3, r6m {1to8} ; mx %else add r5d, 512 vpbroadcastd ym18, r5d %endif vpaddd ym20, ym3, r7m {1to8} ; my mova ym16, [base+pd_0to7] vpbroadcastd ym19, [abcdq+4*0] ; alpha vpbroadcastd ym21, [abcdq+4*1] ; gamma lea r4, [ssq*3+6] vpdpwssd ym18, ym19, ym16 ; tmx vpdpwssd ym20, ym21, ym16 ; tmy sub srcq, r4 mova m10, [base+warp8x8_permA] lea r4, [mc_warp_filter+64*8] vbroadcasti32x4 m12, [base+warp8x8_permC] kxnorb k1, k1, k1 vbroadcasti32x4 m13, [base+warp8x8_permD] movu ym5, [srcq+0] vinserti32x8 m5, [srcq+8], 1 psrad ym17, ym18, 10 mova m11, [base+warp8x8_permB] kmovb k2, k1 vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 psrad ym19, 16 ; beta psrad ym21, 16 ; delta paddd ym18, ym19 vpermb m4, m10, m5 vpbroadcastq m9, [base+warp_shift_h+t0*8] pshufd m3, m3, q3120 paddd m7, m1, m1 pshufb m2, m3, m12 vpdpwssd m1, m4, m2 vpermb m5, m11, m5 vshufi32x4 m4, m5, q1021 pshufb m3, m13 vpdpwssd m1, m4, m3 call .h psllq m2, m1, 32 paddd m1, m2 vpmultishiftqb m1, m9, m1 vpshrdq m1, m0, 48 ; 01 12 call .h vpshrdq m2, m1, m0, 48 ; 23 34 call .h vpshrdq m3, m2, m0, 48 ; 45 56 .main2: call .h psrad ym6, ym20, 10 kmovb k1, k2 paddd ym17, ym20, ym21 ; my += delta vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 shufps m5, m20, m6, q2020 mova m16, m8 pshufb m4, m5, m12 vpdpwssd m16, m1, m4 ; a0 b0 pshufb m5, m13 mova m1, m2 vpdpwssd m16, m2, m5 ; a1 b1 shufps m6, m20, m6, q3131 paddd ym20, ym17, ym21 pshufb m4, m6, m12 mova m2, m3 vpdpwssd m16, m3, m4 ; a2 b2 vpshrdq m3, m0, 48 ; 67 78 pshufb m6, m13 vpdpwssd m16, m3, m6 ; a3 b3 ret ALIGN function_align .h: movu ym16, [srcq+ssq*1] psrad ym6, ym18, 10 lea srcq, [srcq+ssq*2] vinserti32x8 m5, m16, [srcq+ssq*0], 1 kmovb k1, k2 paddd ym17, ym18, ym19 ; mx += beta vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 vpermb m4, m10, m5 shufps m16, m18, m6, q2020 shufps m6, m18, m6, q3131 mova m0, m7 pshufb m18, m16, m12 vpdpwssd m0, m4, m18 ; a0 b0 vpermb m5, m11, m5 pshufb m18, m6, m13 vpdpwssd m0, m5, m18 ; a3 b3 paddd ym18, ym17, ym19 vshufi32x4 m17, m4, m5, q1021 pshufb m16, m13 vpdpwssd m0, m17, m16 ; a1 b1 vshufi32x4 m4, m5, q2132 pshufb m6, m12 vpdpwssd m0, m4, m6 ; a2 b2 vpmultishiftqb m0, m9, m0 ; a a b b ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm0, ym1, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m2, [base+avg_round+t0*4] vpbroadcastd m3, [base+avg_shift+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+64*0] paddsw m0, [tmp2q+64*0] mova m1, [tmp1q+64*1] paddsw m1, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 vpsrlvw m0, m3 vpsrlvw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx512icl_table lea r6, [w_avg_avx512icl_table] tzcnt wd, wm mov t0d, r7m ; pixel_max shr t0d, 11 movsxd wq, [r6+wq*4] vpbroadcastd m5, [base+w_avg_round+t0*4] vpbroadcastd m7, [base+bidir_shift+t0*4] add wq, r6 mov r6d, r6m ; weight lea t0d, [r6-16] shl r6d, 16 sub r6d, t0d ; 16-weight, weight movifnidn hd, hm vpbroadcastd m6, r6d BIDIR_FN ALIGN function_align .main: mova m3, [tmp1q+64*0] mova m1, [tmp2q+64*0] mova m0, [tmp1q+64*1] mova m4, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m2, m1, m3 punpckhwd m1, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 mova m0, m5 vpdpwssd m0, m6, m2 mova m2, m5 vpdpwssd m2, m6, m1 mova m1, m5 vpdpwssd m1, m6, m3 mova m3, m5 vpdpwssd m3, m6, m4 REPX {psrad x, 2}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m7 vpsrlvw m1, m7 ret cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx512icl_table lea r7, [mask_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_64] vpbroadcastd m9, [base+mask_round+r6*4] vpbroadcastd m10, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: pmovzxbw m1, [maskq+32*0] mova m4, [tmp1q+64*0] mova m2, [tmp2q+64*0] pmovzxbw m6, [maskq+32*1] mova m5, [tmp1q+64*1] mova m3, [tmp2q+64*1] add maskq, 32*2 add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m7, m4, m2 punpckhwd m4, m2 psubw m0, m8, m1 punpcklwd m2, m1, m0 ; m, 64-m punpckhwd m1, m0 mova m0, m9 vpdpwssd m0, m7, m2 mova m2, m9 vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) punpcklwd m7, m5, m3 punpckhwd m5, m3 psubw m1, m8, m6 punpcklwd m3, m6, m1 punpckhwd m6, m1 mova m1, m9 vpdpwssd m1, m7, m3 mova m3, m9 vpdpwssd m3, m5, m6 REPX {psrad x, 4}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m10 vpsrlvw m1, m10 ret cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx512icl_table lea r7, [w_mask_420_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+mask_round+r6*4] vpbroadcastd m13, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m14, [base+w_mask_round+r6*4] mova ym15, [w_mask_end42x] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: mova m4, [w_mask_shuf4] vpermt2b m2, m4, m3 mova m3, m14 vpdpbusd m3, m2, [pb_64] {1to16} vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: mova m8, [w_mask_shuf8] vpbroadcastd m9, [pb_64] jmp .w8_start .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 mova [maskq], xm3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16: mova m8, [w_mask_shuf16] vpbroadcastd m9, [pb_64] jmp .w16_start .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 mova [maskq], xm3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m2, m3 mova m8, m14 vpdpwssd m8, m11, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 call .main paddw m2, m3 mova m3, m14 vpdpwssd m3, m11, m2 vpermt2b m8, m15, m3 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m1 mova [maskq], ym8 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 32 .w64: mova m8, m2 mova m9, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main paddw m8, m2 paddw m9, m3 mova m2, m14 vpdpwssd m2, m11, m8 mova m3, m14 vpdpwssd m3, m11, m9 vpermt2b m2, m15, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 mova [maskq], ym2 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 64 .w128: mova m16, m2 mova m8, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main mova m17, m2 mova m9, m3 mova [dstq+strideq*0+64*2], m0 mova [dstq+strideq*0+64*3], m1 call .main paddw m2, m16 paddw m3, m8 mova m16, m14 vpdpwssd m16, m11, m2 mova m8, m14 vpdpwssd m8, m11, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 call .main paddw m2, m17 paddw m3, m9 mova m17, m14 vpdpwssd m17, m11, m2 mova m9, m14 vpdpwssd m9, m11, m3 vpermt2b m16, m15, m8 vpermt2b m17, m15, m9 mova [dstq+strideq*1+64*2], m0 mova [dstq+strideq*1+64*3], m1 mova [maskq+32*0], ym16 mova [maskq+32*1], ym17 sub hd, 2 jg .w128_loop vzeroupper RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m10, m6 psrlw m6, 10 ; 64-m psubw m2, m11, m6 ; m punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m12 vpdpwssd m0, m5, m1 mova m1, m12 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m10, m5 psrlw m5, 10 psubw m3, m11, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m12 vpdpwssd m1, m6, m4 mova m4, m12 vpdpwssd m4, m7, m5 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m13 vpsrlvw m1, m13 ret cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx512icl_table lea r7, [w_mask_422_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] vpbroadcastd m11, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m12, [base+w_mask_round+r6*4] mova ym13, [w_mask_end42x] mov maskq, maskmp add wq, r7 paddw m14, m9, m9 ; pw_128 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 mova m5, m12 vpdpwssd m5, m14, m2 mova m2, m12 vpdpwssd m2, m14, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpermt2b m5, m13, m2 vpsrlvw m0, m11 vpsrlvw m1, m11 mova [maskq], ym5 add maskq, 32 ret cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx512icl_table lea r7, [w_mask_444_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] mova m11, [w_mask_end444] vpbroadcastd m12, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 vpermt2b m2, m11, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m12 vpsrlvw m1, m12 mova [maskq], m2 add maskq, 64 ret cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx512icl_table lea r6, [blend_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw ym19, [maskq] movq xm16, [dstq+dsq*0] movhps xm16, [dstq+dsq*1] vpbroadcastq ym17, [dstq+dsq*2] vpbroadcastq ym18, [dstq+r6 ] pmullw ym19, ym6 vpblendd ym16, ym17, 0x30 vpblendd ym16, ym18, 0xc0 psubw ym17, ym16, [tmpq] add maskq, 16 add tmpq, 32 pmulhrsw ym17, ym19 paddw ym16, ym17 vextracti128 xm17, ym16, 1 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 movq [dstq+dsq*2], xm17 movhps [dstq+r6 ], xm17 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 vzeroupper RET .w8: pmovzxbw m2, [maskq] mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vinserti32x4 m0, [dstq+dsq*2], 2 vinserti32x4 m0, [dstq+r6 ], 3 pmullw m2, m6 psubw m1, m0, [tmpq] add maskq, 32 add tmpq, 64 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 vextracti32x4 [dstq+dsq*2], m0, 2 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 mova ym1, [dstq+dsq*2] vinserti32x8 m1, [dstq+r6 ], 1 pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+64*0] psubw m3, m1, [tmpq+64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 mova [dstq+dsq*2], ym1 vextracti32x8 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w16 RET .w32: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova m0, [dstq+dsq*0] mova m1, [dstq+dsq*1] pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+ 64*0] psubw m3, m1, [tmpq+ 64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32 RET cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h lea r5, [blend_v_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: vpbroadcastd xmm2, [obmc_masks_avx2+2*2] .w2_loop: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movq xmm1, [tmpq] add tmpq, 4*2 psubw xmm1, xmm0, xmm1 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_loop RET .w4: vpbroadcastq xmm2, [obmc_masks_avx2+4*2] .w4_loop: movq xmm0, [dstq+dsq*0] movhps xmm0, [dstq+dsq*1] psubw xmm1, xmm0, [tmpq] add tmpq, 8*2 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET .w8: vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] .w8_loop: mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 psubw ym1, ym0, [tmpq] add tmpq, 16*2 pmulhrsw ym1, ym2 paddw ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] .w16_loop: mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 psubw m1, m0, [tmpq] add tmpq, 32*2 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: mova m4, [obmc_masks_avx2+32*2] .w32_loop: mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 64*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop RET cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask %define base r6-$$ lea r6, [$$] tzcnt wd, wm mov hd, hm movsxd wq, [base+blend_h_avx512icl_table+wq*4] lea maskq, [base+obmc_masks_avx2+hq*2] lea hd, [hq*3] lea wq, [base+blend_h_avx512icl_table+wq] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] movq xmm1, [tmpq] add tmpq, 4*2 punpcklwd xmm2, xmm2 psubw xmm1, xmm0, xmm1 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova xmm3, [blend_shuf] .w4_loop: movq xmm0, [dstq+dsq*0] movhps xmm0, [dstq+dsq*1] movd xmm2, [maskq+hq*2] psubw xmm1, xmm0, [tmpq] add tmpq, 8*2 pshufb xmm2, xmm3 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET .w8: vbroadcasti32x4 ym3, [blend_shuf] shufpd ym3, ym3, 0x0c .w8_loop: mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vpbroadcastd ym2, [maskq+hq*2] psubw ym1, ym0, [tmpq] add tmpq, 16*2 pshufb ym2, ym3 pmulhrsw ym1, ym2 paddw ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET .w16: vbroadcasti32x4 m3, [blend_shuf] shufpd m3, m3, 0xf0 .w16_loop: mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 vpbroadcastd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 32*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16_loop RET .w32: vpbroadcastw m4, [maskq+hq*2] vpbroadcastw m5, [maskq+hq*2+2] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 64*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w32 RET .w64: vpbroadcastw m4, [maskq+hq*2] mova m0, [dstq+64*0] psubw m2, m0, [tmpq+64*0] mova m1, [dstq+64*1] psubw m3, m1, [tmpq+64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m8, [maskq+hq*2] mova m0, [dstq+64*0] psubw m4, m0, [tmpq+64*0] mova m1, [dstq+64*1] psubw m5, m1, [tmpq+64*1] mova m2, [dstq+64*2] psubw m6, m2, [tmpq+64*2] mova m3, [dstq+64*3] psubw m7, m3, [tmpq+64*3] add tmpq, 64*4 REPX {pmulhrsw x, m8}, m4, m5, m6, m7 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq inc hq jl .w128 RET cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax sub dword mx0m, 4<<14 sub dword src_wm, 8 mov r6, ~0 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm kmovq k6, r6 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pd_16384] vpbroadcastd m7, [base+pd_63] mova m24, [base+resize_permA] mova m25, [base+resize_permB] mova m26, [base+resize_permC] mova m27, [base+resize_permD] vbroadcasti32x4 m28, [base+resize_shufA] vbroadcasti32x4 m29, [base+resize_shufB] mova m30, [base+resize_permE] vpbroadcastw ym31, pxmaxm vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] pslld m5, 4 ; dx*16 pslld m6, 14 pxor m2, m2 .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset vptestmd k5, m1, m1 pand m9, m7 ; filter offset (masked) ktestw k5, k5 jz .load vpbroadcastq m14, [base+pd_0_4] vpermq m10, m0, q1100 vpermq m11, m0, q3322 vpermq m20, m1, q1100 vpermq m21, m1, q3322 punpckldq m10, m10 punpckldq m11, m11 punpckldq m20, m20 punpckldq m21, m21 paddd m10, m14 paddd m11, m14 paddd m20, m14 paddd m21, m14 vextracti32x8 ym12, m10, 1 vextracti32x8 ym13, m11, 1 vextracti32x8 ym22, m20, 1 vextracti32x8 ym23, m21, 1 kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] pshufb m16, m0 pshufb m17, m1 pshufb m18, m14 pshufb m19, m15 mova m20, m24 mova m22, m24 mova m21, m25 mova m23, m25 vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd mova m15, m26 mova m17, m26 mova m16, m27 mova m18, m27 vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd kmovq k1, k6 kmovq k2, k6 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] pshufb m10, m11, m28 pshufb m11, m11, m29 pshufb m12, m13, m28 pshufb m13, m13, m29 jmp .filter .load: kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] pshufb m10, m11, m28 pshufb m11, m11, m29 pshufb m12, m13, m28 pshufb m13, m13, m29 vpgatherdd m15{k3}, [srcq+m0*2+ 0] vpgatherdd m16{k4}, [srcq+m0*2+ 4] kmovq k1, k6 kmovq k2, k6 vpgatherdd m17{k1}, [srcq+m0*2+ 8] vpgatherdd m18{k2}, [srcq+m0*2+12] .filter: mova m14, m2 vpdpwssd m14, m15, m10 vpdpwssd m14, m16, m11 vpdpwssd m14, m17, m12 vpdpwssd m14, m18, m13 psubd m14, m3, m14 psrad m14, 15 packusdw m14, m14 vpermq m14, m30, m14 pminsw ym14, ym31 mova [dstq+xq*2], ym14 paddd m4, m5 add xd, 16 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64