diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2021-05-04 15:04:33 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2021-05-05 01:14:00 +0300 |
commit | c1412688dd5c1f70560bbfdf622647d23df83ff4 (patch) | |
tree | 5581668076a1297fd5675cb540d81f4db7b3c94c | |
parent | dc7cdc0b581d52da4f5ce84841408675cf81c094 (diff) |
x86: Add high bitdepth ipred_z1 AVX2 asm
-rw-r--r-- | src/x86/ipred16_avx2.asm | 761 | ||||
-rw-r--r-- | src/x86/ipred_init_tmpl.c | 2 |
2 files changed, 761 insertions, 2 deletions
diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index 0314b7c..a38823c 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -62,9 +62,21 @@ filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 + dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 +pw_m1024: times 2 dw -1024 +z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +z_filter_k: dw 4, 4, 5, 5, 4, 4, 8, 8, 6, 6, 4, 4 + +%define pw_4 z_filter_k pw_2: times 2 dw 2 -pw_4: times 2 dw 4 +pw_3: times 2 dw 3 +pw_62: times 2 dw 62 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 @@ -92,6 +104,7 @@ JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 @@ -100,6 +113,7 @@ JMP_TABLE ipred_cfl_ac_420_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_p JMP_TABLE ipred_cfl_ac_422_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 +cextern dr_intra_derivative cextern filter_intra_taps SECTION .text @@ -1114,6 +1128,751 @@ ALIGN function_align jz .w64_loop_x RET +cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z1_16bpc_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea r7, [dr_intra_derivative] + movsxd wq, [r6+wq*4] + add tlq, 2 + add wq, r6 + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [r7+dxq] + xor angled, 0x4ff ; d = 90 - angle + vpbroadcastd m5, [pw_62] + jmp wq +.w4: + ALLOC_STACK -64, 7 + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + vpbroadcastw xm3, [tlq+14] + movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 + paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 + add dxd, dxd + palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 + paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d + psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 + psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 + pxor xm4, xm4 + paddw xm2, xm0 + vpbroadcastw xm0, r8m ; pixel_max + mova [rsp+32], xm3 + movd xm3, dxd + pmaxsw xm2, xm4 + mov r3d, dxd + pavgw xm2, xm4 + vpbroadcastw m3, xm3 + pminsw xm2, xm0 + punpcklwd xm0, xm1, xm2 + punpckhwd xm1, xm2 + lea r5, [strideq*3] + pslldq m2, m3, 8 + mova [rsp+ 0], xm0 + mova [rsp+16], xm1 + paddw m6, m3, m3 + paddw m3, m2 + vpblendd m4, m6, 0xf0 + paddw m6, m6 + paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 + vbroadcasti128 m4, [z_upsample] +.w4_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + movu xm2, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base2 + vinserti128 m1, [rsp+r3*2], 1 ; 0 2 + lea r3d, [r2+dxq] + shr r2d, 6 ; base3 + vinserti128 m2, [rsp+r2*2], 1 ; 1 3 + pshufb m1, m4 + pshufb m2, m4 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m5, m3 ; frac + psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 + psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) + pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) + paddw m3, m6 ; xpos += dx + paddw m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; w4/w8/w16 +%define base r3-z_filter_t0 + movd xm0, maxbased + lea r3, [z_filter_t0] + movd xm1, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m1, xm1 + pcmpeqb m0, [base+z_filter_wh] + mova xm2, [r3+angleq*8] + pand m0, m1 + pcmpgtb m0, m2 + pmovmskb r5d, m0 + ret +.w4_no_upsample: + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea maxbased, [hq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastw xm3, [tlq+14] + mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 + vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] + palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 + pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + paddw xm2, xm0 + pmullw xm2, xm4 + movd [rsp+16], xm3 + cmp r5d, 3 + jne .w4_3tap + paddw xm1, xm2 + palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 + pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 + movzx r3d, word [tlq+14] + movzx r2d, word [tlq+12] + inc maxbased + paddw xm2, xm0 + sub r2d, r3d + paddw xm2, xm2 + lea r2d, [r2+r3*8+4] + shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 + mov [rsp+16], r2w +.w4_3tap: + pxor xm0, xm0 + paddw xm1, xm2 + mov tlq, rsp + psrlw xm1, 3 + cmp hd, 8 + sbb maxbased, -1 + pavgw xm0, xm1 + mova [tlq], xm0 +.w4_main: + movd xm3, dxd + vpbroadcastq m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] + shl maxbased, 6 + vpbroadcastw m3, xm3 + movd xm0, maxbased + mov r3d, dxd ; xpos + vpbroadcastw m0, xm0 + paddw m4, m3, m3 + psubw m1, m0 ; -max_base_x + vpblendd m3, m4, 0xcc + paddw m0, m4, m3 + vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 + paddw m4, m4 + paddw m3, m1 +.w4_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [tlq+r3*2] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + movu xm2, [tlq+r5*2] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + vinserti128 m1, [tlq+r3*2], 1 ; 0 2 + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + vinserti128 m2, [tlq+r5*2], 1 ; 1 3 + punpcklqdq m0, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + vpblendd m1, m2, 0xcc + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 ; xpos < max_base_x + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w4_loop + lea r6, [strideq*3] +.w4_end_loop: + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + movq [dstq+strideq*2], xm6 + movq [dstq+r6 ], xm6 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_end_loop +.w4_end: + RET +.w8: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 7 + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ + movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ + movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + cmp hd, 4 + jne .w8_upsample_h8 ; awkward single-pixel edge case + vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ +.w8_upsample_h8: + paddw m2, m1 + paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + add dxd, dxd + psubw m0, m2, m0 + psraw m0, 3 + pxor m4, m4 + paddw m2, m0 + vpbroadcastw m0, r8m + movd xm3, dxd + pmaxsw m2, m4 + mov r3d, dxd + pavgw m2, m4 + vpbroadcastw m3, xm3 + pminsw m2, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vbroadcasti128 m4, [z_upsample] + mova [rsp+ 0], xm0 + mova [rsp+16], xm1 + paddw m6, m3, m3 + vextracti128 [rsp+32], m0, 1 + vextracti128 [rsp+48], m1, 1 + vpblendd m3, m6, 0xf0 ; xpos0 xpos1 +.w8_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [rsp+r3*2] + movu xm2, [rsp+r3*2+16] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + vinserti128 m1, [rsp+r2*2], 1 + vinserti128 m2, [rsp+r2*2+16], 1 + pshufb m1, m4 + pshufb m2, m4 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m3, m6 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_upsample_loop + RET +.w8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(h+7, 15) + jmp .w8_main +.w8_no_upsample: + lea maxbased, [hq+7] + test angled, 0x400 + jnz .w8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w8_main + popcnt r5d, r5d + vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m2 + cmp hd, 8 + jl .w8_filter_h4 + punpckhwd m2, m2 + vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + je .w8_filter_end ; 8x4 and 8x8 are always 3-tap + movzx r3d, word [tlq+30] + mov maxbased, 16 + mov [rsp+32], r3d + cmp r5d, 3 + jne .w8_filter_end + punpcklwd xm6, xm0, xm0 + vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g + vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movzx r5d, word [tlq+28] + mov [rsp+34], r3w + paddw m2, m6 + sub r5d, r3d + inc maxbased + paddw m2, m2 + lea r3d, [r5+r3*8+4] + paddw m1, m2 + shr r3d, 3 + mov [rsp+32], r3w + jmp .w8_filter_end +.w8_filter_h4: + pshuflw m3, m2, q3321 + vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ +.w8_filter_end: + paddw m0, m3 + pmullw m0, m4 + mov tlq, rsp + pxor m2, m2 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + mova [tlq], m0 +.w8_main: + movd xm3, dxd + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m3, xm3 + movd xm0, maxbased + mov r3d, dxd + vpbroadcastw m0, xm0 + paddw m4, m3, m3 + psubw m1, m0 + vpblendd m3, m4, 0xf0 ; xpos0 xpos1 + paddw m3, m1 +.w8_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + movu xm0, [tlq+r3*2] + movu xm1, [tlq+r3*2+2] + lea r3d, [r5+dxq] + shr r5d, 6 + vinserti128 m0, [tlq+r5*2], 1 + vinserti128 m1, [tlq+r5*2+2], 1 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w8_loop +.w8_end_loop: + mova [dstq+strideq*0], xm6 + mova [dstq+strideq*1], xm6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_end_loop +.w8_end: + RET +.w16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(h+15, 31) + jmp .w16_main +.w16: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 7 + lea maxbased, [hq+15] + test angled, 0x400 + jnz .w16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w16_main + popcnt r5d, r5d + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + cmp r5d, 3 + jne .w16_filter_3tap + vpbroadcastd m2, [base+pw_3] + punpcklwd xm0, xm0 + vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m0, m2 + pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m0, m1 + psrlw m0, 2 + movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + cmp hd, 8 + jl .w16_filter_5tap_h4 + punpckhwd m3, m3 + je .w16_filter_5tap_h8 + vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + movzx r3d, word [tlq+62] + movzx r2d, word [tlq+60] + pavgw m2, m4 + sub r2d, r3d + paddw m1, m3 + lea r2d, [r2+r3*8+4] + paddw m1, m2 + shr r2d, 3 + psrlw m1, 2 + mov [rsp+66], r3w + mov [rsp+64], r2w + mov tlq, rsp + mov r3d, 33 + cmp hd, 16 + cmovg maxbased, r3d + jmp .w16_filter_end2 +.w16_filter_5tap_h8: + vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 + vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 + pavgw xm2, xm4 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 + jmp .w16_filter_end2 +.w16_filter_5tap_h4: + pshuflw xm4, xm3, q3332 ; 4 5 5 5 + pshuflw xm3, xm3, q3321 ; 3 4 5 5 + pavgw xm2, xm4 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 + jmp .w16_filter_end2 +.w16_filter_3tap: + vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m4 + pmullw m3, m2 + paddw m0, m1 + cmp hd, 8 + je .w16_filter_3tap_h8 + jl .w16_filter_3tap_h4 + punpckhwd m2, m2 + vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + jmp .w16_filter_end +.w16_filter_3tap_h4: + pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ + jmp .w16_filter_end +.w16_filter_3tap_h8: + psrldq xm2, 2 + pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 +.w16_filter_end: + paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + pmullw m2, m4 + psrlw m0, 3 + pxor m1, m1 + paddw m2, m3 + psrlw m2, 3 + pavgw m0, m1 + pavgw m1, m2 +.w16_filter_end2: + mov tlq, rsp + mova [tlq+ 0], m0 + mova [tlq+32], m1 +.w16_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r3d, dxd + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + psubw m3, m0 +.w16_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + lea r3d, [r5+dxq] + shr r5d, 6 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m1, m0 + movu m0, [tlq+r5*2] + vpblendvb m2, m6, m1, m2 + movu m1, [tlq+r5*2+2] + mova [dstq+strideq*0], m2 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+strideq*1], m0 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w16_loop +.w16_end_loop: + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_end_loop +.w16_end: + RET +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK -160, 8 + lea maxbased, [hq+31] + mov r3d, 63 + cmp hd, 32 + cmova maxbased, r3d + test angled, 0x400 + jnz .w32_main + vpbroadcastd m2, [pw_3] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m1, m2 + paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + mov r3, rsp + paddw m0, m1 + lea r5d, [maxbaseq-31] + psrlw m0, 2 + mova [r3], m0 +.w32_filter_loop: + mova m0, [tlq+30] + paddw m1, m2, [tlq+28] + add tlq, 32 + paddw m0, [tlq+0] + pavgw m1, [tlq+4] + paddw m0, [tlq+2] + add r3, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r3], m0 + sub r5d, 16 + jg .w32_filter_loop + movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + punpckhwd m1, m0, m0 + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + jl .w32_filter_h8 + vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + movzx r5d, word [tlq+62] + movzx r2d, word [tlq+60] + pavgw m2, m3 + sub r2d, r5d + paddw m0, m1 + lea r2d, [r2+r5*8+4] + paddw m0, m2 + shr r2d, 3 + psrlw m0, 2 + mova [r3+32], m0 + mov [r3+66], r5w + mov [r3+64], r2w + mov tlq, rsp + mov r3d, 65 + cmp hd, 64 + cmove maxbased, r3d + jmp .w32_main +.w32_filter_h8: + vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 + vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 + pavgw xm2, xm3 + paddw xm0, xm1 + mov tlq, rsp + paddw xm0, xm2 + psrlw xm0, 2 + mova [r3+32], xm0 +.w32_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r5d, dxd + vpbroadcastd m7, [pw_m1024] ; -16 * 64 + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + psubw m3, m0 +.w32_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + psraw m1, m3, 15 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*0], m0 + movu m0, [tlq+r3*2+32] + movu m1, [tlq+r3*2+34] + add r5d, dxd + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m7, m3 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+32*1], m0 + dec hd + jz .w32_end + add dstq, strideq + cmp r5d, maxbased + jb .w32_loop +.w32_end_loop: + mova [dstq+32*0], m6 + mova [dstq+32*1], m6 + add dstq, strideq + dec hd + jg .w32_end_loop +.w32_end: + RET +.w64: + %assign stack_offset org_stack_offset + ALLOC_STACK -256, 10 + lea maxbased, [hq+63] + test angled, 0x400 + jnz .w64_main + vpbroadcastd m2, [pw_3] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m1, m2 + paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + mov r3, rsp + paddw m0, m1 + lea r5d, [hq+32] + psrlw m0, 2 + mova [r3], m0 +.w64_filter_loop: + mova m0, [tlq+30] + paddw m1, m2, [tlq+28] + add tlq, 32 + paddw m0, [tlq+0] + pavgw m1, [tlq+4] + paddw m0, [tlq+2] + add r3, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r3], m0 + sub r5d, 16 + jg .w64_filter_loop + movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + punpckhwd m1, m0, m0 + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + pavgw m2, m3 + paddw m0, m1 + paddw m0, m2 + mov tlq, rsp + psrlw m0, 2 + mova [r3+32], m0 +.w64_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r5d, dxd + vpbroadcastd m7, [pw_m1024] ; -16 * 64 + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + paddw m8, m7, m7 ; -32 * 64 + psubw m3, m0 + paddw m9, m8, m7 ; -48 * 64 +.w64_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + psraw m1, m3, 15 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*0], m0 + movu m0, [tlq+r3*2+32] + movu m1, [tlq+r3*2+34] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m7, m3 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*1], m0 + movu m0, [tlq+r3*2+64] + movu m1, [tlq+r3*2+66] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m8, m3 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*2], m0 + movu m0, [tlq+r3*2+96] + movu m1, [tlq+r3*2+98] + add r5d, dxd + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m9, m3 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+32*3], m0 + dec hd + jz .w64_end + add dstq, strideq + cmp r5d, maxbased + jb .w64_loop +.w64_end_loop: + mova [dstq+32*0], m6 + mova [dstq+32*1], m6 + mova [dstq+32*2], m6 + mova [dstq+32*3], m6 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + %macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax %ifnum %4 pshufb xm%2, xm%4 diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred_init_tmpl.c index 868751e..c487168 100644 --- a/src/x86/ipred_init_tmpl.c +++ b/src/x86/ipred_init_tmpl.c @@ -118,8 +118,8 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2); -#if BITDEPTH == 8 init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2); +#if BITDEPTH == 8 init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2); init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); #endif |