diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2021-05-04 15:04:34 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2021-05-05 01:16:23 +0300 |
commit | 1e29d3e5eb07f76f6317e27513577cf5c4dc2ef1 (patch) | |
tree | 7e554adf2af9e43b0205f3b93685007b57c449d6 | |
parent | c1412688dd5c1f70560bbfdf622647d23df83ff4 (diff) |
x86: Add high bitdepth ipred_z3 AVX2 asm
-rw-r--r-- | src/x86/ipred16_avx2.asm | 1046 | ||||
-rw-r--r-- | src/x86/ipred_init_tmpl.c | 2 |
2 files changed, 1047 insertions, 1 deletions
diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index a38823c..995d693 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -105,6 +105,7 @@ JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 @@ -1873,6 +1874,1051 @@ ALIGN function_align .w64_end: RET +cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z3_16bpc_avx2_table] + tzcnt hd, hm + movifnidn angled, anglem + lea r7, [dr_intra_derivative+45*2-1] + sub tlq, 2 + movsxd hq, [r6+hq*4] + sub angled, 180 + add hq, r6 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + movzx dyd, word [r7+dyq] + vpbroadcastd m5, [pw_62] + mov org_wd, wd + jmp hq +.h4: + ALLOC_STACK -64, 7 + lea r7, [strideq*3] + cmp angleb, 40 + jae .h4_no_upsample + lea r4d, [angleq-1024] + sar r4d, 7 + add r4d, wd + jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) + mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 + pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 + vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 + pshufd xm3, xm1, q0000 + paddw xm1, xm2 + paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 + vpbroadcastw xm4, r8m ; pixel_max + add dyd, dyd + psubw xm0, xm1, xm0 + mova [rsp+ 0], xm3 + movd xm3, dyd + psraw xm0, 3 + neg dyd + paddw xm1, xm0 + pxor xm0, xm0 + lea r2d, [dyq+(16<<6)+63] ; ypos + pmaxsw xm1, xm0 + pavgw xm1, xm0 + vpbroadcastw m3, xm3 + pminsw xm1, xm4 + punpckhwd xm0, xm1, xm2 + punpcklwd xm1, xm2 + paddw m2, m3, m3 + mova [rsp+32], xm0 + punpcklwd m3, m2 + mova [rsp+16], xm1 + paddw m4, m2, m2 + paddw m2, m3 + vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 +.h4_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 + movu xm1, [rsp+r2*2] + lea r2d, [r4+dyq] + shr r4d, 6 + movu xm2, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 + vinserti128 m1, [rsp+r2*2], 1 + lea r2d, [r4+dyq] + shr r4d, 6 + vinserti128 m2, [rsp+r4*2], 1 + psrld m0, m1, 16 + pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 + pslld m2, 16 + pblendw m1, m2, 0xaa + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m3, m4 + paddw m1, m0 + vextracti128 xm2, m1, 1 + punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 + movhps [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r7 ], xm1 + add dstq, 8 + sub wd, 4 + jg .h4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; h4/h8/h16 +%define base r4-z_filter_t0 + lea r4, [z_filter_t0] + movd xm0, maxbased + movd xm1, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m1, xm1 + pcmpeqb m0, [base+z_filter_wh] + pand m0, m1 + mova xm1, [r4+angleq*8] + pcmpgtb m0, m1 + pmovmskb r5d, m0 + ret +.h4_no_upsample: + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea maxbased, [wq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d + mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 + movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 + vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] + pmullw xm2, xm0 + pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 + paddw xm1, xm0, xm3 + movd [rsp+12], xm0 + pmullw xm1, xm4 + cmp r5d, 3 + jne .h4_filter_3tap + pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 + vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 + movzx r4d, word [tlq-14] + movzx r2d, word [tlq-12] + inc maxbased + paddw xm1, xm2 + paddw xm0, xm3 + sub r2d, r4d + paddw xm2, xm0, xm0 + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+14], r2w +.h4_filter_3tap: + pxor xm0, xm0 + paddw xm1, xm2 + lea tlq, [rsp+30] + psrlw xm1, 3 + cmp wd, 8 + sbb maxbased, -1 + pavgw xm0, xm1 + mova [rsp+16], xm0 +.h4_main: + movd xm3, dyd + neg maxbaseq + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m3, xm3 + lea r4d, [maxbaseq+3*64] + neg dyq + movd xm2, r4d + sub tlq, 8 + lea r4, [dyq+63] ; ypos + punpcklwd m1, m1 + paddw m0, m3, m3 + vpbroadcastw m2, xm2 + punpcklwd m3, m0 + paddw m4, m0, m0 + paddw m0, m3 + psubw m2, m1 + vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 + or maxbased, 63 + paddw m3, m2 +.h4_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu xm1, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + movu xm2, [tlq+r5*2] + lea r5, [r4+dyq] + sar r4, 6 ; base2 + vinserti128 m1, [tlq+r4*2], 1 + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vinserti128 m2, [tlq+r5*2], 1 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + pand m2, m5, m3 + palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 ; ypos < max_base_y + paddw m3, m4 + paddw m1, m0 + vpblendvb m1, m6, m1, m2 + vextracti128 xm2, m1, 1 + punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 + movhps [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r7 ], xm1 + sub wd, 4 + jz .h4_end + add dstq, 8 + cmp r4d, maxbased + jg .h4_loop +.h4_end_loop: + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + movq [dstq+strideq*2], xm6 + movq [dstq+r7 ], xm6 + add dstq, 8 + sub wd, 4 + jg .h4_end_loop +.h4_end: + RET +.h8: + lea r4d, [angleq+216] + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 8 + mov r4b, wb + lea r7, [strideq*3] + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e + movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d + cmp wd, 8 + je .h8_upsample_w8 + pshufhw xm3, xm2, q1000 + vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d +.h8_upsample_w8: + paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastw m4, r8m ; pixel_max + add dyd, dyd + psubw m0, m1, m0 + movd xm6, dyd + psraw m0, 3 + neg dyd + paddw m1, m0 + pxor m0, m0 + pmaxsw m1, m0 + lea r4d, [dyq+(16<<6)+63] ; ypos + pavgw m1, m0 + vpbroadcastw m6, xm6 + pminsw m1, m4 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + vextracti128 [rsp+48], m0, 1 + vextracti128 [rsp+32], m1, 1 + paddw m7, m6, m6 + mova [rsp+16], xm0 + mova [rsp+ 0], xm1 + punpcklwd m6, m7 ; ypos0 ypos1 +.h8_upsample_loop: + lea r2d, [r4+dyq] + shr r4d, 6 ; base0 + movu m1, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 ; base1 + movu m2, [rsp+r2*2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base2 + movu m3, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 ; base3 + movu m4, [rsp+r2*2] + psrld m0, m1, 16 + pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 + pslld m2, 16 + pblendw m1, m2, 0xaa + psrld m2, m3, 16 + pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 + pslld m4, 16 + pblendw m3, m4, 0xaa + pand m4, m5, m6 + paddw m6, m7 + psllw m4, 9 + psubw m1, m0 + pmulhrsw m1, m4 + pand m4, m5, m6 + psllw m4, 9 + psubw m3, m2 + pmulhrsw m3, m4 + paddw m6, m7 + lea r2, [dstq+strideq*4] + paddw m1, m0 + paddw m3, m2 + punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + movhps [r2 +strideq*0], xm0 + movq [r2 +strideq*1], xm0 + movhps [r2 +strideq*2], xm1 + movq [r2 +r7 ], xm1 + movhps [dstq+strideq*0], xm2 + movq [dstq+strideq*1], xm2 + movhps [dstq+strideq*2], xm3 + movq [dstq+r7 ], xm3 + add dstq, 8 + sub wd, 4 + jg .h8_upsample_loop + RET +.h8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(w+7, 15) + jmp .h8_main +.h8_no_upsample: + lea maxbased, [wq+7] + test angled, 0x400 + jnz .h8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h8_main + popcnt r5d, r5d + mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + pmullw m2, m0 + cmp wd, 8 + jl .h8_filter_w4 + punpcklwd xm0, xm0 + vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movd [rsp+28], xm0 + paddw m1, m3 + mov r4d, 16 + pmullw m1, m4 + cmovg maxbased, r4d + cmp r5d, 3 + jne .h8_filter_3tap + punpckhwd m3, m3 + vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + movzx r4d, word [tlq-30] + movzx r2d, word [tlq-28] + inc maxbased + paddw m1, m2 + paddw m0, m3 + sub r2d, r4d + paddw m2, m0, m0 + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+30], r2w + jmp .h8_filter_3tap +.h8_filter_w4: + pshufhw xm1, xm0, q2100 + vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e + paddw m1, m3 + pmullw m1, m4 +.h8_filter_3tap: + pxor m0, m0 + paddw m1, m2 + lea tlq, [rsp+62] + psrlw m1, 3 + pavgw m0, m1 + mova [rsp+32], m0 +.h8_main: + movd xm4, dyd + neg maxbaseq + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m7, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + lea r4d, [maxbaseq+7*64] + neg dyq + movd xm2, r4d + sub tlq, 16 + lea r4, [dyq+63] + paddw m6, m4, m4 + vpbroadcastw m2, xm2 + vpblendd m4, m6, 0xf0 ; ypos0 ypos1 + psubw m2, m1 + or maxbased, 63 + paddw m4, m2 +.h8_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu xm0, [tlq+r4*2+2] + movu xm1, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + vinserti128 m0, [tlq+r5*2+2], 1 + vinserti128 m1, [tlq+r5*2], 1 + lea r5, [r4+dyq] + sar r4, 6 ; base2 + pand m3, m5, m4 + psllw m3, 9 + psubw m1, m0 + pmulhrsw m1, m3 + psraw m3, m4, 15 + paddw m4, m6 + paddw m0, m1 + movu xm1, [tlq+r4*2+2] + movu xm2, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vpblendvb m0, m7, m0, m3 + vinserti128 m1, [tlq+r5*2+2], 1 + vinserti128 m2, [tlq+r5*2], 1 + pand m3, m5, m4 + psllw m3, 9 + psubw m2, m1 + pmulhrsw m2, m3 + psraw m3, m4, 15 + paddw m4, m6 + lea r5, [dstq+strideq*4] + paddw m1, m2 + vpblendvb m1, m7, m1, m3 + punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 + vextracti128 xm3, m2, 1 + punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 + punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 + vextracti128 xm3, m0, 1 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movq [dstq+r7 ], xm2 + punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 + punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 + movhps [r5 +strideq*0], xm1 + movq [r5 +strideq*1], xm1 + movhps [r5 +strideq*2], xm0 + movq [r5 +r7 ], xm0 + sub wd, 4 + jz .h8_end + add dstq, 8 + cmp r4d, maxbased + jg .h8_loop + lea r6, [strideq*5] + lea r2, [strideq+r7*2] ; stride*7 + test wd, 4 + jz .h8_end_loop + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + movq [dstq+strideq*2], xm7 + movq [dstq+r7 ], xm7 + movq [dstq+strideq*4], xm7 + movq [dstq+r6 ], xm7 + movq [dstq+r7*2 ], xm7 + movq [dstq+r2 ], xm7 + add dstq, 8 + sub wd, 4 + jz .h8_end +.h8_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + mova [dstq+strideq*2], xm7 + mova [dstq+r7 ], xm7 + mova [dstq+strideq*4], xm7 + mova [dstq+r6 ], xm7 + mova [dstq+r7*2 ], xm7 + mova [dstq+r2 ], xm7 + add dstq, 16 + sub wd, 8 + jg .h8_end_loop +.h8_end: + RET +.h16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(w+15, 31) + jmp .h16_main +ALIGN function_align +.h16: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 10 + lea maxbased, [wq+15] + lea r7, [strideq*3] + test angled, 0x400 + jnz .h16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pmullw m1, m7 + paddw m1, m2 + cmp wd, 8 + jg .h16_filter_w16 + mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 + pmullw xm6, xm3 + jl .h16_filter_w4 + pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 + cmp r5d, 3 + jne .h16_filter_w8_3tap + vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 +.h16_filter_w8_5tap: + punpckhwd m0, m0 + vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 + paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw xm4, xm4 + paddw m0, m0 + paddw xm6, xm4 + paddw m1, m0 +.h16_filter_w8_3tap: + paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 + pmullw xm3, xm7 + pxor m0, m0 + paddw xm3, xm6 + psrlw xm3, 3 + pavgw xm3, xm0 + mova [rsp+48], xm3 + jmp .h16_filter_end +.h16_filter_w4: + pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 + cmp r5d, 3 + jne .h16_filter_w8_3tap + pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 + jmp .h16_filter_w8_5tap +.h16_filter_w16: + mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + pmullw m6, m3 + punpcklwd xm3, xm3 + vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + mov r4d, 32 + cmp wd, 16 + cmovg maxbased, r4d + movd [rsp+28], xm3 + pmullw m4, m7 + cmp r5d, 3 + jne .h16_filter_w16_3tap + punpckhwd m0, m0 + vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movzx r4d, word [tlq-62] + movzx r2d, word [tlq-60] + or maxbased, 1 + paddw m3, m3 + sub r2d, r4d + paddw m0, m0 + lea r2d, [r2+r4*8+4] + paddw m4, m3 + shr r2d, 3 + paddw m1, m0 + mov [rsp+30], r2w +.h16_filter_w16_3tap: + pxor m0, m0 + paddw m4, m6 + psrlw m4, 3 + pavgw m4, m0 + mova [rsp+32], m4 +.h16_filter_end: + psrlw m1, 3 + lea tlq, [rsp+94] + pavgw m1, m0 + mova [rsp+64], m1 +.h16_main: + movd xm8, dyd + neg maxbaseq + vpbroadcastw m9, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m8, xm8 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + movd xm7, r4d + sub tlq, 32 + lea r4, [dyq+63] + vpbroadcastw m7, xm7 + or maxbased, 63 + psubw m7, [z_base_inc] +.h16_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu m0, [tlq+r4*2+2] + movu m2, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + movu m1, [tlq+r5*2+2] + movu m3, [tlq+r5*2] + lea r5, [r4+dyq] + sar r4, 6 ; base3 + pand m6, m5, m7 + psllw m6, 9 + psubw m2, m0 + pmulhrsw m2, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m0, m2 + movu m2, [tlq+r4*2+2] + movu m4, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vpblendvb m0, m9, m0, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m3, m1 + pmulhrsw m3, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m1, m3 + vpblendvb m1, m9, m1, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m4, m2 + pmulhrsw m4, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m2, m4 + movu m3, [tlq+r5*2+2] + movu m4, [tlq+r5*2] + vpblendvb m2, m9, m2, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m4, m3 + pmulhrsw m4, m6 + psraw m6, m7, 15 + paddw m7, m8 + lea r5, [dstq+strideq*4] + paddw m3, m4 + vpblendvb m3, m9, m3, m6 + punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 + punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 + punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 + punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 + vextracti128 xm6, m3, 1 + punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 + punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 + punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 + vextracti128 xm2, m4, 1 + movhps [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + vextracti128 xm6, m1, 1 + movhps [dstq+strideq*2], xm2 + movq [dstq+r7 ], xm2 + vextracti128 xm2, m0, 1 + movhps [r5 +strideq*0], xm6 + movq [r5 +strideq*1], xm6 + movhps [r5 +strideq*2], xm2 + movq [r5 +r7 ], xm2 + lea r5, [dstq+strideq*8] + movhps [r5 +strideq*0], xm3 + movq [r5 +strideq*1], xm3 + movhps [r5 +strideq*2], xm4 + movq [r5 +r7 ], xm4 + lea r5, [r5+strideq*4] + movhps [r5 +strideq*0], xm1 + movq [r5 +strideq*1], xm1 + movhps [r5 +strideq*2], xm0 + movq [r5 +r7 ], xm0 + sub wd, 4 + jz .h16_end + add dstq, 8 + cmp r4d, maxbased + jg .h16_loop + mov hd, 4 +.h16_end_loop0: + mov r6d, wd + mov r2, dstq + test wb, 4 + jz .h16_end_loop + movq [dstq+strideq*0], xm9 + movq [dstq+strideq*1], xm9 + movq [dstq+strideq*2], xm9 + movq [dstq+r7 ], xm9 + and r6d, 120 + jz .h16_end_w4 + add dstq, 8 +.h16_end_loop: + mova [dstq+strideq*0], xm9 + mova [dstq+strideq*1], xm9 + mova [dstq+strideq*2], xm9 + mova [dstq+r7 ], xm9 + add dstq, 16 + sub r6d, 8 + jg .h16_end_loop +.h16_end_w4: + lea dstq, [r2+strideq*4] + dec hd + jg .h16_end_loop0 +.h16_end: + RET +.h32: + %assign stack_offset org_stack_offset + ALLOC_STACK -160, 9 + lea maxbased, [wq+31] + and maxbased, 31 + or maxbased, 32 ; imin(w+31, 63) + test angled, 0x400 + jnz .h32_main + vpbroadcastd m2, [pw_3] + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + punpckhwd m1, m0, m0 + vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m2 + paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + lea r4, [rsp+128] + paddw m0, m1 + lea r5d, [maxbaseq-31] + psrlw m0, 2 + mova [r4], m0 +.h32_filter_loop: + mova m0, [tlq-62] + paddw m1, m2, [tlq-66] + paddw m0, [tlq-64] + pavgw m1, [tlq-58] + paddw m0, [tlq-60] + sub tlq, 32 + sub r4, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r4], m0 + sub r5d, 16 + jg .h32_filter_loop + jl .h32_filter_h8 + mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movzx r5d, word [tlq-62] + movzx r2d, word [tlq-60] + pavgw m2, m3 + sub r2d, r5d + paddw m0, m1 + lea r2d, [r2+r5*8+4] + paddw m0, m2 + shr r2d, 3 + psrlw m0, 2 + mova [r4-32], m0 + mov [r4-36], r5w + mov [r4-34], r2w + lea tlq, [rsp+158] + mov r4d, 65 + cmp wd, 64 + cmove maxbased, r4d + jmp .h32_main +.h32_filter_h8: + mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 + pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 + paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 + paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 + vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 + lea tlq, [rsp+158] + pavgw xm2, xm3 + paddw xm0, xm1 + paddw xm0, xm2 + psrlw xm0, 2 + mova [r4-16], xm0 +.h32_main: + movd xm6, dyd + neg maxbaseq + vpbroadcastw m7, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m6, xm6 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + movd xm4, r4d + vpbroadcastd m8, [pw_m1024] + lea r4, [dyq+63] + vpbroadcastw m4, xm4 + or maxbased, 63 + psubw m4, [z_base_inc] +.h32_loop: + mov r5, r4 + sar r5, 6 + movu m1, [tlq+r5*2-64] + movu m0, [tlq+r5*2-62] + pand m3, m5, m4 + psllw m3, 9 + psubw m1, m0 + pmulhrsw m1, m3 + pcmpgtw m2, m8, m4 + paddw m0, m1 + vpblendvb m0, m7, m0, m2 + movu m2, [tlq+r5*2-32] + movu m1, [tlq+r5*2-30] + add r4, dyq + sub rsp, 64 + psubw m2, m1 + pmulhrsw m2, m3 + psraw m3, m4, 15 + paddw m4, m6 + mova [rsp+32*0], m0 + paddw m1, m2 + vpblendvb m1, m7, m1, m3 + mova [rsp+32*1], m1 + dec wd + jz .h32_transpose + cmp r4d, maxbased + jg .h32_loop +.h32_end_loop: + sub rsp, 64 + mova [rsp+32*0], m7 + mova [rsp+32*1], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + lea r3, [strideq*3] + lea r4, [strideq*5] + mov r8, dstq + lea r5, [strideq+r3*2] +.h32_transpose_loop0: + lea r6, [rsp+32] + lea r2, [r8+org_wq*2-16] +.h32_transpose_loop: + mova m0, [r6+64*7] + mova m1, [r6+64*6] + mova m2, [r6+64*5] + mova m3, [r6+64*4] + mova m4, [r6+64*3] + mova m5, [r6+64*2] + mova m6, [r6+64*1] + mova m7, [r6+64*0] + punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 + punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 + punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 + punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 + punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 + punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 + punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 + lea dstq, [r2+strideq*8] + sub r6, 32 + punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 + punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 + punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 + punpckhqdq m5, m7, m1 ; 8 0 + vextracti128 [r2 +strideq*0], m5, 1 + punpcklqdq m7, m1 ; 9 1 + mova [dstq+strideq*0], xm5 + punpckhqdq m1, m8, m3 ; 10 2 + vextracti128 [r2 +strideq*1], m7, 1 + punpcklqdq m8, m3 ; 11 3 + mova [dstq+strideq*1], xm7 + punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 + vextracti128 [r2 +strideq*2], m1, 1 + punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 + mova [dstq+strideq*2], xm1 + punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 + vextracti128 [r2 +r3 ], m8, 1 + punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 + mova [dstq+r3 ], xm8 + punpckhqdq m6, m3, m2 ; 12 4 + vextracti128 [r2 +strideq*4], m6, 1 + punpcklqdq m3, m2 ; 13 5 + mova [dstq+strideq*4], xm6 + punpckhqdq m2, m0, m4 ; 14 6 + vextracti128 [r2 +r4 ], m3, 1 + punpcklqdq m0, m4 ; 15 7 + mova [dstq+r4 ], xm3 + vextracti128 [r2 +r3*2 ], m2, 1 + mova [dstq+r3*2 ], xm2 + vextracti128 [r2 +r5 ], m0, 1 + mova [dstq+r5 ], xm0 + lea r2, [dstq+strideq*8] + cmp r6, rsp + jae .h32_transpose_loop + add rsp, 64*8 + sub org_wd, 8 + jg .h32_transpose_loop0 +.h32_end: + RET +.h64: + %assign stack_offset org_stack_offset + ALLOC_STACK -256, 10 + lea maxbased, [wq+63] + test angled, 0x400 + jnz .h64_main + vpbroadcastd m2, [pw_3] + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + punpckhwd m1, m0, m0 + vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m2 + paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + lea r4, [rsp+224] + paddw m0, m1 + lea r5d, [wq+32] + psrlw m0, 2 + mova [r4], m0 +.h64_filter_loop: + mova m0, [tlq-62] + paddw m1, m2, [tlq-66] + paddw m0, [tlq-64] + pavgw m1, [tlq-58] + paddw m0, [tlq-60] + sub tlq, 32 + sub r4, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r4], m0 + sub r5d, 16 + jg .h64_filter_loop + mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + lea tlq, [rsp+254] + pavgw m2, m3 + paddw m0, m1 + paddw m0, m2 + psrlw m0, 2 + mova [r4-32], m0 +.h64_main: + neg maxbaseq + movd xm4, dyd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + vpbroadcastd m7, [pw_m1024] + movd xm3, r4d + lea r4, [dyq+63] + paddw m8, m7, m7 + vpbroadcastw m3, xm3 + or maxbased, 63 + paddw m9, m8, m7 + psubw m3, [z_base_inc] +.h64_loop: + mov r5, r4 + sar r5, 6 + movu m1, [tlq+r5*2-128] + movu m0, [tlq+r5*2-126] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + sub rsp, 128 + paddw m0, m1 + pcmpgtw m1, m9, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*0], m0 + movu m1, [tlq+r5*2-96] + movu m0, [tlq+r5*2-94] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m8, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*1], m0 + movu m1, [tlq+r5*2-64] + movu m0, [tlq+r5*2-62] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m7, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*2], m0 + movu m1, [tlq+r5*2-32] + movu m0, [tlq+r5*2-30] + psubw m1, m0 + pmulhrsw m1, m2 + add r4, dyq + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [rsp+32*3], m0 + dec wd + jz .h64_transpose + cmp r4d, maxbased + jg .h64_loop +.h64_end_loop: + sub rsp, 128 + mova [rsp+32*0], m6 + mova [rsp+32*1], m6 + mova [rsp+32*2], m6 + mova [rsp+32*3], m6 + dec wd + jg .h64_end_loop +.h64_transpose: + lea r2, [strideq*3] + lea r3, [strideq*5] + mov r5, dstq + lea r4, [strideq+r2*2] +.h64_transpose_loop0: + lea r6, [rsp+112] + lea dstq, [r5+org_wq*2-32] +.h64_transpose_loop: + mova xm0, [r6+128*15] + vinserti128 m0, [r6+128* 7], 1 + mova xm1, [r6+128*14] + vinserti128 m1, [r6+128* 6], 1 + mova xm2, [r6+128*13] + vinserti128 m2, [r6+128* 5], 1 + mova xm3, [r6+128*12] + vinserti128 m3, [r6+128* 4], 1 + mova xm4, [r6+128*11] + vinserti128 m4, [r6+128* 3], 1 + mova xm5, [r6+128*10] + vinserti128 m5, [r6+128* 2], 1 + mova xm6, [r6+128* 9] + vinserti128 m6, [r6+128* 1], 1 + mova xm7, [r6+128* 8] + vinserti128 m7, [r6+128* 0], 1 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + sub r6, 16 + punpckhdq m7, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpckhqdq m5, m7, m1 + punpcklqdq m7, m1 + punpckhqdq m1, m8, m3 + punpcklqdq m8, m3 + punpckhdq m3, m0, m2 + mova [dstq+strideq*0], m5 + punpckldq m0, m2 + mova [dstq+strideq*1], m7 + punpckhdq m2, m4, m6 + mova [dstq+strideq*2], m1 + punpckldq m4, m6 + mova [dstq+r2 ], m8 + punpckhqdq m6, m3, m2 + mova [dstq+strideq*4], m6 + punpcklqdq m3, m2 + mova [dstq+r3 ], m3 + punpckhqdq m2, m0, m4 + mova [dstq+r2*2 ], m2 + punpcklqdq m0, m4 + mova [dstq+r4 ], m0 + lea dstq, [dstq+strideq*8] + cmp r6, rsp + jae .h64_transpose_loop + add rsp, 128*16 + sub org_wd, 16 + jg .h64_transpose_loop0 +.h64_end: + RET + %macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax %ifnum %4 pshufb xm%2, xm%4 diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred_init_tmpl.c index c487168..b2ecbf9 100644 --- a/src/x86/ipred_init_tmpl.c +++ b/src/x86/ipred_init_tmpl.c @@ -121,8 +121,8 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2); #if BITDEPTH == 8 init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2); - init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); #endif + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2); init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2); |