diff options
author | Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com> | 2021-05-04 15:03:20 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2021-05-04 18:00:07 +0300 |
commit | 43b72358f3b0015c064e7386a60bbbe92bceeeaf (patch) | |
tree | fa078eb83ecd1eac663cb962a090e9be02b47f1a | |
parent | eb78dbe877584c28a662103c69ed0956f7bf519f (diff) |
x86: Add high bitdepth ipred_paeth AVX2 asm
-rw-r--r-- | src/x86/ipred16_avx2.asm | 138 | ||||
-rw-r--r-- | src/x86/ipred_init_tmpl.c | 2 |
2 files changed, 138 insertions, 2 deletions
diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index efefa15..d7b86c3 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -28,7 +28,10 @@ %if ARCH_X86_64 -SECTION_RODATA +SECTION_RODATA 32 + +ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 + db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 @@ -49,6 +52,7 @@ JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 SECTION .text @@ -435,4 +439,136 @@ INIT_YMM avx2 jg .w64 RET +%macro PAETH 3 ; top, signed_ldiff, ldiff + paddw m0, m%2, m1 + psubw m7, m3, m0 ; tldiff + psubw m0, m%1 ; tdiff + pabsw m7, m7 + pabsw m0, m0 + pminsw m7, m0 + pcmpeqw m0, m7 + pcmpgtw m7, m%3, m7 + vpblendvb m0, m3, m%1, m0 + vpblendvb m0, m1, m0, m7 +%endmacro + +cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h +%define base r5-ipred_paeth_16bpc_avx2_table + movifnidn hd, hm + lea r5, [ipred_paeth_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + vpbroadcastw m3, [tlq] ; topleft + add wq, r5 + jmp wq +.w4: + vpbroadcastq m2, [tlq+2] ; top + movsldup m6, [base+ipred_hv_shuf] + lea r3, [strideq*3] + psubw m4, m2, m3 + pabsw m5, m4 +.w4_loop: + sub tlq, 8 + vpbroadcastq m1, [tlq] + pshufb m1, m6 ; left + PAETH 2, 4, 5 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m2, [tlq+2] + movsldup m6, [base+ipred_hv_shuf] + lea r3, [strideq*3] + psubw m4, m2, m3 + pabsw m5, m4 +.w8_loop: + sub tlq, 4 + vpbroadcastd m1, [tlq] + pshufb m1, m6 + PAETH 2, 4, 5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + movu m2, [tlq+2] + psubw m4, m2, m3 + pabsw m5, m4 +.w16_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m2, [tlq+2] + movu m6, [tlq+34] +%if WIN64 + movaps r4m, xmm8 + movaps r6m, xmm9 +%endif + psubw m4, m2, m3 + psubw m8, m6, m3 + pabsw m5, m4 + pabsw m9, m8 +.w32_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq+32*0], m0 + PAETH 6, 8, 9 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps xmm8, r4m + movaps xmm9, r6m +%endif + RET +ALIGN function_align +.w64: + WIN64_SPILL_XMM 16 + movu m2, [tlq+ 2] + movu m6, [tlq+34] + movu m10, [tlq+66] + movu m13, [tlq+98] + psubw m4, m2, m3 + psubw m8, m6, m3 + psubw m11, m10, m3 + psubw m14, m13, m3 + pabsw m5, m4 + pabsw m9, m8 + pabsw m12, m11 + pabsw m15, m14 +.w64_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq+32*0], m0 + PAETH 6, 8, 9 + mova [dstq+32*1], m0 + PAETH 10, 11, 12 + mova [dstq+32*2], m0 + PAETH 13, 14, 15 + mova [dstq+32*3], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + %endif diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred_init_tmpl.c index 2d4fa00..52437ca 100644 --- a/src/x86/ipred_init_tmpl.c +++ b/src/x86/ipred_init_tmpl.c @@ -114,8 +114,8 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2); init_angular_ipred_fn(HOR_PRED, ipred_h, avx2); init_angular_ipred_fn(VERT_PRED, ipred_v, avx2); -#if BITDEPTH == 8 init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2); +#if BITDEPTH == 8 init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2); |