diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2021-05-04 15:02:31 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2021-05-04 17:59:16 +0300 |
commit | c8bdb78a772152e71b499977d88e0e09614cb030 (patch) | |
tree | 0823b5aa96f731b8f71c84c3b35a9276dc6b770b | |
parent | 3f7e2c154ce2502aa2965ac01b124b60e36d8087 (diff) |
x86: Add high bitdepth w_mask_422 AVX2 asm
-rw-r--r-- | src/x86/mc16_avx2.asm | 135 | ||||
-rw-r--r-- | src/x86/mc_init_tmpl.c | 1 |
2 files changed, 136 insertions, 0 deletions
diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 87a3848..2b6d2db 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -70,6 +70,7 @@ BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -2996,4 +2997,138 @@ ALIGN function_align add tmp2q, 32*4 ret +cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx2_table + lea r7, [w_mask_422_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + vpbroadcastb m14, r7m ; sign + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+bidir_rnd+r6*4] + vpbroadcastd m13, [base+bidir_mul+r6*4] + mova m15, [base+deint_shuf] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 4 + W_MASK 1, 5 + phaddw m4, m5 + W_MASK 2, 5 + W_MASK 3, 6 + phaddw m5, m6 + add tmp1q, 32*4 + add tmp2q, 32*4 + packuswb m4, m5 + pxor m5, m5 + psubb m4, m14 + pavgb m4, m5 + vpermd m4, m15, m4 + mova [maskq], m4 + add maskq, 32 + ret + %endif ; ARCH_X86_64 diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c index dc16375..ec67a20 100644 --- a/src/x86/mc_init_tmpl.c +++ b/src/x86/mc_init_tmpl.c @@ -285,6 +285,7 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { c->avg = dav1d_avg_16bpc_avx2; c->w_avg = dav1d_w_avg_16bpc_avx2; c->mask = dav1d_mask_16bpc_avx2; + c->w_mask[1] = dav1d_w_mask_422_16bpc_avx2; c->w_mask[2] = dav1d_w_mask_420_16bpc_avx2; #endif |