diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2021-05-04 15:02:35 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2021-05-04 17:59:16 +0300 |
commit | 6519c4b882be89975f906abef41e4e4e64b71fd2 (patch) | |
tree | 42dc061a6bc56cc73423e33ca973e2c51de8fa34 | |
parent | c8bdb78a772152e71b499977d88e0e09614cb030 (diff) |
x86: Add high bitdepth w_mask_444 AVX2 asm
-rw-r--r-- | src/x86/mc16_avx2.asm | 120 | ||||
-rw-r--r-- | src/x86/mc_init_tmpl.c | 1 |
2 files changed, 121 insertions, 0 deletions
diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 2b6d2db..be77824 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -71,6 +71,7 @@ BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -3131,4 +3132,123 @@ ALIGN function_align add maskq, 32 ret +cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx2_table + lea r7, [w_mask_444_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] + vpbroadcastd m4, [base+pw_64] + vpbroadcastd m5, [base+bidir_rnd+r6*4] + vpbroadcastd m6, [base+bidir_mul+r6*4] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + call .main + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + call .main + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + call .main + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + call .main + mova [dstq+32*6], m0 + mova [dstq+32*7], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2, 4, 5, 6 + W_MASK 1, 3, 4, 5, 6 + packuswb m2, m3 + vpermq m2, m2, q3120 + add tmp1q, 32*2 + add tmp2q, 32*2 + mova [maskq], m2 + add maskq, 32 + ret + %endif ; ARCH_X86_64 diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c index ec67a20..fb973ab 100644 --- a/src/x86/mc_init_tmpl.c +++ b/src/x86/mc_init_tmpl.c @@ -285,6 +285,7 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { c->avg = dav1d_avg_16bpc_avx2; c->w_avg = dav1d_w_avg_16bpc_avx2; c->mask = dav1d_mask_16bpc_avx2; + c->w_mask[0] = dav1d_w_mask_444_16bpc_avx2; c->w_mask[1] = dav1d_w_mask_422_16bpc_avx2; c->w_mask[2] = dav1d_w_mask_420_16bpc_avx2; #endif |