Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrik Gramner <gramner@twoorioles.com>2021-05-04 15:02:35 +0300
committerHenrik Gramner <henrik@gramner.com>2021-05-04 17:59:16 +0300
commit6519c4b882be89975f906abef41e4e4e64b71fd2 (patch)
tree42dc061a6bc56cc73423e33ca973e2c51de8fa34
parentc8bdb78a772152e71b499977d88e0e09614cb030 (diff)
x86: Add high bitdepth w_mask_444 AVX2 asm
-rw-r--r--src/x86/mc16_avx2.asm120
-rw-r--r--src/x86/mc_init_tmpl.c1
2 files changed, 121 insertions, 0 deletions
diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm
index 2b6d2db..be77824 100644
--- a/src/x86/mc16_avx2.asm
+++ b/src/x86/mc16_avx2.asm
@@ -71,6 +71,7 @@ BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
@@ -3131,4 +3132,123 @@ ALIGN function_align
add maskq, 32
ret
+cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m4, [base+pw_64]
+ vpbroadcastd m5, [base+bidir_rnd+r6*4]
+ vpbroadcastd m6, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ call .main
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ call .main
+ mova [dstq+32*6], m0
+ mova [dstq+32*7], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2, 4, 5, 6
+ W_MASK 1, 3, 4, 5, 6
+ packuswb m2, m3
+ vpermq m2, m2, q3120
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ mova [maskq], m2
+ add maskq, 32
+ ret
+
%endif ; ARCH_X86_64
diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c
index ec67a20..fb973ab 100644
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -285,6 +285,7 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->avg = dav1d_avg_16bpc_avx2;
c->w_avg = dav1d_w_avg_16bpc_avx2;
c->mask = dav1d_mask_16bpc_avx2;
+ c->w_mask[0] = dav1d_w_mask_444_16bpc_avx2;
c->w_mask[1] = dav1d_w_mask_422_16bpc_avx2;
c->w_mask[2] = dav1d_w_mask_420_16bpc_avx2;
#endif