Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVictorien Le Couviour--Tuffet <victorien@videolan.org>2021-05-04 15:04:32 +0300
committerHenrik Gramner <henrik@gramner.com>2021-05-04 23:39:17 +0300
commitdc7cdc0b581d52da4f5ce84841408675cf81c094 (patch)
tree72ee55e195b32cd90dbc5e1023c2cca28906e7cb
parent0d42b3030b76592d33a993423f07e144d20c5d40 (diff)
x86: Add high bitdepth pal_pred AVX2 asm
-rw-r--r--src/ipred.h1
-rw-r--r--src/x86/ipred16_avx2.asm106
-rw-r--r--src/x86/ipred_init_tmpl.c2
3 files changed, 109 insertions, 0 deletions
diff --git a/src/ipred.h b/src/ipred.h
index 5df2657..eaaddb3 100644
--- a/src/ipred.h
+++ b/src/ipred.h
@@ -71,6 +71,7 @@ typedef decl_cfl_pred_fn(*cfl_pred_fn);
/*
* dst[x,y] = pal[idx[x,y]]
* - palette indices are [0-7]
+ * - only 16-byte alignment is guaranteed for idx.
*/
#define decl_pal_pred_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \
diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm
index aaac24d..0314b7c 100644
--- a/src/x86/ipred16_avx2.asm
+++ b/src/x86/ipred16_avx2.asm
@@ -61,6 +61,7 @@ ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11
filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1
filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
pw_2: times 2 dw 2
pw_4: times 2 dw 4
@@ -97,6 +98,7 @@ JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32
JMP_TABLE ipred_cfl_ac_420_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3
JMP_TABLE ipred_cfl_ac_422_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3
+JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64
cextern filter_intra_taps
@@ -2167,4 +2169,108 @@ DEFINE_ARGS ac, ypx, stride, wpad, hpad, iptr, h, sz, ac_bak
jg .sub_loop
RET
+cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ vbroadcasti128 m3, [palq]
+ lea r2, [pal_pred_16bpc_avx2_table]
+ tzcnt wd, wm
+ vbroadcasti128 m4, [pal_pred_shuf]
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ pshufb m3, m4
+ punpckhqdq m4, m3, m3
+ add wq, r2
+DEFINE_ARGS dst, stride, stride3, idx, w, h
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova xm2, [idxq]
+ add idxq, 16
+ pshufb xm1, xm3, xm2
+ pshufb xm2, xm4, xm2
+ punpcklbw xm0, xm1, xm2
+ punpckhbw xm1, xm2
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movu m2, [idxq] ; only 16-byte alignment
+ add idxq, 32
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m0, 1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+32], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+32], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+ 0], m0
+ mova [dstq+32], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+64], m0
+ mova [dstq+96], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
%endif
diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred_init_tmpl.c
index 9cbd393..868751e 100644
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -136,6 +136,8 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
c->pal_pred = dav1d_pal_pred_avx2;
+#else
+ c->pal_pred = dav1d_pal_pred_16bpc_avx2;
#endif
#endif
}