Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVictorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>2021-05-04 15:03:18 +0300
committerHenrik Gramner <henrik@gramner.com>2021-05-04 18:00:07 +0300
commiteb78dbe877584c28a662103c69ed0956f7bf519f (patch)
treed2e9c2102e629e39bdae7c044d16fed1f88e5d6c
parentae4b53fb4c9e8b616ea2f5ca92f9b9ee4ff33116 (diff)
x86: Add high bitdepth ipred_{h,v} AVX2 asm
-rw-r--r--src/x86/ipred16_avx2.asm82
-rw-r--r--src/x86/ipred_init_tmpl.c2
2 files changed, 83 insertions, 1 deletions
diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm
index e99ae0b..efefa15 100644
--- a/src/x86/ipred16_avx2.asm
+++ b/src/x86/ipred16_avx2.asm
@@ -48,6 +48,7 @@ pw_2048: times 2 dw 2048
JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64
SECTION .text
@@ -353,4 +354,85 @@ cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
lea stride3q, [strideq*3]
jmp wq
+cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+34]
+ movu m2, [tlq+66]
+ movu m3, [tlq+98]
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ lea r5, [ipred_h_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+INIT_XMM avx2
+.w4:
+ IPRED_H 4, q
+.w8:
+ IPRED_H 8, a
+INIT_YMM avx2
+.w16:
+ IPRED_H 16, a
+.w32:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+.w64:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ sub tlq, 4
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*0+32*2], m0
+ mova [dstq+strideq*0+32*3], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m1
+ mova [dstq+strideq*1+32*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
%endif
diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred_init_tmpl.c
index fb7e917..2d4fa00 100644
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -112,9 +112,9 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2);
init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2);
init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2);
-#if BITDEPTH == 8
init_angular_ipred_fn(HOR_PRED, ipred_h, avx2);
init_angular_ipred_fn(VERT_PRED, ipred_v, avx2);
+#if BITDEPTH == 8
init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2);
init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2);
init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2);