Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorVictorien Le Couviour--Tuffet <victorien@videolan.org>2022-01-18 16:46:31 +0300
committerVictorien Le Couviour--Tuffet <victorien@videolan.org>2022-01-24 17:37:16 +0300
commit4a52aa4790a57b01ab40831ffb5eb6a53cb8a0f0 (patch)
tree9106b1d1c3c0dddb845b075f45313e70c4193093 /src
parent1cdde64f8297728846d164beaf2f6a74bd7ca20f (diff)
x86: Add mc.resize AVX-512 (Ice Lake) asm
resize_8bpc_c: 542599.0 resize_8bpc_ssse3: 87635.4 resize_8bpc_avx2: 67401.1 resize_8bpc_avx512icl: 50263.6 resize_16bpc_c: 573438.9 resize_16bpc_ssse3: 121505.2 resize_16bpc_avx2: 83293.4 resize_16bpc_avx512icl: 77974.8
Diffstat (limited to 'src')
-rw-r--r--src/x86/mc16_avx2.asm2
-rw-r--r--src/x86/mc16_avx512.asm155
-rw-r--r--src/x86/mc16_sse.asm2
-rw-r--r--src/x86/mc_avx2.asm4
-rw-r--r--src/x86/mc_avx512.asm115
-rw-r--r--src/x86/mc_init_tmpl.c1
-rw-r--r--src/x86/mc_sse.asm2
7 files changed, 265 insertions, 16 deletions
diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm
index 8c9f861..3dacfe6 100644
--- a/src/x86/mc16_avx2.asm
+++ b/src/x86/mc16_avx2.asm
@@ -5789,7 +5789,7 @@ cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
vpbroadcastd m5, dxm
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
- DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr, _, pxmax
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
LEA r7, $$
%define base r7-$$
vpbroadcastd m3, [base+pd_64]
diff --git a/src/x86/mc16_avx512.asm b/src/x86/mc16_avx512.asm
index c0b139f..c2ea090 100644
--- a/src/x86/mc16_avx512.asm
+++ b/src/x86/mc16_avx512.asm
@@ -131,6 +131,16 @@ warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
+resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
+resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
+resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
+resize_permE: dq 0, 2, 4, 6
+resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
+resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
prep_hv_shift: dq 6, 4
put_bilin_h_rnd: dw 8, 8, 10, 10
@@ -151,9 +161,12 @@ pw_m512: times 2 dw -512
pw_2: times 2 dw 2
pw_64: times 2 dw 64
pd_32: dd 32
+pd_63: dd 63
pd_128: dd 128
pd_640: dd 640
pd_2176: dd 2176
+pd_16384: dd 16384
+pd_0_4: dd 0, 4
%define pw_16 prep_mul
%define pd_512 warp_8x8_rnd_h
@@ -237,6 +250,7 @@ cextern mc_subpel_filters
cextern mc_warp_filter
cextern obmc_masks_avx2
+cextern resize_filter
SECTION .text
@@ -4708,4 +4722,145 @@ cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
jl .w128
RET
+cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k6, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pd_16384]
+ vpbroadcastd m7, [base+pd_63]
+ mova m24, [base+resize_permA]
+ mova m25, [base+resize_permB]
+ mova m26, [base+resize_permC]
+ mova m27, [base+resize_permD]
+ vbroadcasti32x4 m28, [base+resize_shufA]
+ vbroadcasti32x4 m29, [base+resize_shufB]
+ mova m30, [base+resize_permE]
+ vpbroadcastw ym31, pxmaxm
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k5, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k5, k5
+ jz .load
+ vpbroadcastq m14, [base+pd_0_4]
+ vpermq m10, m0, q1100
+ vpermq m11, m0, q3322
+ vpermq m20, m1, q1100
+ vpermq m21, m1, q3322
+ punpckldq m10, m10
+ punpckldq m11, m11
+ punpckldq m20, m20
+ punpckldq m21, m21
+ paddd m10, m14
+ paddd m11, m14
+ paddd m20, m14
+ paddd m21, m14
+ vextracti32x8 ym12, m10, 1
+ vextracti32x8 ym13, m11, 1
+ vextracti32x8 ym22, m20, 1
+ vextracti32x8 ym23, m21, 1
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
+ vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
+ vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
+ vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
+ vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
+ vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
+ vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
+ pshufb m16, m0
+ pshufb m17, m1
+ pshufb m18, m14
+ pshufb m19, m15
+ mova m20, m24
+ mova m22, m24
+ mova m21, m25
+ mova m23, m25
+ vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
+ vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
+ vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
+ vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
+ mova m15, m26
+ mova m17, m26
+ mova m16, m27
+ mova m18, m27
+ vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
+ vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
+ vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
+ vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
+ kmovq k1, k6
+ kmovq k2, k6
+ vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
+ pshufb m10, m11, m28
+ pshufb m11, m11, m29
+ pshufb m12, m13, m28
+ pshufb m13, m13, m29
+ jmp .filter
+.load:
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
+ pshufb m10, m11, m28
+ pshufb m11, m11, m29
+ pshufb m12, m13, m28
+ pshufb m13, m13, m29
+ vpgatherdd m15{k3}, [srcq+m0*2+ 0]
+ vpgatherdd m16{k4}, [srcq+m0*2+ 4]
+ kmovq k1, k6
+ kmovq k2, k6
+ vpgatherdd m17{k1}, [srcq+m0*2+ 8]
+ vpgatherdd m18{k2}, [srcq+m0*2+12]
+.filter:
+ mova m14, m2
+ vpdpwssd m14, m15, m10
+ vpdpwssd m14, m16, m11
+ vpdpwssd m14, m17, m12
+ vpdpwssd m14, m18, m13
+ psubd m14, m3, m14
+ psrad m14, 15
+ packusdw m14, m14
+ vpermq m14, m30, m14
+ pminsw ym14, ym31
+ mova [dstq+xq*2], ym14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
%endif ; ARCH_X86_64
diff --git a/src/x86/mc16_sse.asm b/src/x86/mc16_sse.asm
index da84684..6435bd0 100644
--- a/src/x86/mc16_sse.asm
+++ b/src/x86/mc16_sse.asm
@@ -8550,7 +8550,7 @@ cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
pshufd m5, m5, q0000
mova [rsp+16*3*ARCH_X86_32], m4
%if ARCH_X86_64
- DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
LEA r7, $$
%define base r7-$$
%else
diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm
index a1e94f9..2719ef3 100644
--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -5046,11 +5046,11 @@ cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
- DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
LEA r7, $$
%define base r7-$$
- vpbroadcastd m3, [base+pw_m256]
+ vpbroadcastd xm3, [base+pw_m256]
vpbroadcastd m7, [base+pd_63]
vbroadcasti128 m15, [base+pb_8x0_8x8]
pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
diff --git a/src/x86/mc_avx512.asm b/src/x86/mc_avx512.asm
index 1b02100..fb55449 100644
--- a/src/x86/mc_avx512.asm
+++ b/src/x86/mc_avx512.asm
@@ -193,29 +193,39 @@ bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 1
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+resize_permC: dd 0, 4, 8, 12
pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
wm_420_perm64: dq 0xfedcba9876543210
wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
-pb_127: times 4 db 127
-pw_m128 times 2 dw -128
-pw_1024: times 2 dw 1024
-pw_2048: times 2 dw 2048
-pw_6903: times 2 dw 6903
-pw_8192: times 2 dw 8192
-pd_32: dd 32
-pd_34: dd 34
-pd_512: dd 512
-pd_32768: dd 32768
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+pb_127: times 4 db 127
+pw_m128 times 2 dw -128
+pw_m256: times 2 dw -256
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_34: dd 34
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
%define pb_m64 (wm_sign+4)
%define pb_64 (wm_sign+8)
%define pd_2 (pd_0to7+8)
cextern mc_subpel_filters
-cextern mc_warp_filter
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+cextern mc_warp_filter
+cextern resize_filter
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
@@ -4450,4 +4460,87 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
jl .w128
RET
+cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k3, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti32x4 m15, [base+pb_8x0_8x8]
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+ mova m16, [base+resize_permA]
+ mova m17, [base+resize_permB]
+ mova xm18, [base+resize_permC]
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k4, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k4, k4
+ jz .load
+ vextracti32x8 ym12, m0, 1
+ vextracti32x8 ym13, m1, 1
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m10{k1}, [srcq+ym0]
+ vpgatherdq m11{k2}, [srcq+ym12]
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m14{k1}, [base+resize_shuf+4+ym1]
+ vpgatherdq m0{k2}, [base+resize_shuf+4+ym13]
+ mova m12, m16
+ mova m13, m17
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m10, m14
+ pshufb m11, m0
+ vpermi2d m12, m10, m11
+ vpermi2d m13, m10, m11
+ jmp .filter
+.load:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m12{k1}, [srcq+m0+0]
+ vpgatherdd m13{k2}, [srcq+m0+4]
+.filter:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m10{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m11{k2}, [base+resize_filter+m9*8+4]
+ mova m14, m2
+ vpdpbusd m14, m12, m10
+ vpdpbusd m14, m13, m11
+ packssdw m14, m14
+ pmulhrsw m14, m3
+ packuswb m14, m14
+ vpermd m14, m18, m14
+ mova [dstq+xq], xm14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
%endif ; ARCH_X86_64
diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc_init_tmpl.c
index b98a73b..57680ea 100644
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -294,5 +294,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->blend_h = BF(dav1d_blend_h, avx512icl);
c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
+ c->resize = BF(dav1d_resize, avx512icl);
#endif
}
diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm
index e3214eb..54939c6 100644
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -9404,7 +9404,7 @@ cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
pshufd m5, m5, q0000
%if ARCH_X86_64
- DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
LEA r7, $$
%define base r7-$$
%else