Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMatthias Dressel <code@deadcode.eu>2022-04-24 21:13:01 +0300
committerMatthias Dressel <code@deadcode.eu>2022-04-24 21:58:00 +0300
commitffb59680356fd210816cf9e46d9d023ade1f4d5a (patch)
tree7bcb9a12f640777aeccf7a9d589f857928a9eb09 /src
parente67a500054c7d5529c8e48841cf60d7b9dcb3ec5 (diff)
x86/itx: Add 32x8 12bpc AVX2 transforms
inv_txfm_add_32x8_dct_dct_0_12bpc_c: 286.7 inv_txfm_add_32x8_dct_dct_0_12bpc_avx2: 20.1 inv_txfm_add_32x8_dct_dct_1_12bpc_c: 7832.7 inv_txfm_add_32x8_dct_dct_1_12bpc_avx2: 710.6 inv_txfm_add_32x8_dct_dct_2_12bpc_c: 7838.1 inv_txfm_add_32x8_dct_dct_2_12bpc_avx2: 711.6 inv_txfm_add_32x8_dct_dct_3_12bpc_c: 7818.3 inv_txfm_add_32x8_dct_dct_3_12bpc_avx2: 710.9 inv_txfm_add_32x8_dct_dct_4_12bpc_c: 7820.6 inv_txfm_add_32x8_dct_dct_4_12bpc_avx2: 710.5 inv_txfm_add_32x8_identity_identity_0_12bpc_c: 1526.6 inv_txfm_add_32x8_identity_identity_0_12bpc_avx2: 19.3 inv_txfm_add_32x8_identity_identity_1_12bpc_c: 1519.4 inv_txfm_add_32x8_identity_identity_1_12bpc_avx2: 19.9 inv_txfm_add_32x8_identity_identity_2_12bpc_c: 1519.9 inv_txfm_add_32x8_identity_identity_2_12bpc_avx2: 43.6 inv_txfm_add_32x8_identity_identity_3_12bpc_c: 1519.4 inv_txfm_add_32x8_identity_identity_3_12bpc_avx2: 67.8 inv_txfm_add_32x8_identity_identity_4_12bpc_c: 1523.2 inv_txfm_add_32x8_identity_identity_4_12bpc_avx2: 91.6
Diffstat (limited to 'src')
-rw-r--r--src/x86/itx16_avx2.asm163
-rw-r--r--src/x86/itx_init_tmpl.c1
2 files changed, 129 insertions, 35 deletions
diff --git a/src/x86/itx16_avx2.asm b/src/x86/itx16_avx2.asm
index df37de0..0b6a460 100644
--- a/src/x86/itx16_avx2.asm
+++ b/src/x86/itx16_avx2.asm
@@ -4363,6 +4363,10 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_20b_max]
jmp m(idct_16x8_internal_10bpc).pass1
.pass2:
+ call .pass2_main
+ RET
+ALIGN function_align
+.pass2_main:
call m(idct_8x16_internal_12bpc).transpose
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -4406,8 +4410,7 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpermq m1, m5, q3120
vpermq m2, m6, q3120
vpermq m3, m7, q3120
- call m(idct_16x8_internal_10bpc).write_16x4_zero
- RET
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
ALIGN function_align
.write_16x4_start:
vpbroadcastd m9, [pixel_12bpc_max]
@@ -4426,7 +4429,8 @@ cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
jmp m(iadst_16x8_internal_10bpc).pass1
.pass2:
call .pass2_main
- jmp m(idct_16x8_internal_12bpc).end
+ call m(idct_16x8_internal_12bpc).end
+ RET
ALIGN function_align
.pass2_main:
call m(idct_8x16_internal_12bpc).transpose
@@ -5453,7 +5457,7 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
call m(idct_16x16_internal_12bpc).write_16x16
RET
-%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift
+%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
mova m%4, [r6+32*(%1-4)]
mova m%2, [r5+32*(3-%1)]
mova m%5, [r4+32*(%1-4)]
@@ -5470,8 +5474,10 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
paddd m%2, m%3, m%5 ; out15 - n
psubd m%3, m%5 ; out16 + n
REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%if %7 & 1
packssdw m%1, m%3 ; out0 + n, out16 + n
packssdw m%2, m%4 ; out15 - n, out31 - n
+%endif
%endmacro
cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
@@ -6165,6 +6171,47 @@ ALIGN function_align
mova [cq+128*7], m7
add cq, 32
ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2, 0
+ mova [cq+32*16], m8
+ mova [cq+32*31], m9
+ IDCT32_END 1, 14, 8, 9, 10, 2, 0
+ mova [cq+32*17], m8
+ mova [cq+32*30], m9
+ mova [cq+32*14], m14
+ IDCT32_END 2, 14, 8, 9, 10, 2, 0
+ mova [cq+32*18], m8
+ mova [cq+32*29], m9
+ mova [cq+32*13], m14
+ IDCT32_END 3, 14, 8, 9, 10, 2, 0
+ mova [cq+32*19], m8
+ mova [cq+32*28], m9
+ mova [cq+32*12], m14
+ IDCT32_END 4, 14, 8, 9, 10, 2, 0
+ mova [cq+32*20], m8
+ mova [cq+32*27], m9
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ IDCT32_END 5, 10, 0, 1, 2, 2, 0
+ mova [cq+32*21], m0
+ mova [cq+32*26], m1
+ IDCT32_END 6, 9, 0, 1, 2, 2, 0
+ mova [cq+32*22], m0
+ mova [cq+32*25], m1
+ IDCT32_END 7, 8, 0, 1, 2, 2, 0
+ mova [cq+32*23], m0
+ mova [cq+32*24], m1
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 1]
+ mova m2, [cq+32* 2]
+ mova m11, m14
+ mova m12, [cq+32*12]
+ mova m13, [cq+32*13]
+ mova m14, [cq+32*14]
+ ret
cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
vpbroadcastd m7, [pixel_12bpc_max]
@@ -6202,6 +6249,39 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
RET
.full:
PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ call .pass1
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ lea r6, [deint_shuf+128]
+ vpbroadcastd m11, [pw_2048]
+ mov r4, dstq
+ call .pass2
+ mova m0, [r5+32*3] ; 16 17
+ mova m1, [r5+32*2] ; 30 31
+ mova m2, [r5+32*1] ; 18 19
+ mova m3, [r5+32*0] ; 28 29
+ mova m4, [r5-32*1] ; 20 21
+ mova m5, [r5-32*2] ; 26 27
+ mova m6, [r5-32*3] ; 22 23
+ mova m7, [r5-32*4] ; 24 25
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ lea dstq, [r4+32]
+ call .pass2
+ RET
+ALIGN function_align
+.pass2:
+ call m(idct_16x8_internal_8bpc).main
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.pass1:
mova m0, [cq+32* 1]
mova m1, [cq+32* 7]
mova m2, [cq+32* 9]
@@ -6211,10 +6291,7 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
mova m6, [cq+32*25]
mova m7, [cq+32*31]
vpbroadcastd m11, [pd_2048]
- vpbroadcastd m12, [clip_18b_min]
- vpbroadcastd m13, [clip_18b_max]
vpbroadcastd m14, [pd_2896]
- lea r6, [rsp+32*4]
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
mova m0, [cq+32* 3]
mova m1, [cq+32* 5]
@@ -6244,37 +6321,12 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
mova m7, [cq+32*28]
call m(idct_8x8_internal_10bpc).main
call m(idct_8x16_internal_10bpc).main_evenhalf
- call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
- lea r6, [deint_shuf+128]
- vpbroadcastd m11, [pw_2048]
- mov r4, dstq
- call .pass2
- mova m0, [r5+32*3] ; 16 17
- mova m1, [r5+32*2] ; 30 31
- mova m2, [r5+32*1] ; 18 19
- mova m3, [r5+32*0] ; 28 29
- mova m4, [r5-32*1] ; 20 21
- mova m5, [r5-32*2] ; 26 27
- mova m6, [r5-32*3] ; 22 23
- mova m7, [r5-32*4] ; 24 25
- call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
- lea dstq, [r4+32]
- call .pass2
- RET
-ALIGN function_align
-.pass2:
- call m(idct_16x8_internal_8bpc).main
- REPX {pmulhrsw x, m11}, m0, m1, m2, m3
- call m(idct_16x8_internal_10bpc).write_16x4_start
- pmulhrsw m0, m11, m4
- pmulhrsw m1, m11, m5
- pmulhrsw m2, m11, m6
- pmulhrsw m3, m11, m7
- jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ ret
cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
- vpbroadcastd m5, [pw_4096]
vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_4096]
pxor m6, m6
mov r6d, eobd
add eobb, 21
@@ -6301,6 +6353,47 @@ cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
jge .loop
RET
+cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 2896
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ vpbroadcastd m4, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
+ call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
+ mov r4, dstq
+ call m(idct_16x8_internal_12bpc).pass2_main
+ mova m0, [cq+32* 0] ; 16
+ mova m1, [cq+32* 1] ; 17
+ mova m2, [cq+32* 2] ; 18
+ mova m3, [cq+32* 3] ; 19
+ mova m4, [cq+32* 4] ; 20
+ mova m5, [cq+32* 5] ; 21
+ mova m6, [cq+32* 6] ; 22
+ mova m7, [cq+32* 7] ; 23
+ mova m8, [cq+32* 8] ; 24
+ mova m9, [cq+32* 9] ; 25
+ mova m10, [cq+32*10] ; 26
+ mova m11, [cq+32*11] ; 27
+ mova m12, [cq+32*12] ; 28
+ mova m13, [cq+32*13] ; 29
+ mova m14, [cq+32*14] ; 30
+ mova m15, [cq+32*15] ; 31
+ lea dstq, [r4+32]
+ call m(idct_16x8_internal_12bpc).pass2_main
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
+
%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
mova m%4, [%2]
paddsw m%3, m%1, m%4
diff --git a/src/x86/itx_init_tmpl.c b/src/x86/itx_init_tmpl.c
index 3bce9da..d643592 100644
--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -275,6 +275,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
assign_itx12_bpc_fn( , 16, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
#endif
if (bpc > 10) return;