Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src/x86
diff options
context:
space:
mode:
authorMatthias Dressel <code@deadcode.eu>2021-11-30 04:54:07 +0300
committerMatthias Dressel <code@deadcode.eu>2021-12-04 07:04:37 +0300
commit23e8405c2e28a381174266f5f09eb197643691a3 (patch)
tree4fbb09888fed761f144be0f58da4d0a68d0cb50c /src/x86
parent7b99b0e17fbd86f0847684798b182f9122545580 (diff)
x86/itx: Add 8x16 12bpc AVX2 transforms
inv_txfm_add_8x16_adst_adst_0_12bpc_c: 4440.4 inv_txfm_add_8x16_adst_adst_0_12bpc_avx2: 354.3 inv_txfm_add_8x16_adst_adst_1_12bpc_c: 4437.3 inv_txfm_add_8x16_adst_adst_1_12bpc_avx2: 354.3 inv_txfm_add_8x16_adst_adst_2_12bpc_c: 4438.8 inv_txfm_add_8x16_adst_adst_2_12bpc_avx2: 442.6 inv_txfm_add_8x16_adst_dct_0_12bpc_c: 4507.3 inv_txfm_add_8x16_adst_dct_0_12bpc_avx2: 310.0 inv_txfm_add_8x16_adst_dct_1_12bpc_c: 4500.3 inv_txfm_add_8x16_adst_dct_1_12bpc_avx2: 310.0 inv_txfm_add_8x16_adst_dct_2_12bpc_c: 4516.1 inv_txfm_add_8x16_adst_dct_2_12bpc_avx2: 399.5 inv_txfm_add_8x16_adst_flipadst_0_12bpc_c: 4457.3 inv_txfm_add_8x16_adst_flipadst_0_12bpc_avx2: 355.6 inv_txfm_add_8x16_adst_flipadst_1_12bpc_c: 4441.3 inv_txfm_add_8x16_adst_flipadst_1_12bpc_avx2: 355.6 inv_txfm_add_8x16_adst_flipadst_2_12bpc_c: 4448.9 inv_txfm_add_8x16_adst_flipadst_2_12bpc_avx2: 445.5 inv_txfm_add_8x16_adst_identity_0_12bpc_c: 3204.0 inv_txfm_add_8x16_adst_identity_0_12bpc_avx2: 173.1 inv_txfm_add_8x16_adst_identity_1_12bpc_c: 3207.1 inv_txfm_add_8x16_adst_identity_1_12bpc_avx2: 173.6 inv_txfm_add_8x16_adst_identity_2_12bpc_c: 3210.4 inv_txfm_add_8x16_adst_identity_2_12bpc_avx2: 261.2 inv_txfm_add_8x16_dct_adst_0_12bpc_c: 4484.2 inv_txfm_add_8x16_dct_adst_0_12bpc_avx2: 334.0 inv_txfm_add_8x16_dct_adst_1_12bpc_c: 4503.8 inv_txfm_add_8x16_dct_adst_1_12bpc_avx2: 334.6 inv_txfm_add_8x16_dct_adst_2_12bpc_c: 4490.7 inv_txfm_add_8x16_dct_adst_2_12bpc_avx2: 395.6 inv_txfm_add_8x16_dct_dct_0_12bpc_c: 419.9 inv_txfm_add_8x16_dct_dct_0_12bpc_avx2: 37.6 inv_txfm_add_8x16_dct_dct_1_12bpc_c: 4482.6 inv_txfm_add_8x16_dct_dct_1_12bpc_avx2: 284.6 inv_txfm_add_8x16_dct_dct_2_12bpc_c: 4468.7 inv_txfm_add_8x16_dct_dct_2_12bpc_avx2: 348.3 inv_txfm_add_8x16_dct_flipadst_0_12bpc_c: 4468.4 inv_txfm_add_8x16_dct_flipadst_0_12bpc_avx2: 333.6 inv_txfm_add_8x16_dct_flipadst_1_12bpc_c: 4463.5 inv_txfm_add_8x16_dct_flipadst_1_12bpc_avx2: 333.5 inv_txfm_add_8x16_dct_flipadst_2_12bpc_c: 4459.4 inv_txfm_add_8x16_dct_flipadst_2_12bpc_avx2: 397.4 inv_txfm_add_8x16_dct_identity_0_12bpc_c: 3237.1 inv_txfm_add_8x16_dct_identity_0_12bpc_avx2: 149.6 inv_txfm_add_8x16_dct_identity_1_12bpc_c: 3229.9 inv_txfm_add_8x16_dct_identity_1_12bpc_avx2: 148.6 inv_txfm_add_8x16_dct_identity_2_12bpc_c: 3225.6 inv_txfm_add_8x16_dct_identity_2_12bpc_avx2: 211.3 inv_txfm_add_8x16_flipadst_adst_0_12bpc_c: 4532.1 inv_txfm_add_8x16_flipadst_adst_0_12bpc_avx2: 356.2 inv_txfm_add_8x16_flipadst_adst_1_12bpc_c: 4527.6 inv_txfm_add_8x16_flipadst_adst_1_12bpc_avx2: 356.1 inv_txfm_add_8x16_flipadst_adst_2_12bpc_c: 4532.5 inv_txfm_add_8x16_flipadst_adst_2_12bpc_avx2: 440.0 inv_txfm_add_8x16_flipadst_dct_0_12bpc_c: 4571.6 inv_txfm_add_8x16_flipadst_dct_0_12bpc_avx2: 310.3 inv_txfm_add_8x16_flipadst_dct_1_12bpc_c: 4554.5 inv_txfm_add_8x16_flipadst_dct_1_12bpc_avx2: 309.7 inv_txfm_add_8x16_flipadst_dct_2_12bpc_c: 4554.3 inv_txfm_add_8x16_flipadst_dct_2_12bpc_avx2: 399.9 inv_txfm_add_8x16_flipadst_flipadst_0_12bpc_c: 4497.2 inv_txfm_add_8x16_flipadst_flipadst_0_12bpc_avx2: 355.9 inv_txfm_add_8x16_flipadst_flipadst_1_12bpc_c: 4486.2 inv_txfm_add_8x16_flipadst_flipadst_1_12bpc_avx2: 355.6 inv_txfm_add_8x16_flipadst_flipadst_2_12bpc_c: 4493.4 inv_txfm_add_8x16_flipadst_flipadst_2_12bpc_avx2: 446.0 inv_txfm_add_8x16_flipadst_identity_0_12bpc_c: 3265.7 inv_txfm_add_8x16_flipadst_identity_0_12bpc_avx2: 173.8 inv_txfm_add_8x16_flipadst_identity_1_12bpc_c: 3270.8 inv_txfm_add_8x16_flipadst_identity_1_12bpc_avx2: 173.5 inv_txfm_add_8x16_flipadst_identity_2_12bpc_c: 3271.8 inv_txfm_add_8x16_flipadst_identity_2_12bpc_avx2: 261.6 inv_txfm_add_8x16_identity_adst_0_12bpc_c: 3295.3 inv_txfm_add_8x16_identity_adst_0_12bpc_avx2: 302.5 inv_txfm_add_8x16_identity_adst_1_12bpc_c: 3303.1 inv_txfm_add_8x16_identity_adst_1_12bpc_avx2: 303.0 inv_txfm_add_8x16_identity_adst_2_12bpc_c: 3304.6 inv_txfm_add_8x16_identity_adst_2_12bpc_avx2: 303.1 inv_txfm_add_8x16_identity_dct_0_12bpc_c: 3298.9 inv_txfm_add_8x16_identity_dct_0_12bpc_avx2: 257.8 inv_txfm_add_8x16_identity_dct_1_12bpc_c: 3308.1 inv_txfm_add_8x16_identity_dct_1_12bpc_avx2: 259.2 inv_txfm_add_8x16_identity_dct_2_12bpc_c: 3306.6 inv_txfm_add_8x16_identity_dct_2_12bpc_avx2: 259.2 inv_txfm_add_8x16_identity_flipadst_0_12bpc_c: 3294.7 inv_txfm_add_8x16_identity_flipadst_0_12bpc_avx2: 302.2 inv_txfm_add_8x16_identity_flipadst_1_12bpc_c: 3292.5 inv_txfm_add_8x16_identity_flipadst_1_12bpc_avx2: 302.2 inv_txfm_add_8x16_identity_flipadst_2_12bpc_c: 3275.4 inv_txfm_add_8x16_identity_flipadst_2_12bpc_avx2: 303.3 inv_txfm_add_8x16_identity_identity_0_12bpc_c: 2044.6 inv_txfm_add_8x16_identity_identity_0_12bpc_avx2: 116.2 inv_txfm_add_8x16_identity_identity_1_12bpc_c: 2059.9 inv_txfm_add_8x16_identity_identity_1_12bpc_avx2: 117.0 inv_txfm_add_8x16_identity_identity_2_12bpc_c: 2048.4 inv_txfm_add_8x16_identity_identity_2_12bpc_avx2: 116.2
Diffstat (limited to 'src/x86')
-rw-r--r--src/x86/itx16_avx2.asm372
-rw-r--r--src/x86/itx_init_tmpl.c1
2 files changed, 301 insertions, 72 deletions
diff --git a/src/x86/itx16_avx2.asm b/src/x86/itx16_avx2.asm
index fc8960a..7a94e48 100644
--- a/src/x86/itx16_avx2.asm
+++ b/src/x86/itx16_avx2.asm
@@ -1,5 +1,6 @@
; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
+; Copyright © 2021, Matthias Dressel
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
@@ -2729,8 +2730,8 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
vpbroadcastd m7, [pixel_12bpc_max]
jmp m(iidentity_8x8_internal_10bpc).pass2_main
-%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
- INV_TXFM_FN %1, %2, %3, 8x16
+%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
+ INV_TXFM_FN %1, %2, %3, 8x16, %4
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
@@ -2738,7 +2739,7 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
add r6d, 2048
sar r6d, 12
imul r6d, 2896
- jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+ jmp m(inv_txfm_add_dct_dct_8x8_%4bpc).dconly
%endif
%endmacro
@@ -2749,10 +2750,11 @@ INV_TXFM_8X16_FN dct, flipadst
cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
%undef cmp
- vpbroadcastd m14, [pd_2896]
- vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
cmp eobd, 43
jl .fast
add cq, 32
@@ -2942,10 +2944,11 @@ INV_TXFM_8X16_FN adst, identity, 35
cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
%undef cmp
- vpbroadcastd m14, [pd_2896]
- vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
cmp eobd, 43
jl .fast
add cq, 32
@@ -3007,10 +3010,11 @@ INV_TXFM_8X16_FN flipadst, identity, 35
cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
%undef cmp
- vpbroadcastd m14, [pd_2896]
- vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
cmp eobd, 43
jl .fast
add cq, 32
@@ -3078,6 +3082,7 @@ INV_TXFM_8X16_FN identity, identity
%endmacro
cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+.pass1:
vpbroadcastd m15, [pd_2896]
pmulld m0, m15, [cq+32* 0]
pmulld m8, m15, [cq+32* 1]
@@ -3111,37 +3116,41 @@ cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
packssdw m4, m12
packssdw m5, m13
packssdw m6, m14
- packssdw m7, m15
+ packssdw m13, m7, m15
vpbroadcastd m8, [pw_1697x16]
- REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13
+ vpbroadcastd m7, [pixel_10bpc_max]
+ vpbroadcastd m12, [pw_2048]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
punpckhwd m9, m0, m1
punpcklwd m0, m1
- punpckhwd m1, m6, m7
- punpcklwd m6, m7
- punpckhwd m7, m4, m5
+ punpckhwd m1, m6, m13
+ punpcklwd m6, m13
+ punpckhwd m13, m4, m5
punpcklwd m4, m5
punpcklwd m5, m2, m3
punpckhwd m2, m3
- vpbroadcastd m12, [pw_2048]
punpckhdq m3, m0, m5
punpckldq m0, m5
punpckhdq m11, m9, m2
punpckldq m9, m2
punpckldq m2, m4, m6
punpckhdq m4, m6
- punpckldq m6, m7, m1
- punpckhdq m7, m1
+ punpckldq m6, m13, m1
+ punpckhdq m13, m1
punpckhqdq m1, m0, m2
punpcklqdq m0, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
punpcklqdq m8, m9, m6
punpckhqdq m9, m6
- punpcklqdq m10, m11, m7
- punpckhqdq m11, m7
+ punpcklqdq m10, m11, m13
+ punpckhqdq m11, m13
pmulhrsw m0, m12
pmulhrsw m1, m12
- vpbroadcastd m7, [pixel_10bpc_max]
call m(iidentity_8x8_internal_10bpc).write_2x8x2_start
pmulhrsw m0, m12, m2
pmulhrsw m1, m12, m3
@@ -3153,6 +3162,212 @@ cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
pmulhrsw m0, m12, m10
pmulhrsw m1, m12, m11
call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ ret
+
+INV_TXFM_8X16_FN dct, dct, 0, 12
+INV_TXFM_8X16_FN dct, identity, 35, 12
+INV_TXFM_8X16_FN dct, adst, 0, 12
+INV_TXFM_8X16_FN dct, flipadst, 0, 12
+
+cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call .transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ mova [cq+32* 8], m0
+ mova [cq+32*10], m2
+ mova [cq+32*12], m4
+ mova [cq+32*14], m6
+ pmaxsd m0, m12, [cq+32* 1]
+ pmaxsd m4, m12, m1
+ pmaxsd m1, m12, [cq+32* 3]
+ pmaxsd m2, m12, [cq+32* 5]
+ pmaxsd m6, m12, m5
+ pmaxsd m5, m12, m3
+ pmaxsd m3, m12, [cq+32* 7]
+ pmaxsd m7, m12
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+32* 0]
+ pmaxsd m1, m12, [cq+32* 2]
+ pmaxsd m2, m12, [cq+32* 4]
+ pmaxsd m3, m12, [cq+32* 6]
+ pmaxsd m4, m12, [cq+32* 8]
+ pmaxsd m5, m12, [cq+32*10]
+ pmaxsd m6, m12, [cq+32*12]
+ pmaxsd m7, m12, [cq+32*14]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ vpbroadcastd m11, [pd_8]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x8_internal_10bpc).pass1_rotations
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+.end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m2, q3120
+ vpermq m1, m3, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m6, q3120
+ vpermq m1, m7, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.transpose:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ mova [cq+32* 3], m3
+ mova [cq+32* 4], m4
+ mova [cq+32* 5], m5
+ mova [cq+32* 6], m6
+ mova [cq+32* 7], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, m12
+ mova m5, m13
+ mova m6, m14
+ mova m7, m15
+ jmp m(idct_8x8_internal_12bpc).transpose_8x8
+
+INV_TXFM_8X16_FN adst, dct, 0, 12
+INV_TXFM_8X16_FN adst, adst, 0, 12
+INV_TXFM_8X16_FN adst, flipadst, 0, 12
+INV_TXFM_8X16_FN adst, identity, 35, 12
+
+cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call .pass2_main
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+.pass2_end:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
+ jmp m(idct_8x16_internal_12bpc).end
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ mova [cq+32* 8], m0
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*15], m7
+ pmaxsd m0, m13, [cq+32* 2] ; 2
+ pmaxsd m3, m13, m1 ; 9
+ pmaxsd m1, m13, m5 ; 13
+ pmaxsd m4, m13, m2 ; 10
+ pmaxsd m2, m13, [cq+32* 6] ; 6
+ pmaxsd m5, m13, [cq+32* 5] ; 5
+ pmaxsd m6, m13, m6 ; 14
+ pmaxsd m7, m13, [cq+32* 1] ; 1
+ REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m12, [pd_2048]
+ vpbroadcastd m15, [pd_2896]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ pmaxsd m0, m13, [cq+32* 0] ; 0
+ pmaxsd m1, m13, [cq+32*15] ; 15
+ pmaxsd m2, m13, [cq+32* 4] ; 4
+ pmaxsd m3, m13, [cq+32*11] ; 11
+ pmaxsd m4, m13, [cq+32* 8] ; 8
+ pmaxsd m5, m13, [cq+32* 7] ; 7
+ pmaxsd m6, m13, [cq+32*12] ; 12
+ pmaxsd m7, m13, [cq+32* 3] ; 3
+ REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_16x8_internal_10bpc).main_part2
+ vpbroadcastd m14, [pd_34816]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_34815
+ pslld m15, 3 ; pd_8
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct, 0, 12
+INV_TXFM_8X16_FN flipadst, adst, 0, 12
+INV_TXFM_8X16_FN flipadst, flipadst, 0, 12
+INV_TXFM_8X16_FN flipadst, identity, 35, 12
+
+cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call m(iadst_8x16_internal_12bpc).pass2_main
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ jmp m(iadst_8x16_internal_12bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct, 0, 12
+INV_TXFM_8X16_FN identity, adst, 0, 12
+INV_TXFM_8X16_FN identity, flipadst, 0, 12
+INV_TXFM_8X16_FN identity, identity, 0, 12
+
+cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x16_internal_10bpc).pass1
+.pass2:
+ mova [cq], m7
+ vpbroadcastd m7, [clip_18b_min]
+ REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ pmaxsd m7, [cq]
+ mova [cq], m15
+ vpbroadcastd m15, [clip_18b_max]
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pminsd m15, [cq]
+ mova [cq], m7
+ vpbroadcastd m7, [pd_11586]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ pmulld m7, [cq]
+ mova [cq], m15
+ vpbroadcastd m15, [pd_2048]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [cq]
+ REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m13, m7, m15
+ vpbroadcastd m7, [pixel_12bpc_max]
+ vpbroadcastd m12, [pw_16384]
+ call m(iidentity_8x16_internal_10bpc).pass2_end
RET
%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
@@ -3671,6 +3886,29 @@ cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
call m(idct_8x16_internal_10bpc).main_evenhalf
psrld m11, 11 ; pd_1
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .pass1_rotations
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m10, [pw_2048]
+.end:
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ call .write_16x4_start
+.end2:
+ pmulhrsw m0, m4, m10
+ pmulhrsw m1, m5, m10
+ pmulhrsw m2, m6, m10
+ pmulhrsw m3, m7, m10
+ call .write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
mova m14, [r6-32*4]
mova m13, [r6-32*3]
mova m12, [r6-32*2]
@@ -3694,25 +3932,7 @@ cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
paddd m6, m8 ; out6
psubd m8, m7, [r6+32*3] ; out8
paddd m7, [r6+32*3] ; out7
- REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
- m8, m9, m10, m11, m12, m13, m14, m15
- jmp tx2q
-.pass2:
- call .transpose
- call m(idct_16x8_internal_8bpc).main
- vpbroadcastd m10, [pw_2048]
-.end:
- pmulhrsw m0, m10
- pmulhrsw m1, m10
- pmulhrsw m2, m10
- pmulhrsw m3, m10
- call .write_16x4_start
- pmulhrsw m0, m4, m10
- pmulhrsw m1, m5, m10
- pmulhrsw m2, m6, m10
- pmulhrsw m3, m7, m10
- call .write_16x4_zero
- RET
+ ret
ALIGN function_align
.transpose:
lea r6, [deint_shuf+128]
@@ -3792,22 +4012,7 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpbroadcastd m14, [pd_6144]
psrld m15, 11 ; pd_1
psubd m13, m14, m15 ; pd_6143
- paddd m0, m15
- psubd m1, m15, m1
- paddd m2, m15
- psubd m3, m15, m3
- paddd m4, m14
- psubd m5, m13, m5
- paddd m6, m14
- psubd m7, m13, m7
- paddd m8, m14, m9
- psubd m9, m13, m10
- paddd m10, m14, m11
- psubd m11, m13, m12
- paddd m12, m15, [r6-32*1]
- psubd m13, m15, [r6-32*2]
- paddd m14, m15, [r6-32*3]
- psubd m15, [r6-32*4]
+ call .pass1_rotations
.pass1_end:
REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15
REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
@@ -3831,6 +4036,25 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
call m(idct_16x8_internal_10bpc).write_16x4_zero
RET
ALIGN function_align
+.pass1_rotations:
+ paddd m0, m15
+ psubd m1, m15, m1
+ paddd m2, m15
+ psubd m3, m15, m3
+ paddd m4, m14
+ psubd m5, m13, m5
+ paddd m6, m14
+ psubd m7, m13, m7
+ paddd m8, m14, m9
+ psubd m9, m13, m10
+ paddd m10, m14, m11
+ psubd m11, m13, m12
+ paddd m12, m15, [r6-32*1]
+ psubd m13, m15, [r6-32*2]
+ paddd m14, m15, [r6-32*3]
+ psubd m15, [r6-32*4]
+ ret
+ALIGN function_align
.main:
vpbroadcastd m15, [pd_2896]
pmulld m0, m15, [cq+32* 2]
@@ -4006,23 +4230,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpbroadcastd m14, [pd_6144]
psrld m15, 11
psubd m13, m14, m15
- psubd m8, m13, m7
- paddd m7, m14, m9
- paddd m9, m14, m6
- psubd m6, m13, m10
- psubd m10, m13, m5
- paddd m5, m14, m11
- paddd m11, m14, m4
- psubd m4, m13, m12
- psubd m12, m15, m3
- paddd m3, m15, [r6-32*1]
- paddd m13, m15, m2
- psubd m2, m15, [r6-32*2]
- psubd m14, m15, m1
- mova m1, m15
- paddd m15, m0
- psubd m0, m1, [r6-32*4]
- paddd m1, [r6-32*3]
+ call .pass1_rotations
jmp m(iadst_16x8_internal_10bpc).pass1_end
.pass2:
call m(idct_16x8_internal_10bpc).transpose
@@ -4046,6 +4254,26 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
pmulhrsw m3, m12, m10
call m(idct_16x8_internal_10bpc).write_16x4_zero
RET
+ALIGN function_align
+.pass1_rotations:
+ psubd m8, m13, m7
+ paddd m7, m14, m9
+ paddd m9, m14, m6
+ psubd m6, m13, m10
+ psubd m10, m13, m5
+ paddd m5, m14, m11
+ paddd m11, m14, m4
+ psubd m4, m13, m12
+ psubd m12, m15, m3
+ paddd m3, m15, [r6-32*1]
+ paddd m13, m15, m2
+ psubd m2, m15, [r6-32*2]
+ psubd m14, m15, m1
+ mova m1, m15
+ paddd m15, m0
+ psubd m0, m1, [r6-32*4]
+ paddd m1, [r6-32*3]
+ ret
INV_TXFM_16X8_FN identity, dct
INV_TXFM_16X8_FN identity, adst
diff --git a/src/x86/itx_init_tmpl.c b/src/x86/itx_init_tmpl.c
index 1c95b62..7ffbde9 100644
--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -270,6 +270,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
assign_itx16_bpc_fn(R, 4, 16, 12, avx2);
assign_itx16_bpc_fn(R, 8, 4, 12, avx2);
assign_itx16_bpc_fn( , 8, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
#endif