diff options
author | Matthias Dressel <code@deadcode.eu> | 2021-11-30 04:54:07 +0300 |
---|---|---|
committer | Matthias Dressel <code@deadcode.eu> | 2021-12-04 07:04:37 +0300 |
commit | 23e8405c2e28a381174266f5f09eb197643691a3 (patch) | |
tree | 4fbb09888fed761f144be0f58da4d0a68d0cb50c /src/x86 | |
parent | 7b99b0e17fbd86f0847684798b182f9122545580 (diff) |
x86/itx: Add 8x16 12bpc AVX2 transforms
inv_txfm_add_8x16_adst_adst_0_12bpc_c: 4440.4
inv_txfm_add_8x16_adst_adst_0_12bpc_avx2: 354.3
inv_txfm_add_8x16_adst_adst_1_12bpc_c: 4437.3
inv_txfm_add_8x16_adst_adst_1_12bpc_avx2: 354.3
inv_txfm_add_8x16_adst_adst_2_12bpc_c: 4438.8
inv_txfm_add_8x16_adst_adst_2_12bpc_avx2: 442.6
inv_txfm_add_8x16_adst_dct_0_12bpc_c: 4507.3
inv_txfm_add_8x16_adst_dct_0_12bpc_avx2: 310.0
inv_txfm_add_8x16_adst_dct_1_12bpc_c: 4500.3
inv_txfm_add_8x16_adst_dct_1_12bpc_avx2: 310.0
inv_txfm_add_8x16_adst_dct_2_12bpc_c: 4516.1
inv_txfm_add_8x16_adst_dct_2_12bpc_avx2: 399.5
inv_txfm_add_8x16_adst_flipadst_0_12bpc_c: 4457.3
inv_txfm_add_8x16_adst_flipadst_0_12bpc_avx2: 355.6
inv_txfm_add_8x16_adst_flipadst_1_12bpc_c: 4441.3
inv_txfm_add_8x16_adst_flipadst_1_12bpc_avx2: 355.6
inv_txfm_add_8x16_adst_flipadst_2_12bpc_c: 4448.9
inv_txfm_add_8x16_adst_flipadst_2_12bpc_avx2: 445.5
inv_txfm_add_8x16_adst_identity_0_12bpc_c: 3204.0
inv_txfm_add_8x16_adst_identity_0_12bpc_avx2: 173.1
inv_txfm_add_8x16_adst_identity_1_12bpc_c: 3207.1
inv_txfm_add_8x16_adst_identity_1_12bpc_avx2: 173.6
inv_txfm_add_8x16_adst_identity_2_12bpc_c: 3210.4
inv_txfm_add_8x16_adst_identity_2_12bpc_avx2: 261.2
inv_txfm_add_8x16_dct_adst_0_12bpc_c: 4484.2
inv_txfm_add_8x16_dct_adst_0_12bpc_avx2: 334.0
inv_txfm_add_8x16_dct_adst_1_12bpc_c: 4503.8
inv_txfm_add_8x16_dct_adst_1_12bpc_avx2: 334.6
inv_txfm_add_8x16_dct_adst_2_12bpc_c: 4490.7
inv_txfm_add_8x16_dct_adst_2_12bpc_avx2: 395.6
inv_txfm_add_8x16_dct_dct_0_12bpc_c: 419.9
inv_txfm_add_8x16_dct_dct_0_12bpc_avx2: 37.6
inv_txfm_add_8x16_dct_dct_1_12bpc_c: 4482.6
inv_txfm_add_8x16_dct_dct_1_12bpc_avx2: 284.6
inv_txfm_add_8x16_dct_dct_2_12bpc_c: 4468.7
inv_txfm_add_8x16_dct_dct_2_12bpc_avx2: 348.3
inv_txfm_add_8x16_dct_flipadst_0_12bpc_c: 4468.4
inv_txfm_add_8x16_dct_flipadst_0_12bpc_avx2: 333.6
inv_txfm_add_8x16_dct_flipadst_1_12bpc_c: 4463.5
inv_txfm_add_8x16_dct_flipadst_1_12bpc_avx2: 333.5
inv_txfm_add_8x16_dct_flipadst_2_12bpc_c: 4459.4
inv_txfm_add_8x16_dct_flipadst_2_12bpc_avx2: 397.4
inv_txfm_add_8x16_dct_identity_0_12bpc_c: 3237.1
inv_txfm_add_8x16_dct_identity_0_12bpc_avx2: 149.6
inv_txfm_add_8x16_dct_identity_1_12bpc_c: 3229.9
inv_txfm_add_8x16_dct_identity_1_12bpc_avx2: 148.6
inv_txfm_add_8x16_dct_identity_2_12bpc_c: 3225.6
inv_txfm_add_8x16_dct_identity_2_12bpc_avx2: 211.3
inv_txfm_add_8x16_flipadst_adst_0_12bpc_c: 4532.1
inv_txfm_add_8x16_flipadst_adst_0_12bpc_avx2: 356.2
inv_txfm_add_8x16_flipadst_adst_1_12bpc_c: 4527.6
inv_txfm_add_8x16_flipadst_adst_1_12bpc_avx2: 356.1
inv_txfm_add_8x16_flipadst_adst_2_12bpc_c: 4532.5
inv_txfm_add_8x16_flipadst_adst_2_12bpc_avx2: 440.0
inv_txfm_add_8x16_flipadst_dct_0_12bpc_c: 4571.6
inv_txfm_add_8x16_flipadst_dct_0_12bpc_avx2: 310.3
inv_txfm_add_8x16_flipadst_dct_1_12bpc_c: 4554.5
inv_txfm_add_8x16_flipadst_dct_1_12bpc_avx2: 309.7
inv_txfm_add_8x16_flipadst_dct_2_12bpc_c: 4554.3
inv_txfm_add_8x16_flipadst_dct_2_12bpc_avx2: 399.9
inv_txfm_add_8x16_flipadst_flipadst_0_12bpc_c: 4497.2
inv_txfm_add_8x16_flipadst_flipadst_0_12bpc_avx2: 355.9
inv_txfm_add_8x16_flipadst_flipadst_1_12bpc_c: 4486.2
inv_txfm_add_8x16_flipadst_flipadst_1_12bpc_avx2: 355.6
inv_txfm_add_8x16_flipadst_flipadst_2_12bpc_c: 4493.4
inv_txfm_add_8x16_flipadst_flipadst_2_12bpc_avx2: 446.0
inv_txfm_add_8x16_flipadst_identity_0_12bpc_c: 3265.7
inv_txfm_add_8x16_flipadst_identity_0_12bpc_avx2: 173.8
inv_txfm_add_8x16_flipadst_identity_1_12bpc_c: 3270.8
inv_txfm_add_8x16_flipadst_identity_1_12bpc_avx2: 173.5
inv_txfm_add_8x16_flipadst_identity_2_12bpc_c: 3271.8
inv_txfm_add_8x16_flipadst_identity_2_12bpc_avx2: 261.6
inv_txfm_add_8x16_identity_adst_0_12bpc_c: 3295.3
inv_txfm_add_8x16_identity_adst_0_12bpc_avx2: 302.5
inv_txfm_add_8x16_identity_adst_1_12bpc_c: 3303.1
inv_txfm_add_8x16_identity_adst_1_12bpc_avx2: 303.0
inv_txfm_add_8x16_identity_adst_2_12bpc_c: 3304.6
inv_txfm_add_8x16_identity_adst_2_12bpc_avx2: 303.1
inv_txfm_add_8x16_identity_dct_0_12bpc_c: 3298.9
inv_txfm_add_8x16_identity_dct_0_12bpc_avx2: 257.8
inv_txfm_add_8x16_identity_dct_1_12bpc_c: 3308.1
inv_txfm_add_8x16_identity_dct_1_12bpc_avx2: 259.2
inv_txfm_add_8x16_identity_dct_2_12bpc_c: 3306.6
inv_txfm_add_8x16_identity_dct_2_12bpc_avx2: 259.2
inv_txfm_add_8x16_identity_flipadst_0_12bpc_c: 3294.7
inv_txfm_add_8x16_identity_flipadst_0_12bpc_avx2: 302.2
inv_txfm_add_8x16_identity_flipadst_1_12bpc_c: 3292.5
inv_txfm_add_8x16_identity_flipadst_1_12bpc_avx2: 302.2
inv_txfm_add_8x16_identity_flipadst_2_12bpc_c: 3275.4
inv_txfm_add_8x16_identity_flipadst_2_12bpc_avx2: 303.3
inv_txfm_add_8x16_identity_identity_0_12bpc_c: 2044.6
inv_txfm_add_8x16_identity_identity_0_12bpc_avx2: 116.2
inv_txfm_add_8x16_identity_identity_1_12bpc_c: 2059.9
inv_txfm_add_8x16_identity_identity_1_12bpc_avx2: 117.0
inv_txfm_add_8x16_identity_identity_2_12bpc_c: 2048.4
inv_txfm_add_8x16_identity_identity_2_12bpc_avx2: 116.2
Diffstat (limited to 'src/x86')
-rw-r--r-- | src/x86/itx16_avx2.asm | 372 | ||||
-rw-r--r-- | src/x86/itx_init_tmpl.c | 1 |
2 files changed, 301 insertions, 72 deletions
diff --git a/src/x86/itx16_avx2.asm b/src/x86/itx16_avx2.asm index fc8960a..7a94e48 100644 --- a/src/x86/itx16_avx2.asm +++ b/src/x86/itx16_avx2.asm @@ -1,5 +1,6 @@ ; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC +; Copyright © 2021, Matthias Dressel ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -2729,8 +2730,8 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m7, [pixel_12bpc_max] jmp m(iidentity_8x8_internal_10bpc).pass2_main -%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset - INV_TXFM_FN %1, %2, %3, 8x16 +%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 8x16, %4 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 @@ -2738,7 +2739,7 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 add r6d, 2048 sar r6d, 12 imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly + jmp m(inv_txfm_add_dct_dct_8x8_%4bpc).dconly %endif %endmacro @@ -2749,10 +2750,11 @@ INV_TXFM_8X16_FN dct, flipadst cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp - vpbroadcastd m14, [pd_2896] - vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 @@ -2942,10 +2944,11 @@ INV_TXFM_8X16_FN adst, identity, 35 cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp - vpbroadcastd m14, [pd_2896] - vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 @@ -3007,10 +3010,11 @@ INV_TXFM_8X16_FN flipadst, identity, 35 cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp - vpbroadcastd m14, [pd_2896] - vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 @@ -3078,6 +3082,7 @@ INV_TXFM_8X16_FN identity, identity %endmacro cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 0] pmulld m8, m15, [cq+32* 1] @@ -3111,37 +3116,41 @@ cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 - packssdw m7, m15 + packssdw m13, m7, m15 vpbroadcastd m8, [pw_1697x16] - REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 + vpbroadcastd m7, [pixel_10bpc_max] + vpbroadcastd m12, [pw_2048] + call .pass2_end + RET +ALIGN function_align +.pass2_end: punpckhwd m9, m0, m1 punpcklwd m0, m1 - punpckhwd m1, m6, m7 - punpcklwd m6, m7 - punpckhwd m7, m4, m5 + punpckhwd m1, m6, m13 + punpcklwd m6, m13 + punpckhwd m13, m4, m5 punpcklwd m4, m5 punpcklwd m5, m2, m3 punpckhwd m2, m3 - vpbroadcastd m12, [pw_2048] punpckhdq m3, m0, m5 punpckldq m0, m5 punpckhdq m11, m9, m2 punpckldq m9, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 - punpckldq m6, m7, m1 - punpckhdq m7, m1 + punpckldq m6, m13, m1 + punpckhdq m13, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m8, m9, m6 punpckhqdq m9, m6 - punpcklqdq m10, m11, m7 - punpckhqdq m11, m7 + punpcklqdq m10, m11, m13 + punpckhqdq m11, m13 pmulhrsw m0, m12 pmulhrsw m1, m12 - vpbroadcastd m7, [pixel_10bpc_max] call m(iidentity_8x8_internal_10bpc).write_2x8x2_start pmulhrsw m0, m12, m2 pmulhrsw m1, m12, m3 @@ -3153,6 +3162,212 @@ cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 pmulhrsw m0, m12, m10 pmulhrsw m1, m12, m11 call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero + ret + +INV_TXFM_8X16_FN dct, dct, 0, 12 +INV_TXFM_8X16_FN dct, identity, 35, 12 +INV_TXFM_8X16_FN dct, adst, 0, 12 +INV_TXFM_8X16_FN dct, flipadst, 0, 12 + +cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call .transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*10], m2 + mova [cq+32*12], m4 + mova [cq+32*14], m6 + pmaxsd m0, m12, [cq+32* 1] + pmaxsd m4, m12, m1 + pmaxsd m1, m12, [cq+32* 3] + pmaxsd m2, m12, [cq+32* 5] + pmaxsd m6, m12, m5 + pmaxsd m5, m12, m3 + pmaxsd m3, m12, [cq+32* 7] + pmaxsd m7, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 2] + pmaxsd m2, m12, [cq+32* 4] + pmaxsd m3, m12, [cq+32* 6] + pmaxsd m4, m12, [cq+32* 8] + pmaxsd m5, m12, [cq+32*10] + pmaxsd m6, m12, [cq+32*12] + pmaxsd m7, m12, [cq+32*14] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + vpbroadcastd m11, [pd_8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x8_internal_10bpc).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 +.end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m2, q3120 + vpermq m1, m3, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.transpose: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + mova [cq+32* 3], m3 + mova [cq+32* 4], m4 + mova [cq+32* 5], m5 + mova [cq+32* 6], m6 + mova [cq+32* 7], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, m12 + mova m5, m13 + mova m6, m14 + mova m7, m15 + jmp m(idct_8x8_internal_12bpc).transpose_8x8 + +INV_TXFM_8X16_FN adst, dct, 0, 12 +INV_TXFM_8X16_FN adst, adst, 0, 12 +INV_TXFM_8X16_FN adst, flipadst, 0, 12 +INV_TXFM_8X16_FN adst, identity, 35, 12 + +cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call .pass2_main + call m(iadst_16x8_internal_10bpc).pass1_rotations +.pass2_end: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp m(idct_8x16_internal_12bpc).end +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*15], m7 + pmaxsd m0, m13, [cq+32* 2] ; 2 + pmaxsd m3, m13, m1 ; 9 + pmaxsd m1, m13, m5 ; 13 + pmaxsd m4, m13, m2 ; 10 + pmaxsd m2, m13, [cq+32* 6] ; 6 + pmaxsd m5, m13, [cq+32* 5] ; 5 + pmaxsd m6, m13, m6 ; 14 + pmaxsd m7, m13, [cq+32* 1] ; 1 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m12, [pd_2048] + vpbroadcastd m15, [pd_2896] + call m(iadst_16x8_internal_10bpc).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m1, m13, [cq+32*15] ; 15 + pmaxsd m2, m13, [cq+32* 4] ; 4 + pmaxsd m3, m13, [cq+32*11] ; 11 + pmaxsd m4, m13, [cq+32* 8] ; 8 + pmaxsd m5, m13, [cq+32* 7] ; 7 + pmaxsd m6, m13, [cq+32*12] ; 12 + pmaxsd m7, m13, [cq+32* 3] ; 3 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_16x8_internal_10bpc).main_part2 + vpbroadcastd m14, [pd_34816] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_34815 + pslld m15, 3 ; pd_8 + ret + +INV_TXFM_8X16_FN flipadst, dct, 0, 12 +INV_TXFM_8X16_FN flipadst, adst, 0, 12 +INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 +INV_TXFM_8X16_FN flipadst, identity, 35, 12 + +cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call m(iadst_8x16_internal_12bpc).pass2_main + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + jmp m(iadst_8x16_internal_12bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct, 0, 12 +INV_TXFM_8X16_FN identity, adst, 0, 12 +INV_TXFM_8X16_FN identity, flipadst, 0, 12 +INV_TXFM_8X16_FN identity, identity, 0, 12 + +cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_8x16_internal_10bpc).pass1 +.pass2: + mova [cq], m7 + vpbroadcastd m7, [clip_18b_min] + REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmaxsd m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [clip_18b_max] + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pminsd m15, [cq] + mova [cq], m7 + vpbroadcastd m7, [pd_11586] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmulld m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [pd_2048] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [cq] + REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m7, [pixel_12bpc_max] + vpbroadcastd m12, [pw_16384] + call m(iidentity_8x16_internal_10bpc).pass2_end RET %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth @@ -3671,6 +3886,29 @@ cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 call m(idct_8x16_internal_10bpc).main_evenhalf psrld m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call .pass1_rotations + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m10, [pw_2048] +.end: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call .write_16x4_start +.end2: + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m10 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m10 + call .write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: mova m14, [r6-32*4] mova m13, [r6-32*3] mova m12, [r6-32*2] @@ -3694,25 +3932,7 @@ cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 paddd m6, m8 ; out6 psubd m8, m7, [r6+32*3] ; out8 paddd m7, [r6+32*3] ; out7 - REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ - m8, m9, m10, m11, m12, m13, m14, m15 - jmp tx2q -.pass2: - call .transpose - call m(idct_16x8_internal_8bpc).main - vpbroadcastd m10, [pw_2048] -.end: - pmulhrsw m0, m10 - pmulhrsw m1, m10 - pmulhrsw m2, m10 - pmulhrsw m3, m10 - call .write_16x4_start - pmulhrsw m0, m4, m10 - pmulhrsw m1, m5, m10 - pmulhrsw m2, m6, m10 - pmulhrsw m3, m7, m10 - call .write_16x4_zero - RET + ret ALIGN function_align .transpose: lea r6, [deint_shuf+128] @@ -3792,22 +4012,7 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m14, [pd_6144] psrld m15, 11 ; pd_1 psubd m13, m14, m15 ; pd_6143 - paddd m0, m15 - psubd m1, m15, m1 - paddd m2, m15 - psubd m3, m15, m3 - paddd m4, m14 - psubd m5, m13, m5 - paddd m6, m14 - psubd m7, m13, m7 - paddd m8, m14, m9 - psubd m9, m13, m10 - paddd m10, m14, m11 - psubd m11, m13, m12 - paddd m12, m15, [r6-32*1] - psubd m13, m15, [r6-32*2] - paddd m14, m15, [r6-32*3] - psubd m15, [r6-32*4] + call .pass1_rotations .pass1_end: REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 @@ -3831,6 +4036,25 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align +.pass1_rotations: + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] + ret +ALIGN function_align .main: vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 2] @@ -4006,23 +4230,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m14, [pd_6144] psrld m15, 11 psubd m13, m14, m15 - psubd m8, m13, m7 - paddd m7, m14, m9 - paddd m9, m14, m6 - psubd m6, m13, m10 - psubd m10, m13, m5 - paddd m5, m14, m11 - paddd m11, m14, m4 - psubd m4, m13, m12 - psubd m12, m15, m3 - paddd m3, m15, [r6-32*1] - paddd m13, m15, m2 - psubd m2, m15, [r6-32*2] - psubd m14, m15, m1 - mova m1, m15 - paddd m15, m0 - psubd m0, m1, [r6-32*4] - paddd m1, [r6-32*3] + call .pass1_rotations jmp m(iadst_16x8_internal_10bpc).pass1_end .pass2: call m(idct_16x8_internal_10bpc).transpose @@ -4046,6 +4254,26 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 pmulhrsw m3, m12, m10 call m(idct_16x8_internal_10bpc).write_16x4_zero RET +ALIGN function_align +.pass1_rotations: + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + ret INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst diff --git a/src/x86/itx_init_tmpl.c b/src/x86/itx_init_tmpl.c index 1c95b62..7ffbde9 100644 --- a/src/x86/itx_init_tmpl.c +++ b/src/x86/itx_init_tmpl.c @@ -270,6 +270,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c, assign_itx16_bpc_fn(R, 4, 16, 12, avx2); assign_itx16_bpc_fn(R, 8, 4, 12, avx2); assign_itx16_bpc_fn( , 8, 8, 12, avx2); + assign_itx16_bpc_fn(R, 8, 16, 12, avx2); assign_itx16_bpc_fn(R, 16, 4, 12, avx2); #endif |