From eaedb95de65fb5612297b148a306398240e8ad6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 11 Apr 2020 13:27:37 +0300 Subject: arm64: itx: Add NEON implementation of itx for 10 bpc Add an element size specifier to the existing individual transform functions for 8 bpc, naming them e.g. inv_dct_8h_x8_neon, to clarify that they operate on input vectors of 8h, and make the symbols public, to let the 10 bpc case call them from a different object file. The same convention is used in the new itx16.S, like inv_dct_4s_x8_neon. Make the existing itx.S compiled regardless of whether 8 bpc support is enabled. For builds with 8 bpc support disabled, this does include the unused frontend functions though, but this is hopefully tolerable to avoid having to split the file into a sharable file for transforms and a separate one for frontends. This only implements the 10 bpc case, as that case can use transforms operating on 16 bit coefficients in the second pass. Relative speedup vs C for a few functions: Cortex A53 A72 A73 inv_txfm_add_4x4_dct_dct_0_10bpc_neon: 4.14 4.06 4.49 inv_txfm_add_4x4_dct_dct_1_10bpc_neon: 6.51 6.49 6.42 inv_txfm_add_8x8_dct_dct_0_10bpc_neon: 5.02 4.63 6.23 inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 8.54 7.13 11.96 inv_txfm_add_16x16_dct_dct_0_10bpc_neon: 5.52 6.60 8.03 inv_txfm_add_16x16_dct_dct_1_10bpc_neon: 11.27 9.62 12.22 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 9.60 6.97 8.59 inv_txfm_add_32x32_dct_dct_0_10bpc_neon: 2.60 3.48 3.19 inv_txfm_add_32x32_dct_dct_1_10bpc_neon: 14.65 12.64 16.86 inv_txfm_add_32x32_dct_dct_2_10bpc_neon: 11.57 8.80 12.68 inv_txfm_add_32x32_dct_dct_3_10bpc_neon: 8.79 8.00 9.21 inv_txfm_add_32x32_dct_dct_4_10bpc_neon: 7.58 6.21 7.80 inv_txfm_add_64x64_dct_dct_0_10bpc_neon: 2.41 2.85 2.75 inv_txfm_add_64x64_dct_dct_1_10bpc_neon: 12.91 10.27 12.24 inv_txfm_add_64x64_dct_dct_2_10bpc_neon: 10.96 7.97 10.31 inv_txfm_add_64x64_dct_dct_3_10bpc_neon: 8.95 7.42 9.55 inv_txfm_add_64x64_dct_dct_4_10bpc_neon: 7.97 6.12 7.82 --- src/arm/64/itx.S | 116 +- src/arm/64/itx16.S | 3514 +++++++++++++++++++++++++++++++++++++++++++++++ src/arm/64/util.S | 12 + src/arm/itx_init_tmpl.c | 4 +- 4 files changed, 3587 insertions(+), 59 deletions(-) create mode 100644 src/arm/64/itx16.S (limited to 'src/arm') diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S index e8141e8..29dec19 100644 --- a/src/arm/64/itx.S +++ b/src/arm/64/itx.S @@ -449,14 +449,14 @@ endfunc sqsub \r2\sz, v3\sz, v7\sz .endm -function inv_dct_4x4_neon +function inv_dct_4h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .4h ret endfunc -function inv_dct_8x4_neon +function inv_dct_8h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .8h @@ -489,12 +489,12 @@ endfunc rshrn \o3\().4h, \o3\().4s, #12 .endm -function inv_adst_4x4_neon +function inv_adst_4h_x4_neon, export=1 iadst_4x4 v16, v17, v18, v19 ret endfunc -function inv_flipadst_4x4_neon +function inv_flipadst_4h_x4_neon, export=1 iadst_4x4 v19, v18, v17, v16 ret endfunc @@ -555,17 +555,17 @@ endfunc rshrn2 \o3\().8h, v5.4s, #12 .endm -function inv_adst_8x4_neon +function inv_adst_8h_x4_neon, export=1 iadst_8x4 v16, v17, v18, v19 ret endfunc -function inv_flipadst_8x4_neon +function inv_flipadst_8h_x4_neon, export=1 iadst_8x4 v19, v18, v17, v16 ret endfunc -function inv_identity_4x4_neon +function inv_identity_4h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.4h, v16.4h, v0.h[0] @@ -579,7 +579,7 @@ function inv_identity_4x4_neon ret endfunc -function inv_identity_8x4_neon +function inv_identity_8h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.8h, v16.8h, v0.h[0] @@ -684,8 +684,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 b L(itx_4x4_end) 1: .endif - adr x4, inv_\txfm1\()_4x4_neon - adr x5, inv_\txfm2\()_4x4_neon + adr x4, inv_\txfm1\()_4h_x4_neon + adr x5, inv_\txfm2\()_4h_x4_neon b inv_txfm_add_4x4_neon endfunc .endm @@ -741,14 +741,14 @@ def_fn_4x4 identity, flipadst mov \r6\szb, v6\szb // out6 .endm -function inv_dct_8x8_neon +function inv_dct_8h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b ret endfunc -function inv_dct_4x8_neon +function inv_dct_4h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b @@ -822,27 +822,27 @@ endfunc sqneg \o5\()\sz, v3\sz // out5 .endm -function inv_adst_8x8_neon +function inv_adst_8h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h ret endfunc -function inv_flipadst_8x8_neon +function inv_flipadst_8h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h ret endfunc -function inv_adst_4x8_neon +function inv_adst_4h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h ret endfunc -function inv_flipadst_4x8_neon +function inv_flipadst_4h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h ret endfunc -function inv_identity_8x8_neon +function inv_identity_8h_x8_neon, export=1 sqshl v16.8h, v16.8h, #1 sqshl v17.8h, v17.8h, #1 sqshl v18.8h, v18.8h, #1 @@ -854,7 +854,7 @@ function inv_identity_8x8_neon ret endfunc -function inv_identity_4x8_neon +function inv_identity_4h_x8_neon, export=1 sqshl v16.4h, v16.4h, #1 sqshl v17.4h, v17.4h, #1 sqshl v18.4h, v18.4h, #1 @@ -911,11 +911,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif - adr x5, inv_\txfm2\()_8x8_neon + adr x5, inv_\txfm2\()_8h_x8_neon .ifc \txfm1, identity b inv_txfm_identity_add_8x8_neon .else - adr x4, inv_\txfm1\()_8x8_neon + adr x4, inv_\txfm1\()_8h_x8_neon b inv_txfm_add_8x8_neon .endif endfunc @@ -998,8 +998,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif - adr x4, inv_\txfm1\()_\h\()x\w\()_neon - adr x5, inv_\txfm2\()_\w\()x\h\()_neon + adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon + adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon b inv_txfm_add_\w\()x\h\()_neon endfunc .endm @@ -1110,14 +1110,14 @@ def_fns_48 8, 4 mov v22\szb, v3\szb .endm -function inv_dct_8x16_neon +function inv_dct_8h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .8h, .16b ret endfunc -function inv_dct_4x16_neon +function inv_dct_4h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .4h, .8b @@ -1294,27 +1294,27 @@ endfunc sqneg \o9\sz, v7\sz // out9 .endm -function inv_adst_8x16_neon +function inv_adst_8h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b ret endfunc -function inv_flipadst_8x16_neon +function inv_flipadst_8h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b ret endfunc -function inv_adst_4x16_neon +function inv_adst_4h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b ret endfunc -function inv_flipadst_4x16_neon +function inv_flipadst_4h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b ret endfunc -function inv_identity_8x16_neon +function inv_identity_8h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1325,7 +1325,7 @@ function inv_identity_8x16_neon ret endfunc -function inv_identity_4x16_neon +function inv_identity_4h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1465,9 +1465,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 adr x9, inv_txfm_horz_identity_16x8_neon .else adr x9, inv_txfm_horz_16x8_neon - adr x4, inv_\txfm1\()_8x16_neon + adr x4, inv_\txfm1\()_8h_x16_neon .endif - adr x5, inv_\txfm2\()_8x16_neon + adr x5, inv_\txfm2\()_8h_x16_neon mov x13, #\eob_half b inv_txfm_add_16x16_neon endfunc @@ -1634,12 +1634,12 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 idct_dc \w, \h, 1 .endif .if \w == 4 - adr x4, inv_\txfm1\()_8x\w\()_neon - adr x5, inv_\txfm2\()_4x\h\()_neon + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_4h_x\h\()_neon mov w13, #\eob_half .else - adr x4, inv_\txfm1\()_4x\w\()_neon - adr x5, inv_\txfm2\()_8x\h\()_neon + adr x4, inv_\txfm1\()_4h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon @@ -1816,8 +1816,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif - adr x4, inv_\txfm1\()_8x\w\()_neon - adr x5, inv_\txfm2\()_8x\h\()_neon + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon .if \w == 8 mov x13, #\eob_half .endif @@ -1851,7 +1851,7 @@ def_fn_816 \w, \h, identity, flipadst, 64 def_fns_816 8, 16 def_fns_816 16, 8 -function inv_dct32_odd_8x16_neon +function inv_dct32_odd_8h_x16_neon, export=1 movrel x16, idct_coeffs, 2*16 ld1 {v0.8h, v1.8h}, [x16] sub x16, x16, #2*16 @@ -2029,7 +2029,7 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 @@ -2059,7 +2059,7 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31 .endif - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5 .macro store2 r0, r1, shift @@ -2105,7 +2105,7 @@ function inv_txfm_add_vert_dct_8x32_neon .endr sub x7, x7, x8, lsl #4 - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x7], x8 @@ -2118,7 +2118,7 @@ function inv_txfm_add_vert_dct_8x32_neon .endr sub x7, x7, x8, lsl #4 sub x7, x7, x8, lsr #1 - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon neg x9, x8 mov x10, x6 @@ -2384,7 +2384,7 @@ function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 sub sp, sp, #1024 movrel x13, eob_16x32 ldrh w12, [x13], #2 - adr x4, inv_dct_8x16_neon + adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, sp, #(\i*16*2) @@ -2432,7 +2432,7 @@ function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 mov x15, x30 sub sp, sp, #1024 - adr x5, inv_dct_8x16_neon + adr x5, inv_dct_8h_x16_neon .irp i, 0, 8 add x6, sp, #(\i*32*2) @@ -2493,7 +2493,7 @@ function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 sub w9, w9, #8 add x2, x2, #2*8 - bl inv_dct_8x8_neon + bl inv_dct_8h_x8_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23 srshr v\i\().8h, v\i\().8h, #2 @@ -2550,7 +2550,7 @@ function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 .endr add w9, w9, #8 - bl inv_dct_8x8_neon + bl inv_dct_8h_x8_neon cmp w9, #32 @@ -2755,7 +2755,7 @@ endfunc .endm .macro def_dct64_func suffix, clear=0, scale=0 -function inv_txfm_dct\suffix\()_8x64_neon +function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 mov x14, x30 mov x6, sp lsl x8, x8, #2 @@ -2768,7 +2768,7 @@ function inv_txfm_dct\suffix\()_8x64_neon add x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon store16 x6 @@ -2781,7 +2781,7 @@ function inv_txfm_dct\suffix\()_8x64_neon sub x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon add x10, x6, #16*15 sub x6, x6, #16*16 @@ -3043,7 +3043,7 @@ function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-2 // shift - bl inv_txfm_dct_clear_8x64_neon + bl inv_txfm_dct_clear_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 @@ -3068,7 +3068,7 @@ function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x7, x5, #(\i*2) mov x8, #64*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr @@ -3097,7 +3097,7 @@ function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-1 // shift - bl inv_txfm_dct_clear_scale_8x64_neon + bl inv_txfm_dct_clear_scale_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 @@ -3171,7 +3171,7 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 .irp i, 0, 8, 16, 24 add x7, x5, #(\i*2) mov x8, #32*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr @@ -3200,7 +3200,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 add x7, x2, #(\i*2) mov x8, #16*2 mov x12, #-2 // shift - bl inv_txfm_dct_clear_8x64_neon + bl inv_txfm_dct_clear_8h_x64_neon add x6, x4, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 8 @@ -3222,7 +3222,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 b.gt 2b 3: - adr x5, inv_dct_8x16_neon + adr x5, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x4, #(\i*2) @@ -3245,7 +3245,7 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 movrel x13, eob_16x32 ldrh w12, [x13], #2 - adr x4, inv_dct_8x16_neon + adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, x5, #(\i*16*2) .if \i > 0 @@ -3276,7 +3276,7 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 .irp i, 0, 8 add x7, x5, #(\i*2) mov x8, #16*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S new file mode 100644 index 0000000..5e4d63d --- /dev/null +++ b/src/arm/64/itx16.S @@ -0,0 +1,3514 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, +// int bitdepth_max); + +// Most of the functions use the following register layout: +// x0-x3 external parameters +// x4 function pointer to first transform +// x5 function pointer to second transform +// x6 output parameter for helper function +// x7 input parameter for helper function +// x8 input stride for helper function +// x9-x12 scratch variables for helper functions +// x13 pointer to list of eob thresholds +// x14 return pointer for helper function +// x15 return pointer for main function + +// The SIMD registers most often use the following layout: +// v0-v1 multiplication coefficients +// v2-v7 scratch registers +// v8-v15 unused +// v16-v31 inputs/outputs of transforms + +const idct_coeffs, align=4 + // idct4 + .int 2896, 2896*8*(1<<16), 1567, 3784 + // idct8 + .int 799, 4017, 3406, 2276 + // idct16 + .int 401, 4076, 3166, 2598 + .int 1931, 3612, 3920, 1189 + // idct32 + .int 201, 4091, 3035, 2751 + .int 1751, 3703, 3857, 1380 + .int 995, 3973, 3513, 2106 + .int 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) + .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) + .int 4076, 401, 4017, 799 + + .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) + .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) + .int -3166, -2598, -799, -4017 + + .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) + .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) + .int 3612, 1931, 2276, 3406 + + .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) + .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) + .int -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + .int 1321, 3803, 2482, 3344 +endconst + +const iadst8_coeffs, align=4 + .int 4076, 401, 3612, 1931 + .int 2598, 3166, 1189, 3920 + // idct_coeffs + .int 2896, 0, 1567, 3784 +endconst + +const iadst16_coeffs, align=4 + .int 4091, 201, 3973, 995 + .int 3703, 1751, 3290, 2440 + .int 2751, 3035, 2106, 3513 + .int 1380, 3857, 601, 4052 +endconst + +.macro mul_mla d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mla \d\().4s, \s1\().4s, \c1 +.endm + +.macro mul_mls d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mls \d\().4s, \s1\().4s, \c1 +.endm + +.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 + sqrdmulh \r0\sz, \r0\sz, \c + sqrdmulh \r1\sz, \r1\sz, \c + sqrdmulh \r2\sz, \r2\sz, \c + sqrdmulh \r3\sz, \r3\sz, \c +.ifnb \r4 + sqrdmulh \r4\sz, \r4\sz, \c + sqrdmulh \r5\sz, \r5\sz, \c + sqrdmulh \r6\sz, \r6\sz, \c + sqrdmulh \r7\sz, \r7\sz, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.ifnb \load + ld1 {\load}, [\src], x1 +.endif +.ifnb \shift + srshr \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + sqadd \adddst, \adddst, \addsrc +.endif +.ifnb \max + smax \max, \max, v6.8h +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}, [\dst], x1 +.endif +.endm +.macro load_add_store_8x16 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src + load_add_store v3.8h, v17.8h, , , , , , \dst, \src + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src + load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src + load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src + load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src + load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src + load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src + load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src + load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src + load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src + load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src + load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src + load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src + load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src + load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src + load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src + load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src + load_add_store , , , , , v31.8h, v30.8h, \dst, \src + load_add_store , , , , , , v31.8h, \dst, \src +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits + load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits + load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits + load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits + load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits + load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits + load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src, shiftbits=4 + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits + load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits + load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src +.ifnb \load + ld1 {\load}[0], [\src], x1 +.endif +.ifnb \inssrc + ins \insdst\().d[1], \inssrc\().d[0] +.endif +.ifnb \shift + srshr \shift, \shift, #4 +.endif +.ifnb \load + ld1 {\load}[1], [\src], x1 +.endif +.ifnb \addsrc + sqadd \adddst, \adddst, \addsrc +.endif +.ifnb \store + st1 {\store}[0], [\dst], x1 +.endif +.ifnb \max + smax \max, \max, v6.8h +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}[1], [\dst], x1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src + load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src + load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src + load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src + load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src + load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src + load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src + load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src + load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src + load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src + load_add_store4 , , , , , , , , v30.d, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src + load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src + load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src + load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src + load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src + load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src + load_add_store4 , , , , , , , , v22.d, \dst, \src +.endm + +.macro idct_dc w, h, shift + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v0.2s, w16 + sqrdmulh v20.4s, v16.4s, v0.s[0] + str wzr, [x2] +.if (\w == 2*\h) || (2*\w == \h) + sqrdmulh v20.4s, v20.4s, v0.s[0] +.endif +.if \shift > 0 + sqrshrn v16.4h, v20.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift +.else + sqxtn v16.4h, v20.4s + sqxtn2 v16.8h, v20.4s +.endif + sqrdmulh v16.8h, v16.8h, v0.h[1] + srshr v16.8h, v16.8h, #4 + mov w4, #\h + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + subs w4, w4, #4 + ld1 {v1.d}[1], [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + sub x0, x0, x1, lsl #2 + sqadd v1.8h, v1.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + st1 {v0.d}[0], [x0], x1 + smin v1.8h, v1.8h, v31.8h + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w8_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h}, [x0], x1 + subs w4, w4, #4 + ld1 {v1.8h}, [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v2.8h}, [x0], x1 + sqadd v1.8h, v1.8h, v16.8h + ld1 {v3.8h}, [x0], x1 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + sub x0, x0, x1, lsl #2 + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + st1 {v0.8h}, [x0], x1 + smin v2.8h, v2.8h, v31.8h + st1 {v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w16_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h}, [x0], x1 + subs w4, w4, #2 + ld1 {v2.8h, v3.8h}, [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v16.8h + sub x0, x0, x1, lsl #1 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + st1 {v0.8h, v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w32_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w4, w4, #1 + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v16.8h + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w64_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x1, x1, #64 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + subs w4, w4, #1 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] + sqadd v1.8h, v1.8h, v16.8h + sub x0, x0, #64 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + sqadd v4.8h, v4.8h, v16.8h + sqadd v5.8h, v5.8h, v16.8h + sqadd v6.8h, v6.8h, v16.8h + sqadd v7.8h, v7.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smax v4.8h, v4.8h, v30.8h + smax v5.8h, v5.8h, v30.8h + smax v6.8h, v6.8h, v30.8h + smax v7.8h, v7.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + smin v6.8h, v6.8h, v31.8h + smin v7.8h, v7.8h, v31.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +.macro iwht4 + add v16.4s, v16.4s, v17.4s + sub v21.4s, v18.4s, v19.4s + sub v20.4s, v16.4s, v21.4s + sshr v20.4s, v20.4s, #1 + sub v18.4s, v20.4s, v17.4s + sub v17.4s, v20.4s, v19.4s + add v19.4s, v21.4s, v18.4s + sub v16.4s, v16.4s, v17.4s +.endm + +.macro idct_4 r0, r1, r2, r3 + mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] + mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] + mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] + mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] + srshr v6.4s, v6.4s, #12 + srshr v7.4s, v4.4s, #12 + srshr v2.4s, v2.4s, #12 + srshr v3.4s, v3.4s, #12 + sqadd \r0\().4s, v2.4s, v6.4s + sqsub \r3\().4s, v2.4s, v6.4s + sqadd \r1\().4s, v3.4s, v7.4s + sqsub \r2\().4s, v3.4s, v7.4s +.endm + +function inv_dct_4s_x4_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] + idct_4 v16, v17, v18, v19 + ret +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.4s}, [x16] + + sub v3.4s, v16.4s, v18.4s + mul v4.4s, v16.4s, v0.s[0] + mla v4.4s, v18.4s, v0.s[1] + mla v4.4s, v19.4s, v0.s[2] + mul v7.4s, v17.4s, v0.s[3] + add v3.4s, v3.4s, v19.4s + mul v5.4s, v16.4s, v0.s[2] + mls v5.4s, v18.4s, v0.s[0] + mls v5.4s, v19.4s, v0.s[1] + + add \o3\().4s, v4.4s, v5.4s + mul \o2\().4s, v3.4s, v0.s[3] + add \o0\().4s, v4.4s, v7.4s + add \o1\().4s, v5.4s, v7.4s + sub \o3\().4s, \o3\().4s, v7.4s + + srshr \o0\().4s, \o0\().4s, #12 + srshr \o2\().4s, \o2\().4s, #12 + srshr \o1\().4s, \o1\().4s, #12 + srshr \o3\().4s, \o3\().4s, #12 +.endm + +function inv_adst_4s_x4_neon + iadst_4x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_4s_x4_neon + iadst_4x4 v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x4_neon + movz w16, #(5793-4096)*8, lsl #16 + dup v0.2s, w16 + sqrdmulh v4.4s, v16.4s, v0.s[0] + sqrdmulh v5.4s, v17.4s, v0.s[0] + sqrdmulh v6.4s, v18.4s, v0.s[0] + sqrdmulh v7.4s, v19.4s, v0.s[0] + sqadd v16.4s, v16.4s, v4.4s + sqadd v17.4s, v17.4s, v5.4s + sqadd v18.4s, v18.4s, v6.4s + sqadd v19.4s, v19.4s, v7.4s + ret +endfunc + +function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 + mov x15, x30 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + sshr v16.4s, v16.4s, #2 + sshr v17.4s, v17.4s, #2 + sshr v18.4s, v18.4s, #2 + sshr v19.4s, v19.4s, #2 + + iwht4 + + st1 {v30.4s, v31.4s}, [x2], #32 + transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 + + iwht4 + + ld1 {v0.d}[0], [x0], x1 + sqxtn v16.4h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqxtn2 v16.8h, v17.4s + ld1 {v1.d}[0], [x0], x1 + sqxtn v18.4h, v18.4s + ld1 {v1.d}[1], [x0], x1 + sqxtn2 v18.8h, v19.4s + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + blr x4 + + st1 {v30.4s, v31.4s}, [x2], #32 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + blr x5 + + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.d}[0], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + srshr v16.8h, v16.8h, #4 + srshr v18.8h, v18.8h, #4 + +L(itx_4x4_end): + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x0, x0, x1, lsl #2 + sqadd v16.8h, v16.8h, v0.8h + sqadd v18.8h, v18.8h, v1.8h + smax v16.8h, v16.8h, v30.8h + smax v18.8h, v18.8h, v30.8h + smin v16.8h, v16.8h, v31.8h + st1 {v16.d}[0], [x0], x1 + smin v18.8h, v18.8h, v31.8h + st1 {v16.d}[1], [x0], x1 + st1 {v18.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + + br x15 +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v4.2s, w16 + str wzr, [x2] + sqrdmulh v16.4s, v16.4s, v4.s[0] + ld1 {v0.d}[0], [x0], x1 + sqxtn v20.4h, v16.4s + sqxtn2 v20.8h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqrdmulh v20.8h, v20.8h, v4.h[1] + ld1 {v1.d}[0], [x0], x1 + srshr v16.8h, v20.8h, #4 + ld1 {v1.d}[1], [x0], x1 + srshr v18.8h, v20.8h, #4 + movi v30.8h, #0 + b L(itx_4x4_end) +1: +.endif + adr x4, inv_\txfm1\()_4s_x4_neon + movrel x5, X(inv_\txfm2\()_4h_x4_neon) + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4 \r0, \r2, \r4, \r6 + + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a + mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a + mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a + srshr \r1\().4s, v2.4s, #12 // t4a + srshr \r7\().4s, v4.4s, #12 // t7a + srshr \r3\().4s, v6.4s, #12 // t5a + srshr \r5\().4s, v7.4s, #12 // taa + + sqadd v2.4s, \r1\().4s, \r3\().4s // t4 + sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a + sqadd v3.4s, \r7\().4s, \r5\().4s // t7 + sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a + + mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 + mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 + srshr v4.4s, v4.4s, #12 // t5 + srshr v5.4s, v6.4s, #12 // t6 + + sqsub \r7\().4s, \r0\().4s, v3.4s // out7 + sqadd \r0\().4s, \r0\().4s, v3.4s // out0 + sqadd \r1\().4s, \r2\().4s, v5.4s // out1 + sqsub v6.4s, \r2\().4s, v5.4s // out6 + sqadd \r2\().4s, \r4\().4s, v4.4s // out2 + sqsub \r5\().4s, \r4\().4s, v4.4s // out5 + sqadd \r3\().4s, \r6\().4s, v2.4s // out3 + sqsub \r4\().4s, \r6\().4s, v2.4s // out4 + mov \r6\().16b, v6.16b // out6 +.endm + +function inv_dct_4s_x8_neon + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 + movrel x16, iadst8_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v23, v16, v0.s[0], v0.s[1] + mul_mls v4, v23, v16, v0.s[1], v0.s[0] + mul_mla v6, v21, v18, v0.s[2], v0.s[3] + srshr v16.4s, v2.4s, #12 // t0a + srshr v23.4s, v4.4s, #12 // t1a + mul_mls v2, v21, v18, v0.s[3], v0.s[2] + mul_mla v4, v19, v20, v1.s[0], v1.s[1] + srshr v18.4s, v6.4s, #12 // t2a + srshr v21.4s, v2.4s, #12 // t3a + mul_mls v6, v19, v20, v1.s[1], v1.s[0] + mul_mla v2, v17, v22, v1.s[2], v1.s[3] + srshr v20.4s, v4.4s, #12 // t4a + srshr v19.4s, v6.4s, #12 // t5a + mul_mls v4, v17, v22, v1.s[3], v1.s[2] + srshr v22.4s, v2.4s, #12 // t6a + srshr v17.4s, v4.4s, #12 // t7a + + ld1 {v0.4s}, [x16] + + sqadd v2.4s, v16.4s, v20.4s // t0 + sqsub v3.4s, v16.4s, v20.4s // t4 + sqadd v4.4s, v23.4s, v19.4s // t1 + sqsub v5.4s, v23.4s, v19.4s // t5 + sqadd v6.4s, v18.4s, v22.4s // t2 + sqsub v7.4s, v18.4s, v22.4s // t6 + sqadd v18.4s, v21.4s, v17.4s // t3 + sqsub v19.4s, v21.4s, v17.4s // t7 + + mul_mla v16, v3, v5, v0.s[3], v0.s[2] + mul_mls v20, v3, v5, v0.s[2], v0.s[3] + mul_mls v22, v19, v7, v0.s[3], v0.s[2] + + srshr v3.4s, v16.4s, #12 // t4a + srshr v5.4s, v20.4s, #12 // t5a + + mul_mla v16, v19, v7, v0.s[2], v0.s[3] + + srshr v7.4s, v22.4s, #12 // t6a + srshr v19.4s, v16.4s, #12 // t7a + + sqadd \o0\().4s, v2.4s, v6.4s // out0 + sqsub v2.4s, v2.4s, v6.4s // t2 + sqadd \o7\().4s, v4.4s, v18.4s // out7 + sqsub v4.4s, v4.4s, v18.4s // t3 + sqneg \o7\().4s, \o7\().4s // out7 + + sqadd \o1\().4s, v3.4s, v7.4s // out1 + sqsub v3.4s, v3.4s, v7.4s // t6 + sqadd \o6\().4s, v5.4s, v19.4s // out6 + sqsub v5.4s, v5.4s, v19.4s // t7 + sqneg \o1\().4s, \o1\().4s // out1 + + mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) + mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) + mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) + srshr v2.4s, v18.4s, #12 // out3 + mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) + srshr v3.4s, v20.4s, #12 // out5 + srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) + srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) + + sqneg \o3\().4s, v2.4s // out3 + sqneg \o5\().4s, v3.4s // out5 +.endm + +function inv_adst_4s_x8_neon + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +function inv_flipadst_4s_x8_neon + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x8_neon + sqshl v16.4s, v16.4s, #1 + sqshl v17.4s, v17.4s, #1 + sqshl v18.4s, v18.4s, #1 + sqshl v19.4s, v19.4s, #1 + sqshl v20.4s, v20.4s, #1 + sqshl v21.4s, v21.4s, #1 + sqshl v22.4s, v22.4s, #1 + sqshl v23.4s, v23.4s, #1 + ret +endfunc + +function inv_txfm_add_8x8_neon + movi v31.4s, #0 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 + + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + mov v23.16b, v27.16b + + blr x5 + + load_add_store_8x8 x0, x7 + br x15 +endfunc + +.macro def_fn_8x8 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + movrel x5, X(inv_\txfm2\()_8h_x8_neon) + mov w13, #\eob_half + adr x4, inv_\txfm1\()_4s_x8_neon + b inv_txfm_add_8x8_neon +endfunc +.endm + +def_fn_8x8 dct, dct, 10 +def_fn_8x8 identity, identity, 10 +def_fn_8x8 dct, adst, 10 +def_fn_8x8 dct, flipadst, 10 +def_fn_8x8 dct, identity, 4 +def_fn_8x8 adst, dct, 10 +def_fn_8x8 adst, adst, 10 +def_fn_8x8 adst, flipadst, 10 +def_fn_8x8 flipadst, dct, 10 +def_fn_8x8 flipadst, adst, 10 +def_fn_8x8 flipadst, flipadst, 10 +def_fn_8x8 identity, dct, 4 +def_fn_8x8 adst, identity, 4 +def_fn_8x8 flipadst, identity, 4 +def_fn_8x8 identity, adst, 4 +def_fn_8x8 identity, flipadst, 4 + +function inv_txfm_add_8x4_neon + movi v28.4s, #0 + movi v29.4s, #0 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + blr x4 + + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + sqxtn v20.4h, v20.4s + sqxtn v21.4h, v21.4s + sqxtn v22.4h, v22.4s + sqxtn v23.4h, v23.4s + + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] + + blr x5 + + load_add_store_8x4 x0, x7 + br x15 +endfunc + +function inv_txfm_add_4x8_neon + movz w16, #2896*8, lsl #16 + movi v31.4s, #0 + dup v30.2s, w16 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v20.4h, v16.4s + sqxtn v21.4h, v17.4s + sqxtn v22.4h, v18.4s + sqxtn v23.4h, v19.4s + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f + +1: +.irp i, v20, v21, v22, v23 + movi \i\().4h, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x8 x0, x7 + br x15 +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon +.if \w == 4 + mov w13, #\eob_half +.endif + movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct, 13 +def_fn_48 \w, \h, identity, identity, 13 +def_fn_48 \w, \h, dct, adst, 13 +def_fn_48 \w, \h, dct, flipadst, 13 +def_fn_48 \w, \h, dct, identity, 4 +def_fn_48 \w, \h, adst, dct, 13 +def_fn_48 \w, \h, adst, adst, 13 +def_fn_48 \w, \h, adst, flipadst, 13 +def_fn_48 \w, \h, flipadst, dct, 13 +def_fn_48 \w, \h, flipadst, adst, 13 +def_fn_48 \w, \h, flipadst, flipadst, 13 +def_fn_48 \w, \h, identity, dct, 16 +def_fn_48 \w, \h, adst, identity, 4 +def_fn_48 \w, \h, flipadst, identity, 4 +def_fn_48 \w, \h, identity, adst, 16 +def_fn_48 \w, \h, identity, flipadst, 16 +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + + +function inv_dct_4s_x16_neon + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #32 + + mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a + mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a + srshr v17.4s, v2.4s, #12 // t8a + srshr v31.4s, v4.4s, #12 // t15a + mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a + mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a + srshr v23.4s, v6.4s, #12 // t9a + srshr v25.4s, v2.4s, #12 // t14a + mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a + mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a + srshr v21.4s, v4.4s, #12 // t10a + srshr v27.4s, v6.4s, #12 // t13a + mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a + srshr v19.4s, v2.4s, #12 // t11a + srshr v29.4s, v4.4s, #12 // t12a + + ld1 {v0.4s}, [x16] + + sqsub v2.4s, v17.4s, v23.4s // t9 + sqadd v17.4s, v17.4s, v23.4s // t8 + sqsub v3.4s, v31.4s, v25.4s // t14 + sqadd v31.4s, v31.4s, v25.4s // t15 + sqsub v23.4s, v19.4s, v21.4s // t10 + sqadd v19.4s, v19.4s, v21.4s // t11 + sqadd v25.4s, v29.4s, v27.4s // t12 + sqsub v29.4s, v29.4s, v27.4s // t13 + + mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a + mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a + srshr v21.4s, v4.4s, #12 // t9a + srshr v27.4s, v6.4s, #12 // t14a + + mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a + srshr v29.4s, v4.4s, #12 // t13a + neg v6.4s, v6.4s + srshr v23.4s, v6.4s, #12 // t10a + + sqsub v2.4s, v17.4s, v19.4s // t11a + sqadd v17.4s, v17.4s, v19.4s // t8a + sqsub v3.4s, v31.4s, v25.4s // t12a + sqadd v31.4s, v31.4s, v25.4s // t15a + sqadd v19.4s, v21.4s, v23.4s // t9 + sqsub v21.4s, v21.4s, v23.4s // t10 + sqsub v25.4s, v27.4s, v29.4s // t13 + sqadd v27.4s, v27.4s, v29.4s // t14 + + mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 + mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a + + srshr v4.4s, v4.4s, #12 // t11 + srshr v5.4s, v6.4s, #12 // t12 + mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a + srshr v2.4s, v2.4s, #12 // t10a + srshr v3.4s, v6.4s, #12 // t13a + + sqadd v6.4s, v16.4s, v31.4s // out0 + sqsub v31.4s, v16.4s, v31.4s // out15 + mov v16.16b, v6.16b + sqadd v23.4s, v30.4s, v17.4s // out7 + sqsub v7.4s, v30.4s, v17.4s // out8 + sqadd v17.4s, v18.4s, v27.4s // out1 + sqsub v30.4s, v18.4s, v27.4s // out14 + sqadd v18.4s, v20.4s, v3.4s // out2 + sqsub v29.4s, v20.4s, v3.4s // out13 + sqadd v3.4s, v28.4s, v19.4s // out6 + sqsub v25.4s, v28.4s, v19.4s // out9 + sqadd v19.4s, v22.4s, v5.4s // out3 + sqsub v28.4s, v22.4s, v5.4s // out12 + sqadd v20.4s, v24.4s, v4.4s // out4 + sqsub v27.4s, v24.4s, v4.4s // out11 + sqadd v21.4s, v26.4s, v2.4s // out5 + sqsub v26.4s, v26.4s, v2.4s // out10 + mov v24.16b, v7.16b + mov v22.16b, v3.16b + + ret +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel x16, iadst16_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 + mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 + srshr v16.4s, v2.4s, #12 // t0 + srshr v31.4s, v4.4s, #12 // t1 + mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 + mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 + srshr v18.4s, v6.4s, #12 // t2 + srshr v29.4s, v2.4s, #12 // t3 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 + mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 + srshr v20.4s, v4.4s, #12 // t4 + srshr v27.4s, v6.4s, #12 // t5 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 + ld1 {v0.4s, v1.4s}, [x16] + movrel x16, idct_coeffs + mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 + srshr v22.4s, v2.4s, #12 // t6 + srshr v25.4s, v4.4s, #12 // t7 + mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 + mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 + srshr v23.4s, v6.4s, #12 // t8 + srshr v24.4s, v2.4s, #12 // t9 + mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 + mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 + srshr v21.4s, v4.4s, #12 // t10 + srshr v26.4s, v6.4s, #12 // t11 + mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 + mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 + srshr v19.4s, v2.4s, #12 // t12 + srshr v28.4s, v4.4s, #12 // t13 + mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 + srshr v17.4s, v6.4s, #12 // t14 + srshr v30.4s, v2.4s, #12 // t15 + + ld1 {v0.4s, v1.4s}, [x16] + + sqsub v2.4s, v16.4s, v23.4s // t8a + sqadd v16.4s, v16.4s, v23.4s // t0a + sqsub v3.4s, v31.4s, v24.4s // t9a + sqadd v31.4s, v31.4s, v24.4s // t1a + sqadd v23.4s, v18.4s, v21.4s // t2a + sqsub v18.4s, v18.4s, v21.4s // t10a + sqadd v24.4s, v29.4s, v26.4s // t3a + sqsub v29.4s, v29.4s, v26.4s // t11a + sqadd v21.4s, v20.4s, v19.4s // t4a + sqsub v20.4s, v20.4s, v19.4s // t12a + sqadd v26.4s, v27.4s, v28.4s // t5a + sqsub v27.4s, v27.4s, v28.4s // t13a + sqadd v19.4s, v22.4s, v17.4s // t6a + sqsub v22.4s, v22.4s, v17.4s // t14a + sqadd v28.4s, v25.4s, v30.4s // t7a + sqsub v25.4s, v25.4s, v30.4s // t15a + + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 + mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 + mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 + srshr v17.4s, v4.4s, #12 // t8 + srshr v30.4s, v6.4s, #12 // t9 + mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 + srshr v18.4s, v2.4s, #12 // t10 + srshr v29.4s, v4.4s, #12 // t11 + mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 + srshr v27.4s, v6.4s, #12 // t12 + srshr v20.4s, v2.4s, #12 // t13 + mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 + srshr v25.4s, v4.4s, #12 // t14 + srshr v22.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t4 + sqadd v16.4s, v16.4s, v21.4s // t0 + sqsub v3.4s, v31.4s, v26.4s // t5 + sqadd v31.4s, v31.4s, v26.4s // t1 + sqadd v21.4s, v23.4s, v19.4s // t2 + sqsub v23.4s, v23.4s, v19.4s // t6 + sqadd v26.4s, v24.4s, v28.4s // t3 + sqsub v24.4s, v24.4s, v28.4s // t7 + sqadd v19.4s, v17.4s, v27.4s // t8a + sqsub v17.4s, v17.4s, v27.4s // t12a + sqadd v28.4s, v30.4s, v20.4s // t9a + sqsub v30.4s, v30.4s, v20.4s // t13a + sqadd v27.4s, v18.4s, v25.4s // t10a + sqsub v18.4s, v18.4s, v25.4s // t14a + sqadd v20.4s, v29.4s, v22.4s // t11a + sqsub v29.4s, v29.4s, v22.4s // t15a + + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a + mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a + mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a + srshr v22.4s, v4.4s, #12 // t4a + srshr v25.4s, v6.4s, #12 // t5a + mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a + mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 + srshr v24.4s, v2.4s, #12 // t6a + srshr v23.4s, v4.4s, #12 // t7a + mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 + mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 + srshr v17.4s, v6.4s, #12 // t12 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 + srshr v29.4s, v2.4s, #12 // t13 + srshr v30.4s, v4.4s, #12 // t14 + srshr v18.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t2a +.ifc \o0, v16 + sqadd \o0\().4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 +.else + sqadd v4.4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 + mov \o0\().16b, v4.16b +.endif + sqneg \o15\().4s, \o15\().4s // out15 + + sqsub v3.4s, v29.4s, v18.4s // t15a + sqadd \o13\().4s, v29.4s, v18.4s // out13 + sqadd \o2\().4s, v17.4s, v30.4s // out2 + sqsub v26.4s, v17.4s, v30.4s // t14a + sqneg \o13\().4s, \o13\().4s // out13 + + sqadd \o1\().4s, v19.4s, v27.4s // out1 + sqsub v27.4s, v19.4s, v27.4s // t10 + sqadd \o14\().4s, v28.4s, v20.4s // out14 + sqsub v20.4s, v28.4s, v20.4s // t11 + sqneg \o1\().4s, \o1\().4s // out1 + + sqadd \o3\().4s, v22.4s, v24.4s // out3 + sqsub v22.4s, v22.4s, v24.4s // t6 + sqadd \o12\().4s, v25.4s, v23.4s // out12 + sqsub v23.4s, v25.4s, v23.4s // t7 + sqneg \o3\().4s, \o3\().4s // out3 + + mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) + mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) + mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) + + srshr v24.4s, v24.4s, #12 // out8 + srshr v4.4s, v4.4s, #12 // out7 + srshr v5.4s, v6.4s, #12 // out5 + mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) + mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) + srshr v26.4s, v6.4s, #12 // out10 + + mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) + mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) + mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) + + srshr \o4\().4s, v2.4s, #12 // out4 + srshr v6.4s, v6.4s, #12 // out11 + srshr v7.4s, v21.4s, #12 // out9 + srshr \o6\().4s, v22.4s, #12 // out6 + +.ifc \o8, v23 + mov \o8\().16b, v24.16b + mov \o10\().16b, v26.16b +.endif + + sqneg \o7\().4s, v4.4s // out7 + sqneg \o5\().4s, v5.4s // out5 + sqneg \o11\().4s, v6.4s // out11 + sqneg \o9\().4s, v7.4s // out9 +.endm + +function inv_adst_4s_x16_neon + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + ret +endfunc + +function inv_flipadst_4s_x16_neon + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x16_neon + movz w16, #2*(5793-4096)*8, lsl #16 + dup v0.2s, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.4s, v\i\().4s, v0.s[0] + sqadd v\i\().4s, v\i\().4s, v\i\().4s + sqadd v\i\().4s, v\i\().4s, v2.4s +.endr + ret +endfunc + +.macro identity_4x16_shift1 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + srshr v3.4s, v3.4s, #1 + sqadd \i, \i, v3.4s +.endr +.endm + +.macro identity_4x16 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + sqadd \i, \i, \i + sqadd \i, \i, v3.4s +.endr +.endm + +.macro def_horz_16 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + mov x14, x30 + movi v7.4s, #0 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + blr x4 + sqrshrn v16.4h, v16.4s, #\shift + sqrshrn v17.4h, v17.4s, #\shift + sqrshrn v18.4h, v18.4s, #\shift + sqrshrn v19.4h, v19.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift + sqrshrn2 v17.8h, v21.4s, #\shift + sqrshrn2 v18.8h, v22.4s, #\shift + sqrshrn2 v19.8h, v23.4s, #\shift + sqrshrn v20.4h, v24.4s, #\shift + sqrshrn v21.4h, v25.4s, #\shift + sqrshrn v22.4h, v26.4s, #\shift + sqrshrn v23.4h, v27.4s, #\shift + sqrshrn2 v20.8h, v28.4s, #\shift + sqrshrn2 v21.8h, v29.4s, #\shift + sqrshrn2 v22.8h, v30.4s, #\shift + sqrshrn2 v23.8h, v31.4s, #\shift + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 + +.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h + st1 {\i}, [x6], #16 +.endr + + br x14 +endfunc +.endm + +def_horz_16 scale=0, shift=2 +def_horz_16 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_8x16_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + blr x5 + load_add_store_8x16 x6, x7 + br x14 +endfunc + +function inv_txfm_add_16x16_neon + mov x15, x30 + sub sp, sp, #512 + ldrh w12, [x13], #2 +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*16*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #512 + br x15 +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + adr x4, inv_\txfm1\()_4s_x16_neon + movrel x5, X(inv_\txfm2\()_8h_x16_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_16x16 +.else + movrel x13, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_16x16_identity +.else + movrel x13, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct, 36 +def_fn_16x16 identity, identity, 36 +def_fn_16x16 dct, adst, 36 +def_fn_16x16 dct, flipadst, 36 +def_fn_16x16 dct, identity, 8 +def_fn_16x16 adst, dct, 36 +def_fn_16x16 adst, adst, 36 +def_fn_16x16 adst, flipadst, 36 +def_fn_16x16 flipadst, dct, 36 +def_fn_16x16 flipadst, adst, 36 +def_fn_16x16 flipadst, flipadst, 36 +def_fn_16x16 identity, dct, 8 + +function inv_txfm_add_16x4_neon + mov x15, x30 + movi v4.4s, #0 + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], #16 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + + sqrshrn v16.4h, v24.4s, #1 + sqrshrn v17.4h, v25.4s, #1 + sqrshrn v18.4h, v26.4s, #1 + sqrshrn v19.4h, v27.4s, #1 + sqrshrn2 v16.8h, v28.4s, #1 + sqrshrn2 v17.8h, v29.4s, #1 + sqrshrn2 v18.8h, v30.4s, #1 + sqrshrn2 v19.8h, v31.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + add x6, x0, #16 + load_add_store_8x4 x6, x7 + + br x15 +endfunc + +function inv_txfm_add_4x16_neon + ldrh w12, [x13, #4] + mov x15, x30 + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v28.4h, v16.4s, #1 + rshrn v29.4h, v17.4s, #1 + rshrn v30.4h, v18.4s, #1 + rshrn v31.4h, v19.4s, #1 + transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 + + b 2f +1: +.irp i, v28.4h, v29.4h, v30.4h, v31.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v24.4h, v16.4s, #1 + rshrn v25.4h, v17.4s, #1 + rshrn v26.4h, v18.4s, #1 + rshrn v27.4h, v19.4s, #1 + transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 + + b 2f +1: +.irp i, v24.4h, v25.4h, v26.4h, v27.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v20.4h, v16.4s, #1 + rshrn v21.4h, v17.4s, #1 + rshrn v22.4h, v18.4s, #1 + rshrn v23.4h, v19.4s, #1 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f +1: +.irp i, v20.4h, v21.4h, v22.4h, v23.4h + movi \i, #0 +.endr +2: + + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v2.4s}, [x2], x11 +.endr + blr x4 + rshrn v16.4h, v16.4s, #1 + rshrn v17.4h, v17.4s, #1 + rshrn v18.4h, v18.4s, #1 + rshrn v19.4h, v19.4s, #1 + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x16 x0, x6 + + br x15 +endfunc + +const eob_4x16 + .short 13, 29, 45, 64 +endconst + +const eob_4x16_identity1 + .short 16, 32, 48, 64 +endconst + +const eob_4x16_identity2 + .short 4, 8, 12, 64 +endconst + +.macro def_fn_416 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif +.if \w == 4 + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_4x16 +.else + movrel x13, eob_4x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_4x16_identity2 +.else + movrel x13, eob_4x16 +.endif +.endif +.else + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct +def_fn_416 \w, \h, identity, identity +def_fn_416 \w, \h, dct, adst +def_fn_416 \w, \h, dct, flipadst +def_fn_416 \w, \h, dct, identity +def_fn_416 \w, \h, adst, dct +def_fn_416 \w, \h, adst, adst +def_fn_416 \w, \h, adst, flipadst +def_fn_416 \w, \h, flipadst, dct +def_fn_416 \w, \h, flipadst, adst +def_fn_416 \w, \h, flipadst, flipadst +def_fn_416 \w, \h, identity, dct +def_fn_416 \w, \h, adst, identity +def_fn_416 \w, \h, flipadst, identity +def_fn_416 \w, \h, identity, adst +def_fn_416 \w, \h, identity, flipadst +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + + +function inv_txfm_add_16x8_neon + mov x15, x30 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + mov x11, #32 + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + sqrshrn v12.4h, v24.4s, #1 + sqrshrn v13.4h, v25.4s, #1 + sqrshrn v14.4h, v26.4s, #1 + sqrshrn v15.4h, v27.4s, #1 + sqrshrn2 v12.8h, v28.4s, #1 + sqrshrn2 v13.8h, v29.4s, #1 + sqrshrn2 v14.8h, v30.4s, #1 + sqrshrn2 v15.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 + + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + movi v4.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + sqrshrn v8.4h, v24.4s, #1 + sqrshrn v9.4h, v25.4s, #1 + sqrshrn v10.4h, v26.4s, #1 + sqrshrn v11.4h, v27.4s, #1 + sqrshrn2 v8.8h, v28.4s, #1 + sqrshrn2 v9.8h, v29.4s, #1 + sqrshrn2 v10.8h, v30.4s, #1 + sqrshrn2 v11.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + blr x5 + + mov x6, x0 + load_add_store_8x8 x6, x7 + + mov v16.16b, v8.16b + mov v17.16b, v9.16b + mov v18.16b, v10.16b + mov v19.16b, v11.16b + mov v20.16b, v12.16b + mov v21.16b, v13.16b + mov v22.16b, v14.16b + mov v23.16b, v15.16b + + blr x5 + + add x0, x0, #16 + load_add_store_8x8 x0, x7 + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + br x15 +endfunc + +function inv_txfm_add_8x16_neon + mov x15, x30 + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] + ldrh w12, [x13, #4] + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v28.4h, v16.4s, #1 + sqrshrn v29.4h, v17.4s, #1 + sqrshrn v30.4h, v18.4s, #1 + sqrshrn v31.4h, v19.4s, #1 + sqrshrn2 v28.8h, v20.4s, #1 + sqrshrn2 v29.8h, v21.4s, #1 + sqrshrn2 v30.8h, v22.4s, #1 + sqrshrn2 v31.8h, v23.4s, #1 + transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h + movi \i, #0 +.endr + +2: + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + blr x5 + + load_add_store_8x16 x0, x6 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x20 + + br x15 +endfunc + +const eob_8x16 + .short 10, 43, 75, 128 +endconst + +const eob_8x16_identity1 + .short 4, 64, 96, 128 +endconst + +const eob_8x16_identity2 + .short 4, 8, 12, 128 +endconst + +.macro def_fn_816 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.if \w == 8 +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_8x16 +.else + movrel x13, eob_8x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_8x16_identity2 +.else + movrel x13, eob_8x16 +.endif +.endif +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct +def_fn_816 \w, \h, identity, identity +def_fn_816 \w, \h, dct, adst +def_fn_816 \w, \h, dct, flipadst +def_fn_816 \w, \h, dct, identity +def_fn_816 \w, \h, adst, dct +def_fn_816 \w, \h, adst, adst +def_fn_816 \w, \h, adst, flipadst +def_fn_816 \w, \h, flipadst, dct +def_fn_816 \w, \h, flipadst, adst +def_fn_816 \w, \h, flipadst, flipadst +def_fn_816 \w, \h, identity, dct +def_fn_816 \w, \h, adst, identity +def_fn_816 \w, \h, flipadst, identity +def_fn_816 \w, \h, identity, adst +def_fn_816 \w, \h, identity, flipadst +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4s_x16_neon + movrel x16, idct_coeffs, 4*16 + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a + mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a + mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a + srshr v16.4s, v2.4s, #12 // t16a + srshr v31.4s, v4.4s, #12 // t31a + mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a + mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a + srshr v24.4s, v6.4s, #12 // t17a + srshr v23.4s, v2.4s, #12 // t30a + mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a + mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a + srshr v20.4s, v4.4s, #12 // t18a + srshr v27.4s, v6.4s, #12 // t29a + mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #4*24 + mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a + srshr v28.4s, v2.4s, #12 // t19a + srshr v19.4s, v4.4s, #12 // t28a + mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a + mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a + srshr v18.4s, v6.4s, #12 // t20a + srshr v29.4s, v2.4s, #12 // t27a + mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a + mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a + srshr v26.4s, v4.4s, #12 // t21a + srshr v21.4s, v6.4s, #12 // t26a + mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a + mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a + srshr v22.4s, v2.4s, #12 // t22a + srshr v25.4s, v4.4s, #12 // t25a + mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a + srshr v30.4s, v6.4s, #12 // t23a + srshr v17.4s, v2.4s, #12 // t24a + + ld1 {v0.4s, v1.4s}, [x16] + + sqsub v2.4s, v16.4s, v24.4s // t17 + sqadd v16.4s, v16.4s, v24.4s // t16 + sqsub v3.4s, v31.4s, v23.4s // t30 + sqadd v31.4s, v31.4s, v23.4s // t31 + sqsub v24.4s, v28.4s, v20.4s // t18 + sqadd v28.4s, v28.4s, v20.4s // t19 + sqadd v23.4s, v18.4s, v26.4s // t20 + sqsub v18.4s, v18.4s, v26.4s // t21 + sqsub v20.4s, v30.4s, v22.4s // t22 + sqadd v30.4s, v30.4s, v22.4s // t23 + sqadd v26.4s, v17.4s, v25.4s // t24 + sqsub v17.4s, v17.4s, v25.4s // t25 + sqsub v22.4s, v29.4s, v21.4s // t26 + sqadd v29.4s, v29.4s, v21.4s // t27 + sqadd v25.4s, v19.4s, v27.4s // t28 + sqsub v19.4s, v19.4s, v27.4s // t29 + + mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a + mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a + mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a + srshr v21.4s, v4.4s, #12 // t17a + srshr v27.4s, v6.4s, #12 // t30a + neg v2.4s, v2.4s // -> t18a + mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a + srshr v19.4s, v2.4s, #12 // t18a + srshr v24.4s, v4.4s, #12 // t29a + mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a + mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a + srshr v22.4s, v6.4s, #12 // t21a + srshr v18.4s, v2.4s, #12 // t26a + neg v4.4s, v4.4s // -> t22a + mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a + srshr v17.4s, v4.4s, #12 // t22a + srshr v20.4s, v6.4s, #12 // t25a + + sqsub v2.4s, v27.4s, v24.4s // t29 + sqadd v27.4s, v27.4s, v24.4s // t30 + sqsub v3.4s, v21.4s, v19.4s // t18 + sqadd v21.4s, v21.4s, v19.4s // t17 + sqsub v24.4s, v16.4s, v28.4s // t19a + sqadd v16.4s, v16.4s, v28.4s // t16a + sqsub v19.4s, v30.4s, v23.4s // t20a + sqadd v30.4s, v30.4s, v23.4s // t23a + sqsub v28.4s, v17.4s, v22.4s // t21 + sqadd v17.4s, v17.4s, v22.4s // t22 + sqadd v23.4s, v26.4s, v29.4s // t24a + sqsub v26.4s, v26.4s, v29.4s // t27a + sqadd v22.4s, v20.4s, v18.4s // t25 + sqsub v20.4s, v20.4s, v18.4s // t26 + sqsub v29.4s, v31.4s, v25.4s // t28a + sqadd v31.4s, v31.4s, v25.4s // t31a + + mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a + mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a + mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 + srshr v18.4s, v4.4s, #12 // t18a + srshr v25.4s, v6.4s, #12 // t29a + mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 + srshr v29.4s, v2.4s, #12 // t19 + srshr v24.4s, v4.4s, #12 // t28 + neg v6.4s, v6.4s // -> t20 + mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 + mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a + srshr v26.4s, v6.4s, #12 // t20 + srshr v19.4s, v2.4s, #12 // t27 + neg v4.4s, v4.4s // -> t21a + mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a + srshr v20.4s, v4.4s, #12 // t21a + srshr v28.4s, v6.4s, #12 // t26a + + sqsub v2.4s, v16.4s, v30.4s // t23 + sqadd v16.4s, v16.4s, v30.4s // t16 = out16 + sqsub v3.4s, v31.4s, v23.4s // t24 + sqadd v31.4s, v31.4s, v23.4s // t31 = out31 + sqsub v23.4s, v21.4s, v17.4s // t22a + sqadd v17.4s, v21.4s, v17.4s // t17a = out17 + sqadd v30.4s, v27.4s, v22.4s // t30a = out30 + sqsub v21.4s, v27.4s, v22.4s // t25a + sqsub v27.4s, v18.4s, v20.4s // t21 + sqadd v18.4s, v18.4s, v20.4s // t18 = out18 + sqadd v4.4s, v29.4s, v26.4s // t19a = out19 + sqsub v26.4s, v29.4s, v26.4s // t20a + sqadd v29.4s, v25.4s, v28.4s // t29 = out29 + sqsub v25.4s, v25.4s, v28.4s // t26 + sqadd v28.4s, v24.4s, v19.4s // t28a = out28 + sqsub v24.4s, v24.4s, v19.4s // t27a + mov v19.16b, v4.16b // out19 + + mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 + mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 + srshr v20.4s, v4.4s, #12 // t20 + srshr v22.4s, v6.4s, #12 // t27 + + mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a + mov v27.16b, v22.16b // t27 + srshr v26.4s, v4.4s, #12 // t26a + + mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 + mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 + srshr v21.4s, v6.4s, #12 // t21a + srshr v22.4s, v24.4s, #12 // t22 + srshr v25.4s, v4.4s, #12 // t25 + + mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a + srshr v23.4s, v4.4s, #12 // t23a + srshr v24.4s, v6.4s, #12 // t24a + + ret +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + mov x14, x30 + movi v7.4s, #0 + lsl x8, x8, #1 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct_4s_x16_neon + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 + transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 + +.macro store1 r0, r1, r2, r3 + st1 {\r0}, [x6], #16 + st1 {\r1}, [x6], #16 + st1 {\r2}, [x6], #16 + st1 {\r3}, [x6], #16 +.endm + store1 v16.4s, v20.4s, v24.4s, v28.4s + store1 v17.4s, v21.4s, v25.4s, v29.4s + store1 v18.4s, v22.4s, v26.4s, v30.4s + store1 v19.4s, v23.4s, v27.4s, v31.4s +.purgem store1 + sub x6, x6, #64*4 + + movi v7.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in v0.s[1] + scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct32_odd_4s_x16_neon + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 + transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 +.macro store2 r0, r1, r2, r3, shift + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] + sqsub v4.4s, v0.4s, \r0 + sqadd v0.4s, v0.4s, \r0 + sqsub v5.4s, v1.4s, \r1 + sqadd v1.4s, v1.4s, \r1 + sqsub v6.4s, v2.4s, \r2 + sqadd v2.4s, v2.4s, \r2 + sqsub v7.4s, v3.4s, \r3 + sqadd v3.4s, v3.4s, \r3 + sqrshrn v0.4h, v0.4s, #\shift + sqrshrn2 v0.8h, v1.4s, #\shift + sqrshrn v1.4h, v2.4s, #\shift + sqrshrn2 v1.8h, v3.4s, #\shift + sqrshrn v2.4h, v7.4s, #\shift + sqrshrn2 v2.8h, v6.4s, #\shift + sqrshrn v3.4h, v5.4s, #\shift + sqrshrn2 v3.8h, v4.4s, #\shift + st1 {v0.8h, v1.8h}, [x6], #32 + rev64 v2.8h, v2.8h + rev64 v3.8h, v3.8h + st1 {v2.8h, v3.8h}, [x6], #32 +.endm + + store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift + store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift + store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift + store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift +.purgem store2 + br x14 +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_8x32_neon + mov x14, x30 + lsl x8, x8, #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + + bl X(inv_dct_8h_x16_neon) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + st1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + sub x7, x7, x8, lsr #1 + bl X(inv_dct32_odd_8h_x16_neon) + + neg x9, x8 + mov x10, x6 + movi v0.8h, #0 + mvni v1.8h, #0xfc, lsl #8 // 0x3ff +.macro combine r0, r1, r2, r3, op, stride + ld1 {v5.8h}, [x7], \stride + ld1 {v2.8h}, [x10], x1 + ld1 {v6.8h}, [x7], \stride + ld1 {v3.8h}, [x10], x1 + \op v5.8h, v5.8h, \r0 + ld1 {v7.8h}, [x7], \stride + ld1 {v4.8h}, [x10], x1 + srshr v5.8h, v5.8h, #4 + \op v6.8h, v6.8h, \r1 + sqadd v5.8h, v5.8h, v2.8h + srshr v6.8h, v6.8h, #4 + \op v7.8h, v7.8h, \r2 + smax v2.8h, v5.8h, v0.8h + ld1 {v5.8h}, [x7], \stride + sqadd v6.8h, v6.8h, v3.8h + smin v2.8h, v2.8h, v1.8h + srshr v7.8h, v7.8h, #4 + \op v5.8h, v5.8h, \r3 + st1 {v2.8h}, [x6], x1 + ld1 {v2.8h}, [x10], x1 + smax v3.8h, v6.8h, v0.8h + sqadd v7.8h, v7.8h, v4.8h + smin v3.8h, v3.8h, v1.8h + srshr v5.8h, v5.8h, #4 + st1 {v3.8h}, [x6], x1 + smax v4.8h, v7.8h, v0.8h + sqadd v5.8h, v5.8h, v2.8h + smin v4.8h, v4.8h, v1.8h + st1 {v4.8h}, [x6], x1 + smax v2.8h, v5.8h, v0.8h + smin v2.8h, v2.8h, v1.8h + st1 {v2.8h}, [x6], x1 +.endm + combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 + combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 + combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 + combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 + sub x7, x7, x8 + combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 + combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 + combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 + combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 +.purgem combine + + br x14 +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + .short 10, 43, 75, 107, 139, 171, 203, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 + movi v0.8h, #0 + movi v1.8h, #0 + movrel x13, eob_32x32, 2 + + mov x8, #4*32 +1: + mov w9, #0 + movrel x12, eob_32x32, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + load_add_store_8x8 x0, x7, shiftbits=2 + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc + +.macro shift_16_regs op, shift +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movz w16, #2896*8, lsl #16 + movz w17, #2*(5793-4096)*8, lsl #16 + movi v0.4s, #0 + movi v1.4s, #0 + movrel x13, eob_16x32\hshort, 2 + + mov x8, #4*\h +1: + mov w9, #0 + movrel x12, eob_16x32\wshort, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + dup v2.2s, w16 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + mov v2.s[1], w17 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + +.if \w == 16 + // 16x32 + identity_4x16_shift1 v2.s[1] +.else + // 32x16 + shift_16_regs sqshl, 1 + identity_4x16 v2.s[1] +.endif + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + +.if \w == 16 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=4 +.endif + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #16 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movi v0.4s, #0 + movi v1.4s, #0 + // Working on 8x8 blocks, read every other entry from eob_8x32 + movrel x13, eob_8x32, 2 + + mov w8, #4*\h +1: + // Working on 8x8 blocks, read every other entry from eob_8x32 + ldrh w12, [x13], #4 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + +.if \w == 8 + sqrshrn v16.4h, v16.4s, #1 + sqrshrn2 v16.8h, v17.4s, #1 + sqrshrn v17.4h, v18.4s, #1 + sqrshrn2 v17.8h, v19.4s, #1 + sqrshrn v18.4h, v20.4s, #1 + sqrshrn2 v18.8h, v21.4s, #1 + sqrshrn v19.4h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + sqrshrn v20.4h, v24.4s, #1 + sqrshrn2 v20.8h, v25.4s, #1 + sqrshrn v21.4h, v26.4s, #1 + sqrshrn2 v21.8h, v27.4s, #1 + sqrshrn v22.4h, v28.4s, #1 + sqrshrn2 v22.8h, v29.4s, #1 + sqrshrn v23.4h, v30.4s, #1 + sqrshrn2 v23.8h, v31.4s, #1 +.else + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + + cmp w3, w12 +.if \w == 8 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=3 +.endif + + b.lt 9f +.if \w == 8 + sub x2, x2, x8, lsl #3 + add x2, x2, #4*8 +.else + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 +.endif + b 1b + +9: + ret +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 + idct_dc 32, 32, 2 + + mov x15, x30 + sub sp, sp, #2048 + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #2048 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 + idct_dc 16, 32, 1 + + mov x15, x30 + sub sp, sp, #1024 + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + adr x4, inv_dct_4s_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*16*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + mov x8, #4*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #16*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 + idct_dc 32, 16, 1 + + mov x15, x30 + sub sp, sp, #1024 + + movrel x13, eob_16x32 + movrel x5, X(inv_dct_8h_x16_neon) + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + mov x8, #4*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 + idct_dc 8, 32, 2 + + mov x15, x30 + sub sp, sp, #512 + + movrel x13, eob_8x32 + + movi v28.4s, #0 + mov x8, #4*32 + mov w9, #32 + mov x6, sp + mov x7, x2 +1: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().4s}, [x7] + st1 {v28.4s}, [x7], x8 +.endr + ldrh w12, [x13], #2 + sub w9, w9, #4 + sub x7, x7, x8, lsl #3 + add x7, x7, #4*4 + + bl inv_dct_4s_x8_neon + + sqrshrn v16.4h, v16.4s, #2 + sqrshrn v17.4h, v17.4s, #2 + sqrshrn v18.4h, v18.4s, #2 + sqrshrn v19.4h, v19.4s, #2 + sqrshrn2 v16.8h, v20.4s, #2 + sqrshrn2 v17.8h, v21.4s, #2 + sqrshrn2 v18.8h, v22.4s, #2 + sqrshrn2 v19.8h, v23.4s, #2 + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + cmp w3, w12 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + + b.ge 1b + cbz w9, 3f + + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 +2: + subs w9, w9, #4 + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 + b.gt 2b + +3: + mov x6, x0 + mov x7, sp + mov x8, #8*2 + bl inv_txfm_add_vert_dct_8x32_neon + + add sp, sp, #512 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 + idct_dc 32, 8, 2 + + mov x15, x30 + sub sp, sp, #512 + +.irp i, 0, 4 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + cmp w3, #10 + b.lt 1f +.endif + mov x8, #8*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + +2: + mov x8, #2*32 + mov w9, #0 +1: + add x6, x0, x9, lsl #1 + add x7, sp, x9, lsl #1 // #(\i*2) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x7], x8 +.endr + add w9, w9, #8 + + bl X(inv_dct_8h_x8_neon) + + cmp w9, #32 + + load_add_store_8x8 x6, x7 + + b.lt 1b + + add sp, sp, #512 + br x15 +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + ld1 {v0.4s, v1.4s}, [x17], #32 + + sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a + sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a + sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a + sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a + sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a + sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a + sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a + sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a + + ld1 {v0.4s}, [x17], #16 + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t33 + sqsub v26.4s, v19.4s, v18.4s // t34 + sqadd v27.4s, v19.4s, v18.4s // t35 + sqadd v28.4s, v20.4s, v21.4s // t60 + sqsub v29.4s, v20.4s, v21.4s // t61 + sqsub v30.4s, v23.4s, v22.4s // t62 + sqadd v31.4s, v23.4s, v22.4s // t63 + + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a + mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a + neg v2.4s, v2.4s // t34a + mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a + srshr v26.4s, v2.4s, #12 // t34a + mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a + srshr v29.4s, v4.4s, #12 // t61a + srshr v25.4s, v6.4s, #12 // t33a + srshr v30.4s, v2.4s, #12 // t62a + + sqadd v16.4s, v24.4s, v27.4s // t32a + sqsub v19.4s, v24.4s, v27.4s // t35a + sqadd v17.4s, v25.4s, v26.4s // t33 + sqsub v18.4s, v25.4s, v26.4s // t34 + sqsub v20.4s, v31.4s, v28.4s // t60a + sqadd v23.4s, v31.4s, v28.4s // t63a + sqsub v21.4s, v30.4s, v29.4s // t61 + sqadd v22.4s, v30.4s, v29.4s // t62 + + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a + mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 + srshr v21.4s, v2.4s, #12 // t61a + srshr v18.4s, v4.4s, #12 // t34a + mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 + srshr v20.4s, v6.4s, #12 // t60 + srshr v19.4s, v2.4s, #12 // t35 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 + + ret +endfunc + +function inv_dct64_step2_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + ldr q16, [x6, #4*4*0] // t32a + ldr q17, [x9, #4*4*8] // t39a + ldr q18, [x9, #4*4*0] // t63a + ldr q19, [x6, #4*4*8] // t56a + ldr q20, [x6, #4*4*16] // t40a + ldr q21, [x9, #4*4*24] // t47a + ldr q22, [x9, #4*4*16] // t55a + ldr q23, [x6, #4*4*24] // t48a + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t39 + sqadd v26.4s, v18.4s, v19.4s // t63 + sqsub v27.4s, v18.4s, v19.4s // t56 + sqsub v28.4s, v21.4s, v20.4s // t40 + sqadd v29.4s, v21.4s, v20.4s // t47 + sqadd v30.4s, v23.4s, v22.4s // t48 + sqsub v31.4s, v23.4s, v22.4s // t55 + + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a + mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a + srshr v25.4s, v2.4s, #12 // t56a + srshr v27.4s, v4.4s, #12 // t39a + neg v6.4s, v6.4s // t40a + mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a + srshr v31.4s, v6.4s, #12 // t40a + srshr v28.4s, v2.4s, #12 // t55a + + sqadd v16.4s, v24.4s, v29.4s // t32a + sqsub v19.4s, v24.4s, v29.4s // t47a + sqadd v17.4s, v27.4s, v31.4s // t39 + sqsub v18.4s, v27.4s, v31.4s // t40 + sqsub v20.4s, v26.4s, v30.4s // t48a + sqadd v23.4s, v26.4s, v30.4s // t63a + sqsub v21.4s, v25.4s, v28.4s // t55 + sqadd v22.4s, v25.4s, v28.4s // t56 + + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a + mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 + srshr v18.4s, v2.4s, #12 // t40a + srshr v21.4s, v4.4s, #12 // t55a + mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 + srshr v19.4s, v6.4s, #12 // t47 + srshr v20.4s, v2.4s, #12 // t48 + + str q16, [x6, #4*4*0] // t32a + str q17, [x9, #4*4*0] // t39 + str q18, [x6, #4*4*8] // t40a + str q19, [x9, #4*4*8] // t47 + str q20, [x6, #4*4*16] // t48 + str q21, [x9, #4*4*16] // t55a + str q22, [x6, #4*4*24] // t56 + str q23, [x9, #4*4*24] // t63a + + add x6, x6, #4*4 + sub x9, x9, #4*4 + cmp x6, x9 + b.lt 1b + ret +endfunc + +.macro load8 src, strd, zero, clear +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s +.if \clear + ld1 {\i}, [\src] + st1 {\zero}, [\src], \strd +.else + ld1 {\i}, [\src], \strd +.endif +.endr +.endm + +.macro store16 dst +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + st1 {\i}, [\dst], #16 +.endr +.endm + +.macro clear_upper8 +.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + movi \i, #0 +.endr +.endm + +.macro movi_if reg, val, cond +.if \cond + movi \reg, \val +.endif +.endm + +.macro movz16dup_if reg, gpr, val, cond +.if \cond + movz \gpr, \val, lsl #16 + dup \reg, \gpr +.endif +.endm + +.macro st1_if regs, dst, cond +.if \cond + st1 \regs, \dst +.endif +.endm + +.macro str_if reg, dst, cond +.if \cond + str \reg, \dst +.endif +.endm + +.macro stroff_if reg, dst, dstoff, cond +.if \cond + str \reg, \dst, \dstoff +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4s_x64_neon + mov x14, x30 + mov x6, sp + lsl x8, x8, #2 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + add x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct_4s_x16_neon + + store16 x6 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + lsr x8, x8, #1 + sub x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct32_odd_4s_x16_neon + + add x10, x6, #16*15 + sub x6, x6, #16*16 + + mov x9, #-16 + +.macro store_addsub r0, r1, r2, r3 + ld1 {v2.4s}, [x6], #16 + ld1 {v3.4s}, [x6], #16 + sqadd v6.4s, v2.4s, \r0 + sqsub \r0, v2.4s, \r0 + ld1 {v4.4s}, [x6], #16 + sqadd v7.4s, v3.4s, \r1 + sqsub \r1, v3.4s, \r1 + ld1 {v5.4s}, [x6], #16 + sqadd v2.4s, v4.4s, \r2 + sub x6, x6, #16*4 + sqsub \r2, v4.4s, \r2 + st1 {v6.4s}, [x6], #16 + st1 {\r0}, [x10], x9 + sqadd v3.4s, v5.4s, \r3 + sqsub \r3, v5.4s, \r3 + st1 {v7.4s}, [x6], #16 + st1 {\r1}, [x10], x9 + st1 {v2.4s}, [x6], #16 + st1 {\r2}, [x10], x9 + st1 {v3.4s}, [x6], #16 + st1 {\r3}, [x10], x9 +.endm + store_addsub v31.4s, v30.4s, v29.4s, v28.4s + store_addsub v27.4s, v26.4s, v25.4s, v24.4s + store_addsub v23.4s, v22.4s, v21.4s, v20.4s + store_addsub v19.4s, v18.4s, v17.4s, v16.4s +.purgem store_addsub + + add x6, x6, #4*4*16 + + movrel x17, idct64_coeffs + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x9, x7, x8, lsl #4 // offset 16 + add x10, x7, x8, lsl #3 // offset 8 + sub x9, x9, x8 // offset 15 + sub x11, x10, x8 // offset 7 + ld1 {v16.4s}, [x7] // in1 (offset 0) + ld1 {v17.4s}, [x9] // in31 (offset 15) + ld1 {v18.4s}, [x10] // in17 (offset 8) + ld1 {v19.4s}, [x11] // in15 (offset 7) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x7, x7, x8, lsl #2 // offset 4 + sub x9, x9, x8, lsl #2 // offset 11 + sub x10, x7, x8 // offset 3 + add x11, x9, x8 // offset 12 + ld1 {v16.4s}, [x10] // in7 (offset 3) + ld1 {v17.4s}, [x11] // in25 (offset 12) + ld1 {v18.4s}, [x9] // in23 (offset 11) + ld1 {v19.4s}, [x7] // in9 (offset 4) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + sub x10, x10, x8, lsl #1 // offset 1 + sub x9, x9, x8, lsl #1 // offset 9 + add x7, x7, x8 // offset 5 + add x11, x11, x8 // offset 13 + ldr q16, [x10, x8] // in5 (offset 2) + ldr q17, [x11] // in27 (offset 13) + ldr q18, [x9, x8] // in21 (offset 10) + ldr q19, [x7] // in11 (offset 5) + stroff_if q7, [x10, x8], \clear + str_if q7, [x11], \clear + stroff_if q7, [x9, x8], \clear + str_if q7, [x7], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + ldr q16, [x10] // in3 (offset 1) + ldr q17, [x11, x8] // in29 (offset 14) + ldr q18, [x9] // in19 (offset 9) + ldr q19, [x7, x8] // in13 (offset 6) + str_if q7, [x10], \clear + stroff_if q7, [x11, x8], \clear + str_if q7, [x9], \clear + stroff_if q7, [x7, x8], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + + sub x6, x6, #4*4*32 + add x9, x6, #4*4*7 + + bl inv_dct64_step2_neon + + br x14 +endfunc +.endm + +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + + +function inv_txfm_horz_dct_64x4_neon + mov x14, x30 + + mov x7, sp + add x8, sp, #4*4*(64 - 4) + add x9, x6, #2*56 + mov x10, #2*64 + mov x11, #-4*4*4 + + dup v7.4s, w12 +1: + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + +.macro store_addsub src0, src1, src2, src3 + sqsub v1.4s, \src0, \src1 + sqadd v0.4s, \src0, \src1 + sqsub v3.4s, \src2, \src3 + srshl v1.4s, v1.4s, v7.4s + sqadd v2.4s, \src2, \src3 + srshl v3.4s, v3.4s, v7.4s + srshl v0.4s, v0.4s, v7.4s + srshl v2.4s, v2.4s, v7.4s + sqxtn v3.4h, v3.4s + sqxtn2 v3.8h, v1.4s + sqxtn v0.4h, v0.4s + sqxtn2 v0.8h, v2.4s + rev64 v3.8h, v3.8h + st1 {v0.8h}, [x6], x10 + st1 {v3.8h}, [x9], x10 +.endm + store_addsub v16.4s, v31.4s, v20.4s, v27.4s + store_addsub v17.4s, v30.4s, v21.4s, v26.4s + store_addsub v18.4s, v29.4s, v22.4s, v25.4s + store_addsub v19.4s, v28.4s, v23.4s, v24.4s +.purgem store_addsub + sub x6, x6, x10, lsl #2 + sub x9, x9, x10, lsl #2 + add x6, x6, #16 + sub x9, x9, #16 + + cmp x7, x8 + b.lt 1b + br x14 +endfunc + +function inv_txfm_add_vert_dct_8x64_neon + mov x14, x30 + lsl x8, x8, #1 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, x1, lsl #6 + sub x9, x9, x1 + neg x10, x1 + mov x11, #-2*8*4 + +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff +.macro add_dest_addsub src0, src1, src2, src3 + ld1 {v0.8h}, [x6], x1 + ld1 {v1.8h}, [x9], x10 + sqadd v4.8h, \src0, \src1 + ld1 {v2.8h}, [x6] + sqsub \src0, \src0, \src1 + ld1 {v3.8h}, [x9] + sqadd v5.8h, \src2, \src3 + sqsub \src2, \src2, \src3 + sub x6, x6, x1 + sub x9, x9, x10 + srshr v4.8h, v4.8h, #4 + srshr v5.8h, v5.8h, #4 + srshr \src0, \src0, #4 + sqadd v0.8h, v0.8h, v4.8h + srshr \src2, \src2, #4 + sqadd v1.8h, v1.8h, \src0 + sqadd v2.8h, v2.8h, v5.8h + smax v0.8h, v0.8h, v6.8h + sqadd v3.8h, v3.8h, \src2 + smax v1.8h, v1.8h, v6.8h + smin v0.8h, v0.8h, v7.8h + smax v2.8h, v2.8h, v6.8h + smin v1.8h, v1.8h, v7.8h + st1 {v0.8h}, [x6], x1 + smax v3.8h, v3.8h, v6.8h + smin v2.8h, v2.8h, v7.8h + st1 {v1.8h}, [x9], x10 + smin v3.8h, v3.8h, v7.8h + st1 {v2.8h}, [x6], x1 + st1 {v3.8h}, [x9], x10 +.endm + add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h + add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h + add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h + add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem add_dest_addsub + cmp x7, x8 + b.lt 1b + + br x14 +endfunc + +.macro sub_sp space +#ifdef _WIN32 +.if \space > 4096 + sub x16, sp, #4096 + ldr xzr, [x16] + sub sp, x16, #(\space - 4096) +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +.else + sub sp, sp, #\space +.endif +#endif +.endm + +function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 + idct_dc 64, 64, 2 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x7, x5, #(\i*2) + mov x8, #64*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 + idct_dc 64, 32, 1 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-1 // shift + bl inv_txfm_dct_clear_scale_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 + idct_dc 32, 64, 1 + + mov x15, x30 + + sub_sp 32*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x7, x5, #(\i*2) + mov x8, #32*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #32*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 + idct_dc 64, 16, 2 + + mov x15, x30 + + sub_sp 64*16*2+64*4*4 + add x4, sp, #64*4*4 + + movrel x13, eob_16x32 + +.irp i, 0, 4, 8, 12 + add x6, x4, #(\i*64*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x4, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: + movrel x5, X(inv_dct_8h_x16_neon) +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x4, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, x4, #64*16*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 + idct_dc 16, 64, 2 + + mov x15, x30 + + sub_sp 16*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + + adr x4, inv_dct_4s_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*16*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x7, x5, #(\i*2) + mov x8, #16*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #16*32*2 + br x15 +endfunc diff --git a/src/arm/64/util.S b/src/arm/64/util.S index 3332c85..fc0e0d0 100644 --- a/src/arm/64/util.S +++ b/src/arm/64/util.S @@ -170,6 +170,18 @@ trn2 \r3\().2s, \t5\().2s, \t7\().2s .endm +.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().4s, \r0\().4s, \r1\().4s + trn2 \t5\().4s, \r0\().4s, \r1\().4s + trn1 \t6\().4s, \r2\().4s, \r3\().4s + trn2 \t7\().4s, \r2\().4s, \r3\().4s + + trn1 \r0\().2d, \t4\().2d, \t6\().2d + trn2 \r2\().2d, \t4\().2d, \t6\().2d + trn1 \r1\().2d, \t5\().2d, \t7\().2d + trn2 \r3\().2d, \t5\().2d, \t7\().2d +.endm + .macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().8h, \r0\().8h, \r1\().8h trn2 \t5\().8h, \r0\().8h, \r1\().8h diff --git a/src/arm/itx_init_tmpl.c b/src/arm/itx_init_tmpl.c index 5d4de82..c73becd 100644 --- a/src/arm/itx_init_tmpl.c +++ b/src/arm/itx_init_tmpl.c @@ -117,7 +117,9 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 && ARCH_AARCH64 + if (bpc > 10) return; + +#if ARCH_AARCH64 assign_itx17_fn( , 4, 4, neon); assign_itx16_fn(R, 4, 8, neon); assign_itx16_fn(R, 4, 16, neon); -- cgit v1.2.3