diff options
author | Martin Storsjö <martin@martin.st> | 2020-04-28 00:17:04 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2020-05-10 08:51:42 +0300 |
commit | b6b1394b06ea2cce03c9c97c77510cb8f2a207e2 (patch) | |
tree | 8cd849f6780dd9ea7477329d51450744d83b22fd /src/arm | |
parent | 208a2abd16bb4132018810765e9982a457f62fa0 (diff) |
arm64: itx: Minor optimizations for the 8x32 functions
This gives a couple cycles speedup.
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/itx.S | 18 |
1 files changed, 8 insertions, 10 deletions
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S index 0c91379..4be4c8d 100644 --- a/src/arm/64/itx.S +++ b/src/arm/64/itx.S @@ -2317,6 +2317,7 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 mov w8, #2*\h 1: + ldrh w12, [x13], #2 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v0.8h}, [x2], x8 @@ -2329,14 +2330,13 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + cmp w3, w12 .if \w == 8 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=3 .endif - ldrh w12, [x13], #2 - cmp w3, w12 b.lt 9f .if \w == 8 sub x2, x2, x8, lsl #3 @@ -2509,16 +2509,15 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1 mov x8, #2*32 mov w9, #32 mov x6, sp - mov x7, x2 1: .irp i, 16, 17, 18, 19, 20, 21, 22, 23 - ld1 {v\i\().8h}, [x7] - st1 {v28.8h}, [x7], x8 + ld1 {v\i\().8h}, [x2] + st1 {v28.8h}, [x2], x8 .endr ldrh w12, [x13], #2 + sub x2, x2, x8, lsl #3 sub w9, w9, #8 - sub x7, x7, x8, lsl #3 - add x7, x7, #2*8 + add x2, x2, #2*8 bl inv_dct_8x8_neon @@ -2528,10 +2527,9 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 cmp w3, w12 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23 - st1 {v\i\().8h}, [x6], #16 -.endr + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 b.ge 1b cbz w9, 3f |