Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2020-04-28 00:17:04 +0300
committerMartin Storsjö <martin@martin.st>2020-05-10 08:51:42 +0300
commitb6b1394b06ea2cce03c9c97c77510cb8f2a207e2 (patch)
tree8cd849f6780dd9ea7477329d51450744d83b22fd
parent208a2abd16bb4132018810765e9982a457f62fa0 (diff)
arm64: itx: Minor optimizations for the 8x32 functions
This gives a couple cycles speedup.
-rw-r--r--src/arm/64/itx.S18
1 files changed, 8 insertions, 10 deletions
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S
index 0c91379..4be4c8d 100644
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -2317,6 +2317,7 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
mov w8, #2*\h
1:
+ ldrh w12, [x13], #2
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x2]
st1 {v0.8h}, [x2], x8
@@ -2329,14 +2330,13 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ cmp w3, w12
.if \w == 8
load_add_store_8x8 x0, x7, shiftbits=2
.else
load_add_store_8x8 x0, x7, shiftbits=3
.endif
- ldrh w12, [x13], #2
- cmp w3, w12
b.lt 9f
.if \w == 8
sub x2, x2, x8, lsl #3
@@ -2509,16 +2509,15 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1
mov x8, #2*32
mov w9, #32
mov x6, sp
- mov x7, x2
1:
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x7]
- st1 {v28.8h}, [x7], x8
+ ld1 {v\i\().8h}, [x2]
+ st1 {v28.8h}, [x2], x8
.endr
ldrh w12, [x13], #2
+ sub x2, x2, x8, lsl #3
sub w9, w9, #8
- sub x7, x7, x8, lsl #3
- add x7, x7, #2*8
+ add x2, x2, #2*8
bl inv_dct_8x8_neon
@@ -2528,10 +2527,9 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
cmp w3, w12
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- st1 {v\i\().8h}, [x6], #16
-.endr
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
b.ge 1b
cbz w9, 3f