Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2020-02-01 15:33:58 +0300
committerJanne Grunau <janne-vlc@jannau.net>2020-02-11 00:27:54 +0300
commitb1167ce169f004f90bcc4a9e8841ffb90fe4abf1 (patch)
tree40f7663eec69f8b7de9e9ece8ca651b766e59858
parent0bad117eb0f97594a938f17ba05d3ca89ba81a9f (diff)
arm64: mc: Use two regs for alternating output rows for w4/8 in avg/w_avg/mask
It was already done this way for w32/64. Not doing it for w16 as it didn't help there (and instead gave a small slowdown due to the two setup instructions). This gives a small speedup on in-order cores like A53. Before: Cortex A53 A72 A73 avg_w4_8bpc_neon: 60.9 25.6 29.0 avg_w8_8bpc_neon: 143.6 52.8 64.0 After: avg_w4_8bpc_neon: 56.7 26.7 28.5 avg_w8_8bpc_neon: 137.2 54.5 64.4
-rw-r--r--src/arm/64/mc.S30
1 files changed, 18 insertions, 12 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 9b27a56..5a7f771 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -85,38 +85,44 @@ function \type\()_8bpc_neon, export=1
\type v4, v0, v1, v2, v3
sub x7, x7, w4, uxtw
br x7
+40:
+ add x7, x0, x1
+ lsl x1, x1, #1
4:
cmp w5, #4
st1 {v4.s}[0], [x0], x1
- st1 {v4.s}[1], [x0], x1
+ st1 {v4.s}[1], [x7], x1
st1 {v4.s}[2], [x0], x1
- st1 {v4.s}[3], [x0], x1
+ st1 {v4.s}[3], [x7], x1
b.eq 0f
\type v5, v0, v1, v2, v3
cmp w5, #8
st1 {v5.s}[0], [x0], x1
- st1 {v5.s}[1], [x0], x1
+ st1 {v5.s}[1], [x7], x1
st1 {v5.s}[2], [x0], x1
- st1 {v5.s}[3], [x0], x1
+ st1 {v5.s}[3], [x7], x1
b.eq 0f
\type v4, v0, v1, v2, v3
st1 {v4.s}[0], [x0], x1
- st1 {v4.s}[1], [x0], x1
+ st1 {v4.s}[1], [x7], x1
\type v5, v0, v1, v2, v3
st1 {v4.s}[2], [x0], x1
- st1 {v4.s}[3], [x0], x1
+ st1 {v4.s}[3], [x7], x1
st1 {v5.s}[0], [x0], x1
- st1 {v5.s}[1], [x0], x1
+ st1 {v5.s}[1], [x7], x1
st1 {v5.s}[2], [x0], x1
- st1 {v5.s}[3], [x0], x1
+ st1 {v5.s}[3], [x7], x1
ret
+80:
+ add x7, x0, x1
+ lsl x1, x1, #1
8:
st1 {v4.d}[0], [x0], x1
\type v5, v0, v1, v2, v3
- st1 {v4.d}[1], [x0], x1
+ st1 {v4.d}[1], [x7], x1
st1 {v5.d}[0], [x0], x1
subs w5, w5, #4
- st1 {v5.d}[1], [x0], x1
+ st1 {v5.d}[1], [x7], x1
b.le 0f
\type v4, v0, v1, v2, v3
b 8b
@@ -185,8 +191,8 @@ L(\type\()_tbl):
.hword L(\type\()_tbl) - 640b
.hword L(\type\()_tbl) - 320b
.hword L(\type\()_tbl) - 16b
- .hword L(\type\()_tbl) - 8b
- .hword L(\type\()_tbl) - 4b
+ .hword L(\type\()_tbl) - 80b
+ .hword L(\type\()_tbl) - 40b
endfunc
.endm