diff options
author | Martin Storsjö <martin@martin.st> | 2020-02-01 15:33:58 +0300 |
---|---|---|
committer | Janne Grunau <janne-vlc@jannau.net> | 2020-02-11 00:27:54 +0300 |
commit | b1167ce169f004f90bcc4a9e8841ffb90fe4abf1 (patch) | |
tree | 40f7663eec69f8b7de9e9ece8ca651b766e59858 | |
parent | 0bad117eb0f97594a938f17ba05d3ca89ba81a9f (diff) |
arm64: mc: Use two regs for alternating output rows for w4/8 in avg/w_avg/mask
It was already done this way for w32/64. Not doing it for w16 as it
didn't help there (and instead gave a small slowdown due to the two
setup instructions).
This gives a small speedup on in-order cores like A53.
Before: Cortex A53 A72 A73
avg_w4_8bpc_neon: 60.9 25.6 29.0
avg_w8_8bpc_neon: 143.6 52.8 64.0
After:
avg_w4_8bpc_neon: 56.7 26.7 28.5
avg_w8_8bpc_neon: 137.2 54.5 64.4
-rw-r--r-- | src/arm/64/mc.S | 30 |
1 files changed, 18 insertions, 12 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 9b27a56..5a7f771 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -85,38 +85,44 @@ function \type\()_8bpc_neon, export=1 \type v4, v0, v1, v2, v3 sub x7, x7, w4, uxtw br x7 +40: + add x7, x0, x1 + lsl x1, x1, #1 4: cmp w5, #4 st1 {v4.s}[0], [x0], x1 - st1 {v4.s}[1], [x0], x1 + st1 {v4.s}[1], [x7], x1 st1 {v4.s}[2], [x0], x1 - st1 {v4.s}[3], [x0], x1 + st1 {v4.s}[3], [x7], x1 b.eq 0f \type v5, v0, v1, v2, v3 cmp w5, #8 st1 {v5.s}[0], [x0], x1 - st1 {v5.s}[1], [x0], x1 + st1 {v5.s}[1], [x7], x1 st1 {v5.s}[2], [x0], x1 - st1 {v5.s}[3], [x0], x1 + st1 {v5.s}[3], [x7], x1 b.eq 0f \type v4, v0, v1, v2, v3 st1 {v4.s}[0], [x0], x1 - st1 {v4.s}[1], [x0], x1 + st1 {v4.s}[1], [x7], x1 \type v5, v0, v1, v2, v3 st1 {v4.s}[2], [x0], x1 - st1 {v4.s}[3], [x0], x1 + st1 {v4.s}[3], [x7], x1 st1 {v5.s}[0], [x0], x1 - st1 {v5.s}[1], [x0], x1 + st1 {v5.s}[1], [x7], x1 st1 {v5.s}[2], [x0], x1 - st1 {v5.s}[3], [x0], x1 + st1 {v5.s}[3], [x7], x1 ret +80: + add x7, x0, x1 + lsl x1, x1, #1 8: st1 {v4.d}[0], [x0], x1 \type v5, v0, v1, v2, v3 - st1 {v4.d}[1], [x0], x1 + st1 {v4.d}[1], [x7], x1 st1 {v5.d}[0], [x0], x1 subs w5, w5, #4 - st1 {v5.d}[1], [x0], x1 + st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 8b @@ -185,8 +191,8 @@ L(\type\()_tbl): .hword L(\type\()_tbl) - 640b .hword L(\type\()_tbl) - 320b .hword L(\type\()_tbl) - 16b - .hword L(\type\()_tbl) - 8b - .hword L(\type\()_tbl) - 4b + .hword L(\type\()_tbl) - 80b + .hword L(\type\()_tbl) - 40b endfunc .endm |