arm64: mc: Use two regs for alternating output rows for w4/8 in avg/w_avg/mask

It was already done this way for w32/64. Not doing it for w16 as it didn't help there (and instead gave a small slowdown due to the two setup instructions). This gives a small speedup on in-order cores like A53. Before: Cortex A53 A72 A73 avg_w4_8bpc_neon: 60.9 25.6 29.0 avg_w8_8bpc_neon: 143.6 52.8 64.0 After: avg_w4_8bpc_neon: 56.7 26.7 28.5 avg_w8_8bpc_neon: 137.2 54.5 64.4
author: Martin Storsjö <martin@martin.st> 2020-02-01 15:33:58 +0300
committer: Janne Grunau <janne-vlc@jannau.net> 2020-02-11 00:27:54 +0300
commit: b1167ce169f004f90bcc4a9e8841ffb90fe4abf1 (patch)
tree: 40f7663eec69f8b7de9e9ece8ca651b766e59858
parent: 0bad117eb0f97594a938f17ba05d3ca89ba81a9f (diff)
1 files changed, 18 insertions, 12 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 9b27a56..5a7f771 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -85,38 +85,44 @@ function \type\()_8bpc_neon, export=1
         \type           v4,  v0,  v1,  v2,  v3
         sub             x7,  x7,  w4, uxtw
         br              x7
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
 4:
         cmp             w5,  #4
         st1             {v4.s}[0],  [x0], x1
-        st1             {v4.s}[1],  [x0], x1
+        st1             {v4.s}[1],  [x7], x1
         st1             {v4.s}[2],  [x0], x1
-        st1             {v4.s}[3],  [x0], x1
+        st1             {v4.s}[3],  [x7], x1
         b.eq            0f
         \type           v5,  v0,  v1,  v2,  v3
         cmp             w5,  #8
         st1             {v5.s}[0],  [x0], x1
-        st1             {v5.s}[1],  [x0], x1
+        st1             {v5.s}[1],  [x7], x1
         st1             {v5.s}[2],  [x0], x1
-        st1             {v5.s}[3],  [x0], x1
+        st1             {v5.s}[3],  [x7], x1
         b.eq            0f
         \type           v4,  v0,  v1,  v2,  v3
         st1             {v4.s}[0],  [x0], x1
-        st1             {v4.s}[1],  [x0], x1
+        st1             {v4.s}[1],  [x7], x1
         \type           v5,  v0,  v1,  v2,  v3
         st1             {v4.s}[2],  [x0], x1
-        st1             {v4.s}[3],  [x0], x1
+        st1             {v4.s}[3],  [x7], x1
         st1             {v5.s}[0],  [x0], x1
-        st1             {v5.s}[1],  [x0], x1
+        st1             {v5.s}[1],  [x7], x1
         st1             {v5.s}[2],  [x0], x1
-        st1             {v5.s}[3],  [x0], x1
+        st1             {v5.s}[3],  [x7], x1
         ret
+80:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
 8:
         st1             {v4.d}[0],  [x0], x1
         \type           v5,  v0,  v1,  v2,  v3
-        st1             {v4.d}[1],  [x0], x1
+        st1             {v4.d}[1],  [x7], x1
         st1             {v5.d}[0],  [x0], x1
         subs            w5,  w5,  #4
-        st1             {v5.d}[1],  [x0], x1
+        st1             {v5.d}[1],  [x7], x1
         b.le            0f
         \type           v4,  v0,  v1,  v2,  v3
         b               8b
@@ -185,8 +191,8 @@ L(\type\()_tbl):
         .hword L(\type\()_tbl) -  640b
         .hword L(\type\()_tbl) -  320b
         .hword L(\type\()_tbl) -   16b
-        .hword L(\type\()_tbl) -    8b
-        .hword L(\type\()_tbl) -    4b
+        .hword L(\type\()_tbl) -   80b
+        .hword L(\type\()_tbl) -   40b
 endfunc
 .endm
author	Martin Storsjö <martin@martin.st>	2020-02-01 15:33:58 +0300
committer	Janne Grunau <janne-vlc@jannau.net>	2020-02-11 00:27:54 +0300
commit	b1167ce169f004f90bcc4a9e8841ffb90fe4abf1 (patch)
tree	40f7663eec69f8b7de9e9ece8ca651b766e59858
parent	0bad117eb0f97594a938f17ba05d3ca89ba81a9f (diff)