From 0bad117eb0f97594a938f17ba05d3ca89ba81a9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 1 Feb 2020 14:33:49 +0200
Subject: arm64: mc: Simplify avg/w_avg/mask by always using the w16 macro

This shortens the source by 40 lines, and gives a significant
speedup on A53, a small speedup on A72 and a very minor slowdown
for avg/w_avg on A73.

Before:           Cortex A53     A72     A73
avg_w4_8bpc_neon:       67.4    26.1    25.4
avg_w8_8bpc_neon:      158.7    56.3    59.1
avg_w16_8bpc_neon:     382.9   154.1   160.7
w_avg_w4_8bpc_neon:     99.9    43.6    39.4
w_avg_w8_8bpc_neon:    253.2    98.3    99.0
w_avg_w16_8bpc_neon:   543.1   285.0   301.8
mask_w4_8bpc_neon:     110.6    51.4    45.1
mask_w8_8bpc_neon:     295.0   129.9   114.0
mask_w16_8bpc_neon:    654.6   365.8   369.7
After:
avg_w4_8bpc_neon:       60.8    26.3    29.0
avg_w8_8bpc_neon:      142.8    52.9    64.1
avg_w16_8bpc_neon:     378.2   153.4   160.8
w_avg_w4_8bpc_neon:     78.7    41.0    40.9
w_avg_w8_8bpc_neon:    190.6    90.1   105.1
w_avg_w16_8bpc_neon:   531.1   279.3   301.4
mask_w4_8bpc_neon:      86.6    47.2    44.9
mask_w8_8bpc_neon:     222.0   114.3   114.9
mask_w16_8bpc_neon:    639.5   356.0   369.8
---
 src/arm/64/mc.S | 135 ++++++++++++++++++++------------------------------------
 1 file changed, 48 insertions(+), 87 deletions(-)

(limited to 'src/arm')

diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index ea10fc7..9b27a56 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -29,14 +29,7 @@
 #include "src/arm/asm.S"
 #include "util.S"
 
-.macro avg dst, t0, t1
-        ld1             {\t0\().8h},   [x2],  16
-        ld1             {\t1\().8h},   [x3],  16
-        add             \t0\().8h,   \t0\().8h,   \t1\().8h
-        sqrshrun        \dst\().8b,  \t0\().8h,   #5
-.endm
-
-.macro avg16 dst, t0, t1, t2, t3
+.macro avg dst, t0, t1, t2, t3
         ld1             {\t0\().8h,\t1\().8h},   [x2],  32
         ld1             {\t2\().8h,\t3\().8h},   [x3],  32
         add             \t0\().8h,   \t0\().8h,   \t2\().8h
@@ -45,16 +38,7 @@
         sqrshrun2       \dst\().16b, \t1\().8h,   #5
 .endm
 
-.macro w_avg dst, t0, t1
-        ld1             {\t0\().8h},   [x2],  16
-        ld1             {\t1\().8h},   [x3],  16
-        sub             \t0\().8h,   \t1\().8h,   \t0\().8h
-        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
-        add             \t0\().8h,   \t1\().8h,   \t0\().8h
-        sqrshrun        \dst\().8b,  \t0\().8h,   #4
-.endm
-
-.macro w_avg16 dst, t0, t1, t2, t3
+.macro w_avg dst, t0, t1, t2, t3
         ld1             {\t0\().8h,\t1\().8h},   [x2],  32
         ld1             {\t2\().8h,\t3\().8h},   [x3],  32
         sub             \t0\().8h,   \t2\().8h,   \t0\().8h
@@ -67,19 +51,7 @@
         sqrshrun2       \dst\().16b, \t1\().8h,   #4
 .endm
 
-.macro mask dst, t0, t1
-        ld1             {v30.8b},      [x6],  8
-        ld1             {\t0\().8h},   [x2],  16
-        mul             v30.8b, v30.8b, v31.8b
-        ld1             {\t1\().8h},   [x3],  16
-        shll            v30.8h, v30.8b, #8
-        sub             \t0\().8h,   \t1\().8h,   \t0\().8h
-        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
-        add             \t0\().8h,   \t1\().8h,   \t0\().8h
-        sqrshrun        \dst\().8b,  \t0\().8h,   #4
-.endm
-
-.macro mask16 dst, t0, t1, t2, t3
+.macro mask dst, t0, t1, t2, t3
         ld1             {v30.16b}, [x6],  16
         ld1             {\t0\().8h,\t1\().8h},   [x2],  32
         mul             v30.16b, v30.16b, v31.16b
@@ -109,113 +81,102 @@ function \type\()_8bpc_neon, export=1
 .endif
         adr             x7,  L(\type\()_tbl)
         sub             w4,  w4,  #24
-        \type           v4,  v0,  v1
         ldrh            w4,  [x7, x4, lsl #1]
-        \type           v5,  v2,  v3
+        \type           v4,  v0,  v1,  v2,  v3
         sub             x7,  x7,  w4, uxtw
         br              x7
 4:
         cmp             w5,  #4
         st1             {v4.s}[0],  [x0], x1
         st1             {v4.s}[1],  [x0], x1
-        st1             {v5.s}[0],  [x0], x1
-        st1             {v5.s}[1],  [x0], x1
+        st1             {v4.s}[2],  [x0], x1
+        st1             {v4.s}[3],  [x0], x1
         b.eq            0f
-        \type           v6,  v0,  v1
-        \type           v7,  v2,  v3
+        \type           v5,  v0,  v1,  v2,  v3
         cmp             w5,  #8
-        st1             {v6.s}[0],  [x0], x1
-        st1             {v6.s}[1],  [x0], x1
-        st1             {v7.s}[0],  [x0], x1
-        st1             {v7.s}[1],  [x0], x1
+        st1             {v5.s}[0],  [x0], x1
+        st1             {v5.s}[1],  [x0], x1
+        st1             {v5.s}[2],  [x0], x1
+        st1             {v5.s}[3],  [x0], x1
         b.eq            0f
-        \type           v4,  v0,  v1
-        \type           v5,  v2,  v3
+        \type           v4,  v0,  v1,  v2,  v3
         st1             {v4.s}[0],  [x0], x1
         st1             {v4.s}[1],  [x0], x1
-        \type           v6,  v0,  v1
+        \type           v5,  v0,  v1,  v2,  v3
+        st1             {v4.s}[2],  [x0], x1
+        st1             {v4.s}[3],  [x0], x1
         st1             {v5.s}[0],  [x0], x1
         st1             {v5.s}[1],  [x0], x1
-        \type           v7,  v2,  v3
-        st1             {v6.s}[0],  [x0], x1
-        st1             {v6.s}[1],  [x0], x1
-        st1             {v7.s}[0],  [x0], x1
-        st1             {v7.s}[1],  [x0], x1
+        st1             {v5.s}[2],  [x0], x1
+        st1             {v5.s}[3],  [x0], x1
         ret
 8:
-        st1             {v4.8b},  [x0], x1
-        \type           v6,  v0,  v1
-        st1             {v5.8b},  [x0], x1
-        \type           v7,  v0,  v1
-        st1             {v6.8b},  [x0], x1
+        st1             {v4.d}[0],  [x0], x1
+        \type           v5,  v0,  v1,  v2,  v3
+        st1             {v4.d}[1],  [x0], x1
+        st1             {v5.d}[0],  [x0], x1
         subs            w5,  w5,  #4
-        st1             {v7.8b},  [x0], x1
+        st1             {v5.d}[1],  [x0], x1
         b.le            0f
-        \type           v4,  v0,  v1
-        \type           v5,  v2,  v3
+        \type           v4,  v0,  v1,  v2,  v3
         b               8b
-160:
-        trn1            v4.2d,  v4.2d,  v5.2d
 16:
-        \type\()16      v5, v0, v1, v2, v3
+        \type           v5,  v0,  v1,  v2,  v3
         st1             {v4.16b}, [x0], x1
-        \type\()16      v6, v0, v1, v2, v3
+        \type           v6,  v0,  v1,  v2,  v3
         st1             {v5.16b}, [x0], x1
-        \type\()16      v7, v0, v1, v2, v3
+        \type           v7,  v0,  v1,  v2,  v3
         st1             {v6.16b}, [x0], x1
         subs            w5,  w5,  #4
         st1             {v7.16b}, [x0], x1
         b.le            0f
-        \type\()16      v4, v0, v1, v2, v3
+        \type           v4,  v0,  v1,  v2,  v3
         b               16b
 320:
-        trn1            v4.2d,  v4.2d,  v5.2d
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 32:
-        \type\()16      v5, v0, v1, v2, v3
-        \type\()16      v6, v0, v1, v2, v3
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
         st1             {v4.16b,v5.16b}, [x0], x1
-        \type\()16      v7, v0, v1, v2, v3
+        \type           v7,  v0,  v1,  v2,  v3
         subs            w5,  w5,  #2
         st1             {v6.16b,v7.16b}, [x7], x1
         b.le            0f
-        \type\()16      v4, v0, v1, v2, v3
+        \type           v4,  v0,  v1,  v2,  v3
         b               32b
 640:
-        trn1            v4.2d,  v4.2d,  v5.2d
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 64:
-        \type\()16      v5,  v0, v1, v2, v3
-        \type\()16      v6,  v0, v1, v2, v3
-        \type\()16      v7,  v0, v1, v2, v3
-        \type\()16      v16, v0, v1, v2, v3
-        \type\()16      v17, v0, v1, v2, v3
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
+        \type           v7,  v0,  v1,  v2,  v3
+        \type           v16, v0,  v1,  v2,  v3
+        \type           v17, v0,  v1,  v2,  v3
         st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
-        \type\()16      v18, v0, v1, v2, v3
-        \type\()16      v19, v0, v1, v2, v3
+        \type           v18, v0,  v1,  v2,  v3
+        \type           v19, v0,  v1,  v2,  v3
         subs            w5,  w5,  #2
         st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
         b.le            0f
-        \type\()16      v4, v0, v1, v2, v3
+        \type           v4, v0,  v1,  v2,  v3
         b               64b
 1280:
-        trn1            v4.2d,  v4.2d,  v5.2d
         add             x7,  x0,  #64
 128:
-        \type\()16      v5,  v0, v1, v2, v3
-        \type\()16      v6,  v0, v1, v2, v3
-        \type\()16      v7,  v0, v1, v2, v3
-        \type\()16      v16, v0, v1, v2, v3
-        \type\()16      v17, v0, v1, v2, v3
+        \type           v5,  v0,  v1,  v2,  v3
+        \type           v6,  v0,  v1,  v2,  v3
+        \type           v7,  v0,  v1,  v2,  v3
+        \type           v16, v0,  v1,  v2,  v3
+        \type           v17, v0,  v1,  v2,  v3
         st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
-        \type\()16      v18, v0, v1, v2, v3
-        \type\()16      v19, v0, v1, v2, v3
+        \type           v18, v0,  v1,  v2,  v3
+        \type           v19, v0,  v1,  v2,  v3
         subs            w5,  w5,  #1
         st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
         b.le            0f
-        \type\()16      v4, v0, v1, v2, v3
+        \type           v4, v0,  v1,  v2,  v3
         b               128b
 0:
         ret
@@ -223,7 +184,7 @@ L(\type\()_tbl):
         .hword L(\type\()_tbl) - 1280b
         .hword L(\type\()_tbl) -  640b
         .hword L(\type\()_tbl) -  320b
-        .hword L(\type\()_tbl) -  160b
+        .hword L(\type\()_tbl) -   16b
         .hword L(\type\()_tbl) -    8b
         .hword L(\type\()_tbl) -    4b
 endfunc
-- 
cgit v1.2.3