AArch64 Neon: Replace XTN, XTN2 pairs with single UZP1

It is often necessary to narrow the elements in a pair of Neon vectors to half the current width, before combining the results. This is usually achieved with a pair of XTN/XTN2 instructions. However, it is possible to achieve the same outcome with a single 'unzip' (UZP1) instruction. This patch changes all sequential AArch64 Neon XTN, XTN2 instruction pairs to use a single UZP1 instruction. Change-Id: I2a9fad3082d2cf363b1edce9ef0b8d547ec6c41a
author: Jonathan Wright <jonathan.wright@arm.com> 2021-11-30 17:28:20 +0300
committer: Martin Storsjö <martin@martin.st> 2021-12-03 19:33:18 +0300
commit: 19ff99ea8417b3c9a62ff4c1bd14fffceb26ad73 (patch)
tree: e127531d8b379e6f7a0d42a7a6faf652e3de4047 /src/arm
parent: 4e41273896efb37dbd6b2ec12acf41894eb3e119 (diff)
2 files changed, 32 insertions, 64 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index f8d143b..a3105fe 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -252,8 +252,7 @@ function w_mask_\type\()_8bpc_neon, export=1
         sqrshrun        v22.8b,  v20.8h,  #4
         sqrshrun        v23.8b,  v21.8h,  #4
 .if \type == 444
-        xtn             v18.8b,   v18.8h
-        xtn2            v18.16b,  v19.8h
+        uzp1            v18.16b,  v18.16b, v19.16b      // Same as xtn, xtn2
         sub             v18.16b,  v1.16b,  v18.16b
         st1             {v18.16b}, [x6],  #16
 .elseif \type == 422
@@ -298,8 +297,7 @@ function w_mask_\type\()_8bpc_neon, export=1
         sqrshrun        v22.8b,  v20.8h,  #4
         sqrshrun        v23.8b,  v21.8h,  #4
 .if \type == 444
-        xtn             v18.8b,  v18.8h
-        xtn2            v18.16b, v19.8h
+        uzp1            v18.16b, v18.16b, v19.16b       // Same as xtn, xtn2
         sub             v18.16b, v1.16b,  v18.16b
         st1             {v18.16b}, [x6],  #16
 .elseif \type == 422
@@ -373,10 +371,8 @@ function w_mask_\type\()_8bpc_neon, export=1
         sqrshrun        v26.8b,  v26.8h,  #4
         sqrshrun        v27.8b,  v27.8h,  #4
 .if \type == 444
-        xtn             v20.8b,  v20.8h
-        xtn2            v20.16b, v21.8h
-        xtn             v21.8b,  v22.8h
-        xtn2            v21.16b, v23.8h
+        uzp1            v20.16b, v20.16b, v21.16b       // Same as xtn, xtn2
+        uzp1            v21.16b, v22.16b, v23.16b       // Ditto
         sub             v20.16b, v1.16b,  v20.16b
         sub             v21.16b, v1.16b,  v21.16b
         st1             {v20.16b}, [x6],  #16
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S
index 66a8505..d528ee0 100644
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -65,10 +65,8 @@
         saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
         saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
         saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
-        xtn             \d0\().4h,  \d0\().4s
-        xtn2            \d0\().8h,  \t0\().4s
-        xtn             \d1\().4h,  \d1\().4s
-        xtn2            \d1\().8h,  \t1\().4s
+        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h // Same as xtn, xtn2
+        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h // Ditto
         srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
         srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
         add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
@@ -106,10 +104,8 @@
         saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
         saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
         saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
-        xtn             \d0\().4h,  \d0\().4s
-        xtn2            \d0\().8h,  \t0\().4s
-        xtn             \d1\().4h,  \d1\().4s
-        xtn2            \d1\().8h,  \t1\().4s
+        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h  // Same as xtn, xtn2
+        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h  // Ditto
         srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
         srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
         add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
@@ -320,8 +316,7 @@ function w_mask_\type\()_16bpc_neon, export=1
         umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
         umin            v5.8h,   v5.8h,   v31.8h
 .if \type == 444
-        xtn             v20.8b,  v20.8h           // 64 - m
-        xtn2            v20.16b, v21.8h
+        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
         sub             v20.16b, v1.16b,  v20.16b // m
         st1             {v20.16b}, [x6], #16
 .elseif \type == 422
@@ -386,8 +381,7 @@ function w_mask_\type\()_16bpc_neon, export=1
         umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
         umin            v5.8h,   v5.8h,   v31.8h
 .if \type == 444
-        xtn             v20.8b,  v20.8h           // 64 - m
-        xtn2            v20.16b, v21.8h
+        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
         sub             v20.16b, v1.16b,  v20.16b // m
         st1             {v20.16b}, [x6], #16
 .elseif \type == 422
@@ -505,10 +499,8 @@ function w_mask_\type\()_16bpc_neon, export=1
         umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
         umin            v7.8h,   v7.8h,   v31.8h
 .if \type == 444
-        xtn             v20.8b,  v20.8h           // 64 - m
-        xtn2            v20.16b, v21.8h
-        xtn             v21.8b,  v22.8h
-        xtn2            v21.16b, v23.8h
+        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
+        uzp1            v21.16b, v22.16b, v23.16b
         sub             v20.16b, v1.16b,  v20.16b // m
         sub             v21.16b, v1.16b,  v21.16b
         st1             {v20.16b}, [x6],  #16
@@ -1425,11 +1417,9 @@ endfunc
 .endif
 .endm
 .macro xtn_h r0, r1, r2, r3
-        xtn             \r0\().4h,  \r0\().4s
-        xtn2            \r0\().8h,  \r1\().4s
+        uzp1            \r0\().8h,  \r0\().8h,  \r1\().8h // Same as xtn, xtn2
 .ifnb \r2
-        xtn             \r2\().4h,  \r2\().4s
-        xtn2            \r2\().8h,  \r3\().4s
+        uzp1            \r2\().8h,  \r2\().8h,  \r3\().8h // Ditto
 .endif
 .endm
 .macro srshl_s shift, r0, r1, r2, r3
@@ -1659,8 +1649,7 @@ L(\type\()_8tap_h):
         srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
         umin            v16.8h,  v16.8h,  v31.8h
 .else
-        xtn             v16.4h,  v16.4s
-        xtn2            v16.8h,  v20.4s
+        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
         sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
 .endif
         st1             {v16.d}[0], [\dst], \d_strd
@@ -1720,10 +1709,8 @@ L(\type\()_8tap_h):
         umin            v18.8h,  v18.8h,  v31.8h
         umin            v22.8h,  v22.8h,  v31.8h
 .else
-        xtn             v18.4h,  v18.4s
-        xtn2            v18.8h,  v19.4s
-        xtn             v22.4h,  v22.4s
-        xtn2            v22.8h,  v23.4s
+        uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2
+        uzp1            v22.8h,  v22.8h,  v23.8h // Ditto
         sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
         sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
 .endif
@@ -2411,8 +2398,7 @@ L(\type\()_8tap_filter_4):
         // them to .4h gives a significant speedup on out of order cores
         // (at the cost of a smaller slowdown on in-order cores such as A53),
         // and conserves register space (no need to clobber v8-v15).
-        xtn             v16.4h,  v24.4s
-        xtn2            v16.8h,  v25.4s
+        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
 
         bl              L(\type\()_8tap_filter_8)
         mov             v17.16b, v23.16b
@@ -2511,8 +2497,7 @@ L(\type\()_8tap_filter_4):
         // them to .4h gives a significant speedup on out of order cores
         // (at the cost of a smaller slowdown on in-order cores such as A53),
         // and conserves register space (no need to clobber v8-v15).
-        xtn             v16.4h,  v24.4s
-        xtn2            v16.8h,  v25.4s
+        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
 
         bl              L(\type\()_8tap_filter_8)
         mov             v17.16b, v23.16b
@@ -2623,10 +2608,8 @@ L(\type\()_8tap_filter_8):
         srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
-        xtn             v23.4h,  v25.4s
-        xtn2            v23.8h,  v26.4s
-        xtn             v24.4h,  v27.4s
-        xtn2            v24.8h,  v28.4s
+        uzp1            v23.8h,  v25.8h,  v26.8h // Same as xtn, xtn2
+        uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
         ret
 
 L(\type\()_8tap_hv_tbl):
@@ -3133,8 +3116,7 @@ L(\type\()_bilin_hv):
 .ifc \type, put
         urshl           v4.4s,   v4.4s,   v30.4s
         urshl           v5.4s,   v5.4s,   v30.4s
-        xtn             v4.4h,   v4.4s
-        xtn2            v4.8h,   v5.4s
+        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
 .else
         rshrn           v4.4h,   v4.4s,   #4
         rshrn2          v4.8h,   v5.4s,   #4
@@ -3197,10 +3179,8 @@ L(\type\()_bilin_hv):
         urshl           v5.4s,   v5.4s,   v30.4s
         urshl           v6.4s,   v6.4s,   v30.4s
         urshl           v7.4s,   v7.4s,   v30.4s
-        xtn             v4.4h,   v4.4s
-        xtn2            v4.8h,   v5.4s
-        xtn             v5.4h,   v6.4s
-        xtn2            v5.8h,   v7.4s
+        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
+        uzp1            v5.8h,   v6.8h,   v7.8h  // Ditto
 .else
         rshrn           v4.4h,   v4.4s,   #4
         rshrn2          v4.8h,   v5.4s,   #4
@@ -3367,32 +3347,24 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
 .endif
 
         bl              warp_filter_horz_neon
-        xtn             v24.4h,  v16.4s
-        xtn2            v24.8h,  v17.4s
+        uzp1            v24.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
         bl              warp_filter_horz_neon
-        xtn             v25.4h,  v16.4s
-        xtn2            v25.8h,  v17.4s
+        uzp1            v25.8h,  v16.8h,  v17.8h // Ditto
         bl              warp_filter_horz_neon
-        xtn             v26.4h,  v16.4s
-        xtn2            v26.8h,  v17.4s
+        uzp1            v26.8h,  v16.8h,  v17.8h // Ditto
         bl              warp_filter_horz_neon
-        xtn             v27.4h,  v16.4s
-        xtn2            v27.8h,  v17.4s
+        uzp1            v27.8h,  v16.8h,  v17.8h // Ditto
         bl              warp_filter_horz_neon
-        xtn             v28.4h,  v16.4s
-        xtn2            v28.8h,  v17.4s
+        uzp1            v28.8h,  v16.8h,  v17.8h // Ditto
         bl              warp_filter_horz_neon
-        xtn             v29.4h,  v16.4s
-        xtn2            v29.8h,  v17.4s
+        uzp1            v29.8h,  v16.8h,  v17.8h // Ditto
         bl              warp_filter_horz_neon
-        xtn             v30.4h,  v16.4s
-        xtn2            v30.8h,  v17.4s
+        uzp1            v30.8h,  v16.8h,  v17.8h // Ditto
 
 1:
         add             w14, w6,  #512
         bl              warp_filter_horz_neon
-        xtn             v31.4h,  v16.4s
-        xtn2            v31.8h,  v17.4s
+        uzp1            v31.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
 
         load_filter_row d0, w14, w9
         load_filter_row d1, w14, w9
author	Jonathan Wright <jonathan.wright@arm.com>	2021-11-30 17:28:20 +0300
committer	Martin Storsjö <martin@martin.st>	2021-12-03 19:33:18 +0300
commit	19ff99ea8417b3c9a62ff4c1bd14fffceb26ad73 (patch)
tree	e127531d8b379e6f7a0d42a7a6faf652e3de4047 /src/arm
parent	4e41273896efb37dbd6b2ec12acf41894eb3e119 (diff)