Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src/arm
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2021-11-30 17:28:20 +0300
committerMartin Storsjö <martin@martin.st>2021-12-03 19:33:18 +0300
commit19ff99ea8417b3c9a62ff4c1bd14fffceb26ad73 (patch)
treee127531d8b379e6f7a0d42a7a6faf652e3de4047 /src/arm
parent4e41273896efb37dbd6b2ec12acf41894eb3e119 (diff)
AArch64 Neon: Replace XTN, XTN2 pairs with single UZP1
It is often necessary to narrow the elements in a pair of Neon vectors to half the current width, before combining the results. This is usually achieved with a pair of XTN/XTN2 instructions. However, it is possible to achieve the same outcome with a single 'unzip' (UZP1) instruction. This patch changes all sequential AArch64 Neon XTN, XTN2 instruction pairs to use a single UZP1 instruction. Change-Id: I2a9fad3082d2cf363b1edce9ef0b8d547ec6c41a
Diffstat (limited to 'src/arm')
-rw-r--r--src/arm/64/mc.S12
-rw-r--r--src/arm/64/mc16.S84
2 files changed, 32 insertions, 64 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index f8d143b..a3105fe 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -252,8 +252,7 @@ function w_mask_\type\()_8bpc_neon, export=1
sqrshrun v22.8b, v20.8h, #4
sqrshrun v23.8b, v21.8h, #4
.if \type == 444
- xtn v18.8b, v18.8h
- xtn2 v18.16b, v19.8h
+ uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2
sub v18.16b, v1.16b, v18.16b
st1 {v18.16b}, [x6], #16
.elseif \type == 422
@@ -298,8 +297,7 @@ function w_mask_\type\()_8bpc_neon, export=1
sqrshrun v22.8b, v20.8h, #4
sqrshrun v23.8b, v21.8h, #4
.if \type == 444
- xtn v18.8b, v18.8h
- xtn2 v18.16b, v19.8h
+ uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2
sub v18.16b, v1.16b, v18.16b
st1 {v18.16b}, [x6], #16
.elseif \type == 422
@@ -373,10 +371,8 @@ function w_mask_\type\()_8bpc_neon, export=1
sqrshrun v26.8b, v26.8h, #4
sqrshrun v27.8b, v27.8h, #4
.if \type == 444
- xtn v20.8b, v20.8h
- xtn2 v20.16b, v21.8h
- xtn v21.8b, v22.8h
- xtn2 v21.16b, v23.8h
+ uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2
+ uzp1 v21.16b, v22.16b, v23.16b // Ditto
sub v20.16b, v1.16b, v20.16b
sub v21.16b, v1.16b, v21.16b
st1 {v20.16b}, [x6], #16
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S
index 66a8505..d528ee0 100644
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -65,10 +65,8 @@
saddw2 \t0\().4s, \t0\().4s, \t2\().8h
saddw \d1\().4s, \d1\().4s, \t3\().4h
saddw2 \t1\().4s, \t1\().4s, \t3\().8h
- xtn \d0\().4h, \d0\().4s
- xtn2 \d0\().8h, \t0\().4s
- xtn \d1\().4h, \d1\().4s
- xtn2 \d1\().8h, \t1\().4s
+ uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
+ uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
@@ -106,10 +104,8 @@
saddw2 \t0\().4s, \t0\().4s, \t2\().8h
saddw \d1\().4s, \d1\().4s, \t3\().4h
saddw2 \t1\().4s, \t1\().4s, \t3\().8h
- xtn \d0\().4h, \d0\().4s
- xtn2 \d0\().8h, \t0\().4s
- xtn \d1\().4h, \d1\().4s
- xtn2 \d1\().8h, \t1\().4s
+ uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
+ uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
@@ -320,8 +316,7 @@ function w_mask_\type\()_16bpc_neon, export=1
umin v4.8h, v4.8h, v31.8h // iclip_pixel
umin v5.8h, v5.8h, v31.8h
.if \type == 444
- xtn v20.8b, v20.8h // 64 - m
- xtn2 v20.16b, v21.8h
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
sub v20.16b, v1.16b, v20.16b // m
st1 {v20.16b}, [x6], #16
.elseif \type == 422
@@ -386,8 +381,7 @@ function w_mask_\type\()_16bpc_neon, export=1
umin v4.8h, v4.8h, v31.8h // iclip_pixel
umin v5.8h, v5.8h, v31.8h
.if \type == 444
- xtn v20.8b, v20.8h // 64 - m
- xtn2 v20.16b, v21.8h
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
sub v20.16b, v1.16b, v20.16b // m
st1 {v20.16b}, [x6], #16
.elseif \type == 422
@@ -505,10 +499,8 @@ function w_mask_\type\()_16bpc_neon, export=1
umin v6.8h, v6.8h, v31.8h // iclip_pixel
umin v7.8h, v7.8h, v31.8h
.if \type == 444
- xtn v20.8b, v20.8h // 64 - m
- xtn2 v20.16b, v21.8h
- xtn v21.8b, v22.8h
- xtn2 v21.16b, v23.8h
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ uzp1 v21.16b, v22.16b, v23.16b
sub v20.16b, v1.16b, v20.16b // m
sub v21.16b, v1.16b, v21.16b
st1 {v20.16b}, [x6], #16
@@ -1425,11 +1417,9 @@ endfunc
.endif
.endm
.macro xtn_h r0, r1, r2, r3
- xtn \r0\().4h, \r0\().4s
- xtn2 \r0\().8h, \r1\().4s
+ uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2
.ifnb \r2
- xtn \r2\().4h, \r2\().4s
- xtn2 \r2\().8h, \r3\().4s
+ uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto
.endif
.endm
.macro srshl_s shift, r0, r1, r2, r3
@@ -1659,8 +1649,7 @@ L(\type\()_8tap_h):
srshl v16.8h, v16.8h, v29.8h // -intermediate_bits
umin v16.8h, v16.8h, v31.8h
.else
- xtn v16.4h, v16.4s
- xtn2 v16.8h, v20.4s
+ uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2
sub v16.8h, v16.8h, v28.8h // PREP_BIAS
.endif
st1 {v16.d}[0], [\dst], \d_strd
@@ -1720,10 +1709,8 @@ L(\type\()_8tap_h):
umin v18.8h, v18.8h, v31.8h
umin v22.8h, v22.8h, v31.8h
.else
- xtn v18.4h, v18.4s
- xtn2 v18.8h, v19.4s
- xtn v22.4h, v22.4s
- xtn2 v22.8h, v23.4s
+ uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2
+ uzp1 v22.8h, v22.8h, v23.8h // Ditto
sub v18.8h, v18.8h, v28.8h // PREP_BIAS
sub v22.8h, v22.8h, v28.8h // PREP_BIAS
.endif
@@ -2411,8 +2398,7 @@ L(\type\()_8tap_filter_4):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53),
// and conserves register space (no need to clobber v8-v15).
- xtn v16.4h, v24.4s
- xtn2 v16.8h, v25.4s
+ uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
bl L(\type\()_8tap_filter_8)
mov v17.16b, v23.16b
@@ -2511,8 +2497,7 @@ L(\type\()_8tap_filter_4):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53),
// and conserves register space (no need to clobber v8-v15).
- xtn v16.4h, v24.4s
- xtn2 v16.8h, v25.4s
+ uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
bl L(\type\()_8tap_filter_8)
mov v17.16b, v23.16b
@@ -2623,10 +2608,8 @@ L(\type\()_8tap_filter_8):
srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits)
- xtn v23.4h, v25.4s
- xtn2 v23.8h, v26.4s
- xtn v24.4h, v27.4s
- xtn2 v24.8h, v28.4s
+ uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2
+ uzp1 v24.8h, v27.8h, v28.8h // Ditto
ret
L(\type\()_8tap_hv_tbl):
@@ -3133,8 +3116,7 @@ L(\type\()_bilin_hv):
.ifc \type, put
urshl v4.4s, v4.4s, v30.4s
urshl v5.4s, v5.4s, v30.4s
- xtn v4.4h, v4.4s
- xtn2 v4.8h, v5.4s
+ uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2
.else
rshrn v4.4h, v4.4s, #4
rshrn2 v4.8h, v5.4s, #4
@@ -3197,10 +3179,8 @@ L(\type\()_bilin_hv):
urshl v5.4s, v5.4s, v30.4s
urshl v6.4s, v6.4s, v30.4s
urshl v7.4s, v7.4s, v30.4s
- xtn v4.4h, v4.4s
- xtn2 v4.8h, v5.4s
- xtn v5.4h, v6.4s
- xtn2 v5.8h, v7.4s
+ uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2
+ uzp1 v5.8h, v6.8h, v7.8h // Ditto
.else
rshrn v4.4h, v4.4s, #4
rshrn2 v4.8h, v5.4s, #4
@@ -3367,32 +3347,24 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
.endif
bl warp_filter_horz_neon
- xtn v24.4h, v16.4s
- xtn2 v24.8h, v17.4s
+ uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2
bl warp_filter_horz_neon
- xtn v25.4h, v16.4s
- xtn2 v25.8h, v17.4s
+ uzp1 v25.8h, v16.8h, v17.8h // Ditto
bl warp_filter_horz_neon
- xtn v26.4h, v16.4s
- xtn2 v26.8h, v17.4s
+ uzp1 v26.8h, v16.8h, v17.8h // Ditto
bl warp_filter_horz_neon
- xtn v27.4h, v16.4s
- xtn2 v27.8h, v17.4s
+ uzp1 v27.8h, v16.8h, v17.8h // Ditto
bl warp_filter_horz_neon
- xtn v28.4h, v16.4s
- xtn2 v28.8h, v17.4s
+ uzp1 v28.8h, v16.8h, v17.8h // Ditto
bl warp_filter_horz_neon
- xtn v29.4h, v16.4s
- xtn2 v29.8h, v17.4s
+ uzp1 v29.8h, v16.8h, v17.8h // Ditto
bl warp_filter_horz_neon
- xtn v30.4h, v16.4s
- xtn2 v30.8h, v17.4s
+ uzp1 v30.8h, v16.8h, v17.8h // Ditto
1:
add w14, w6, #512
bl warp_filter_horz_neon
- xtn v31.4h, v16.4s
- xtn2 v31.8h, v17.4s
+ uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2
load_filter_row d0, w14, w9
load_filter_row d1, w14, w9