diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2021-11-30 17:28:20 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2021-12-03 19:33:18 +0300 |
commit | 19ff99ea8417b3c9a62ff4c1bd14fffceb26ad73 (patch) | |
tree | e127531d8b379e6f7a0d42a7a6faf652e3de4047 /src/arm | |
parent | 4e41273896efb37dbd6b2ec12acf41894eb3e119 (diff) |
AArch64 Neon: Replace XTN, XTN2 pairs with single UZP1
It is often necessary to narrow the elements in a pair of Neon
vectors to half the current width, before combining the results. This
is usually achieved with a pair of XTN/XTN2 instructions. However, it
is possible to achieve the same outcome with a single 'unzip' (UZP1)
instruction.
This patch changes all sequential AArch64 Neon XTN, XTN2 instruction
pairs to use a single UZP1 instruction.
Change-Id: I2a9fad3082d2cf363b1edce9ef0b8d547ec6c41a
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/mc.S | 12 | ||||
-rw-r--r-- | src/arm/64/mc16.S | 84 |
2 files changed, 32 insertions, 64 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index f8d143b..a3105fe 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -252,8 +252,7 @@ function w_mask_\type\()_8bpc_neon, export=1 sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 - xtn v18.8b, v18.8h - xtn2 v18.16b, v19.8h + uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 @@ -298,8 +297,7 @@ function w_mask_\type\()_8bpc_neon, export=1 sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 - xtn v18.8b, v18.8h - xtn2 v18.16b, v19.8h + uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 @@ -373,10 +371,8 @@ function w_mask_\type\()_8bpc_neon, export=1 sqrshrun v26.8b, v26.8h, #4 sqrshrun v27.8b, v27.8h, #4 .if \type == 444 - xtn v20.8b, v20.8h - xtn2 v20.16b, v21.8h - xtn v21.8b, v22.8h - xtn2 v21.16b, v23.8h + uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 + uzp1 v21.16b, v22.16b, v23.16b // Ditto sub v20.16b, v1.16b, v20.16b sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S index 66a8505..d528ee0 100644 --- a/src/arm/64/mc16.S +++ b/src/arm/64/mc16.S @@ -65,10 +65,8 @@ saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h - xtn \d0\().4h, \d0\().4s - xtn2 \d0\().8h, \t0\().4s - xtn \d1\().4h, \d1\().4s - xtn2 \d1\().8h, \t1\().4s + uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 + uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits @@ -106,10 +104,8 @@ saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h - xtn \d0\().4h, \d0\().4s - xtn2 \d0\().8h, \t0\().4s - xtn \d1\().4h, \d1\().4s - xtn2 \d1\().8h, \t1\().4s + uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 + uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits @@ -320,8 +316,7 @@ function w_mask_\type\()_16bpc_neon, export=1 umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 - xtn v20.8b, v20.8h // 64 - m - xtn2 v20.16b, v21.8h + uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 @@ -386,8 +381,7 @@ function w_mask_\type\()_16bpc_neon, export=1 umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 - xtn v20.8b, v20.8h // 64 - m - xtn2 v20.16b, v21.8h + uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 @@ -505,10 +499,8 @@ function w_mask_\type\()_16bpc_neon, export=1 umin v6.8h, v6.8h, v31.8h // iclip_pixel umin v7.8h, v7.8h, v31.8h .if \type == 444 - xtn v20.8b, v20.8h // 64 - m - xtn2 v20.16b, v21.8h - xtn v21.8b, v22.8h - xtn2 v21.16b, v23.8h + uzp1 v20.16b, v20.16b, v21.16b // 64 - m + uzp1 v21.16b, v22.16b, v23.16b sub v20.16b, v1.16b, v20.16b // m sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 @@ -1425,11 +1417,9 @@ endfunc .endif .endm .macro xtn_h r0, r1, r2, r3 - xtn \r0\().4h, \r0\().4s - xtn2 \r0\().8h, \r1\().4s + uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2 .ifnb \r2 - xtn \r2\().4h, \r2\().4s - xtn2 \r2\().8h, \r3\().4s + uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto .endif .endm .macro srshl_s shift, r0, r1, r2, r3 @@ -1659,8 +1649,7 @@ L(\type\()_8tap_h): srshl v16.8h, v16.8h, v29.8h // -intermediate_bits umin v16.8h, v16.8h, v31.8h .else - xtn v16.4h, v16.4s - xtn2 v16.8h, v20.4s + uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 sub v16.8h, v16.8h, v28.8h // PREP_BIAS .endif st1 {v16.d}[0], [\dst], \d_strd @@ -1720,10 +1709,8 @@ L(\type\()_8tap_h): umin v18.8h, v18.8h, v31.8h umin v22.8h, v22.8h, v31.8h .else - xtn v18.4h, v18.4s - xtn2 v18.8h, v19.4s - xtn v22.4h, v22.4s - xtn2 v22.8h, v23.4s + uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 + uzp1 v22.8h, v22.8h, v23.8h // Ditto sub v18.8h, v18.8h, v28.8h // PREP_BIAS sub v22.8h, v22.8h, v28.8h // PREP_BIAS .endif @@ -2411,8 +2398,7 @@ L(\type\()_8tap_filter_4): // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). - xtn v16.4h, v24.4s - xtn2 v16.8h, v25.4s + uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_8tap_filter_8) mov v17.16b, v23.16b @@ -2511,8 +2497,7 @@ L(\type\()_8tap_filter_4): // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). - xtn v16.4h, v24.4s - xtn2 v16.8h, v25.4s + uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_8tap_filter_8) mov v17.16b, v23.16b @@ -2623,10 +2608,8 @@ L(\type\()_8tap_filter_8): srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) - xtn v23.4h, v25.4s - xtn2 v23.8h, v26.4s - xtn v24.4h, v27.4s - xtn2 v24.8h, v28.4s + uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 + uzp1 v24.8h, v27.8h, v28.8h // Ditto ret L(\type\()_8tap_hv_tbl): @@ -3133,8 +3116,7 @@ L(\type\()_bilin_hv): .ifc \type, put urshl v4.4s, v4.4s, v30.4s urshl v5.4s, v5.4s, v30.4s - xtn v4.4h, v4.4s - xtn2 v4.8h, v5.4s + uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 @@ -3197,10 +3179,8 @@ L(\type\()_bilin_hv): urshl v5.4s, v5.4s, v30.4s urshl v6.4s, v6.4s, v30.4s urshl v7.4s, v7.4s, v30.4s - xtn v4.4h, v4.4s - xtn2 v4.8h, v5.4s - xtn v5.4h, v6.4s - xtn2 v5.8h, v7.4s + uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 + uzp1 v5.8h, v6.8h, v7.8h // Ditto .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 @@ -3367,32 +3347,24 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1 .endif bl warp_filter_horz_neon - xtn v24.4h, v16.4s - xtn2 v24.8h, v17.4s + uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2 bl warp_filter_horz_neon - xtn v25.4h, v16.4s - xtn2 v25.8h, v17.4s + uzp1 v25.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v26.4h, v16.4s - xtn2 v26.8h, v17.4s + uzp1 v26.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v27.4h, v16.4s - xtn2 v27.8h, v17.4s + uzp1 v27.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v28.4h, v16.4s - xtn2 v28.8h, v17.4s + uzp1 v28.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v29.4h, v16.4s - xtn2 v29.8h, v17.4s + uzp1 v29.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v30.4h, v16.4s - xtn2 v30.8h, v17.4s + uzp1 v30.8h, v16.8h, v17.8h // Ditto 1: add w14, w6, #512 bl warp_filter_horz_neon - xtn v31.4h, v16.4s - xtn2 v31.8h, v17.4s + uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 |