arm64: mc: Use more intuitive lane specifications for loads/stores

For loads where we load/store a full or half register (instead of a lanewise load/store), the lane specification in itself doesn't matter, only its size. This doesn't change the generated code, but makes it more readable.
author: Martin Storsjö <martin@martin.st> 2020-03-03 15:49:33 +0300
committer: Martin Storsjö <martin@martin.st> 2020-03-04 12:43:25 +0300
commit: 83c627165ae5991ac664f5d4d2c6aa7a772ee9a8 (patch)
tree: b52b9096ca3f0f5eb4c262d6630a67b2a237e4dc /src/arm
parent: f4dac1a30b3893d0ff555d8d87a0be7c4b69866a (diff)
1 files changed, 16 insertions, 16 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 4392a2c..7166f32 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -434,7 +434,7 @@ function blend_8bpc_neon, export=1
         lsl             w1,  w1,  #1
         br              x6
 4:
-        ld1             {v2.d}[0],   [x5],  #8
+        ld1             {v2.8b},     [x5],  #8
         ld1             {v1.d}[0],   [x2],  #8
         ld1             {v0.s}[0],   [x0]
         subs            w4,  w4,  #2
@@ -448,8 +448,8 @@ function blend_8bpc_neon, export=1
         b.gt            4b
         ret
 8:
-        ld1             {v2.2d},   [x5],  #16
-        ld1             {v1.2d},   [x2],  #16
+        ld1             {v2.16b},  [x5],  #16
+        ld1             {v1.16b},  [x2],  #16
         ld1             {v0.d}[0],   [x0]
         ld1             {v0.d}[1],   [x8]
         sub             v3.16b,  v4.16b,  v2.16b
@@ -465,13 +465,13 @@ function blend_8bpc_neon, export=1
         b.gt            8b
         ret
 16:
-        ld1             {v1.2d,   v2.2d},   [x5],  #32
-        ld1             {v5.2d,   v6.2d},   [x2],  #32
-        ld1             {v0.2d},   [x0]
+        ld1             {v1.16b,  v2.16b},  [x5],  #32
+        ld1             {v5.16b,  v6.16b},  [x2],  #32
+        ld1             {v0.16b},  [x0]
         subs            w4,  w4,  #2
         sub             v7.16b,  v4.16b,  v1.16b
         sub             v20.16b, v4.16b,  v2.16b
-        ld1             {v3.2d},   [x8]
+        ld1             {v3.16b},  [x8]
         umull           v16.8h,  v5.8b,   v1.8b
         umlal           v16.8h,  v0.8b,   v7.8b
         umull2          v17.8h,  v5.16b,  v1.16b
@@ -484,16 +484,16 @@ function blend_8bpc_neon, export=1
         rshrn2          v18.16b, v17.8h,  #6
         rshrn           v19.8b,  v21.8h,  #6
         rshrn2          v19.16b, v22.8h,  #6
-        st1             {v18.2d},  [x0],  x1
-        st1             {v19.2d},  [x8],  x1
+        st1             {v18.16b}, [x0],  x1
+        st1             {v19.16b}, [x8],  x1
         b.gt            16b
         ret
 32:
-        ld1             {v0.2d,   v1.2d,   v2.2d,   v3.2d},   [x5],  #64
-        ld1             {v16.2d,  v17.2d,  v18.2d,  v19.2d},  [x2],  #64
-        ld1             {v20.2d,  v21.2d},  [x0]
+        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
+        ld1             {v20.16b, v21.16b}, [x0]
         subs            w4,  w4,  #2
-        ld1             {v22.2d,  v23.2d},  [x8]
+        ld1             {v22.16b, v23.16b}, [x8]
         sub             v5.16b,  v4.16b,  v0.16b
         sub             v6.16b,  v4.16b,  v1.16b
         sub             v30.16b, v4.16b,  v2.16b
@@ -522,8 +522,8 @@ function blend_8bpc_neon, export=1
         rshrn2          v27.16b, v1.8h,   #6
         rshrn           v28.8b,  v29.8h,  #6
         rshrn2          v28.16b, v21.8h,  #6
-        st1             {v24.2d, v25.2d}, [x0],  x1
-        st1             {v27.2d, v28.2d}, [x8],  x1
+        st1             {v24.16b, v25.16b}, [x0],  x1
+        st1             {v27.16b, v28.16b}, [x8],  x1
         b.gt            32b
         ret
 L(blend_tbl):
@@ -563,7 +563,7 @@ function blend_h_8bpc_neon, export=1
         ret
 4:
         ld2r            {v0.8b,   v1.8b},   [x5],  #2
-        ld1             {v2.2s},   [x2],  #8
+        ld1             {v2.8b},   [x2],  #8
         subs            w4,  w4,  #2
         ext             v0.8b,   v0.8b,   v1.8b,   #4
         ld1             {v3.s}[0],   [x0]
author	Martin Storsjö <martin@martin.st>	2020-03-03 15:49:33 +0300
committer	Martin Storsjö <martin@martin.st>	2020-03-04 12:43:25 +0300
commit	83c627165ae5991ac664f5d4d2c6aa7a772ee9a8 (patch)
tree	b52b9096ca3f0f5eb4c262d6630a67b2a237e4dc /src/arm
parent	f4dac1a30b3893d0ff555d8d87a0be7c4b69866a (diff)