arm32: mc16: Fix column alignment in the warp function

author: Martin Storsjö <martin@martin.st> 2020-12-01 16:35:42 +0300
committer: Martin Storsjö <martin@martin.st> 2020-12-16 13:44:46 +0300
commit: f3197c1a126f4911d1b028f05da78d1f58ded836 (patch)
tree: 6a5a147cb135cbd6fd50a7b44c8a2bebd95ddf0a
parent: 9257a961c0ca9775d5928b6bd7c6f61b20dcaa42 (diff)
1 files changed, 18 insertions, 18 deletions
diff --git a/src/arm/32/mc16.S b/src/arm/32/mc16.S
index 4a10d69..cc4e52e 100644
--- a/src/arm/32/mc16.S
+++ b/src/arm/32/mc16.S
@@ -2604,8 +2604,8 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
 
         ldrd            r8,  r9,  [r4]
         sxth            r7,  r8
-        asr             r8,  r8, #16
-        asr             r4,  r9, #16
+        asr             r8,  r8,  #16
+        asr             r4,  r9,  #16
         sxth            r9,  r9
         mov             r10, #8
         sub             r2,  r2,  r3, lsl #1
@@ -2665,26 +2665,26 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
 
         // This ordering of vmull/vmlal is highly beneficial for
         // Cortex A8/A9/A53 here, but harmful for Cortex A7.
-        vmull.s16       q0,  d16,  d2
-        vmlal.s16       q0,  d18,  d4
-        vmlal.s16       q0,  d20,  d6
-        vmlal.s16       q0,  d22,  d8
-        vmlal.s16       q0,  d24,  d10
-        vmlal.s16       q0,  d26,  d12
-        vmull.s16       q1,  d17,  d3
-        vmlal.s16       q1,  d19,  d5
-        vmlal.s16       q1,  d21,  d7
-        vmlal.s16       q1,  d23,  d9
-        vmlal.s16       q1,  d25,  d11
-        vmlal.s16       q1,  d27,  d13
+        vmull.s16       q0,  d16, d2
+        vmlal.s16       q0,  d18, d4
+        vmlal.s16       q0,  d20, d6
+        vmlal.s16       q0,  d22, d8
+        vmlal.s16       q0,  d24, d10
+        vmlal.s16       q0,  d26, d12
+        vmull.s16       q1,  d17, d3
+        vmlal.s16       q1,  d19, d5
+        vmlal.s16       q1,  d21, d7
+        vmlal.s16       q1,  d23, d9
+        vmlal.s16       q1,  d25, d11
+        vmlal.s16       q1,  d27, d13
 
         vmovl.s8        q2,  d14
         vmovl.s8        q3,  d15
 
-        vmlal.s16       q0,  d28,  d4
-        vmlal.s16       q0,  d30,  d6
-        vmlal.s16       q1,  d29,  d5
-        vmlal.s16       q1,  d31,  d7
+        vmlal.s16       q0,  d28, d4
+        vmlal.s16       q0,  d30, d6
+        vmlal.s16       q1,  d29, d5
+        vmlal.s16       q1,  d31, d7
 
 .ifb \t
         ldr             lr,  [sp, #4]   // -(7 + intermediate_bits)
author	Martin Storsjö <martin@martin.st>	2020-12-01 16:35:42 +0300
committer	Martin Storsjö <martin@martin.st>	2020-12-16 13:44:46 +0300
commit	f3197c1a126f4911d1b028f05da78d1f58ded836 (patch)
tree	6a5a147cb135cbd6fd50a7b44c8a2bebd95ddf0a
parent	9257a961c0ca9775d5928b6bd7c6f61b20dcaa42 (diff)