Use movi NEON instruction to zero out registers

Currently `dup` is used to zero our NEON registers in the packing and AArch64 kernel code. According to the [Cortex A72 optimization guide](https://developer.arm.com/documentation/uan0016/a/) which is used in the Raspberry PI 4, `dup` has an execution latency of 8 cycles and a throughput of 1 when copying from a general purpose register to a NEON register. This PR changes the code to use `movi` which has a latency of 3 cycles and a throughput of 2. This is also used in [LLVM for zeroing out registers](https://github.com/llvm/llvm-project/blob/master/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll), but please let me know if I am missing something here. I briefly benchmarked this code on a Pixel phone but didn't see any measurable difference which I think is expected since on the used A76 architecture `dup` only has a latency of 3 cycles so this PR won't have a large effect anyway. Closes https://github.com/google/ruy/pull/203 COPYBARA_INTEGRATE_REVIEW=https://github.com/google/ruy/pull/203 from lgeiger:movi-to-zero-neon-register 106c13e330117fdc9cb4a52c1cef7bcce8836017 PiperOrigin-RevId: 337416443
author: Lukas Geiger <lukas.geiger94@gmail.com> 2020-10-16 03:29:35 +0300
committer: Copybara-Service <copybara-worker@google.com> 2020-10-16 03:29:57 +0300
commit: 034c0e2fc805e8bea53d47351da429d7f57bccf2 (patch)
tree: 3638dcca64e9b3ea76e65854b2e00cb4c50d4fb3
parent: e59c55d78f1a041e6a14771254f8e6280804b430 (diff)
2 files changed, 33 insertions, 33 deletions
diff --git a/ruy/kernel_arm64.cc b/ruy/kernel_arm64.cc
index 3fce17e..ea6bcb2 100644
--- a/ruy/kernel_arm64.cc
+++ b/ruy/kernel_arm64.cc
@@ -133,7 +133,7 @@ void Kernel8bitNeon(const KernelParams8bit<4, 4>& params) {
   // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
   // optimization for this kernel.
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
@@ -1194,7 +1194,7 @@ void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params) {
   // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
   // optimization for this kernel.
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
@@ -1862,7 +1862,7 @@ void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params) {
   //  \---------------------/  \-----------------------------------------/
   //                                  int32 accumulators 4x4 block
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
@@ -3028,7 +3028,7 @@ void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params) {
   // unused, and v8 -- v15 are used for loading parameters used for the
   // post-accumulation part of the kernel.
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
@@ -4459,7 +4459,7 @@ void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params) {
   // unused, and v8 -- v15 are used for loading parameters used for the
   // post-accumulation part of the kernel.
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
@@ -5151,7 +5151,7 @@ void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params) {
   // v4 -- v7 are unused, and v8 -- v15 are used for loading parameters used for
   // the post-accumulation part of the kernel.
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
@@ -6493,7 +6493,7 @@ void KernelFloatNeon(const KernelParamsFloat<8, 8>& params) {
   // unused, and v8 -- v15 are used for floading parameters used for the
   // post-accumulation part of the kernel.
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
@@ -7134,7 +7134,7 @@ void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params) {
   // v4 -- v7 are unused, and v8 -- v15 are used for floading parameters used
   // for the post-accumulation part of the kernel.
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
@@ -7627,7 +7627,7 @@ void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params) {
   // v4 -- v7 are unused, and v8 -- v15 are used for floading parameters used
   // for the post-accumulation part of the kernel.
   asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
 
         // clang-format off
 
diff --git a/ruy/pack_arm.cc b/ruy/pack_arm.cc
index 1673c90..c337986 100644
--- a/ruy/pack_arm.cc
+++ b/ruy/pack_arm.cc
@@ -46,10 +46,10 @@ void Pack8bitColMajorForNeon(const void* src_ptr0, const void* src_ptr1,
           // w1 will be the number of rows already loaded.
           "mov w1, #0\n"
           // v28--v32 will be used to accumulate the sums
-          "dup v28.4s, wzr\n"
-          "dup v29.4s, wzr\n"
-          "dup v30.4s, wzr\n"
-          "dup v31.4s, wzr\n"
+          "movi v28.4s, #0\n"
+          "movi v29.4s, #0\n"
+          "movi v30.4s, #0\n"
+          "movi v31.4s, #0\n"
           // Let w2 be `rows` rounded down to multiple of 16.
           "ands w2, %w[rows], #-16\n"
           // If there are no full blocks of 16 rows to process, jump to the
@@ -641,10 +641,10 @@ void Pack8bitColMajorForNeonA55ish(const void* src_ptr0, const void* src_ptr1,
           // w1 will be the number of rows already loaded.
           "mov w1, #0\n"
           // v28--v32 will be used to accumulate the sums
-          "dup v28.4s, wzr\n"
-          "dup v29.4s, wzr\n"
-          "dup v30.4s, wzr\n"
-          "dup v31.4s, wzr\n"
+          "movi v28.4s, #0\n"
+          "movi v29.4s, #0\n"
+          "movi v30.4s, #0\n"
+          "movi v31.4s, #0\n"
           // Let w2 be `rows` rounded down to multiple of 16.
           "ands w2, %w[rows], #-16\n"
           // If there are no full blocks of 16 rows to process, jump to the
@@ -856,10 +856,10 @@ void Pack8bitColMajorForNeonDotprodA55ish(
           // w1 will be the number of rows already loaded.
           "mov w1, #0\n"
           // v28--v32 will be used to accumulate the sums
-          "dup v28.4s, wzr\n"
-          "dup v29.4s, wzr\n"
-          "dup v30.4s, wzr\n"
-          "dup v31.4s, wzr\n"
+          "movi v28.4s, #0\n"
+          "movi v29.4s, #0\n"
+          "movi v30.4s, #0\n"
+          "movi v31.4s, #0\n"
 
           // Let w2 be `rows` rounded down to multiple of 16.
           "ands w2, %w[rows], #-16\n"
@@ -1102,10 +1102,10 @@ void Pack8bitColMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1,
           // w1 will be the number of rows already loaded.
           "mov w1, #0\n"
           // v28--v32 will be used to accumulate the sums
-          "dup v28.4s, wzr\n"
-          "dup v29.4s, wzr\n"
-          "dup v30.4s, wzr\n"
-          "dup v31.4s, wzr\n"
+          "movi v28.4s, #0\n"
+          "movi v29.4s, #0\n"
+          "movi v30.4s, #0\n"
+          "movi v31.4s, #0\n"
 
           // 4x partially unrolled code processing blocks of 64 rows.
           // Read the original loop below first, it has more comments.
@@ -1818,10 +1818,10 @@ void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1,
           "beq 4f\n"
           // Zero out a 4x4 block in registers, which we'll partially overwrite
           // with any remaining rows.
-          "dup v0.16b, wzr\n"
-          "dup v1.16b, wzr\n"
-          "dup v2.16b, wzr\n"
-          "dup v3.16b, wzr\n"
+          "movi v0.16b, #0\n"
+          "movi v1.16b, #0\n"
+          "movi v2.16b, #0\n"
+          "movi v3.16b, #0\n"
 #define RUY_LOAD_ONE_ROW(R)                   \
   "cmp w2, #" #R "\n"                         \
   "beq 5f\n"                                  \
@@ -2161,10 +2161,10 @@ void PackFloatColMajorForNeonA55ish(const float* src_ptr0,
 
           "ands w2, %w[rows], #3\n"
           "beq 4f\n"
-          "dup v0.16b, wzr\n"
-          "dup v1.16b, wzr\n"
-          "dup v2.16b, wzr\n"
-          "dup v3.16b, wzr\n"
+          "movi v0.16b, #0\n"
+          "movi v1.16b, #0\n"
+          "movi v2.16b, #0\n"
+          "movi v3.16b, #0\n"
 #define RUY_LOAD_ONE_ROW(R)                   \
   "cmp w2, #" #R "\n"                         \
   "beq 5f\n"                                  \
author	Lukas Geiger <lukas.geiger94@gmail.com>	2020-10-16 03:29:35 +0300
committer	Copybara-Service <copybara-worker@google.com>	2020-10-16 03:29:57 +0300
commit	034c0e2fc805e8bea53d47351da429d7f57bccf2 (patch)
tree	3638dcca64e9b3ea76e65854b2e00cb4c50d4fb3
parent	e59c55d78f1a041e6a14771254f8e6280804b430 (diff)