diff options
author | Lukas Geiger <lukas.geiger94@gmail.com> | 2020-10-16 03:29:35 +0300 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2020-10-16 03:29:57 +0300 |
commit | 034c0e2fc805e8bea53d47351da429d7f57bccf2 (patch) | |
tree | 3638dcca64e9b3ea76e65854b2e00cb4c50d4fb3 | |
parent | e59c55d78f1a041e6a14771254f8e6280804b430 (diff) |
Use movi NEON instruction to zero out registers
Currently `dup` is used to zero our NEON registers in the packing and AArch64 kernel code. According to the [Cortex A72 optimization guide](https://developer.arm.com/documentation/uan0016/a/) which is used in the Raspberry PI 4, `dup` has an execution latency of 8 cycles and a throughput of 1 when copying from a general purpose register to a NEON register.
This PR changes the code to use `movi` which has a latency of 3 cycles and a throughput of 2. This is also used in [LLVM for zeroing out registers](https://github.com/llvm/llvm-project/blob/master/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll), but please let me know if I am missing something here.
I briefly benchmarked this code on a Pixel phone but didn't see any measurable difference which I think is expected since on the used A76 architecture `dup` only has a latency of 3 cycles so this PR won't have a large effect anyway.
Closes https://github.com/google/ruy/pull/203
COPYBARA_INTEGRATE_REVIEW=https://github.com/google/ruy/pull/203 from lgeiger:movi-to-zero-neon-register 106c13e330117fdc9cb4a52c1cef7bcce8836017
PiperOrigin-RevId: 337416443
-rw-r--r-- | ruy/kernel_arm64.cc | 18 | ||||
-rw-r--r-- | ruy/pack_arm.cc | 48 |
2 files changed, 33 insertions, 33 deletions
diff --git a/ruy/kernel_arm64.cc b/ruy/kernel_arm64.cc index 3fce17e..ea6bcb2 100644 --- a/ruy/kernel_arm64.cc +++ b/ruy/kernel_arm64.cc @@ -133,7 +133,7 @@ void Kernel8bitNeon(const KernelParams8bit<4, 4>& params) { // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING // optimization for this kernel. asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off @@ -1194,7 +1194,7 @@ void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params) { // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING // optimization for this kernel. asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off @@ -1862,7 +1862,7 @@ void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params) { // \---------------------/ \-----------------------------------------/ // int32 accumulators 4x4 block asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off @@ -3028,7 +3028,7 @@ void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params) { // unused, and v8 -- v15 are used for loading parameters used for the // post-accumulation part of the kernel. asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off @@ -4459,7 +4459,7 @@ void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params) { // unused, and v8 -- v15 are used for loading parameters used for the // post-accumulation part of the kernel. asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off @@ -5151,7 +5151,7 @@ void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params) { // v4 -- v7 are unused, and v8 -- v15 are used for loading parameters used for // the post-accumulation part of the kernel. asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off @@ -6493,7 +6493,7 @@ void KernelFloatNeon(const KernelParamsFloat<8, 8>& params) { // unused, and v8 -- v15 are used for floading parameters used for the // post-accumulation part of the kernel. asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off @@ -7134,7 +7134,7 @@ void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params) { // v4 -- v7 are unused, and v8 -- v15 are used for floading parameters used // for the post-accumulation part of the kernel. asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off @@ -7627,7 +7627,7 @@ void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params) { // v4 -- v7 are unused, and v8 -- v15 are used for floading parameters used // for the post-accumulation part of the kernel. asm volatile( -#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n" +#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n" // clang-format off diff --git a/ruy/pack_arm.cc b/ruy/pack_arm.cc index 1673c90..c337986 100644 --- a/ruy/pack_arm.cc +++ b/ruy/pack_arm.cc @@ -46,10 +46,10 @@ void Pack8bitColMajorForNeon(const void* src_ptr0, const void* src_ptr1, // w1 will be the number of rows already loaded. "mov w1, #0\n" // v28--v32 will be used to accumulate the sums - "dup v28.4s, wzr\n" - "dup v29.4s, wzr\n" - "dup v30.4s, wzr\n" - "dup v31.4s, wzr\n" + "movi v28.4s, #0\n" + "movi v29.4s, #0\n" + "movi v30.4s, #0\n" + "movi v31.4s, #0\n" // Let w2 be `rows` rounded down to multiple of 16. "ands w2, %w[rows], #-16\n" // If there are no full blocks of 16 rows to process, jump to the @@ -641,10 +641,10 @@ void Pack8bitColMajorForNeonA55ish(const void* src_ptr0, const void* src_ptr1, // w1 will be the number of rows already loaded. "mov w1, #0\n" // v28--v32 will be used to accumulate the sums - "dup v28.4s, wzr\n" - "dup v29.4s, wzr\n" - "dup v30.4s, wzr\n" - "dup v31.4s, wzr\n" + "movi v28.4s, #0\n" + "movi v29.4s, #0\n" + "movi v30.4s, #0\n" + "movi v31.4s, #0\n" // Let w2 be `rows` rounded down to multiple of 16. "ands w2, %w[rows], #-16\n" // If there are no full blocks of 16 rows to process, jump to the @@ -856,10 +856,10 @@ void Pack8bitColMajorForNeonDotprodA55ish( // w1 will be the number of rows already loaded. "mov w1, #0\n" // v28--v32 will be used to accumulate the sums - "dup v28.4s, wzr\n" - "dup v29.4s, wzr\n" - "dup v30.4s, wzr\n" - "dup v31.4s, wzr\n" + "movi v28.4s, #0\n" + "movi v29.4s, #0\n" + "movi v30.4s, #0\n" + "movi v31.4s, #0\n" // Let w2 be `rows` rounded down to multiple of 16. "ands w2, %w[rows], #-16\n" @@ -1102,10 +1102,10 @@ void Pack8bitColMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1, // w1 will be the number of rows already loaded. "mov w1, #0\n" // v28--v32 will be used to accumulate the sums - "dup v28.4s, wzr\n" - "dup v29.4s, wzr\n" - "dup v30.4s, wzr\n" - "dup v31.4s, wzr\n" + "movi v28.4s, #0\n" + "movi v29.4s, #0\n" + "movi v30.4s, #0\n" + "movi v31.4s, #0\n" // 4x partially unrolled code processing blocks of 64 rows. // Read the original loop below first, it has more comments. @@ -1818,10 +1818,10 @@ void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1, "beq 4f\n" // Zero out a 4x4 block in registers, which we'll partially overwrite // with any remaining rows. - "dup v0.16b, wzr\n" - "dup v1.16b, wzr\n" - "dup v2.16b, wzr\n" - "dup v3.16b, wzr\n" + "movi v0.16b, #0\n" + "movi v1.16b, #0\n" + "movi v2.16b, #0\n" + "movi v3.16b, #0\n" #define RUY_LOAD_ONE_ROW(R) \ "cmp w2, #" #R "\n" \ "beq 5f\n" \ @@ -2161,10 +2161,10 @@ void PackFloatColMajorForNeonA55ish(const float* src_ptr0, "ands w2, %w[rows], #3\n" "beq 4f\n" - "dup v0.16b, wzr\n" - "dup v1.16b, wzr\n" - "dup v2.16b, wzr\n" - "dup v3.16b, wzr\n" + "movi v0.16b, #0\n" + "movi v1.16b, #0\n" + "movi v2.16b, #0\n" + "movi v3.16b, #0\n" #define RUY_LOAD_ONE_ROW(R) \ "cmp w2, #" #R "\n" \ "beq 5f\n" \ |