Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/google/ruy.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLukas Geiger <lukas.geiger94@gmail.com>2020-10-16 03:29:35 +0300
committerCopybara-Service <copybara-worker@google.com>2020-10-16 03:29:57 +0300
commit034c0e2fc805e8bea53d47351da429d7f57bccf2 (patch)
tree3638dcca64e9b3ea76e65854b2e00cb4c50d4fb3
parente59c55d78f1a041e6a14771254f8e6280804b430 (diff)
Use movi NEON instruction to zero out registers
Currently `dup` is used to zero our NEON registers in the packing and AArch64 kernel code. According to the [Cortex A72 optimization guide](https://developer.arm.com/documentation/uan0016/a/) which is used in the Raspberry PI 4, `dup` has an execution latency of 8 cycles and a throughput of 1 when copying from a general purpose register to a NEON register. This PR changes the code to use `movi` which has a latency of 3 cycles and a throughput of 2. This is also used in [LLVM for zeroing out registers](https://github.com/llvm/llvm-project/blob/master/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll), but please let me know if I am missing something here. I briefly benchmarked this code on a Pixel phone but didn't see any measurable difference which I think is expected since on the used A76 architecture `dup` only has a latency of 3 cycles so this PR won't have a large effect anyway. Closes https://github.com/google/ruy/pull/203 COPYBARA_INTEGRATE_REVIEW=https://github.com/google/ruy/pull/203 from lgeiger:movi-to-zero-neon-register 106c13e330117fdc9cb4a52c1cef7bcce8836017 PiperOrigin-RevId: 337416443
-rw-r--r--ruy/kernel_arm64.cc18
-rw-r--r--ruy/pack_arm.cc48
2 files changed, 33 insertions, 33 deletions
diff --git a/ruy/kernel_arm64.cc b/ruy/kernel_arm64.cc
index 3fce17e..ea6bcb2 100644
--- a/ruy/kernel_arm64.cc
+++ b/ruy/kernel_arm64.cc
@@ -133,7 +133,7 @@ void Kernel8bitNeon(const KernelParams8bit<4, 4>& params) {
// No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
// optimization for this kernel.
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
@@ -1194,7 +1194,7 @@ void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params) {
// No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
// optimization for this kernel.
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
@@ -1862,7 +1862,7 @@ void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params) {
// \---------------------/ \-----------------------------------------/
// int32 accumulators 4x4 block
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
@@ -3028,7 +3028,7 @@ void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params) {
// unused, and v8 -- v15 are used for loading parameters used for the
// post-accumulation part of the kernel.
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
@@ -4459,7 +4459,7 @@ void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params) {
// unused, and v8 -- v15 are used for loading parameters used for the
// post-accumulation part of the kernel.
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
@@ -5151,7 +5151,7 @@ void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params) {
// v4 -- v7 are unused, and v8 -- v15 are used for loading parameters used for
// the post-accumulation part of the kernel.
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
@@ -6493,7 +6493,7 @@ void KernelFloatNeon(const KernelParamsFloat<8, 8>& params) {
// unused, and v8 -- v15 are used for floading parameters used for the
// post-accumulation part of the kernel.
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
@@ -7134,7 +7134,7 @@ void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params) {
// v4 -- v7 are unused, and v8 -- v15 are used for floading parameters used
// for the post-accumulation part of the kernel.
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
@@ -7627,7 +7627,7 @@ void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params) {
// v4 -- v7 are unused, and v8 -- v15 are used for floading parameters used
// for the post-accumulation part of the kernel.
asm volatile(
-#define RUY_MAKE_ZERO(reg) "dup " #reg ".4s, wzr\n"
+#define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
// clang-format off
diff --git a/ruy/pack_arm.cc b/ruy/pack_arm.cc
index 1673c90..c337986 100644
--- a/ruy/pack_arm.cc
+++ b/ruy/pack_arm.cc
@@ -46,10 +46,10 @@ void Pack8bitColMajorForNeon(const void* src_ptr0, const void* src_ptr1,
// w1 will be the number of rows already loaded.
"mov w1, #0\n"
// v28--v32 will be used to accumulate the sums
- "dup v28.4s, wzr\n"
- "dup v29.4s, wzr\n"
- "dup v30.4s, wzr\n"
- "dup v31.4s, wzr\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
// Let w2 be `rows` rounded down to multiple of 16.
"ands w2, %w[rows], #-16\n"
// If there are no full blocks of 16 rows to process, jump to the
@@ -641,10 +641,10 @@ void Pack8bitColMajorForNeonA55ish(const void* src_ptr0, const void* src_ptr1,
// w1 will be the number of rows already loaded.
"mov w1, #0\n"
// v28--v32 will be used to accumulate the sums
- "dup v28.4s, wzr\n"
- "dup v29.4s, wzr\n"
- "dup v30.4s, wzr\n"
- "dup v31.4s, wzr\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
// Let w2 be `rows` rounded down to multiple of 16.
"ands w2, %w[rows], #-16\n"
// If there are no full blocks of 16 rows to process, jump to the
@@ -856,10 +856,10 @@ void Pack8bitColMajorForNeonDotprodA55ish(
// w1 will be the number of rows already loaded.
"mov w1, #0\n"
// v28--v32 will be used to accumulate the sums
- "dup v28.4s, wzr\n"
- "dup v29.4s, wzr\n"
- "dup v30.4s, wzr\n"
- "dup v31.4s, wzr\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
// Let w2 be `rows` rounded down to multiple of 16.
"ands w2, %w[rows], #-16\n"
@@ -1102,10 +1102,10 @@ void Pack8bitColMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1,
// w1 will be the number of rows already loaded.
"mov w1, #0\n"
// v28--v32 will be used to accumulate the sums
- "dup v28.4s, wzr\n"
- "dup v29.4s, wzr\n"
- "dup v30.4s, wzr\n"
- "dup v31.4s, wzr\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
// 4x partially unrolled code processing blocks of 64 rows.
// Read the original loop below first, it has more comments.
@@ -1818,10 +1818,10 @@ void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1,
"beq 4f\n"
// Zero out a 4x4 block in registers, which we'll partially overwrite
// with any remaining rows.
- "dup v0.16b, wzr\n"
- "dup v1.16b, wzr\n"
- "dup v2.16b, wzr\n"
- "dup v3.16b, wzr\n"
+ "movi v0.16b, #0\n"
+ "movi v1.16b, #0\n"
+ "movi v2.16b, #0\n"
+ "movi v3.16b, #0\n"
#define RUY_LOAD_ONE_ROW(R) \
"cmp w2, #" #R "\n" \
"beq 5f\n" \
@@ -2161,10 +2161,10 @@ void PackFloatColMajorForNeonA55ish(const float* src_ptr0,
"ands w2, %w[rows], #3\n"
"beq 4f\n"
- "dup v0.16b, wzr\n"
- "dup v1.16b, wzr\n"
- "dup v2.16b, wzr\n"
- "dup v3.16b, wzr\n"
+ "movi v0.16b, #0\n"
+ "movi v1.16b, #0\n"
+ "movi v2.16b, #0\n"
+ "movi v3.16b, #0\n"
#define RUY_LOAD_ONE_ROW(R) \
"cmp w2, #" #R "\n" \
"beq 5f\n" \