Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/google/ruy.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenoit Jacob <benoitjacob@google.com>2020-08-19 19:54:11 +0300
committerCopybara-Service <copybara-worker@google.com>2020-08-19 19:55:51 +0300
commit59c2de870307b80a59795c3768c954d80159c838 (patch)
tree900e65b40336543359c56d5e7c1f99bd3130076b
parent4f6a37b9f115f86b378f288028be4b568204bad5 (diff)
Rename kOutOfOrder -> kGeneric, kInOrder -> kA55ish,
KernelXxxOutOfOrder -> KernelXxx, KernelXxxInOrder -> KernelXxxA55ish PiperOrigin-RevId: 327452323
-rw-r--r--ruy/context_test.cc4
-rw-r--r--ruy/cpuinfo.cc4
-rw-r--r--ruy/cpuinfo.h2
-rw-r--r--ruy/kernel_arm.h58
-rw-r--r--ruy/kernel_arm32.cc15
-rw-r--r--ruy/kernel_arm64.cc43
-rw-r--r--ruy/pack_arm.cc88
-rw-r--r--ruy/pack_arm.h108
-rw-r--r--ruy/test.h6
-rw-r--r--ruy/tune.cc3
-rw-r--r--ruy/tune.h26
-rw-r--r--ruy/tune_test.cc2
12 files changed, 180 insertions, 179 deletions
diff --git a/ruy/context_test.cc b/ruy/context_test.cc
index 8037b3c..4e69e65 100644
--- a/ruy/context_test.cc
+++ b/ruy/context_test.cc
@@ -30,9 +30,9 @@ TEST(ContextTest, ContextClassSanity) {
EXPECT_EQ(&context.thread_pool(), context.mutable_thread_pool());
EXPECT_NE(context.mutable_thread_pool(), nullptr);
EXPECT_EQ(context.max_num_threads(), 1);
- context.set_explicit_tuning(Tuning::kOutOfOrder);
+ context.set_explicit_tuning(Tuning::kGeneric);
context.set_max_num_threads(2);
- EXPECT_EQ(context.explicit_tuning(), Tuning::kOutOfOrder);
+ EXPECT_EQ(context.explicit_tuning(), Tuning::kGeneric);
EXPECT_EQ(context.max_num_threads(), 2);
}
diff --git a/ruy/cpuinfo.cc b/ruy/cpuinfo.cc
index b0c53ad..a4e71ef 100644
--- a/ruy/cpuinfo.cc
+++ b/ruy/cpuinfo.cc
@@ -118,7 +118,7 @@ bool CpuInfo::AvxVnni() {
return EnsureInitialized() && cpuinfo_has_x86_avx512vnni();
}
-bool CpuInfo::CurrentCpuIsInOrder() {
+bool CpuInfo::CurrentCpuIsA55ish() {
if (!EnsureInitialized()) {
return false;
}
@@ -150,7 +150,7 @@ bool CpuInfo::Avx() { return false; }
bool CpuInfo::Avx2Fma() { return false; }
bool CpuInfo::Avx512() { return false; }
bool CpuInfo::AvxVnni() { return false; }
-bool CpuInfo::CurrentCpuIsInOrder() { return false; }
+bool CpuInfo::CurrentCpuIsA55ish() { return false; }
#endif
diff --git a/ruy/cpuinfo.h b/ruy/cpuinfo.h
index f8948e2..e45fa51 100644
--- a/ruy/cpuinfo.h
+++ b/ruy/cpuinfo.h
@@ -38,7 +38,7 @@ class CpuInfo final {
// Common features
const CpuCacheParams& CacheParams();
- bool CurrentCpuIsInOrder();
+ bool CurrentCpuIsA55ish();
private:
enum class InitStatus {
diff --git a/ruy/kernel_arm.h b/ruy/kernel_arm.h
index c4a4dcc..76cfc82 100644
--- a/ruy/kernel_arm.h
+++ b/ruy/kernel_arm.h
@@ -39,16 +39,16 @@ RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kNeon)
RUY_INHERIT_KERNEL(Path::kNeon, Path::kNeonDotprod)
#if RUY_PLATFORM_NEON_64
-void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params);
-void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 4>& params);
+void Kernel8bitNeon(const KernelParams8bit<4, 4>& params);
+void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params);
#elif RUY_PLATFORM_NEON_32
-void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params);
-void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 2>& params);
+void Kernel8bitNeon(const KernelParams8bit<4, 2>& params);
+void Kernel8bitNeon1Col(const KernelParams8bit<4, 2>& params);
#endif
-void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params);
-void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params);
-void Kernel8bitNeonDotprodOutOfOrder1Col(const KernelParams8bit<8, 8>& params);
-void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params);
+void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params);
+void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params);
+void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params);
+void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params);
#if RUY_PLATFORM_NEON_64
template <typename DstScalar>
@@ -66,13 +66,13 @@ struct Kernel<Path::kNeon, std::int8_t, std::int8_t, std::int32_t, DstScalar> {
end_col, dst, &params);
if (dst->layout.cols == 1 &&
mul_params.channel_dimension() == ChannelDimension::kRow) {
- Kernel8bitNeonOutOfOrder1Col(params);
+ Kernel8bitNeon1Col(params);
return;
}
- if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
- Kernel8bitNeonInOrder(params);
+ if (__builtin_expect(tuning == Tuning::kA55ish, true)) {
+ Kernel8bitNeonA55ish(params);
} else {
- Kernel8bitNeonOutOfOrder(params);
+ Kernel8bitNeon(params);
}
}
};
@@ -94,10 +94,10 @@ struct Kernel<Path::kNeon, std::int8_t, std::int8_t, std::int32_t, DstScalar> {
end_col, dst, &params);
if (dst->layout.cols == 1 &&
mul_params.channel_dimension() == ChannelDimension::kRow) {
- Kernel8bitNeonOutOfOrder1Col(params);
+ Kernel8bitNeon1Col(params);
return;
}
- Kernel8bitNeonOutOfOrder(params);
+ Kernel8bitNeon(params);
}
};
#endif
@@ -118,20 +118,20 @@ struct Kernel<Path::kNeonDotprod, std::int8_t, std::int8_t, std::int32_t, DstSca
end_col, dst, &params);
if (dst->layout.cols == 1 &&
mul_params.channel_dimension() == ChannelDimension::kRow) {
- Kernel8bitNeonDotprodOutOfOrder1Col(params);
- } else if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
- Kernel8bitNeonDotprodInOrder(params);
+ Kernel8bitNeonDotprod1Col(params);
+ } else if (__builtin_expect(tuning == Tuning::kA55ish, true)) {
+ Kernel8bitNeonDotprodA55ish(params);
} else {
- Kernel8bitNeonDotprodOutOfOrder(params);
+ Kernel8bitNeonDotprod(params);
}
}
};
#endif
-void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params);
-void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params);
-void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params);
-void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params);
+void KernelFloatNeon(const KernelParamsFloat<8, 8>& params);
+void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params);
+void KernelFloat32Neon(const KernelParamsFloat<8, 4>& params);
+void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params);
#if RUY_PLATFORM_NEON_64
// A Float kernel for ARM64 Neon.
@@ -148,10 +148,10 @@ struct Kernel<Path::kNeon, float, float, float, float> {
KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row,
end_col, dst, &params);
- if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
- KernelFloatNeonInOrder(params);
+ if (__builtin_expect(tuning == Tuning::kA55ish, true)) {
+ KernelFloatNeonA55ish(params);
} else {
- KernelFloatNeonOutOfOrder(params);
+ KernelFloatNeon(params);
}
}
};
@@ -174,7 +174,7 @@ struct Kernel<Path::kNeon, float, float, float, float> {
MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row,
end_col, dst, &params);
- KernelFloat32NeonOutOfOrder(params);
+ KernelFloat32Neon(params);
}
};
#endif
@@ -197,10 +197,10 @@ struct Kernel<Path::kNeonDotprod, float, float, float, float> {
KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row,
end_col, dst, &params);
- if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
- KernelFloatNeonDotprodInOrder(params);
+ if (__builtin_expect(tuning == Tuning::kA55ish, true)) {
+ KernelFloatNeonDotprodA55ish(params);
} else {
- KernelFloatNeonOutOfOrder(params);
+ KernelFloatNeon(params);
}
}
};
diff --git a/ruy/kernel_arm32.cc b/ruy/kernel_arm32.cc
index 1213c2c..4ab58b2 100644
--- a/ruy/kernel_arm32.cc
+++ b/ruy/kernel_arm32.cc
@@ -78,10 +78,9 @@ void CheckOffsetsInKernelParamsFloat32(const Params&) {
// Just like Float 64 version, except accumulate in to 8x4 block to only
// use 16 128-bit NEON registers. This is a "first pass" kernel and not
// tuned. It is meant to run on out-of-order CPUs like the Krait 400 or A9.
-void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
+void KernelFloat32Neon(const KernelParamsFloat<8, 4>& params) {
CheckOffsetsInKernelParamsFloat32(params);
- profiler::ScopeLabel label(
- "Kernel (kNeon, optimized for out-of-order cores)");
+ profiler::ScopeLabel label("Kernel (kNeon)");
const float* lhs_ptr = params.lhs_base_ptr;
const float* rhs_ptr = params.rhs_base_ptr;
@@ -625,9 +624,8 @@ void CheckOffsetsInKernelParams8bit(const Params&) {
// Fast-int8 kernel, ported from ARM 64 version.
// Relevant target CPUs for this kernel include Krait 400 and A9,
// since these are 32-bit, out-of-order CPUs.
-void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) {
- profiler::ScopeLabel label(
- "Kernel (kNeon, optimized for out-of-order cores)");
+void Kernel8bitNeon(const KernelParams8bit<4, 2>& params) {
+ profiler::ScopeLabel label("Kernel (kNeon)");
CheckOffsetsInKernelParams8bit(params);
@@ -1626,9 +1624,8 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) {
// Fast-int8 true "GEMV" kernel (RHS has 1 column). We assume the RHS
// is still packed as if it has two columns
-void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 2>& params) {
- profiler::ScopeLabel label(
- "Kernel (kNeon, optimized for out-of-order cores)");
+void Kernel8bitNeon1Col(const KernelParams8bit<4, 2>& params) {
+ profiler::ScopeLabel label("Kernel (kNeon)");
CheckOffsetsInKernelParams8bit(params);
diff --git a/ruy/kernel_arm64.cc b/ruy/kernel_arm64.cc
index a5034b1..3fce17e 100644
--- a/ruy/kernel_arm64.cc
+++ b/ruy/kernel_arm64.cc
@@ -96,9 +96,8 @@ void CheckOffsetsInKernelParams8bit(const Params&) {
//
// Relevant target CPUs for this kernel include ARM Cortex-A73 and Cortex-A75,
// since these are 64-bit, out-of-order and without dotprod support.
-void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params) {
- profiler::ScopeLabel label(
- "Kernel (kNeon, optimized for out-of-order cores)");
+void Kernel8bitNeon(const KernelParams8bit<4, 4>& params) {
+ profiler::ScopeLabel label("Kernel (kNeon)");
CheckOffsetsInKernelParams8bit(params);
const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
@@ -1151,13 +1150,12 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params) {
"v26", "v27", "v28", "v29", "v30", "v31");
}
-// Similar to existing Kernel8bitNeonOutOfOrder but specialized for the case of
+// Similar to existing Kernel8bitNeon but specialized for the case of
// RHS cols == 1.
// Relevant target CPUs for this kernel include ARM Cortex-A73 and Cortex-A75,
// since these are 64-bit, out-of-order and without dotprod support.
-void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 4>& params) {
- profiler::ScopeLabel label(
- "Kernel (kNeon, optimized for out-of-order cores)");
+void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params) {
+ profiler::ScopeLabel label("Kernel (kNeon)");
CheckOffsetsInKernelParams8bit(params);
@@ -1820,7 +1818,7 @@ void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 4>& params) {
"v13", "v14", "v15", "v16", "v17", "v18", "v19");
}
-// Variant of the above Kernel8bitNeonOutOfOrder, tuned for in-order CPUs.
+// Variant of the above Kernel8bitNeon, tuned for A55-ish CPUs.
// Specifically here, the relevant in-order CPUs are ARM Cortex-A53 and
// the original Cortex-A55, since these are 64-bit and do not support dotprod.
//
@@ -1829,7 +1827,7 @@ void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 4>& params) {
// contribution of gemmlowp kernels tuned for Cortex-A53, with very helpful
// comments. Specifically, see this comment about tuning for Cortex-A53:
// https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4215
-void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params) {
+void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params) {
profiler::ScopeLabel label("Kernel (kNeon, optimized for in-order cores)");
CheckOffsetsInKernelParams8bit(params);
@@ -2984,9 +2982,8 @@ void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params) {
//
// Relevant target CPUs for this kernel include ARM Cortex-A76,
// since these are 64-bit, out-of-order and with dotprod support.
-void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) {
- profiler::ScopeLabel label(
- "Kernel (kNeonDotprod, optimized for out-of-order cores)");
+void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params) {
+ profiler::ScopeLabel label("Kernel (kNeonDotprod)");
CheckOffsetsInKernelParams8bit(params);
@@ -4414,9 +4411,8 @@ void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) {
// RHS cols == 1.
// Relevant target CPUs for this kernel include ARM Cortex-A76,
// since these are 64-bit, out-of-order and with dotprod support.
-void Kernel8bitNeonDotprodOutOfOrder1Col(const KernelParams8bit<8, 8>& params) {
- profiler::ScopeLabel label(
- "Kernel (kNeonDotprod, optimized for out-of-order cores)");
+void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params) {
+ profiler::ScopeLabel label("Kernel (kNeonDotprod)");
CheckOffsetsInKernelParams8bit(params);
@@ -5102,7 +5098,7 @@ void Kernel8bitNeonDotprodOutOfOrder1Col(const KernelParams8bit<8, 8>& params) {
"v13", "v14", "v15", "v16", "v17");
}
-// Variant of the above Kernel8bitNeonDotprodOutOfOrder, tuned for in-order
+// Variant of the above Kernel8bitNeonDotprod, tuned for in-order
// CPUs. Specifically here, the relevant in-order CPUs are ARM Cortex-A55r1,
// since these are 64-bit and support dotprod.
//
@@ -5111,7 +5107,7 @@ void Kernel8bitNeonDotprodOutOfOrder1Col(const KernelParams8bit<8, 8>& params) {
// contribution of gemmlowp kernels tuned for Cortex-A55r1, with very helpful
// comments. Specifically, see this comment about tuning for Cortex-A55r1:
// https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4412
-void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params) {
+void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params) {
profiler::ScopeLabel label(
"Kernel (kNeonDotprod, optimized for in-order cores)");
@@ -6454,10 +6450,9 @@ void CheckOffsetsInKernelParamsFloat(const Params&) {
// width instead of the wider 12x8 that the register space permits and that
// the aforementioned gemmlowp kernel uses. Ruy likes powers of two for now
// and we don't have evidence that going beyond 8x8 is needed.
-void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params) {
+void KernelFloatNeon(const KernelParamsFloat<8, 8>& params) {
CheckOffsetsInKernelParamsFloat(params);
- profiler::ScopeLabel label(
- "Kernel (kNeon, optimized for out-of-order cores)");
+ profiler::ScopeLabel label("Kernel (kNeon)");
const float* lhs_col_ptr = params.lhs_base_ptr;
const float* rhs_col_ptr = params.rhs_base_ptr;
@@ -7086,7 +7081,7 @@ void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params) {
"v26", "v27", "v28", "v29", "v30", "v31");
}
-// Variant of KernelFloatNeonOutOfOrder tuned for in-order CPUs that do not
+// Variant of KernelFloatNeon tuned for in-order CPUs that do not
// support dotprod (while dotprod by itself is not relevant to floating-point,
// this additional bit of information that we have about the target happens to
// be useful here).
@@ -7099,7 +7094,7 @@ void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params) {
// which was contributed by David Mansell with very helpful
// comments. Specifically, see this comment about tuning for Cortex-A53:
// https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4215
-void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params) {
+void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params) {
profiler::ScopeLabel label("Kernel (kNeon, optimized for in-order cores)");
CheckOffsetsInKernelParamsFloat(params);
@@ -7579,7 +7574,7 @@ void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params) {
"v26", "v27", "v28", "v29", "v30", "v31");
}
-// Variant of KernelFloatNeonInOrder tuned for in-order CPUs that do
+// Variant of KernelFloatNeonA55ish tuned for in-order CPUs that do
// support dotprod (while dotprod by itself is not relevant to floating-point,
// this additional bit of information that we have about the target happens to
// be useful here).
@@ -7591,7 +7586,7 @@ void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params) {
// which was contributed by David Mansell with very helpful
// comments. Specifically, see this comment about tuning for Cortex-A55r1:
// https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4412
-void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params) {
+void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params) {
profiler::ScopeLabel label(
"Kernel (kNeonDotprod, optimized for in-order cores)");
diff --git a/ruy/pack_arm.cc b/ruy/pack_arm.cc
index f29b214..1673c90 100644
--- a/ruy/pack_arm.cc
+++ b/ruy/pack_arm.cc
@@ -31,12 +31,13 @@ namespace ruy {
#if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM)
-void Pack8bitColMajorForNeonOutOfOrder(
- const void* src_ptr0, const void* src_ptr1, const void* src_ptr2,
- const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2,
- int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr,
- std::int32_t* sums_ptr, int input_xor) {
- profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)");
+void Pack8bitColMajorForNeon(const void* src_ptr0, const void* src_ptr1,
+ const void* src_ptr2, const void* src_ptr3,
+ int src_inc0, int src_inc1, int src_inc2,
+ int src_inc3, int src_rows, int src_zero_point,
+ std::int8_t* packed_ptr, std::int32_t* sums_ptr,
+ int input_xor) {
+ profiler::ScopeLabel label("Pack (kNeon)");
asm volatile(
// clang-format off
// v26 will be the vector to XOR input values with to perform
@@ -246,11 +247,10 @@ void CheckOffsetsInPackParams8bit(const Params&) {
static_assert(offsetof(Params, input_xor) == RUY_OFFSET_INPUT_XOR, "");
}
-// Packing code for out-of-order ARMv7 CPUs like the Krait 400 or A9.
-// No attempt made at making this code efficient on in-order cores yet.
-void Pack8bitColMajorForNeonOutOfOrder4Cols(const PackParams8bit& params) {
+// No attempt made at making this code efficient on A55-ish cores yet.
+void Pack8bitColMajorForNeon4Cols(const PackParams8bit& params) {
CheckOffsetsInPackParams8bit(params);
- profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)");
+ profiler::ScopeLabel label("Pack (kNeon)");
const void* src_ptr0 = params.src_ptr0;
const void* src_ptr1 = params.src_ptr1;
const void* src_ptr2 = params.src_ptr2;
@@ -473,9 +473,9 @@ void Pack8bitColMajorForNeonOutOfOrder4Cols(const PackParams8bit& params) {
// No attempt made at making this code efficient on in-order cores yet.
// This version differs from the above in that we only handle two columns
// at a time.
-void Pack8bitColMajorForNeonOutOfOrder2Cols(const PackParams8bit& params) {
+void Pack8bitColMajorForNeon2Cols(const PackParams8bit& params) {
CheckOffsetsInPackParams8bit(params);
- profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)");
+ profiler::ScopeLabel label("Pack (kNeon)");
const void* src_ptr0 = params.src_ptr0;
const void* src_ptr1 = params.src_ptr1;
const int src_inc0 = params.src_inc0;
@@ -626,12 +626,12 @@ void Pack8bitColMajorForNeonOutOfOrder2Cols(const PackParams8bit& params) {
#if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM)
-void Pack8bitColMajorForNeonInOrder(const void* src_ptr0, const void* src_ptr1,
- const void* src_ptr2, const void* src_ptr3,
- int src_inc0, int src_inc1, int src_inc2,
- int src_inc3, int src_rows,
- int src_zero_point, std::int8_t* packed_ptr,
- std::int32_t* sums_ptr, int input_xor) {
+void Pack8bitColMajorForNeonA55ish(const void* src_ptr0, const void* src_ptr1,
+ const void* src_ptr2, const void* src_ptr3,
+ int src_inc0, int src_inc1, int src_inc2,
+ int src_inc3, int src_rows,
+ int src_zero_point, std::int8_t* packed_ptr,
+ std::int32_t* sums_ptr, int input_xor) {
profiler::ScopeLabel label("Pack (kNeon, optimized for in-order cores)");
asm volatile(
// clang-format off
@@ -837,7 +837,7 @@ void Pack8bitColMajorForNeonInOrder(const void* src_ptr0, const void* src_ptr1,
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
-void Pack8bitColMajorForNeonDotprodInOrder(
+void Pack8bitColMajorForNeonDotprodA55ish(
const void* src_ptr0, const void* src_ptr1, const void* src_ptr2,
const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2,
int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr,
@@ -1083,13 +1083,13 @@ void Pack8bitColMajorForNeonDotprodInOrder(
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
-void Pack8bitColMajorForNeonDotprodOutOfOrder(
- const void* src_ptr0, const void* src_ptr1, const void* src_ptr2,
- const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2,
- int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr,
- std::int32_t* sums_ptr, int input_xor) {
- profiler::ScopeLabel label(
- "Pack (kNeonDotprod, optimized for out-of-order cores)");
+void Pack8bitColMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1,
+ const void* src_ptr2, const void* src_ptr3,
+ int src_inc0, int src_inc1, int src_inc2,
+ int src_inc3, int src_rows,
+ int src_zero_point, std::int8_t* packed_ptr,
+ std::int32_t* sums_ptr, int input_xor) {
+ profiler::ScopeLabel label("Pack (kNeonDotprod)");
asm volatile(
// clang-format off
// v26 will be the vector to XOR input values with to perform
@@ -1735,11 +1735,11 @@ void Pack8bitRowMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1,
"v27", "v28", "v29", "v30", "v31");
}
-void PackFloatColMajorForNeonOutOfOrder(
- const float* src_ptr0, const float* src_ptr1, const float* src_ptr2,
- const float* src_ptr3, int src_inc0, int src_inc1, int src_inc2,
- int src_inc3, int src_rows, float* packed_ptr) {
- profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)");
+void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1,
+ const float* src_ptr2, const float* src_ptr3,
+ int src_inc0, int src_inc1, int src_inc2,
+ int src_inc3, int src_rows, float* packed_ptr) {
+ profiler::ScopeLabel label("Pack (kNeon)");
asm volatile(
// clang-format off
// w1 will be the number of rows already loaded.
@@ -1882,13 +1882,11 @@ void PackFloatColMajorForNeonOutOfOrder(
#endif
#if RUY_PLATFORM_NEON_32 && RUY_OPT(ASM)
-void PackFloatColMajorForNeonOutOfOrder(const float* src_ptr0,
- const float* src_ptr1,
- const float* src_ptr2,
- const float* src_ptr3, int src_inc,
- int src_rows, float* packed_ptr,
- int output_stride) {
- profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)");
+void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1,
+ const float* src_ptr2, const float* src_ptr3,
+ int src_inc, int src_rows, float* packed_ptr,
+ int output_stride) {
+ profiler::ScopeLabel label("Pack (kNeon)");
asm volatile(
// clang-format off
"mov r1, #0\n"
@@ -2066,12 +2064,12 @@ void PackFloatColMajorForNeonOutOfOrder(const float* src_ptr0,
#endif // (RUY_PLATFORM_NEON_32
#if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM)
-void PackFloatColMajorForNeonInOrder(const float* src_ptr0,
- const float* src_ptr1,
- const float* src_ptr2,
- const float* src_ptr3, int src_inc0,
- int src_inc1, int src_inc2, int src_inc3,
- int src_rows, float* packed_ptr) {
+void PackFloatColMajorForNeonA55ish(const float* src_ptr0,
+ const float* src_ptr1,
+ const float* src_ptr2,
+ const float* src_ptr3, int src_inc0,
+ int src_inc1, int src_inc2, int src_inc3,
+ int src_rows, float* packed_ptr) {
profiler::ScopeLabel label("Pack (kNeon, optimized for in-order cores)");
asm volatile(
@@ -2290,7 +2288,7 @@ void Pack8bitRowMajorForNeon(const std::uint8_t* src_ptr, int src_stride,
// it's working at, this seems like a fair compromise. If one wanted to
// maximize performance at the cost of more code complexity/size, one could
// have code handling 16 columns at a time (maybe limited to
- // Tuning::kOutOfOrder), then 8, then 4 to minimize the amount of slow
+ // Tuning::kGeneric), then 8, then 4 to minimize the amount of slow
// leftovers.
//
// Load 8 sums in sums0, sums1.
diff --git a/ruy/pack_arm.h b/ruy/pack_arm.h
index 12d5cab..ba8964d 100644
--- a/ruy/pack_arm.h
+++ b/ruy/pack_arm.h
@@ -62,23 +62,25 @@ void Pack8bitRowMajorForNeon(const std::uint8_t* src_ptr, int src_stride,
#if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM)
-void Pack8bitColMajorForNeonOutOfOrder(
- const void* src_ptr0, const void* src_ptr1, const void* src_ptr2,
- const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2,
- int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr,
- std::int32_t* sums_ptr, int input_xor);
-void Pack8bitColMajorForNeonInOrder(const void* src_ptr0, const void* src_ptr1,
+void Pack8bitColMajorForNeon(const void* src_ptr0, const void* src_ptr1,
+ const void* src_ptr2, const void* src_ptr3,
+ int src_inc0, int src_inc1, int src_inc2,
+ int src_inc3, int src_rows, int src_zero_point,
+ std::int8_t* packed_ptr, std::int32_t* sums_ptr,
+ int input_xor);
+void Pack8bitColMajorForNeonA55ish(const void* src_ptr0, const void* src_ptr1,
+ const void* src_ptr2, const void* src_ptr3,
+ int src_inc0, int src_inc1, int src_inc2,
+ int src_inc3, int src_rows,
+ int src_zero_point, std::int8_t* packed_ptr,
+ std::int32_t* sums_ptr, int input_xor);
+void Pack8bitColMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1,
const void* src_ptr2, const void* src_ptr3,
int src_inc0, int src_inc1, int src_inc2,
int src_inc3, int src_rows,
int src_zero_point, std::int8_t* packed_ptr,
std::int32_t* sums_ptr, int input_xor);
-void Pack8bitColMajorForNeonDotprodOutOfOrder(
- const void* src_ptr0, const void* src_ptr1, const void* src_ptr2,
- const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2,
- int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr,
- std::int32_t* sums_ptr, int input_xor);
-void Pack8bitColMajorForNeonDotprodInOrder(
+void Pack8bitColMajorForNeonDotprodA55ish(
const void* src_ptr0, const void* src_ptr1, const void* src_ptr2,
const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2,
int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr,
@@ -130,8 +132,8 @@ inline void MakePackParams8bit(const void* src_ptr0, const void* src_ptr1,
params->input_xor = input_xor;
}
-void Pack8bitColMajorForNeonOutOfOrder4Cols(const PackParams8bit& params);
-void Pack8bitColMajorForNeonOutOfOrder2Cols(const PackParams8bit& params);
+void Pack8bitColMajorForNeon4Cols(const PackParams8bit& params);
+void Pack8bitColMajorForNeon2Cols(const PackParams8bit& params);
#endif // (RUY_PLATFORM_NEON_32 && RUY_OPT(ASM)
@@ -187,16 +189,16 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar,
packed_matrix->data + packed_matrix->layout.stride * block_col;
std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
#if RUY_PLATFORM_NEON_64
- if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
- Pack8bitColMajorForNeonInOrder(
+ if (__builtin_expect(tuning == Tuning::kA55ish, true)) {
+ Pack8bitColMajorForNeonA55ish(
src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
packed_ptr, sums_ptr, kInputXor);
} else {
- Pack8bitColMajorForNeonOutOfOrder(
- src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
- src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
- packed_ptr, sums_ptr, kInputXor);
+ Pack8bitColMajorForNeon(src_ptr0, src_ptr1, src_ptr2, src_ptr3,
+ src_inc0, src_inc1, src_inc2, src_inc3,
+ src_matrix.layout.rows, src_matrix.zero_point,
+ packed_ptr, sums_ptr, kInputXor);
}
#else
(void)tuning;
@@ -208,7 +210,7 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar,
packed_ptr, src_inc0, src_inc1, src_inc2, src_inc3,
src_matrix.layout.rows, src_matrix.zero_point,
kInputXor, &params);
- Pack8bitColMajorForNeonOutOfOrder4Cols(params);
+ Pack8bitColMajorForNeon4Cols(params);
#endif // RUY_PLATFORM_NEON_64
}
}
@@ -262,7 +264,7 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 2>, Scalar,
packed_ptr, src_inc0, src_inc1, -1, -1,
src_matrix.layout.rows, src_matrix.zero_point,
kInputXor, &params);
- Pack8bitColMajorForNeonOutOfOrder2Cols(params);
+ Pack8bitColMajorForNeon2Cols(params);
}
}
};
@@ -320,13 +322,13 @@ struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>,
packed_matrix->layout.stride * (block_col & ~7) +
((block_col & 4) * 4);
std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
- if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
- Pack8bitColMajorForNeonDotprodInOrder(
+ if (__builtin_expect(tuning == Tuning::kA55ish, true)) {
+ Pack8bitColMajorForNeonDotprodA55ish(
src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
packed_ptr, sums_ptr, kInputXor);
} else {
- Pack8bitColMajorForNeonDotprodOutOfOrder(
+ Pack8bitColMajorForNeonDotprod(
src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point,
packed_ptr, sums_ptr, kInputXor);
@@ -337,24 +339,22 @@ struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>,
#endif // (RUY_PLATFORM_NEON_64&& RUY_OPT(ASM)
#if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM)
-void PackFloatColMajorForNeonOutOfOrder(
- const float* src_ptr0, const float* src_ptr1, const float* src_ptr2,
- const float* src_ptr3, int src_inc0, int src_inc1, int src_inc2,
- int src_inc3, int src_rows, float* packed_ptr);
-void PackFloatColMajorForNeonInOrder(const float* src_ptr0,
- const float* src_ptr1,
- const float* src_ptr2,
- const float* src_ptr3, int src_inc0,
- int src_inc1, int src_inc2, int src_inc3,
- int src_rows, float* packed_ptr);
+void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1,
+ const float* src_ptr2, const float* src_ptr3,
+ int src_inc0, int src_inc1, int src_inc2,
+ int src_inc3, int src_rows, float* packed_ptr);
+void PackFloatColMajorForNeonA55ish(const float* src_ptr0,
+ const float* src_ptr1,
+ const float* src_ptr2,
+ const float* src_ptr3, int src_inc0,
+ int src_inc1, int src_inc2, int src_inc3,
+ int src_rows, float* packed_ptr);
#elif RUY_PLATFORM_NEON_32 && RUY_OPT(ASM)
-void PackFloatColMajorForNeonOutOfOrder(const float* src_ptr0,
- const float* src_ptr1,
- const float* src_ptr2,
- const float* src_ptr3, int src_inc,
- int src_rows, float* packed_ptr,
- int stride);
+void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1,
+ const float* src_ptr2, const float* src_ptr3,
+ int src_inc, int src_rows, float* packed_ptr,
+ int stride);
#endif // (RUY_PLATFORM_NEON_64&& RUY_OPT(ASM)
#if (RUY_PLATFORM_NEON_32 || RUY_PLATFORM_NEON_64) && RUY_OPT(ASM)
@@ -400,14 +400,14 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
packed_matrix->layout.stride * (block_col & ~7) +
((block_col & 4));
#if RUY_PLATFORM_NEON_64
- if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
- PackFloatColMajorForNeonInOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3,
- src_inc0, src_inc1, src_inc2, src_inc3,
- src_matrix.layout.rows, packed_ptr);
+ if (__builtin_expect(tuning == Tuning::kA55ish, true)) {
+ PackFloatColMajorForNeonA55ish(src_ptr0, src_ptr1, src_ptr2, src_ptr3,
+ src_inc0, src_inc1, src_inc2, src_inc3,
+ src_matrix.layout.rows, packed_ptr);
} else {
- PackFloatColMajorForNeonOutOfOrder(
- src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1,
- src_inc2, src_inc3, src_matrix.layout.rows, packed_ptr);
+ PackFloatColMajorForNeon(src_ptr0, src_ptr1, src_ptr2, src_ptr3,
+ src_inc0, src_inc1, src_inc2, src_inc3,
+ src_matrix.layout.rows, packed_ptr);
}
#else
(void)tuning;
@@ -424,9 +424,9 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
src_inc += src_inc2 == 16 ? 4 : 0;
src_inc += src_inc3 == 16 ? 8 : 0;
const int kOutputStride = 32;
- PackFloatColMajorForNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3,
- src_inc, src_matrix.layout.rows,
- packed_ptr, kOutputStride);
+ PackFloatColMajorForNeon(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc,
+ src_matrix.layout.rows, packed_ptr,
+ kOutputStride);
#endif // RUY_PLATFORM_NEON_64
}
}
@@ -482,9 +482,9 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 4>, float,
src_inc += src_inc2 == 16 ? 4 : 0;
src_inc += src_inc3 == 16 ? 8 : 0;
const int kOutputStride = 16;
- PackFloatColMajorForNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3,
- src_inc, src_matrix.layout.rows,
- packed_ptr, kOutputStride);
+ PackFloatColMajorForNeon(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc,
+ src_matrix.layout.rows, packed_ptr,
+ kOutputStride);
}
}
};
diff --git a/ruy/test.h b/ruy/test.h
index fec2566..15e8bff 100644
--- a/ruy/test.h
+++ b/ruy/test.h
@@ -121,8 +121,8 @@ inline const char* TuningName(Tuning tuning) {
case Tuning::NAME: \
return #NAME;
switch (tuning) {
- RUY_SUBPATHNAME_CASE(kInOrder)
- RUY_SUBPATHNAME_CASE(kOutOfOrder)
+ RUY_SUBPATHNAME_CASE(kA55ish)
+ RUY_SUBPATHNAME_CASE(kGeneric)
default:
RUY_CHECK(false);
return nullptr;
@@ -1820,7 +1820,7 @@ inline std::vector<Tuning> EnumerateTuningsForPath(Path path, bool benchmark) {
}
#if RUY_PLATFORM_ARM
if (path == Path::kNeon || path == Path::kNeonDotprod) {
- return {Tuning::kInOrder, Tuning::kOutOfOrder, Tuning::kAuto};
+ return {Tuning::kA55ish, Tuning::kGeneric, Tuning::kAuto};
}
#endif
(void)path;
diff --git a/ruy/tune.cc b/ruy/tune.cc
index 4d542e9..1f615bf 100644
--- a/ruy/tune.cc
+++ b/ruy/tune.cc
@@ -23,8 +23,7 @@ limitations under the License.
namespace ruy {
Tuning TuningResolver::ResolveNow(CpuInfo* cpuinfo) {
- return cpuinfo->CurrentCpuIsInOrder() ? Tuning::kInOrder
- : Tuning::kOutOfOrder;
+ return cpuinfo->CurrentCpuIsA55ish() ? Tuning::kA55ish : Tuning::kGeneric;
}
TuningResolver::TuningResolver()
diff --git a/ruy/tune.h b/ruy/tune.h
index fcb276f..c9beed9 100644
--- a/ruy/tune.h
+++ b/ruy/tune.h
@@ -24,7 +24,7 @@ limitations under the License.
// layouts compared to Path::kNeon; but within each, different tunings
// will share that same layout.
//
-// # Tuning is for now only based on 1 bit: OutOfOrder / InOrder
+// # Tuning is for now only based on 1 bit: Generic / A55ish
//
// In practice, each of our asm code paths only needs one bit information to
// decide on tuning: whether the CPU is out-of-order or in-order.
@@ -37,7 +37,7 @@ limitations under the License.
//
// Because having tuned code paths is a compromise of efficiency gains
// versus implementation effort and code size, we are happy to stop at just this
-// single bit of information, OutOfOrder/InOrder, at least in the current CPU
+// single bit of information, Generic / A55ish, at least in the current CPU
// landscape. This could change in the future.
#ifndef RUY_RUY_TUNE_H_
#define RUY_RUY_TUNE_H_
@@ -54,10 +54,22 @@ enum class Tuning {
// user-visible parts (see Context). It's meant to be resolved to an
// actual tuning at some point by means of TuningResolver.
kAuto,
- // Target an out-order CPU. Example: ARM Cortex-A75.
- kOutOfOrder,
- // Target an in-order CPU. Example: ARM Cortex-A55.
- kInOrder
+ // Use code not tuned for any particular CPU, typically performing well
+ // on out-of-order cores that don't require as much tuning.
+ kGeneric,
+ // Use code tuned for "Cortex-A55-ish" CPUs, by which we mean mostly:
+ // A53, A55r0 (pre-dotprod), A55r1 (with dotprod). These CPUs have in common
+ // that they are in-order CPU cores with largely similar requirements of code
+ // tuning. The most important such requirement is to use only 64-bit loads
+ // to maximize dual-issuing.
+ //
+ // A55r1 differs from A55r0 and A53 in that it dual-issues 64-bit NEON loads
+ // whereas A55r0 and A53 require using non-NEON ARM 64-bit loads together with
+ // INS instructions to insert 64bit lanes into NEON registers. However, since
+ // A55r1 supports dotprod unlike A55r0 and A53, they are not using the same
+ // kernels in practice anyway, so there was no need to distinguish them with
+ // separate Tuning values.
+ kA55ish
};
// Why a TuningResolver class?
@@ -65,7 +77,7 @@ enum class Tuning {
// Ideally, this Library would offer a single function,
// Tuning GetCurrentCPUTuning();
//
-// However, determining information about the current CPU is not necessarily,
+// However, determining information about the current CPU is not necessarily
// cheap, so we currently cache that and only invalidate/reevaluate after
// a fixed amount of time. This need to store state is why this library
// has to expose a class, TuningResolver, not just a function.
diff --git a/ruy/tune_test.cc b/ruy/tune_test.cc
index dabe21a..c5f2342 100644
--- a/ruy/tune_test.cc
+++ b/ruy/tune_test.cc
@@ -36,7 +36,7 @@ TEST(TuneTest, TuneTest) {
tuning_resolver.SetTuning(Tuning::kAuto);
#ifdef RUY_IMPLEMENT_TUNING
- for (auto tuning : {Tuning::kOutOfOrder, Tuning::kInOrder}) {
+ for (auto tuning : {Tuning::kGeneric, Tuning::kA55ish}) {
tuning_resolver.SetTuning(tuning);
ASSERT_TRUE(tuning_resolver.Resolve(&cpuinfo) == tuning);
// See above comment about 1 second.