diff options
author | Benoit Jacob <benoitjacob@google.com> | 2020-08-19 19:54:11 +0300 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2020-08-19 19:55:51 +0300 |
commit | 59c2de870307b80a59795c3768c954d80159c838 (patch) | |
tree | 900e65b40336543359c56d5e7c1f99bd3130076b | |
parent | 4f6a37b9f115f86b378f288028be4b568204bad5 (diff) |
Rename kOutOfOrder -> kGeneric, kInOrder -> kA55ish,
KernelXxxOutOfOrder -> KernelXxx,
KernelXxxInOrder -> KernelXxxA55ish
PiperOrigin-RevId: 327452323
-rw-r--r-- | ruy/context_test.cc | 4 | ||||
-rw-r--r-- | ruy/cpuinfo.cc | 4 | ||||
-rw-r--r-- | ruy/cpuinfo.h | 2 | ||||
-rw-r--r-- | ruy/kernel_arm.h | 58 | ||||
-rw-r--r-- | ruy/kernel_arm32.cc | 15 | ||||
-rw-r--r-- | ruy/kernel_arm64.cc | 43 | ||||
-rw-r--r-- | ruy/pack_arm.cc | 88 | ||||
-rw-r--r-- | ruy/pack_arm.h | 108 | ||||
-rw-r--r-- | ruy/test.h | 6 | ||||
-rw-r--r-- | ruy/tune.cc | 3 | ||||
-rw-r--r-- | ruy/tune.h | 26 | ||||
-rw-r--r-- | ruy/tune_test.cc | 2 |
12 files changed, 180 insertions, 179 deletions
diff --git a/ruy/context_test.cc b/ruy/context_test.cc index 8037b3c..4e69e65 100644 --- a/ruy/context_test.cc +++ b/ruy/context_test.cc @@ -30,9 +30,9 @@ TEST(ContextTest, ContextClassSanity) { EXPECT_EQ(&context.thread_pool(), context.mutable_thread_pool()); EXPECT_NE(context.mutable_thread_pool(), nullptr); EXPECT_EQ(context.max_num_threads(), 1); - context.set_explicit_tuning(Tuning::kOutOfOrder); + context.set_explicit_tuning(Tuning::kGeneric); context.set_max_num_threads(2); - EXPECT_EQ(context.explicit_tuning(), Tuning::kOutOfOrder); + EXPECT_EQ(context.explicit_tuning(), Tuning::kGeneric); EXPECT_EQ(context.max_num_threads(), 2); } diff --git a/ruy/cpuinfo.cc b/ruy/cpuinfo.cc index b0c53ad..a4e71ef 100644 --- a/ruy/cpuinfo.cc +++ b/ruy/cpuinfo.cc @@ -118,7 +118,7 @@ bool CpuInfo::AvxVnni() { return EnsureInitialized() && cpuinfo_has_x86_avx512vnni(); } -bool CpuInfo::CurrentCpuIsInOrder() { +bool CpuInfo::CurrentCpuIsA55ish() { if (!EnsureInitialized()) { return false; } @@ -150,7 +150,7 @@ bool CpuInfo::Avx() { return false; } bool CpuInfo::Avx2Fma() { return false; } bool CpuInfo::Avx512() { return false; } bool CpuInfo::AvxVnni() { return false; } -bool CpuInfo::CurrentCpuIsInOrder() { return false; } +bool CpuInfo::CurrentCpuIsA55ish() { return false; } #endif diff --git a/ruy/cpuinfo.h b/ruy/cpuinfo.h index f8948e2..e45fa51 100644 --- a/ruy/cpuinfo.h +++ b/ruy/cpuinfo.h @@ -38,7 +38,7 @@ class CpuInfo final { // Common features const CpuCacheParams& CacheParams(); - bool CurrentCpuIsInOrder(); + bool CurrentCpuIsA55ish(); private: enum class InitStatus { diff --git a/ruy/kernel_arm.h b/ruy/kernel_arm.h index c4a4dcc..76cfc82 100644 --- a/ruy/kernel_arm.h +++ b/ruy/kernel_arm.h @@ -39,16 +39,16 @@ RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kNeon) RUY_INHERIT_KERNEL(Path::kNeon, Path::kNeonDotprod) #if RUY_PLATFORM_NEON_64 -void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params); -void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 4>& params); +void Kernel8bitNeon(const KernelParams8bit<4, 4>& params); +void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params); #elif RUY_PLATFORM_NEON_32 -void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params); -void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 2>& params); +void Kernel8bitNeon(const KernelParams8bit<4, 2>& params); +void Kernel8bitNeon1Col(const KernelParams8bit<4, 2>& params); #endif -void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params); -void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params); -void Kernel8bitNeonDotprodOutOfOrder1Col(const KernelParams8bit<8, 8>& params); -void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params); +void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params); +void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params); +void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params); +void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params); #if RUY_PLATFORM_NEON_64 template <typename DstScalar> @@ -66,13 +66,13 @@ struct Kernel<Path::kNeon, std::int8_t, std::int8_t, std::int32_t, DstScalar> { end_col, dst, ¶ms); if (dst->layout.cols == 1 && mul_params.channel_dimension() == ChannelDimension::kRow) { - Kernel8bitNeonOutOfOrder1Col(params); + Kernel8bitNeon1Col(params); return; } - if (__builtin_expect(tuning == Tuning::kInOrder, true)) { - Kernel8bitNeonInOrder(params); + if (__builtin_expect(tuning == Tuning::kA55ish, true)) { + Kernel8bitNeonA55ish(params); } else { - Kernel8bitNeonOutOfOrder(params); + Kernel8bitNeon(params); } } }; @@ -94,10 +94,10 @@ struct Kernel<Path::kNeon, std::int8_t, std::int8_t, std::int32_t, DstScalar> { end_col, dst, ¶ms); if (dst->layout.cols == 1 && mul_params.channel_dimension() == ChannelDimension::kRow) { - Kernel8bitNeonOutOfOrder1Col(params); + Kernel8bitNeon1Col(params); return; } - Kernel8bitNeonOutOfOrder(params); + Kernel8bitNeon(params); } }; #endif @@ -118,20 +118,20 @@ struct Kernel<Path::kNeonDotprod, std::int8_t, std::int8_t, std::int32_t, DstSca end_col, dst, ¶ms); if (dst->layout.cols == 1 && mul_params.channel_dimension() == ChannelDimension::kRow) { - Kernel8bitNeonDotprodOutOfOrder1Col(params); - } else if (__builtin_expect(tuning == Tuning::kInOrder, true)) { - Kernel8bitNeonDotprodInOrder(params); + Kernel8bitNeonDotprod1Col(params); + } else if (__builtin_expect(tuning == Tuning::kA55ish, true)) { + Kernel8bitNeonDotprodA55ish(params); } else { - Kernel8bitNeonDotprodOutOfOrder(params); + Kernel8bitNeonDotprod(params); } } }; #endif -void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params); -void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params); -void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params); -void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params); +void KernelFloatNeon(const KernelParamsFloat<8, 8>& params); +void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params); +void KernelFloat32Neon(const KernelParamsFloat<8, 4>& params); +void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params); #if RUY_PLATFORM_NEON_64 // A Float kernel for ARM64 Neon. @@ -148,10 +148,10 @@ struct Kernel<Path::kNeon, float, float, float, float> { KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params; MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row, end_col, dst, ¶ms); - if (__builtin_expect(tuning == Tuning::kInOrder, true)) { - KernelFloatNeonInOrder(params); + if (__builtin_expect(tuning == Tuning::kA55ish, true)) { + KernelFloatNeonA55ish(params); } else { - KernelFloatNeonOutOfOrder(params); + KernelFloatNeon(params); } } }; @@ -174,7 +174,7 @@ struct Kernel<Path::kNeon, float, float, float, float> { MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row, end_col, dst, ¶ms); - KernelFloat32NeonOutOfOrder(params); + KernelFloat32Neon(params); } }; #endif @@ -197,10 +197,10 @@ struct Kernel<Path::kNeonDotprod, float, float, float, float> { KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params; MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row, end_col, dst, ¶ms); - if (__builtin_expect(tuning == Tuning::kInOrder, true)) { - KernelFloatNeonDotprodInOrder(params); + if (__builtin_expect(tuning == Tuning::kA55ish, true)) { + KernelFloatNeonDotprodA55ish(params); } else { - KernelFloatNeonOutOfOrder(params); + KernelFloatNeon(params); } } }; diff --git a/ruy/kernel_arm32.cc b/ruy/kernel_arm32.cc index 1213c2c..4ab58b2 100644 --- a/ruy/kernel_arm32.cc +++ b/ruy/kernel_arm32.cc @@ -78,10 +78,9 @@ void CheckOffsetsInKernelParamsFloat32(const Params&) { // Just like Float 64 version, except accumulate in to 8x4 block to only // use 16 128-bit NEON registers. This is a "first pass" kernel and not // tuned. It is meant to run on out-of-order CPUs like the Krait 400 or A9. -void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) { +void KernelFloat32Neon(const KernelParamsFloat<8, 4>& params) { CheckOffsetsInKernelParamsFloat32(params); - profiler::ScopeLabel label( - "Kernel (kNeon, optimized for out-of-order cores)"); + profiler::ScopeLabel label("Kernel (kNeon)"); const float* lhs_ptr = params.lhs_base_ptr; const float* rhs_ptr = params.rhs_base_ptr; @@ -625,9 +624,8 @@ void CheckOffsetsInKernelParams8bit(const Params&) { // Fast-int8 kernel, ported from ARM 64 version. // Relevant target CPUs for this kernel include Krait 400 and A9, // since these are 32-bit, out-of-order CPUs. -void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) { - profiler::ScopeLabel label( - "Kernel (kNeon, optimized for out-of-order cores)"); +void Kernel8bitNeon(const KernelParams8bit<4, 2>& params) { + profiler::ScopeLabel label("Kernel (kNeon)"); CheckOffsetsInKernelParams8bit(params); @@ -1626,9 +1624,8 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) { // Fast-int8 true "GEMV" kernel (RHS has 1 column). We assume the RHS // is still packed as if it has two columns -void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 2>& params) { - profiler::ScopeLabel label( - "Kernel (kNeon, optimized for out-of-order cores)"); +void Kernel8bitNeon1Col(const KernelParams8bit<4, 2>& params) { + profiler::ScopeLabel label("Kernel (kNeon)"); CheckOffsetsInKernelParams8bit(params); diff --git a/ruy/kernel_arm64.cc b/ruy/kernel_arm64.cc index a5034b1..3fce17e 100644 --- a/ruy/kernel_arm64.cc +++ b/ruy/kernel_arm64.cc @@ -96,9 +96,8 @@ void CheckOffsetsInKernelParams8bit(const Params&) { // // Relevant target CPUs for this kernel include ARM Cortex-A73 and Cortex-A75, // since these are 64-bit, out-of-order and without dotprod support. -void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params) { - profiler::ScopeLabel label( - "Kernel (kNeon, optimized for out-of-order cores)"); +void Kernel8bitNeon(const KernelParams8bit<4, 4>& params) { + profiler::ScopeLabel label("Kernel (kNeon)"); CheckOffsetsInKernelParams8bit(params); const std::int8_t* lhs_col_ptr = params.lhs_base_ptr; @@ -1151,13 +1150,12 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params) { "v26", "v27", "v28", "v29", "v30", "v31"); } -// Similar to existing Kernel8bitNeonOutOfOrder but specialized for the case of +// Similar to existing Kernel8bitNeon but specialized for the case of // RHS cols == 1. // Relevant target CPUs for this kernel include ARM Cortex-A73 and Cortex-A75, // since these are 64-bit, out-of-order and without dotprod support. -void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 4>& params) { - profiler::ScopeLabel label( - "Kernel (kNeon, optimized for out-of-order cores)"); +void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params) { + profiler::ScopeLabel label("Kernel (kNeon)"); CheckOffsetsInKernelParams8bit(params); @@ -1820,7 +1818,7 @@ void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 4>& params) { "v13", "v14", "v15", "v16", "v17", "v18", "v19"); } -// Variant of the above Kernel8bitNeonOutOfOrder, tuned for in-order CPUs. +// Variant of the above Kernel8bitNeon, tuned for A55-ish CPUs. // Specifically here, the relevant in-order CPUs are ARM Cortex-A53 and // the original Cortex-A55, since these are 64-bit and do not support dotprod. // @@ -1829,7 +1827,7 @@ void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 4>& params) { // contribution of gemmlowp kernels tuned for Cortex-A53, with very helpful // comments. Specifically, see this comment about tuning for Cortex-A53: // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4215 -void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params) { +void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params) { profiler::ScopeLabel label("Kernel (kNeon, optimized for in-order cores)"); CheckOffsetsInKernelParams8bit(params); @@ -2984,9 +2982,8 @@ void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params) { // // Relevant target CPUs for this kernel include ARM Cortex-A76, // since these are 64-bit, out-of-order and with dotprod support. -void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) { - profiler::ScopeLabel label( - "Kernel (kNeonDotprod, optimized for out-of-order cores)"); +void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params) { + profiler::ScopeLabel label("Kernel (kNeonDotprod)"); CheckOffsetsInKernelParams8bit(params); @@ -4414,9 +4411,8 @@ void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) { // RHS cols == 1. // Relevant target CPUs for this kernel include ARM Cortex-A76, // since these are 64-bit, out-of-order and with dotprod support. -void Kernel8bitNeonDotprodOutOfOrder1Col(const KernelParams8bit<8, 8>& params) { - profiler::ScopeLabel label( - "Kernel (kNeonDotprod, optimized for out-of-order cores)"); +void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params) { + profiler::ScopeLabel label("Kernel (kNeonDotprod)"); CheckOffsetsInKernelParams8bit(params); @@ -5102,7 +5098,7 @@ void Kernel8bitNeonDotprodOutOfOrder1Col(const KernelParams8bit<8, 8>& params) { "v13", "v14", "v15", "v16", "v17"); } -// Variant of the above Kernel8bitNeonDotprodOutOfOrder, tuned for in-order +// Variant of the above Kernel8bitNeonDotprod, tuned for in-order // CPUs. Specifically here, the relevant in-order CPUs are ARM Cortex-A55r1, // since these are 64-bit and support dotprod. // @@ -5111,7 +5107,7 @@ void Kernel8bitNeonDotprodOutOfOrder1Col(const KernelParams8bit<8, 8>& params) { // contribution of gemmlowp kernels tuned for Cortex-A55r1, with very helpful // comments. Specifically, see this comment about tuning for Cortex-A55r1: // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4412 -void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params) { +void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params) { profiler::ScopeLabel label( "Kernel (kNeonDotprod, optimized for in-order cores)"); @@ -6454,10 +6450,9 @@ void CheckOffsetsInKernelParamsFloat(const Params&) { // width instead of the wider 12x8 that the register space permits and that // the aforementioned gemmlowp kernel uses. Ruy likes powers of two for now // and we don't have evidence that going beyond 8x8 is needed. -void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params) { +void KernelFloatNeon(const KernelParamsFloat<8, 8>& params) { CheckOffsetsInKernelParamsFloat(params); - profiler::ScopeLabel label( - "Kernel (kNeon, optimized for out-of-order cores)"); + profiler::ScopeLabel label("Kernel (kNeon)"); const float* lhs_col_ptr = params.lhs_base_ptr; const float* rhs_col_ptr = params.rhs_base_ptr; @@ -7086,7 +7081,7 @@ void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params) { "v26", "v27", "v28", "v29", "v30", "v31"); } -// Variant of KernelFloatNeonOutOfOrder tuned for in-order CPUs that do not +// Variant of KernelFloatNeon tuned for in-order CPUs that do not // support dotprod (while dotprod by itself is not relevant to floating-point, // this additional bit of information that we have about the target happens to // be useful here). @@ -7099,7 +7094,7 @@ void KernelFloatNeonOutOfOrder(const KernelParamsFloat<8, 8>& params) { // which was contributed by David Mansell with very helpful // comments. Specifically, see this comment about tuning for Cortex-A53: // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4215 -void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params) { +void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params) { profiler::ScopeLabel label("Kernel (kNeon, optimized for in-order cores)"); CheckOffsetsInKernelParamsFloat(params); @@ -7579,7 +7574,7 @@ void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params) { "v26", "v27", "v28", "v29", "v30", "v31"); } -// Variant of KernelFloatNeonInOrder tuned for in-order CPUs that do +// Variant of KernelFloatNeonA55ish tuned for in-order CPUs that do // support dotprod (while dotprod by itself is not relevant to floating-point, // this additional bit of information that we have about the target happens to // be useful here). @@ -7591,7 +7586,7 @@ void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params) { // which was contributed by David Mansell with very helpful // comments. Specifically, see this comment about tuning for Cortex-A55r1: // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4412 -void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params) { +void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params) { profiler::ScopeLabel label( "Kernel (kNeonDotprod, optimized for in-order cores)"); diff --git a/ruy/pack_arm.cc b/ruy/pack_arm.cc index f29b214..1673c90 100644 --- a/ruy/pack_arm.cc +++ b/ruy/pack_arm.cc @@ -31,12 +31,13 @@ namespace ruy { #if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM) -void Pack8bitColMajorForNeonOutOfOrder( - const void* src_ptr0, const void* src_ptr1, const void* src_ptr2, - const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2, - int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr, - std::int32_t* sums_ptr, int input_xor) { - profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)"); +void Pack8bitColMajorForNeon(const void* src_ptr0, const void* src_ptr1, + const void* src_ptr2, const void* src_ptr3, + int src_inc0, int src_inc1, int src_inc2, + int src_inc3, int src_rows, int src_zero_point, + std::int8_t* packed_ptr, std::int32_t* sums_ptr, + int input_xor) { + profiler::ScopeLabel label("Pack (kNeon)"); asm volatile( // clang-format off // v26 will be the vector to XOR input values with to perform @@ -246,11 +247,10 @@ void CheckOffsetsInPackParams8bit(const Params&) { static_assert(offsetof(Params, input_xor) == RUY_OFFSET_INPUT_XOR, ""); } -// Packing code for out-of-order ARMv7 CPUs like the Krait 400 or A9. -// No attempt made at making this code efficient on in-order cores yet. -void Pack8bitColMajorForNeonOutOfOrder4Cols(const PackParams8bit& params) { +// No attempt made at making this code efficient on A55-ish cores yet. +void Pack8bitColMajorForNeon4Cols(const PackParams8bit& params) { CheckOffsetsInPackParams8bit(params); - profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)"); + profiler::ScopeLabel label("Pack (kNeon)"); const void* src_ptr0 = params.src_ptr0; const void* src_ptr1 = params.src_ptr1; const void* src_ptr2 = params.src_ptr2; @@ -473,9 +473,9 @@ void Pack8bitColMajorForNeonOutOfOrder4Cols(const PackParams8bit& params) { // No attempt made at making this code efficient on in-order cores yet. // This version differs from the above in that we only handle two columns // at a time. -void Pack8bitColMajorForNeonOutOfOrder2Cols(const PackParams8bit& params) { +void Pack8bitColMajorForNeon2Cols(const PackParams8bit& params) { CheckOffsetsInPackParams8bit(params); - profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)"); + profiler::ScopeLabel label("Pack (kNeon)"); const void* src_ptr0 = params.src_ptr0; const void* src_ptr1 = params.src_ptr1; const int src_inc0 = params.src_inc0; @@ -626,12 +626,12 @@ void Pack8bitColMajorForNeonOutOfOrder2Cols(const PackParams8bit& params) { #if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM) -void Pack8bitColMajorForNeonInOrder(const void* src_ptr0, const void* src_ptr1, - const void* src_ptr2, const void* src_ptr3, - int src_inc0, int src_inc1, int src_inc2, - int src_inc3, int src_rows, - int src_zero_point, std::int8_t* packed_ptr, - std::int32_t* sums_ptr, int input_xor) { +void Pack8bitColMajorForNeonA55ish(const void* src_ptr0, const void* src_ptr1, + const void* src_ptr2, const void* src_ptr3, + int src_inc0, int src_inc1, int src_inc2, + int src_inc3, int src_rows, + int src_zero_point, std::int8_t* packed_ptr, + std::int32_t* sums_ptr, int input_xor) { profiler::ScopeLabel label("Pack (kNeon, optimized for in-order cores)"); asm volatile( // clang-format off @@ -837,7 +837,7 @@ void Pack8bitColMajorForNeonInOrder(const void* src_ptr0, const void* src_ptr1, "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -void Pack8bitColMajorForNeonDotprodInOrder( +void Pack8bitColMajorForNeonDotprodA55ish( const void* src_ptr0, const void* src_ptr1, const void* src_ptr2, const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2, int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr, @@ -1083,13 +1083,13 @@ void Pack8bitColMajorForNeonDotprodInOrder( "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -void Pack8bitColMajorForNeonDotprodOutOfOrder( - const void* src_ptr0, const void* src_ptr1, const void* src_ptr2, - const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2, - int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr, - std::int32_t* sums_ptr, int input_xor) { - profiler::ScopeLabel label( - "Pack (kNeonDotprod, optimized for out-of-order cores)"); +void Pack8bitColMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1, + const void* src_ptr2, const void* src_ptr3, + int src_inc0, int src_inc1, int src_inc2, + int src_inc3, int src_rows, + int src_zero_point, std::int8_t* packed_ptr, + std::int32_t* sums_ptr, int input_xor) { + profiler::ScopeLabel label("Pack (kNeonDotprod)"); asm volatile( // clang-format off // v26 will be the vector to XOR input values with to perform @@ -1735,11 +1735,11 @@ void Pack8bitRowMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1, "v27", "v28", "v29", "v30", "v31"); } -void PackFloatColMajorForNeonOutOfOrder( - const float* src_ptr0, const float* src_ptr1, const float* src_ptr2, - const float* src_ptr3, int src_inc0, int src_inc1, int src_inc2, - int src_inc3, int src_rows, float* packed_ptr) { - profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)"); +void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1, + const float* src_ptr2, const float* src_ptr3, + int src_inc0, int src_inc1, int src_inc2, + int src_inc3, int src_rows, float* packed_ptr) { + profiler::ScopeLabel label("Pack (kNeon)"); asm volatile( // clang-format off // w1 will be the number of rows already loaded. @@ -1882,13 +1882,11 @@ void PackFloatColMajorForNeonOutOfOrder( #endif #if RUY_PLATFORM_NEON_32 && RUY_OPT(ASM) -void PackFloatColMajorForNeonOutOfOrder(const float* src_ptr0, - const float* src_ptr1, - const float* src_ptr2, - const float* src_ptr3, int src_inc, - int src_rows, float* packed_ptr, - int output_stride) { - profiler::ScopeLabel label("Pack (kNeon, optimized for out-of-order cores)"); +void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1, + const float* src_ptr2, const float* src_ptr3, + int src_inc, int src_rows, float* packed_ptr, + int output_stride) { + profiler::ScopeLabel label("Pack (kNeon)"); asm volatile( // clang-format off "mov r1, #0\n" @@ -2066,12 +2064,12 @@ void PackFloatColMajorForNeonOutOfOrder(const float* src_ptr0, #endif // (RUY_PLATFORM_NEON_32 #if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM) -void PackFloatColMajorForNeonInOrder(const float* src_ptr0, - const float* src_ptr1, - const float* src_ptr2, - const float* src_ptr3, int src_inc0, - int src_inc1, int src_inc2, int src_inc3, - int src_rows, float* packed_ptr) { +void PackFloatColMajorForNeonA55ish(const float* src_ptr0, + const float* src_ptr1, + const float* src_ptr2, + const float* src_ptr3, int src_inc0, + int src_inc1, int src_inc2, int src_inc3, + int src_rows, float* packed_ptr) { profiler::ScopeLabel label("Pack (kNeon, optimized for in-order cores)"); asm volatile( @@ -2290,7 +2288,7 @@ void Pack8bitRowMajorForNeon(const std::uint8_t* src_ptr, int src_stride, // it's working at, this seems like a fair compromise. If one wanted to // maximize performance at the cost of more code complexity/size, one could // have code handling 16 columns at a time (maybe limited to - // Tuning::kOutOfOrder), then 8, then 4 to minimize the amount of slow + // Tuning::kGeneric), then 8, then 4 to minimize the amount of slow // leftovers. // // Load 8 sums in sums0, sums1. diff --git a/ruy/pack_arm.h b/ruy/pack_arm.h index 12d5cab..ba8964d 100644 --- a/ruy/pack_arm.h +++ b/ruy/pack_arm.h @@ -62,23 +62,25 @@ void Pack8bitRowMajorForNeon(const std::uint8_t* src_ptr, int src_stride, #if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM) -void Pack8bitColMajorForNeonOutOfOrder( - const void* src_ptr0, const void* src_ptr1, const void* src_ptr2, - const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2, - int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr, - std::int32_t* sums_ptr, int input_xor); -void Pack8bitColMajorForNeonInOrder(const void* src_ptr0, const void* src_ptr1, +void Pack8bitColMajorForNeon(const void* src_ptr0, const void* src_ptr1, + const void* src_ptr2, const void* src_ptr3, + int src_inc0, int src_inc1, int src_inc2, + int src_inc3, int src_rows, int src_zero_point, + std::int8_t* packed_ptr, std::int32_t* sums_ptr, + int input_xor); +void Pack8bitColMajorForNeonA55ish(const void* src_ptr0, const void* src_ptr1, + const void* src_ptr2, const void* src_ptr3, + int src_inc0, int src_inc1, int src_inc2, + int src_inc3, int src_rows, + int src_zero_point, std::int8_t* packed_ptr, + std::int32_t* sums_ptr, int input_xor); +void Pack8bitColMajorForNeonDotprod(const void* src_ptr0, const void* src_ptr1, const void* src_ptr2, const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2, int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr, std::int32_t* sums_ptr, int input_xor); -void Pack8bitColMajorForNeonDotprodOutOfOrder( - const void* src_ptr0, const void* src_ptr1, const void* src_ptr2, - const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2, - int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr, - std::int32_t* sums_ptr, int input_xor); -void Pack8bitColMajorForNeonDotprodInOrder( +void Pack8bitColMajorForNeonDotprodA55ish( const void* src_ptr0, const void* src_ptr1, const void* src_ptr2, const void* src_ptr3, int src_inc0, int src_inc1, int src_inc2, int src_inc3, int src_rows, int src_zero_point, std::int8_t* packed_ptr, @@ -130,8 +132,8 @@ inline void MakePackParams8bit(const void* src_ptr0, const void* src_ptr1, params->input_xor = input_xor; } -void Pack8bitColMajorForNeonOutOfOrder4Cols(const PackParams8bit& params); -void Pack8bitColMajorForNeonOutOfOrder2Cols(const PackParams8bit& params); +void Pack8bitColMajorForNeon4Cols(const PackParams8bit& params); +void Pack8bitColMajorForNeon2Cols(const PackParams8bit& params); #endif // (RUY_PLATFORM_NEON_32 && RUY_OPT(ASM) @@ -187,16 +189,16 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar, packed_matrix->data + packed_matrix->layout.stride * block_col; std::int32_t* sums_ptr = sums ? sums + block_col : nullptr; #if RUY_PLATFORM_NEON_64 - if (__builtin_expect(tuning == Tuning::kInOrder, true)) { - Pack8bitColMajorForNeonInOrder( + if (__builtin_expect(tuning == Tuning::kA55ish, true)) { + Pack8bitColMajorForNeonA55ish( src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, packed_ptr, sums_ptr, kInputXor); } else { - Pack8bitColMajorForNeonOutOfOrder( - src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, - src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, - packed_ptr, sums_ptr, kInputXor); + Pack8bitColMajorForNeon(src_ptr0, src_ptr1, src_ptr2, src_ptr3, + src_inc0, src_inc1, src_inc2, src_inc3, + src_matrix.layout.rows, src_matrix.zero_point, + packed_ptr, sums_ptr, kInputXor); } #else (void)tuning; @@ -208,7 +210,7 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar, packed_ptr, src_inc0, src_inc1, src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, kInputXor, ¶ms); - Pack8bitColMajorForNeonOutOfOrder4Cols(params); + Pack8bitColMajorForNeon4Cols(params); #endif // RUY_PLATFORM_NEON_64 } } @@ -262,7 +264,7 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 2>, Scalar, packed_ptr, src_inc0, src_inc1, -1, -1, src_matrix.layout.rows, src_matrix.zero_point, kInputXor, ¶ms); - Pack8bitColMajorForNeonOutOfOrder2Cols(params); + Pack8bitColMajorForNeon2Cols(params); } } }; @@ -320,13 +322,13 @@ struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>, packed_matrix->layout.stride * (block_col & ~7) + ((block_col & 4) * 4); std::int32_t* sums_ptr = sums ? sums + block_col : nullptr; - if (__builtin_expect(tuning == Tuning::kInOrder, true)) { - Pack8bitColMajorForNeonDotprodInOrder( + if (__builtin_expect(tuning == Tuning::kA55ish, true)) { + Pack8bitColMajorForNeonDotprodA55ish( src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, packed_ptr, sums_ptr, kInputXor); } else { - Pack8bitColMajorForNeonDotprodOutOfOrder( + Pack8bitColMajorForNeonDotprod( src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, src_inc2, src_inc3, src_matrix.layout.rows, src_matrix.zero_point, packed_ptr, sums_ptr, kInputXor); @@ -337,24 +339,22 @@ struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>, #endif // (RUY_PLATFORM_NEON_64&& RUY_OPT(ASM) #if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM) -void PackFloatColMajorForNeonOutOfOrder( - const float* src_ptr0, const float* src_ptr1, const float* src_ptr2, - const float* src_ptr3, int src_inc0, int src_inc1, int src_inc2, - int src_inc3, int src_rows, float* packed_ptr); -void PackFloatColMajorForNeonInOrder(const float* src_ptr0, - const float* src_ptr1, - const float* src_ptr2, - const float* src_ptr3, int src_inc0, - int src_inc1, int src_inc2, int src_inc3, - int src_rows, float* packed_ptr); +void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1, + const float* src_ptr2, const float* src_ptr3, + int src_inc0, int src_inc1, int src_inc2, + int src_inc3, int src_rows, float* packed_ptr); +void PackFloatColMajorForNeonA55ish(const float* src_ptr0, + const float* src_ptr1, + const float* src_ptr2, + const float* src_ptr3, int src_inc0, + int src_inc1, int src_inc2, int src_inc3, + int src_rows, float* packed_ptr); #elif RUY_PLATFORM_NEON_32 && RUY_OPT(ASM) -void PackFloatColMajorForNeonOutOfOrder(const float* src_ptr0, - const float* src_ptr1, - const float* src_ptr2, - const float* src_ptr3, int src_inc, - int src_rows, float* packed_ptr, - int stride); +void PackFloatColMajorForNeon(const float* src_ptr0, const float* src_ptr1, + const float* src_ptr2, const float* src_ptr3, + int src_inc, int src_rows, float* packed_ptr, + int stride); #endif // (RUY_PLATFORM_NEON_64&& RUY_OPT(ASM) #if (RUY_PLATFORM_NEON_32 || RUY_PLATFORM_NEON_64) && RUY_OPT(ASM) @@ -400,14 +400,14 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float, packed_matrix->layout.stride * (block_col & ~7) + ((block_col & 4)); #if RUY_PLATFORM_NEON_64 - if (__builtin_expect(tuning == Tuning::kInOrder, true)) { - PackFloatColMajorForNeonInOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, - src_inc0, src_inc1, src_inc2, src_inc3, - src_matrix.layout.rows, packed_ptr); + if (__builtin_expect(tuning == Tuning::kA55ish, true)) { + PackFloatColMajorForNeonA55ish(src_ptr0, src_ptr1, src_ptr2, src_ptr3, + src_inc0, src_inc1, src_inc2, src_inc3, + src_matrix.layout.rows, packed_ptr); } else { - PackFloatColMajorForNeonOutOfOrder( - src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0, src_inc1, - src_inc2, src_inc3, src_matrix.layout.rows, packed_ptr); + PackFloatColMajorForNeon(src_ptr0, src_ptr1, src_ptr2, src_ptr3, + src_inc0, src_inc1, src_inc2, src_inc3, + src_matrix.layout.rows, packed_ptr); } #else (void)tuning; @@ -424,9 +424,9 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float, src_inc += src_inc2 == 16 ? 4 : 0; src_inc += src_inc3 == 16 ? 8 : 0; const int kOutputStride = 32; - PackFloatColMajorForNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, - src_inc, src_matrix.layout.rows, - packed_ptr, kOutputStride); + PackFloatColMajorForNeon(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc, + src_matrix.layout.rows, packed_ptr, + kOutputStride); #endif // RUY_PLATFORM_NEON_64 } } @@ -482,9 +482,9 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 4>, float, src_inc += src_inc2 == 16 ? 4 : 0; src_inc += src_inc3 == 16 ? 8 : 0; const int kOutputStride = 16; - PackFloatColMajorForNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, - src_inc, src_matrix.layout.rows, - packed_ptr, kOutputStride); + PackFloatColMajorForNeon(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc, + src_matrix.layout.rows, packed_ptr, + kOutputStride); } } }; @@ -121,8 +121,8 @@ inline const char* TuningName(Tuning tuning) { case Tuning::NAME: \ return #NAME; switch (tuning) { - RUY_SUBPATHNAME_CASE(kInOrder) - RUY_SUBPATHNAME_CASE(kOutOfOrder) + RUY_SUBPATHNAME_CASE(kA55ish) + RUY_SUBPATHNAME_CASE(kGeneric) default: RUY_CHECK(false); return nullptr; @@ -1820,7 +1820,7 @@ inline std::vector<Tuning> EnumerateTuningsForPath(Path path, bool benchmark) { } #if RUY_PLATFORM_ARM if (path == Path::kNeon || path == Path::kNeonDotprod) { - return {Tuning::kInOrder, Tuning::kOutOfOrder, Tuning::kAuto}; + return {Tuning::kA55ish, Tuning::kGeneric, Tuning::kAuto}; } #endif (void)path; diff --git a/ruy/tune.cc b/ruy/tune.cc index 4d542e9..1f615bf 100644 --- a/ruy/tune.cc +++ b/ruy/tune.cc @@ -23,8 +23,7 @@ limitations under the License. namespace ruy { Tuning TuningResolver::ResolveNow(CpuInfo* cpuinfo) { - return cpuinfo->CurrentCpuIsInOrder() ? Tuning::kInOrder - : Tuning::kOutOfOrder; + return cpuinfo->CurrentCpuIsA55ish() ? Tuning::kA55ish : Tuning::kGeneric; } TuningResolver::TuningResolver() @@ -24,7 +24,7 @@ limitations under the License. // layouts compared to Path::kNeon; but within each, different tunings // will share that same layout. // -// # Tuning is for now only based on 1 bit: OutOfOrder / InOrder +// # Tuning is for now only based on 1 bit: Generic / A55ish // // In practice, each of our asm code paths only needs one bit information to // decide on tuning: whether the CPU is out-of-order or in-order. @@ -37,7 +37,7 @@ limitations under the License. // // Because having tuned code paths is a compromise of efficiency gains // versus implementation effort and code size, we are happy to stop at just this -// single bit of information, OutOfOrder/InOrder, at least in the current CPU +// single bit of information, Generic / A55ish, at least in the current CPU // landscape. This could change in the future. #ifndef RUY_RUY_TUNE_H_ #define RUY_RUY_TUNE_H_ @@ -54,10 +54,22 @@ enum class Tuning { // user-visible parts (see Context). It's meant to be resolved to an // actual tuning at some point by means of TuningResolver. kAuto, - // Target an out-order CPU. Example: ARM Cortex-A75. - kOutOfOrder, - // Target an in-order CPU. Example: ARM Cortex-A55. - kInOrder + // Use code not tuned for any particular CPU, typically performing well + // on out-of-order cores that don't require as much tuning. + kGeneric, + // Use code tuned for "Cortex-A55-ish" CPUs, by which we mean mostly: + // A53, A55r0 (pre-dotprod), A55r1 (with dotprod). These CPUs have in common + // that they are in-order CPU cores with largely similar requirements of code + // tuning. The most important such requirement is to use only 64-bit loads + // to maximize dual-issuing. + // + // A55r1 differs from A55r0 and A53 in that it dual-issues 64-bit NEON loads + // whereas A55r0 and A53 require using non-NEON ARM 64-bit loads together with + // INS instructions to insert 64bit lanes into NEON registers. However, since + // A55r1 supports dotprod unlike A55r0 and A53, they are not using the same + // kernels in practice anyway, so there was no need to distinguish them with + // separate Tuning values. + kA55ish }; // Why a TuningResolver class? @@ -65,7 +77,7 @@ enum class Tuning { // Ideally, this Library would offer a single function, // Tuning GetCurrentCPUTuning(); // -// However, determining information about the current CPU is not necessarily, +// However, determining information about the current CPU is not necessarily // cheap, so we currently cache that and only invalidate/reevaluate after // a fixed amount of time. This need to store state is why this library // has to expose a class, TuningResolver, not just a function. diff --git a/ruy/tune_test.cc b/ruy/tune_test.cc index dabe21a..c5f2342 100644 --- a/ruy/tune_test.cc +++ b/ruy/tune_test.cc @@ -36,7 +36,7 @@ TEST(TuneTest, TuneTest) { tuning_resolver.SetTuning(Tuning::kAuto); #ifdef RUY_IMPLEMENT_TUNING - for (auto tuning : {Tuning::kOutOfOrder, Tuning::kInOrder}) { + for (auto tuning : {Tuning::kGeneric, Tuning::kA55ish}) { tuning_resolver.SetTuning(tuning); ASSERT_TRUE(tuning_resolver.Resolve(&cpuinfo) == tuning); // See above comment about 1 second. |