diff options
Diffstat (limited to 'ruy/test.h')
-rw-r--r-- | ruy/test.h | 2125 |
1 files changed, 2125 insertions, 0 deletions
diff --git a/ruy/test.h b/ruy/test.h new file mode 100644 index 0000000..649a0d9 --- /dev/null +++ b/ruy/test.h @@ -0,0 +1,2125 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_TEST_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_TEST_H_ + +#include <math.h> + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <cstdio> +#include <cstdlib> +#include <ctime> +#include <iostream> +#include <iterator> +#include <limits> +#include <memory> +#include <random> +#include <set> +#include <sstream> +#include <string> +#include <tuple> +#include <type_traits> +#include <vector> + +#include "testing/base/public/gunit.h" // IWYU pragma: export +#include "ruy/matrix.h" // IWYU pragma: export +#include "ruy/platform.h" +#include "ruy/pmu.h" +#include "ruy/ruy.h" +#include "ruy/ruy_advanced.h" +#include "ruy/spec.h" // IWYU pragma: export +#include "ruy/time.h" + +#ifdef RUY_TEST_EXTERNAL_PATHS +#define EIGEN_USE_THREADS +#define EIGEN_USE_CUSTOM_THREAD_POOL +#include "third_party/eigen3/Eigen/Core" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "third_party/gemmlowp/public/gemmlowp.h" +#include "third_party/lapack/blas.h" +#endif + +#ifdef RUY_PROFILER +#include "ruy/profiler/profiler.h" +#endif + +namespace ruy { + +const float kClampRatio = 0.1f; + +enum class ExternalPath { kNone, kGemmlowp, kEigen, kEigenTensor, kOpenBlas }; + +inline std::vector<std::string>* CoveredPaths() { + static std::vector<std::string> covered_paths; + return &covered_paths; +} + +inline const char* PathName(Path path) { +#define RUY_PATHNAME_CASE(NAME) \ + case Path::NAME: \ + return #NAME; + switch (path) { + RUY_PATHNAME_CASE(kReference) + RUY_PATHNAME_CASE(kStandardCpp) +#if RUY_PLATFORM(NEON) + RUY_PATHNAME_CASE(kNeon) + RUY_PATHNAME_CASE(kNeonDotprod) +#elif RUY_PLATFORM(X86) + RUY_PATHNAME_CASE(kSse42) + RUY_PATHNAME_CASE(kAvx2) + RUY_PATHNAME_CASE(kAvx512) + RUY_PATHNAME_CASE(kAvxVnni) +#endif + default: + RUY_CHECK(false); + return nullptr; + } +#undef RUY_PATHNAME_CASE +} + +inline const char* TuningName(Tuning tuning) { +#define RUY_SUBPATHNAME_CASE(NAME) \ + case Tuning::NAME: \ + return #NAME; + switch (tuning) { + RUY_SUBPATHNAME_CASE(kInOrder) + RUY_SUBPATHNAME_CASE(kOutOfOrder) + default: + RUY_CHECK(false); + return nullptr; + } +#undef RUY_SUBPATHNAME_CASE +} + +inline const char* PathName(ExternalPath path) { +#define RUY_PATHNAME_CASE(NAME) \ + case ExternalPath::NAME: \ + return #NAME; + switch (path) { + RUY_PATHNAME_CASE(kGemmlowp) + RUY_PATHNAME_CASE(kEigen) + RUY_PATHNAME_CASE(kEigenTensor) + RUY_PATHNAME_CASE(kOpenBlas) + default: + RUY_CHECK(false); + return nullptr; + } +#undef RUY_PATHNAME_CASE +} + +inline std::ostream& operator<<(std::ostream& stream, Path path) { + return stream << PathName(path); +} + +inline std::ostream& operator<<(std::ostream& stream, + ExternalPath external_path) { + return stream << PathName(external_path); +} + +template <typename ContainerType> +std::string Join(const ContainerType& container) { + if (container.empty()) { + return "<empty>"; + } + std::ostringstream stream; + auto it = container.begin(); + stream << *it++; + for (; it != container.end(); ++it) { + stream << ", "; + stream << *it; + } + return stream.str(); +} + +struct LogCoveredPathsOnDestruction final { + ~LogCoveredPathsOnDestruction() { + std::cerr << "Covered paths: " << Join(*CoveredPaths()) << std::endl; + + // When testing on ARM64 ChromiumOS emulator, make sure that we covered + // the dotprod path. We're getting such coverage at the moment thanks to + // using a sufficiently recent emulator, and we don't want to regress that. +#if RUY_PLATFORM(ARM_64) && defined RUY_TESTING_ON_CHROMIUMOS + bool found_dotprod = false; + for (const std::string& covered_path : *CoveredPaths()) { + if (covered_path == "kNeonDotprod") { + found_dotprod = true; + } + } + if (!found_dotprod) { + std::cerr + << "Error: we haven't tested the kNeonDotprod path as we should " + "have. At the moment, this is required on ChromiumOS as this is " + "what we run emulator tests in, that currently supports " + "dot-product " + "instructions, and we care very much about not regressing that. " + "If this test was run in an emulator, please upgrade to a newer " + "emulator version. If this test was run on an actual device, and " + "you need to be able to run ruy tests on devices not supporting " + "dot-product instructions, get in touch with us.\n" + << std::endl; + abort(); + } +#endif + } + static void Singleton() { static LogCoveredPathsOnDestruction singleton; } +}; + +enum class RandomRange { + kGeneral, + kAvoidMinValue, + kOffCenterAvoidMinValue, + kReasonableSrcZeroPoint, + kReasonableDstZeroPoint, + kBias +}; + +template <typename Scalar, + bool IsFloatingPoint = std::is_floating_point<Scalar>::value> +struct RandomRangeBounds {}; + +template <typename Scalar> +struct RandomRangeBounds<Scalar, true> { + static Scalar GetMinBound(RandomRange range) { + switch (range) { + case RandomRange::kGeneral: + return -1; + case RandomRange::kAvoidMinValue: + return -1; + case RandomRange::kOffCenterAvoidMinValue: + return -1; + case RandomRange::kReasonableSrcZeroPoint: + return 0; + case RandomRange::kReasonableDstZeroPoint: + return 0; + case RandomRange::kBias: + return -1; + default: + RUY_CHECK(false); + return 0; + } + } + static Scalar GetMaxBound(RandomRange range) { + switch (range) { + case RandomRange::kGeneral: + return 1; + case RandomRange::kAvoidMinValue: + return 1; + case RandomRange::kOffCenterAvoidMinValue: + return 1; + case RandomRange::kReasonableSrcZeroPoint: + return 0; + case RandomRange::kReasonableDstZeroPoint: + return 0; + case RandomRange::kBias: + return 1; + default: + RUY_CHECK(false); + return 0; + } + } +}; + +template <typename Scalar> +Scalar WeightedSum(Scalar s1, float weight1, Scalar s2, float weight2) { + float sum = s1 * weight1 + s2 * weight2; + float clamped = std::min<float>( + std::numeric_limits<Scalar>::max(), + std::max<float>(std::numeric_limits<Scalar>::lowest(), sum)); + return static_cast<Scalar>(clamped); +} + +template <typename Scalar> +Scalar Parametrized(float param) { + return WeightedSum(std::numeric_limits<Scalar>::max(), param, + std::numeric_limits<Scalar>::lowest(), 1 - param); +} + +template <typename Scalar> +struct RandomRangeBounds<Scalar, false> { + static Scalar GetMinBound(RandomRange range) { + static constexpr double offcenteredness = + 0.02; // Shift lower limit by about 5 for range of 255. + switch (range) { + case RandomRange::kGeneral: + return std::numeric_limits<Scalar>::lowest(); + case RandomRange::kAvoidMinValue: + return 1 + std::numeric_limits<Scalar>::lowest(); + case RandomRange::kOffCenterAvoidMinValue: + return 1 + std::numeric_limits<Scalar>::lowest() + + static_cast<Scalar>( + offcenteredness * std::numeric_limits<Scalar>::max() - + offcenteredness * + (std::numeric_limits<Scalar>::lowest() + 1)); + case RandomRange::kReasonableSrcZeroPoint: + return std::numeric_limits<Scalar>::lowest(); + case RandomRange::kReasonableDstZeroPoint: + return Parametrized<Scalar>(0.4); + case RandomRange::kBias: + return std::is_same<Scalar, std::int32_t>::value + ? static_cast<Scalar>(-10000) + : 0; + default: + RUY_CHECK(false); + return 0; + } + } + static Scalar GetMaxBound(RandomRange range) { + switch (range) { + case RandomRange::kGeneral: + return std::numeric_limits<Scalar>::max(); + case RandomRange::kAvoidMinValue: + return std::numeric_limits<Scalar>::max(); + case RandomRange::kOffCenterAvoidMinValue: + return std::numeric_limits<Scalar>::max(); + case RandomRange::kReasonableSrcZeroPoint: + return std::numeric_limits<Scalar>::max(); + case RandomRange::kReasonableDstZeroPoint: + return Parametrized<Scalar>(0.6); + case RandomRange::kBias: + return std::is_same<Scalar, std::int32_t>::value + ? static_cast<Scalar>(10000) + : 0; + default: + RUY_CHECK(false); + return 0; + } + } +}; + +inline std::default_random_engine& global_random_engine() { + static std::default_random_engine engine; + return engine; +} + +template <typename Scalar> +struct UniformRandomDistribution { + UniformRandomDistribution(RandomRange range) + : dist(RandomRangeBounds<Scalar>::GetMinBound(range), + RandomRangeBounds<Scalar>::GetMaxBound(range)) {} + Scalar Get() { return dist(global_random_engine()); } + // std::uniform_int_distribution is specified not to support char types, + // only short and wider types. MSVC actually generates an error on + // std::uniform_int_distribution<std::int8_t>. + using StdDistType = typename std::conditional< + std::is_floating_point<Scalar>::value, + std::uniform_real_distribution<Scalar>, + std::uniform_int_distribution<std::int32_t>>::type; + StdDistType dist; +}; + +template <typename Scalar> +void MakeRandomScalar(UniformRandomDistribution<Scalar>* uniform_dist, + Scalar* dst) { + *dst = uniform_dist->Get(); +} + +template <typename Scalar> +void MakeRandomVector(UniformRandomDistribution<Scalar>* uniform_dist, int size, + std::vector<Scalar>* dst) { + dst->resize(size); + for (auto& x : *dst) { + MakeRandomScalar(uniform_dist, &x); + } +} + +template <typename Scalar> +void MakeRandomScalar(RandomRange range, Scalar* dst) { + UniformRandomDistribution<Scalar> dist(range); + *dst = dist.Get(); + if (range == RandomRange::kReasonableDstZeroPoint || + range == RandomRange::kReasonableSrcZeroPoint) { + if (global_random_engine()() & 1) { + *dst = SymmetricZeroPoint<Scalar>(); + } + } +} + +template <typename Scalar> +void MakeRandomVector(RandomRange range, int size, std::vector<Scalar>* dst) { + UniformRandomDistribution<Scalar> dist(range); + dst->resize(size); + for (auto& x : *dst) { + MakeRandomScalar(&dist, &x); + } +} + +enum class LayoutStyle { kPackedLinear, kLinear }; + +inline void MakeLayout(int rows, int cols, Order order, + LayoutStyle layout_style, Layout* layout) { + layout->rows = rows; + layout->cols = cols; + layout->order = order; + + const int packed_stride = order == Order::kColMajor ? rows : cols; + + RUY_CHECK(layout_style == LayoutStyle::kPackedLinear || + layout_style == LayoutStyle::kLinear); + if (layout_style == LayoutStyle::kPackedLinear) { + layout->stride = packed_stride; + } else { + layout->stride = packed_stride + 1; + } +} + +template <typename Scalar> +struct StorageMatrix { + StorageMatrix() = default; + StorageMatrix(const StorageMatrix&) = delete; + void operator=(const StorageMatrix&) = delete; + std::vector<Scalar> data; + Matrix<Scalar> matrix; +}; + +template <typename Scalar> +void VerifyConsistentFields(const StorageMatrix<Scalar>& storage_matrix) { + if (storage_matrix.data.empty()) { + RUY_CHECK_EQ(storage_matrix.matrix.data.get(), nullptr); + RUY_CHECK_EQ(storage_matrix.matrix.layout.rows, 0); + RUY_CHECK_EQ(storage_matrix.matrix.layout.cols, 0); + } else { + RUY_CHECK_EQ(storage_matrix.matrix.data.get(), storage_matrix.data.data()); + RUY_CHECK_EQ(FlatSize(storage_matrix.matrix.layout), + storage_matrix.data.size()); + } +} + +template <typename Scalar> +void MakeRandom(int rows, int cols, Order order, Scalar zero_point, + LayoutStyle layout_style, RandomRange range, + StorageMatrix<Scalar>* storage_matrix) { + MakeLayout(rows, cols, order, layout_style, &storage_matrix->matrix.layout); + storage_matrix->matrix.zero_point = zero_point; + UniformRandomDistribution<Scalar> data_dist(range); + MakeRandomVector(&data_dist, FlatSize(storage_matrix->matrix.layout), + &storage_matrix->data); + storage_matrix->matrix.data = storage_matrix->data.data(); + VerifyConsistentFields(*storage_matrix); +} + +template <typename Scalar> +struct TestResult { + void operator=(const TestResult&) = delete; + void operator=(const TestResult&&) = delete; + StorageMatrix<Scalar> storage_matrix; + Path path = Path::kNone; + Tuning tuning = Tuning::kAuto; + ExternalPath external_path = ExternalPath::kNone; + float latency; + float l1_refill_rate; + float l2_refill_rate; + float l3_refill_rate; + float l1tlb_refill_rate; + float l2tlb_refill_rate; + float mispred_rate; + float frontend_stall_rate; + float backend_stall_rate; + + // Per-path data for pre-packing. + // This is not used by external paths or by Path::kReference. + Allocator allocator; + PrepackedMatrix prepacked_lhs; + PrepackedMatrix prepacked_rhs; + bool use_prepacked_lhs = false; + bool use_prepacked_rhs = false; +}; + +template <typename Scalar> +std::string PathName(const TestResult<Scalar>& result) { + std::string pathname; + if (result.path != Path::kNone) { + pathname.assign(PathName(result.path)); + } else if (result.external_path != ExternalPath::kNone) { + pathname.assign(PathName(result.external_path)); + } else { + RUY_CHECK(false); + } + if (result.tuning != Tuning::kAuto) { + pathname.append("/"); + pathname.append(TuningName(result.tuning)); + } + return pathname; +} + +enum class ExpectedOutcome { kSuccess, kDeath }; + +template <typename tLhsScalar, typename tRhsScalar, typename SpecType> +struct TestSet final { + using LhsScalar = tLhsScalar; + using RhsScalar = tRhsScalar; + using AccumScalar = typename SpecType::AccumScalar; + using DstScalar = typename SpecType::DstScalar; + using Spec = SpecType; + using TestResultType = TestResult<DstScalar>; + + void Run() { + MakeZeroPoints(); + MakeLhsRhs(); + MakeSpec(); + MakeOtherParams(); + MakeResultPaths(); + MakePrepackedMatrices(); + Eval(); + Verify(); + } + + private: + void MakeZeroPoints(); + void MakeLhsRhs(); + void MakeSpec(); + void MakeResultPaths(); + void MakePrepackedMatrices(); + void MakeOtherParams(); + void EvalAndVerify(); + void Eval(); + void Verify(); + + void EvalResult(TestResultType* result); + void EvalRuy(TestResultType* result); + void DoMul(TestResultType* result); + void Benchmark(TestResultType* result); + void VerifyTestResults() const; + + public: + enum class LifeStage { + kInitial, + kHasZeroPoints, + kHasLhsRhs, + kHasSpec, + kHasOtherParams, + kHasResultPaths, + kHasPrepackedMatrices, + kEvaluated, + kFinal + }; + + ~TestSet() { + RUY_CHECK_EQ(life_stage, LifeStage::kFinal); + LogCoveredPathsOnDestruction::Singleton(); + } + + LifeStage life_stage = LifeStage::kInitial; + + int rows = 0; + int cols = 0; + int depth = 0; + Order lhs_order = Order::kRowMajor; + Order rhs_order = Order::kColMajor; + Order dst_order = Order::kColMajor; + LayoutStyle layout_style = LayoutStyle::kPackedLinear; + ExpectedOutcome expected_outcome = ExpectedOutcome::kSuccess; + + bool use_specified_zero_points = false; + LhsScalar lhs_zero_point = 0; + RhsScalar rhs_zero_point = 0; + DstScalar dst_zero_point = 0; + + std::vector<AccumScalar> per_channel_multiplier_fixedpoint; + std::vector<int> per_channel_multiplier_exponent; + + StorageMatrix<LhsScalar> lhs; + StorageMatrix<RhsScalar> rhs; + Spec spec; + std::vector<AccumScalar> bias_data; + std::vector<std::unique_ptr<TestResultType>> results; + + std::vector<Path> paths; + std::vector<ExternalPath> external_paths; + + bool benchmark = false; + bool perchannel = false; + int max_num_threads = 0; + bool benchmark_prepack_lhs = false; + bool benchmark_prepack_rhs = false; +}; + +inline PmuEvents& GlobalPmuEvents() { + static PmuEvents pmu; + return pmu; +} + +inline Context& GlobalContext() { + // Ensure that GlobalPmuEvents is constructed before we create any context. + // This ensures that pmu counters are opened before we create any worker + // thread, which is necessary to count events from worker threads. + GlobalPmuEvents(); + + static Context context; + return context; +} + +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define RUY_TSAN +#endif +#if __has_feature(address_sanitizer) +#define RUY_ASAN +#endif +#endif // defined(__has_feature) + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::DoMul(TestResultType* result) { + Context* context = &GlobalContext(); + + if (!result->use_prepacked_lhs && !result->use_prepacked_rhs) { + Mul<kAllPaths>(lhs.matrix, rhs.matrix, spec, context, + &result->storage_matrix.matrix); + return; + } + + // If we prepacked an input matrix, null out its data pointer to check + // that we don't access any data through it. + Matrix<LhsScalar> null_data_lhs = lhs.matrix; + Matrix<RhsScalar> null_data_rhs = rhs.matrix; + if (result->use_prepacked_lhs) { + null_data_lhs.data = nullptr; + } + if (result->use_prepacked_rhs) { + null_data_rhs.data = nullptr; + } + + // Do the multiplication with pre-packed matrices. + PrepackedMatrix* prepacked_lhs_ptr = + result->use_prepacked_lhs ? &result->prepacked_lhs : nullptr; + PrepackedMatrix* prepacked_rhs_ptr = + result->use_prepacked_rhs ? &result->prepacked_rhs : nullptr; + MulWithPrepacked<kAllPaths>(null_data_lhs, null_data_rhs, spec, context, + &result->storage_matrix.matrix, prepacked_lhs_ptr, + prepacked_rhs_ptr); +} + +// When building for WAsm, ASSERT_DEATH is not defined. +#ifdef ASSERT_DEATH +#define RUY_ASSERT_DEATH(CONDITION, MESSAGE) ASSERT_DEATH(CONDITION, MESSAGE) +#else +#define RUY_ASSERT_DEATH(CONDITION, MESSAGE) +#endif + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::EvalRuy(TestResultType* result) { + GlobalContext().explicit_tuning = result->tuning; + if (max_num_threads) { + GlobalContext().max_num_threads = max_num_threads; + } else if (benchmark) { + GlobalContext().max_num_threads = 1; + } else { + GlobalContext().max_num_threads = 1 + global_random_engine()() % 8; + } + GlobalContext().SetRuntimeEnabledPaths(result->path); + if (expected_outcome == ExpectedOutcome::kSuccess) { + DoMul(result); + RUY_CHECK_EQ(GlobalContext().last_taken_path, result->path); + } else if (expected_outcome == ExpectedOutcome::kDeath) { + // TODO(benoitjacob) TSan and ASan seem to be breaking ASSERT_DEATH. + // Report a bug? +#if (!defined NDEBUG) && (!defined RUY_ASAN) && (!defined RUY_TSAN) + RUY_ASSERT_DEATH(DoMul(result), ""); +#endif + } else { + RUY_CHECK(false); + } + GlobalContext().explicit_tuning = Tuning::kAuto; + GlobalContext().max_num_threads = 1; +} + +#ifdef RUY_TEST_EXTERNAL_PATHS + +template <typename Scalar, gemmlowp::MapOrder tOrder> +void WrapGemmlowp(const Matrix<Scalar>& src, + gemmlowp::MatrixMap<const Scalar, tOrder>* dst) { + RUY_CHECK(src.layout.order == (tOrder == gemmlowp::MapOrder::ColMajor + ? Order::kColMajor + : Order::kRowMajor)); + *dst = gemmlowp::MatrixMap<const Scalar, tOrder>( + src.data.get(), src.layout.rows, src.layout.cols, src.layout.stride); +} + +template <typename Scalar, gemmlowp::MapOrder tOrder> +void WrapGemmlowpMutable(Matrix<Scalar>* src, + gemmlowp::MatrixMap<Scalar, tOrder>* dst) { + RUY_CHECK(src->layout.order == (tOrder == gemmlowp::MapOrder::ColMajor + ? Order::kColMajor + : Order::kRowMajor)); + *dst = gemmlowp::MatrixMap<Scalar, tOrder>( + src->data.get(), src->layout.rows, src->layout.cols, src->layout.stride); +} + +template <Order tOrder> +struct GemmlowpOrder {}; + +template <> +struct GemmlowpOrder<Order::kColMajor> { + static constexpr gemmlowp::MapOrder kValue = gemmlowp::MapOrder::ColMajor; +}; + +template <> +struct GemmlowpOrder<Order::kRowMajor> { + static constexpr gemmlowp::MapOrder kValue = gemmlowp::MapOrder::RowMajor; +}; + +inline gemmlowp::GemmContext& GlobalGemmlowpContext() { + static gemmlowp::GemmContext context; + return context; +} + +template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar, + typename RhsScalar, typename DstScalar, typename Spec> +void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, + const Spec& spec, int max_num_threads, + Matrix<DstScalar>* dst) { + static constexpr gemmlowp::MapOrder kGemmlowpLhsOrder = + GemmlowpOrder<LhsOrder>::kValue; + static constexpr gemmlowp::MapOrder kGemmlowpRhsOrder = + GemmlowpOrder<RhsOrder>::kValue; + static constexpr gemmlowp::MapOrder kGemmlowpDstOrder = + GemmlowpOrder<DstOrder>::kValue; + gemmlowp::MatrixMap<const LhsScalar, kGemmlowpLhsOrder> gemmlowp_lhs; + gemmlowp::MatrixMap<const RhsScalar, kGemmlowpRhsOrder> gemmlowp_rhs; + gemmlowp::MatrixMap<DstScalar, kGemmlowpDstOrder> gemmlowp_dst; + WrapGemmlowp(lhs, &gemmlowp_lhs); + WrapGemmlowp(rhs, &gemmlowp_rhs); + WrapGemmlowpMutable(dst, &gemmlowp_dst); + + gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage; + quantize_down_stage.result_offset_after_shift = dst->zero_point; + quantize_down_stage.result_fixedpoint_multiplier = spec.multiplier_fixedpoint; + quantize_down_stage.result_exponent = spec.multiplier_exponent; + gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC< + gemmlowp::VectorShape::Col> + quantize_down_stage_pc; + quantize_down_stage_pc.result_offset_after_shift = dst->zero_point; + using ColVectorMap = + gemmlowp::VectorMap<const std::int32_t, gemmlowp::VectorShape::Col>; + quantize_down_stage_pc.result_fixedpoint_multiplier = + ColVectorMap(spec.multiplier_fixedpoint_perchannel, lhs.layout.rows); + quantize_down_stage_pc.result_exponent = + ColVectorMap(spec.multiplier_exponent_perchannel, lhs.layout.rows); + + gemmlowp::OutputStageClamp clamp_stage; + clamp_stage.min = spec.clamp_min; + clamp_stage.max = spec.clamp_max; + using OutputStageSaturatingCast = typename std::conditional< + std::is_same<DstScalar, std::uint8_t>::value, + gemmlowp::OutputStageSaturatingCastToUint8, + gemmlowp::OutputStageSaturatingCastToInt16>::type; + OutputStageSaturatingCast saturating_cast_stage; + + GlobalGemmlowpContext().set_max_num_threads(max_num_threads ? max_num_threads + : 1); + if (spec.bias) { + using ColVectorMap = + gemmlowp::VectorMap<const std::int32_t, gemmlowp::VectorShape::Col>; + gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_add_stage; + bias_add_stage.bias_vector = ColVectorMap(spec.bias, dst->layout.rows); +#ifndef GEMMLOWP_SSE4 // gemmlowp perchannel stuff does not build on SSE + if (spec.multiplier_exponent_perchannel) { + const auto& output_pipeline = + std::make_tuple(bias_add_stage, quantize_down_stage_pc, clamp_stage, + saturating_cast_stage); + gemmlowp::GemmWithOutputPipeline< + LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst, + -lhs.zero_point, -rhs.zero_point, output_pipeline); + } else // NOLINT[readability/braces] +#endif + { + const auto& output_pipeline = + std::make_tuple(bias_add_stage, quantize_down_stage, clamp_stage, + saturating_cast_stage); + gemmlowp::GemmWithOutputPipeline< + LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst, + -lhs.zero_point, -rhs.zero_point, output_pipeline); + } + } else { +#ifndef GEMMLOWP_SSE4 // gemmlowp perchannel stuff does not build on SSE + if (spec.multiplier_exponent_perchannel) { + const auto& output_pipeline = std::make_tuple( + quantize_down_stage_pc, clamp_stage, saturating_cast_stage); + gemmlowp::GemmWithOutputPipeline< + LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst, + -lhs.zero_point, -rhs.zero_point, output_pipeline); + } else // NOLINT[readability/braces] +#endif + { + const auto& output_pipeline = std::make_tuple( + quantize_down_stage, clamp_stage, saturating_cast_stage); + gemmlowp::GemmWithOutputPipeline< + LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst, + -lhs.zero_point, -rhs.zero_point, output_pipeline); + } + } +} + +inline constexpr int Mash(Order LhsOrder, Order RhsOrder, Order DstOrder) { + return (LhsOrder == Order::kRowMajor ? 4 : 0) + + (RhsOrder == Order::kRowMajor ? 2 : 0) + + (DstOrder == Order::kRowMajor ? 1 : 0); +} + +template <typename LhsScalar, typename RhsScalar, typename DstScalar, + typename Spec> +void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, + const Spec& spec, int max_num_threads, + Matrix<DstScalar>* dst) { + int index = Mash(lhs.layout.order, rhs.layout.order, dst->layout.order); + switch (index) { +#define EVALGEMMLOWP_CASE3(LHS, RHS, DST) \ + case Mash(LHS, RHS, DST): \ + return EvalGemmlowp<LHS, RHS, DST>(lhs, rhs, spec, max_num_threads, dst); +#define EVALGEMMLOWP_CASE2(LHS, RHS) \ + EVALGEMMLOWP_CASE3(LHS, RHS, Order::kColMajor) \ + EVALGEMMLOWP_CASE3(LHS, RHS, Order::kRowMajor) +#define EVALGEMMLOWP_CASE1(LHS) \ + EVALGEMMLOWP_CASE2(LHS, Order::kColMajor) \ + EVALGEMMLOWP_CASE2(LHS, Order::kRowMajor) + + EVALGEMMLOWP_CASE1(Order::kColMajor) + EVALGEMMLOWP_CASE1(Order::kRowMajor) + +#undef EVALGEMMLOWP_CASE1 +#undef EVALGEMMLOWP_CASE2 +#undef EVALGEMMLOWP_CASE3 + + default: + RUY_CHECK(false); + } +} + +template <Order tOrder> +struct EigenOrder {}; + +template <> +struct EigenOrder<Order::kColMajor> { + static constexpr int kValue = Eigen::ColMajor; +}; + +template <> +struct EigenOrder<Order::kRowMajor> { + static constexpr int kValue = Eigen::RowMajor; +}; + +template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar, + typename RhsScalar, typename DstScalar, typename Spec> +void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, + const Spec& spec, int max_num_threads, Matrix<DstScalar>* dst) { + RUY_CHECK_EQ(lhs.zero_point, 0); + RUY_CHECK_EQ(rhs.zero_point, 0); + RUY_CHECK_EQ(dst->zero_point, 0); + RUY_CHECK_EQ(spec.multiplier_fixedpoint, 0); + RUY_CHECK_EQ(spec.multiplier_exponent, 0); + + static constexpr int kEigenLhsOrder = EigenOrder<LhsOrder>::kValue; + static constexpr int kEigenRhsOrder = EigenOrder<RhsOrder>::kValue; + static constexpr int kEigenDstOrder = EigenOrder<DstOrder>::kValue; + + using EigenLhsType = typename Eigen::Matrix<LhsScalar, Eigen::Dynamic, + Eigen::Dynamic, kEigenLhsOrder>:: + template StridedConstMapType<Eigen::OuterStride<Eigen::Dynamic>>::type; + using EigenRhsType = typename Eigen::Matrix<RhsScalar, Eigen::Dynamic, + Eigen::Dynamic, kEigenRhsOrder>:: + template StridedConstMapType<Eigen::OuterStride<Eigen::Dynamic>>::type; + using EigenDstType = typename Eigen::Matrix<DstScalar, Eigen::Dynamic, + Eigen::Dynamic, kEigenDstOrder>:: + template StridedMapType<Eigen::OuterStride<Eigen::Dynamic>>::type; + using EigenBiasType = + typename Eigen::Matrix<DstScalar, Eigen::Dynamic, 1>::ConstMapType; + + EigenLhsType eigen_lhs(lhs.data.get(), lhs.layout.rows, lhs.layout.cols, + Eigen::OuterStride<Eigen::Dynamic>(lhs.layout.stride)); + EigenRhsType eigen_rhs(rhs.data.get(), rhs.layout.rows, rhs.layout.cols, + Eigen::OuterStride<Eigen::Dynamic>(rhs.layout.stride)); + EigenDstType eigen_dst( + dst->data.get(), dst->layout.rows, dst->layout.cols, + Eigen::OuterStride<Eigen::Dynamic>(dst->layout.stride)); + Eigen::setNbThreads(max_num_threads ? max_num_threads : 1); + + if (spec.bias) { + EigenBiasType eigen_bias(spec.bias, dst->layout.rows); + if (spec.clamp_max == std::numeric_limits<DstScalar>::infinity() && + spec.clamp_min == -std::numeric_limits<DstScalar>::infinity()) { + eigen_dst.noalias() = (eigen_lhs * eigen_rhs).colwise() + eigen_bias; + } else { + eigen_dst.noalias() = ((eigen_lhs * eigen_rhs).colwise() + eigen_bias) + .cwiseMin(spec.clamp_max) + .cwiseMax(spec.clamp_min); + } + } else { + if (spec.clamp_max == std::numeric_limits<DstScalar>::infinity() && + spec.clamp_min == -std::numeric_limits<DstScalar>::infinity()) { + eigen_dst.noalias() = eigen_lhs * eigen_rhs; + } else { + eigen_dst.noalias() = (eigen_lhs * eigen_rhs) + .cwiseMin(spec.clamp_max) + .cwiseMax(spec.clamp_min); + } + } +} + +template <typename LhsScalar, typename RhsScalar, typename DstScalar, + typename Spec> +void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs, + const Spec& spec, int max_num_threads, Matrix<DstScalar>* dst) { + int index = Mash(lhs.layout.order, rhs.layout.order, dst->layout.order); + switch (index) { +#define EVALEIGEN_CASE3(LHS, RHS, DST) \ + case Mash(LHS, RHS, DST): \ + return EvalEigen<LHS, RHS, DST>(lhs, rhs, spec, max_num_threads, dst); +#define EVALEIGEN_CASE2(LHS, RHS) \ + EVALEIGEN_CASE3(LHS, RHS, Order::kColMajor) \ + EVALEIGEN_CASE3(LHS, RHS, Order::kRowMajor) +#define EVALEIGEN_CASE1(LHS) \ + EVALEIGEN_CASE2(LHS, Order::kColMajor) \ + EVALEIGEN_CASE2(LHS, Order::kRowMajor) + + EVALEIGEN_CASE1(Order::kColMajor) + EVALEIGEN_CASE1(Order::kRowMajor) + +#undef EVALEIGEN_CASE1 +#undef EVALEIGEN_CASE2 +#undef EVALEIGEN_CASE3 + + default: + RUY_CHECK(false); + } +} + +template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename Scalar, + typename Spec> +void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs, + const Spec& spec, int max_num_threads, + Matrix<Scalar>* dst) { + RUY_CHECK_EQ(lhs.zero_point, 0); + RUY_CHECK_EQ(rhs.zero_point, 0); + RUY_CHECK_EQ(dst->zero_point, 0); + RUY_CHECK_EQ(spec.multiplier_fixedpoint, 0); + RUY_CHECK_EQ(spec.multiplier_exponent, 0); + + // Eigen::TensorMap only supports packed layouts + RUY_CHECK(IsPacked(lhs.layout)); + RUY_CHECK(IsPacked(rhs.layout)); + RUY_CHECK(IsPacked(dst->layout)); + + using TensorLhsType = + Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>; + using TensorRhsType = + Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>; + using TensorDstType = + Eigen::TensorMap<Eigen::Tensor<Scalar, 2, Eigen::ColMajor>>; + using TensorBiasType = + Eigen::TensorMap<Eigen::Tensor<const Scalar, 1, Eigen::ColMajor>>; + + const bool tr = DstOrder == Order::kRowMajor; + const auto& contract_lhs = tr ? rhs : lhs; + const auto& contract_rhs = tr ? lhs : rhs; + + TensorLhsType tensor_lhs( + contract_lhs.data.get(), + LhsOrder == Order::kColMajor ? contract_lhs.layout.rows + : contract_lhs.layout.cols, + LhsOrder == Order::kColMajor ? contract_lhs.layout.cols + : contract_lhs.layout.rows); + TensorRhsType tensor_rhs( + contract_rhs.data.get(), + RhsOrder == Order::kColMajor ? contract_rhs.layout.rows + : contract_rhs.layout.cols, + RhsOrder == Order::kColMajor ? contract_rhs.layout.cols + : contract_rhs.layout.rows); + TensorDstType tensor_dst( + dst->data.get(), + DstOrder == Order::kColMajor ? dst->layout.rows : dst->layout.cols, + DstOrder == Order::kColMajor ? dst->layout.cols : dst->layout.rows); + using DimPair = + typename Eigen::Tensor<Scalar, 1, 0, Eigen::Index>::DimensionPair; + Eigen::array<DimPair, 1> contract_dims( + {DimPair((LhsOrder == Order::kColMajor) ? 1 : 0, + (RhsOrder == Order::kColMajor) ? 0 : 1)}); + Eigen::array<int, 2> shuffle(DstOrder == Order::kColMajor ? 0 : 1, + DstOrder == Order::kColMajor ? 1 : 0); + static Eigen::ThreadPool pool(max_num_threads ? max_num_threads : 1); + static Eigen::ThreadPoolDevice device(&pool, pool.NumThreads()); + if (spec.bias) { + TensorBiasType tensor_bias(spec.bias, dst->layout.rows); + Eigen::array<int, 2> bias_2d_shape(tr ? 1 : dst->layout.rows, + tr ? dst->layout.rows : 1); + Eigen::array<int, 2> bcast(tr ? dst->layout.cols : 1, + tr ? 1 : dst->layout.cols); + if (spec.clamp_max == std::numeric_limits<Scalar>::infinity() && + spec.clamp_min == -std::numeric_limits<Scalar>::infinity()) { + tensor_dst.device(device) = + tensor_lhs.contract(tensor_rhs, contract_dims); + } else { + tensor_dst.device(device) = + (tensor_lhs.contract(tensor_rhs, contract_dims) + + tensor_bias.reshape(bias_2d_shape).broadcast(bcast)) + .cwiseMin(spec.clamp_max) + .cwiseMax(spec.clamp_min); + } + } else { + if (spec.clamp_max == std::numeric_limits<Scalar>::infinity() && + spec.clamp_min == -std::numeric_limits<Scalar>::infinity()) { + tensor_dst.device(device) = + tensor_lhs.contract(tensor_rhs, contract_dims); + } else { + tensor_dst.device(device) = tensor_lhs.contract(tensor_rhs, contract_dims) + .cwiseMin(spec.clamp_max) + .cwiseMax(spec.clamp_min); + } + } +} + +template <typename Scalar, typename Spec> +void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs, + const Spec& spec, int max_num_threads, + Matrix<Scalar>* dst) { + int index = Mash(lhs.layout.order, rhs.layout.order, dst->layout.order); + switch (index) { +#define EVALEIGENTENSOR_CASE3(LHS, RHS, DST) \ + case Mash(LHS, RHS, DST): \ + return EvalEigenTensor<LHS, RHS, DST>(lhs, rhs, spec, max_num_threads, dst); +#define EVALEIGENTENSOR_CASE2(LHS, RHS) \ + EVALEIGENTENSOR_CASE3(LHS, RHS, Order::kColMajor) \ + EVALEIGENTENSOR_CASE3(LHS, RHS, Order::kRowMajor) +#define EVALEIGENTENSOR_CASE1(LHS) \ + EVALEIGENTENSOR_CASE2(LHS, Order::kColMajor) \ + EVALEIGENTENSOR_CASE2(LHS, Order::kRowMajor) + + EVALEIGENTENSOR_CASE1(Order::kColMajor) + EVALEIGENTENSOR_CASE1(Order::kRowMajor) + +#undef EVALEIGENTENSOR_CASE1 +#undef EVALEIGENTENSOR_CASE2 +#undef EVALEIGENTENSOR_CASE3 + + default: + RUY_CHECK(false); + } +} + +template <typename Scalar> +struct GenericBlasGemm {}; + +template <> +struct GenericBlasGemm<lapack::doublereal> { + static void Run(char* transa, char* transb, lapack::integer* m, + lapack::integer* n, lapack::integer* k, + lapack::doublereal* alpha, lapack::doublereal* a, + lapack::integer* lda, lapack::doublereal* b, + lapack::integer* ldb, lapack::doublereal* beta, + lapack::doublereal* c, lapack::integer* ldc) { + dgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } +}; + +template <> +struct GenericBlasGemm<lapack::real> { + static void Run(char* transa, char* transb, lapack::integer* m, + lapack::integer* n, lapack::integer* k, lapack::real* alpha, + lapack::real* a, lapack::integer* lda, lapack::real* b, + lapack::integer* ldb, lapack::real* beta, lapack::real* c, + lapack::integer* ldc) { + sgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } +}; + +template <typename Scalar, typename Spec> +void EvalOpenBlas(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs, + const Spec& spec, int max_num_threads, Matrix<Scalar>* dst) { + RUY_CHECK_EQ(lhs.zero_point, 0); + RUY_CHECK_EQ(rhs.zero_point, 0); + RUY_CHECK_EQ(dst->zero_point, 0); + RUY_CHECK_EQ(spec.multiplier_fixedpoint, 0); + RUY_CHECK_EQ(spec.multiplier_exponent, 0); + + Matrix<Scalar> gemm_lhs; + Matrix<Scalar> gemm_rhs; + Matrix<Scalar> gemm_dst; + gemm_dst = *dst; + + // Use Transpose to reduce to the all-column-major case. + // Notice that ruy::Matrix merely holds a pointer, does not own data, + // so Transpose is cheap -- no actual matrix data is being transposed here. + if (dst->layout.order == Order::kColMajor) { + gemm_lhs = lhs; + gemm_rhs = rhs; + } else { + gemm_lhs = rhs; + gemm_rhs = lhs; + Transpose(&gemm_lhs); + Transpose(&gemm_rhs); + Transpose(&gemm_dst); + } + bool transposed_lhs = false; + bool transposed_rhs = false; + + if (gemm_lhs.layout.order == Order::kRowMajor) { + Transpose(&gemm_lhs); + transposed_lhs = true; + } + if (gemm_rhs.layout.order == Order::kRowMajor) { + Transpose(&gemm_rhs); + transposed_rhs = true; + } + + RUY_CHECK_EQ(gemm_lhs.layout.order, Order::kColMajor); + RUY_CHECK_EQ(gemm_rhs.layout.order, Order::kColMajor); + RUY_CHECK_EQ(gemm_dst.layout.order, Order::kColMajor); + + char transa = transposed_lhs ? 'T' : 'N'; + char transb = transposed_rhs ? 'T' : 'N'; + int m = gemm_lhs.layout.rows; + int n = gemm_rhs.layout.cols; + int k = gemm_lhs.layout.cols; + float alpha = 1; + Scalar* a = gemm_lhs.data.get(); + int lda = gemm_lhs.layout.stride; + Scalar* b = gemm_rhs.data.get(); + int ldb = gemm_rhs.layout.stride; + float beta = 0; + Scalar* c = gemm_dst.data.get(); + int ldc = gemm_dst.layout.stride; + GenericBlasGemm<Scalar>::Run(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, + &ldb, &beta, c, &ldc); + + // BLAS does not allow us to express the bias-addition and clamping, so + // we use Eigen for that. + + using EigenDstType = + typename Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>:: + template StridedMapType<Eigen::OuterStride<Eigen::Dynamic>>::type; + using EigenBiasType = + typename Eigen::Matrix<Scalar, Eigen::Dynamic, 1>::ConstMapType; + + EigenDstType eigen_dst( + gemm_dst.data.get(), gemm_dst.layout.rows, gemm_dst.layout.cols, + Eigen::OuterStride<Eigen::Dynamic>(gemm_dst.layout.stride)); + Eigen::setNbThreads(max_num_threads ? max_num_threads : 1); + + if (spec.bias) { + EigenBiasType eigen_bias(spec.bias, dst->layout.rows); + if (spec.clamp_max == std::numeric_limits<Scalar>::infinity() && + spec.clamp_min == -std::numeric_limits<Scalar>::infinity()) { + eigen_dst.noalias() = eigen_dst.colwise() + eigen_bias; + } else { + eigen_dst.noalias() = (eigen_dst.colwise() + eigen_bias) + .cwiseMin(spec.clamp_max) + .cwiseMax(spec.clamp_min); + } + } else { + if (spec.clamp_max == std::numeric_limits<Scalar>::infinity() && + spec.clamp_min == -std::numeric_limits<Scalar>::infinity()) { + } else { + eigen_dst.noalias() = + eigen_dst.cwiseMin(spec.clamp_max).cwiseMax(spec.clamp_min); + } + } +} + +template <typename TestSetType> +struct SupportsGemmlowp { + static constexpr bool kValue = + std::is_same<typename TestSetType::LhsScalar, std::uint8_t>::value && + std::is_same<typename TestSetType::RhsScalar, std::uint8_t>::value; +}; + +template <typename TestSetType> +struct UsesSingleScalarType { + static constexpr bool kValue = + std::is_same<typename TestSetType::DstScalar, + typename TestSetType::LhsScalar>::value && + std::is_same<typename TestSetType::DstScalar, + typename TestSetType::RhsScalar>::value && + std::is_same<typename TestSetType::DstScalar, + typename TestSetType::AccumScalar>::value; +}; + +template <typename TestSetType, + bool IsFloatingPoint = + std::is_floating_point<typename TestSetType::AccumScalar>::value, + bool EnableGemmlowp = SupportsGemmlowp<TestSetType>::kValue, + bool SingleScalarType = UsesSingleScalarType<TestSetType>::kValue> +struct EvalExternalPathImpl { + using DstScalar = typename TestSetType::DstScalar; + static void Run(TestSetType*, TestResult<DstScalar>*) { RUY_CHECK(false); } +}; + +template <typename TestSetType> +struct EvalExternalPathImpl<TestSetType, true, false, true> { + using DstScalar = typename TestSetType::DstScalar; + static void Run(TestSetType* test_set, TestResult<DstScalar>* test_result) { + if (test_result->external_path == ExternalPath::kEigen) { + EvalEigen(test_set->lhs.matrix, test_set->rhs.matrix, test_set->spec, + test_set->max_num_threads, &test_result->storage_matrix.matrix); + } else if (test_result->external_path == ExternalPath::kEigenTensor) { + EvalEigenTensor(test_set->lhs.matrix, test_set->rhs.matrix, + test_set->spec, test_set->max_num_threads, + &test_result->storage_matrix.matrix); + } else if (test_result->external_path == ExternalPath::kOpenBlas) { + EvalOpenBlas(test_set->lhs.matrix, test_set->rhs.matrix, test_set->spec, + test_set->max_num_threads, + &test_result->storage_matrix.matrix); + } else { + RUY_CHECK(false); + } + } +}; + +template <typename TestSetType, bool SingleScalarType> +struct EvalExternalPathImpl<TestSetType, false, true, SingleScalarType> { + using DstScalar = typename TestSetType::DstScalar; + static void Run(TestSetType* test_set, TestResult<DstScalar>* test_result) { + if (test_result->external_path == ExternalPath::kGemmlowp) { + EvalGemmlowp(test_set->lhs.matrix, test_set->rhs.matrix, test_set->spec, + test_set->max_num_threads, + &test_result->storage_matrix.matrix); + } else { + RUY_CHECK(false); + } + } +}; + +template <typename TestSetType> +void EvalExternalPath( + TestSetType* test_set, + TestResult<typename TestSetType::DstScalar>* test_result) { + EvalExternalPathImpl<TestSetType>::Run(test_set, test_result); +} + +#endif // RUY_TEST_EXTERNAL_PATHS + +template <typename Scalar> +bool Agree(const Matrix<Scalar>& matrix1, const Matrix<Scalar>& matrix2, + int depth) { + RUY_CHECK_EQ(matrix1.layout.rows, matrix2.layout.rows); + RUY_CHECK_EQ(matrix1.layout.cols, matrix2.layout.cols); + RUY_CHECK_EQ(matrix1.zero_point, matrix2.zero_point); + const int size = matrix1.layout.rows * matrix1.layout.cols; + double tolerated_max_diff = 0; + double tolerated_mean_diff = 0; + if (std::is_floating_point<Scalar>::value) { + // TODO: replace hardcoded 100 by something more sensible, probably + // roughly sqrt(depth) based on central limit theorem. + double max_abs_val = 0; + for (int row = 0; row < matrix1.layout.rows; row++) { + for (int col = 0; col < matrix1.layout.cols; col++) { + max_abs_val = + std::max(max_abs_val, + std::abs(static_cast<double>(Element(matrix1, row, col)))); + max_abs_val = + std::max(max_abs_val, + std::abs(static_cast<double>(Element(matrix2, row, col)))); + } + } + tolerated_max_diff = max_abs_val * std::numeric_limits<Scalar>::epsilon() * + 64 * std::sqrt(static_cast<float>(depth)); + tolerated_mean_diff = tolerated_max_diff / std::sqrt(size); + } else if (RUY_OPT_ENABLED(RUY_OPT_NATIVE_ROUNDING)) { + tolerated_max_diff = 1; + // totally empirical + tolerated_mean_diff = std::min(1.0, 2.0 * std::pow(size, -0.2)); + } + double sum_diff = 0; + for (int row = 0; row < matrix1.layout.rows; row++) { + for (int col = 0; col < matrix1.layout.cols; col++) { + double elem1 = Element(matrix1, row, col); + double elem2 = Element(matrix2, row, col); + double diff = elem1 - elem2; + + sum_diff += diff; + // Test (std::abs(diff) > tolerated_max_diff), but also true if diff is + // NaN. + if (!(std::abs(diff) <= tolerated_max_diff)) { + return false; + } + } + } + double mean_diff = sum_diff / size; + if (std::abs(mean_diff) > tolerated_mean_diff) { + return false; + } + return true; +} + +template <typename Scalar> +bool Agree(const StorageMatrix<Scalar>& storage_matrix1, + const StorageMatrix<Scalar>& storage_matrix2, int depth) { + VerifyConsistentFields(storage_matrix1); + VerifyConsistentFields(storage_matrix2); + return Agree(storage_matrix1.matrix, storage_matrix2.matrix, depth); +} + +template <typename Scalar> +bool Agree(const TestResult<Scalar>& result1, const TestResult<Scalar>& result2, + int depth) { + return Agree(result1.storage_matrix, result2.storage_matrix, depth); +} + +struct Stats { + double median; + double mean; + double min; + double max; +}; + +inline std::string StatsAsString(const Stats& stats) { + char buf[256]; + snprintf(buf, sizeof(buf), "(median = %g, mean = %g, min = %g, max = %g)", + stats.median, stats.mean, stats.min, stats.max); + return std::string(buf); +} + +template <typename Scalar> +void GetMatrixStats(const Matrix<Scalar>& matrix, Stats* stats) { + double min = std::numeric_limits<double>::infinity(); + double max = -std::numeric_limits<double>::infinity(); + double sum = 0; + std::vector<double> allvals; + for (int row = 0; row < matrix.layout.rows; row++) { + for (int col = 0; col < matrix.layout.cols; col++) { + double val = Element(matrix, row, col); + min = std::min(min, val); + max = std::max(max, val); + sum += val; + allvals.push_back(val); + } + } + std::sort(allvals.begin(), allvals.end()); + stats->min = min; + stats->max = max; + stats->mean = sum / allvals.size(); + stats->median = allvals[allvals.size() / 2]; +} + +struct ErrorAnalysis { + Stats stats_good; + Stats stats_bad; + // The below is to help document departure from bit exactness. It's probably + // not going to be relevant to floating-point. + std::set<int> error_rows; + std::set<int> error_cols; + int row_of_first_error = 0; + int col_of_first_error = 0; + double first_error_good_value = 0; + double first_error_bad_value = 0; +}; + +template <typename TestSetType> +void AnalyzeTestError(const TestSetType& test_set, int first_bad_result_index, + ErrorAnalysis* error_analysis) { + const auto& good_matrix = test_set.results[0]->storage_matrix.matrix; + const auto& bad_matrix = + test_set.results[first_bad_result_index]->storage_matrix.matrix; + GetMatrixStats(good_matrix, &error_analysis->stats_good); + GetMatrixStats(bad_matrix, &error_analysis->stats_bad); + bool found_first_error = false; + for (int row = 0; row < good_matrix.layout.rows; row++) { + for (int col = 0; col < good_matrix.layout.cols; col++) { + if (Element(good_matrix, row, col) != Element(bad_matrix, row, col)) { + if (!found_first_error) { + found_first_error = true; + error_analysis->row_of_first_error = row; + error_analysis->col_of_first_error = col; + error_analysis->first_error_good_value = + Element(good_matrix, row, col); + error_analysis->first_error_bad_value = Element(bad_matrix, row, col); + } + error_analysis->error_rows.insert(row); + error_analysis->error_cols.insert(col); + } + } + } +} + +template <typename TestSetType> +void ComputeReasonableMultiplier( + const Matrix<typename TestSetType::LhsScalar>& lhs, + const Matrix<typename TestSetType::RhsScalar>& rhs, double* multiplier) { + using LhsScalar = typename TestSetType::LhsScalar; + using RhsScalar = typename TestSetType::RhsScalar; + using DstScalar = typename TestSetType::DstScalar; + if (std::is_floating_point<DstScalar>::value || + std::is_same<DstScalar, std::int32_t>::value) { + *multiplier = 0; + return; + } + *multiplier = static_cast<double>(std::numeric_limits<DstScalar>::max()) / + (static_cast<double>(lhs.layout.cols) * + std::numeric_limits<LhsScalar>::max() * + std::numeric_limits<RhsScalar>::max()); +} + +inline void QuantizeMultiplier(double multiplier_double, + std::int32_t* multiplier_fixedpoint, + int* multiplier_exponent) { + RUY_CHECK_GT(multiplier_double, 0); + if (multiplier_double == 0.) { + *multiplier_fixedpoint = 0; + *multiplier_exponent = 0; + return; + } + const double q = std::frexp(multiplier_double, multiplier_exponent); + auto q_fixed = static_cast<std::int64_t>(std::round(q * (1ll << 31))); + RUY_CHECK_LE(q_fixed, (1ll << 31)); + if (q_fixed == (1ll << 31)) { + q_fixed /= 2; + ++*multiplier_exponent; + } + RUY_CHECK_LE(q_fixed, std::numeric_limits<std::int32_t>::max()); + *multiplier_fixedpoint = static_cast<std::int32_t>(q_fixed); +} + +template <typename TestSetType> +void SwitchMultiplierToPerChannel(TestSetType* test_set) { + test_set->per_channel_multiplier_fixedpoint.resize(test_set->rows); + test_set->per_channel_multiplier_exponent.resize(test_set->rows); + for (int i = 0; i < test_set->rows; i++) { + // multipliers typically range in [2^30 ; 2^31 - 1]. + // Values in [0, 2^30 - 1] are normally unused, but harmless. + // Thus a good way to randomize multipliers is to subtract from them + // a random value smaller than 2^30 but still significant compared to it. + std::int32_t nudged_multiplier = test_set->spec.multiplier_fixedpoint - + (global_random_engine()() % (1 << 26)); + int nudged_exponent = + test_set->spec.multiplier_exponent - 1 + (global_random_engine()() % 4); + test_set->per_channel_multiplier_fixedpoint[i] = nudged_multiplier; + test_set->per_channel_multiplier_exponent[i] = nudged_exponent; + } + test_set->spec.multiplier_fixedpoint_perchannel = + test_set->per_channel_multiplier_fixedpoint.data(); + test_set->spec.multiplier_exponent_perchannel = + test_set->per_channel_multiplier_exponent.data(); + test_set->spec.multiplier_fixedpoint = 0; + test_set->spec.multiplier_exponent = 0; +} + +template < + typename TestSetType, + bool IsApplicable = + std::is_same<typename TestSetType::AccumScalar, std::int32_t>::value && + !std::is_same<typename TestSetType::DstScalar, std::int32_t>::value> +struct MakeSpecMultiplierFieldsImpl {}; + +template <typename TestSetType> +struct MakeSpecMultiplierFieldsImpl<TestSetType, true> { + static void Run(TestSetType* test_set) { + double multiplier; + ComputeReasonableMultiplier<TestSetType>(test_set->lhs.matrix, + test_set->rhs.matrix, &multiplier); + QuantizeMultiplier(multiplier, &test_set->spec.multiplier_fixedpoint, + &test_set->spec.multiplier_exponent); + if (!test_set->benchmark) { + test_set->perchannel = global_random_engine()() & 1; + } + if (test_set->perchannel) { + SwitchMultiplierToPerChannel(test_set); + } + } +}; + +template <typename TestSetType> +struct MakeSpecMultiplierFieldsImpl<TestSetType, false> { + static void Run(TestSetType* test_set) { + test_set->spec.multiplier_fixedpoint = 0; + test_set->spec.multiplier_exponent = 0; + } +}; + +template <typename Spec> +void MakeSpecClampFields(Spec* spec) { + using AccumScalar = typename Spec::AccumScalar; + using DstScalar = typename Spec::DstScalar; + + if (std::is_same<AccumScalar, std::int32_t>::value) { + // Returning raw accumulators, clamping is not supported. + spec->clamp_min = std::numeric_limits<DstScalar>::lowest(); + spec->clamp_max = std::numeric_limits<DstScalar>::max(); + return; + } + + if (getenv("BENCHMARK_ONLY_MATMUL")) { + if (std::is_floating_point<DstScalar>::value) { + spec->clamp_min = -std::numeric_limits<DstScalar>::infinity(); + spec->clamp_max = std::numeric_limits<DstScalar>::infinity(); + } else { + spec->clamp_min = std::numeric_limits<DstScalar>::lowest(); + spec->clamp_max = std::numeric_limits<DstScalar>::max(); + } + return; + } + + spec->clamp_min = std::numeric_limits<DstScalar>::lowest() + 1; + spec->clamp_max = std::numeric_limits<DstScalar>::max() - 1; +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::MakeZeroPoints() { + RUY_CHECK_EQ(life_stage, LifeStage::kInitial); + if (!benchmark && !use_specified_zero_points) { + MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &lhs_zero_point); + MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &rhs_zero_point); + // If destination is std::int32_t, no dst_zero_point is necessary. + if (std::is_same<DstScalar, std::int32_t>::value) { + dst_zero_point = 0; + } else { + MakeRandomScalar(RandomRange::kReasonableDstZeroPoint, &dst_zero_point); + } + } + life_stage = LifeStage::kHasZeroPoints; +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::MakeLhsRhs() { + RUY_CHECK_EQ(life_stage, LifeStage::kHasZeroPoints); + MakeRandom(rows, depth, lhs_order, lhs_zero_point, layout_style, + RandomRange::kOffCenterAvoidMinValue, &lhs); + MakeRandom(depth, cols, rhs_order, rhs_zero_point, layout_style, + RandomRange::kGeneral, &rhs); + life_stage = LifeStage::kHasLhsRhs; +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::MakeSpec() { + RUY_CHECK_EQ(life_stage, LifeStage::kHasLhsRhs); + + if (!getenv("BENCHMARK_ONLY_MATMUL") && + (benchmark || (global_random_engine()() & 1))) { + MakeRandomVector(RandomRange::kBias, rows, &bias_data); + spec.bias = bias_data.data(); + } + if (lhs.matrix.zero_point == std::numeric_limits<LhsScalar>::lowest() && + rhs.matrix.zero_point == std::numeric_limits<RhsScalar>::lowest()) { + lhs.matrix.zero_point += 1; + } + MakeSpecMultiplierFieldsImpl<TestSet>::Run(this); + MakeSpecClampFields(&spec); + life_stage = LifeStage::kHasSpec; +} + +inline int GetIntEnvVarOrZero(const char* name) { + const char* val = getenv(name); + if (!val) { + return 0; + } + return std::stoi(val); +} + +inline float GetFloatEnvVarOrZero(const char* name) { + const char* val = getenv(name); + if (!val) { + return 0; + } + return std::stof(val); +} + +inline int GetHexIntEnvVarOrZero(const char* name) { + const char* val = getenv(name); + if (!val) { + return 0; + } + return std::stoi(val, nullptr, 16); +} + +inline bool GetBoolEnvVarOrFalse(const char* name) { + return static_cast<bool>(GetIntEnvVarOrZero(name)); +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::MakeOtherParams() { + RUY_CHECK_EQ(life_stage, LifeStage::kHasSpec); + if (max_num_threads == 0) { + max_num_threads = GetIntEnvVarOrZero("THREADS"); + } + life_stage = LifeStage::kHasOtherParams; +} + +inline std::vector<Path> PathsBitfieldAsVector(Path paths_bitfield) { + std::vector<Path> result; + std::uint32_t remaining_paths = static_cast<std::uint32_t>(paths_bitfield); + std::uint32_t test_bit = 1; + while (remaining_paths) { + if (remaining_paths & test_bit) { + result.push_back(static_cast<Path>(test_bit)); + } + remaining_paths &= ~test_bit; + test_bit <<= 1; + } + return result; +} + +inline std::vector<Tuning> EnumerateTuningsForPath(Path path, bool benchmark) { + if (benchmark) { + return {Tuning::kAuto}; + } +#if RUY_PLATFORM(ARM) + if (path == Path::kNeon || path == Path::kNeonDotprod) { + return {Tuning::kInOrder, Tuning::kOutOfOrder, Tuning::kAuto}; + } +#endif + return {Tuning::kAuto}; +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::MakePrepackedMatrices() { + RUY_CHECK_EQ(life_stage, LifeStage::kHasResultPaths); + + // Prepacked matrices are Path-dependent, so create them for each test result. + for (auto& result : results) { + // If this result uses an external path, then skip this entirely. + if (result->path == Path::kNone) { + continue; + } + // Pre-packing doesn't make sense for Path::kReference. + // TODO(silvasean): Make Path::kReference an ExternalPath? + if (result->path == Path::kReference) { + continue; + } + + // Determine whether we should create/use prepacked matrices. + if (benchmark) { + // For benchmarking, do as requested. + result->use_prepacked_lhs = benchmark_prepack_lhs; + result->use_prepacked_rhs = benchmark_prepack_rhs; + } else { + // When testing, randomly pre-pack sometimes. But don't do it too often. + result->use_prepacked_lhs = (global_random_engine()() & 7) == 0; + result->use_prepacked_rhs = (global_random_engine()() & 7) == 0; + } + + // Create the pre-packed matrices. + PrepackedMatrix* prepacked_lhs_ptr = + result->use_prepacked_lhs ? &result->prepacked_lhs : nullptr; + PrepackedMatrix* prepacked_rhs_ptr = + result->use_prepacked_rhs ? &result->prepacked_rhs : nullptr; + auto alloc_fn = [&result](std::size_t num_bytes) { + return result->allocator.AllocateBytes(num_bytes); + }; + // Use a dst with a null data pointer to check that the pre-packing + // invocation doesn't write into it. + Matrix<DstScalar> null_data_dst = result->storage_matrix.matrix; + null_data_dst.data = nullptr; + GlobalContext().SetRuntimeEnabledPaths(result->path); + PrePackForMul<kAllPaths>(lhs.matrix, rhs.matrix, spec, &GlobalContext(), + &null_data_dst, prepacked_lhs_ptr, + prepacked_rhs_ptr, alloc_fn); + RUY_CHECK_EQ(GlobalContext().last_taken_path, result->path); + } + + life_stage = LifeStage::kHasPrepackedMatrices; +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() { + RUY_CHECK_EQ(life_stage, LifeStage::kHasOtherParams); + + Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("PATHS")); + + if (paths_bitfield == Path::kNone) { + // Use a dummy Context just to perform the resolution of specific runtime + // enabled paths. + Context context; + paths_bitfield = context.GetRuntimeEnabledPaths(); + } + + // Trim bits that don't correspond to a compiled path, + // to allow specifying e.g. ffff to mean 'all paths' regardless of whether all + // those bits exist as actual paths. + paths_bitfield = paths_bitfield & kAllPaths; + RUY_CHECK_NE(paths_bitfield, Path::kNone); + paths = PathsBitfieldAsVector(paths_bitfield); + +#ifdef RUY_TEST_EXTERNAL_PATHS + + using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>; + + if (!GetBoolEnvVarOrFalse("NOEXT")) { + if (SupportsGemmlowp<TestSetType>::kValue) { +#ifdef GEMMLOWP_SSE4 + const bool gemmlowp_supported = !spec.multiplier_fixedpoint_perchannel; +#else + const bool gemmlowp_supported = true; +#endif + if (gemmlowp_supported) { + external_paths.push_back(ExternalPath::kGemmlowp); + } + } + if (UsesSingleScalarType<TestSetType>::kValue && + std::is_floating_point<AccumScalar>::value) { + external_paths.push_back(ExternalPath::kEigen); + if (layout_style == LayoutStyle::kPackedLinear) { + external_paths.push_back(ExternalPath::kEigenTensor); + } +// We link against a generic BLAS target that only maps to OpenBLAS on specific +// architectures. +#if RUY_PLATFORM(ARM_32) || RUY_PLATFORM(ARM_64) + // OpenBLAS multi-threading is disabled, so avoid mixing single-threaded + // and multi-threaded benchmark results. + if (max_num_threads == 1 && !getenv("NO_OPENBLAS")) { + external_paths.push_back(ExternalPath::kOpenBlas); + } +#endif + } + } + +#endif // RUY_TEST_EXTERNAL_PATHS + + for (Path path : paths) { + for (Tuning tuning : EnumerateTuningsForPath(path, benchmark)) { + results.emplace_back(new TestResultType); + TestResultType& result = *results.back(); + result.path = path; + result.tuning = tuning; + MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style, + RandomRange::kGeneral, &result.storage_matrix); + } + } + + for (ExternalPath external_path : external_paths) { + results.emplace_back(new TestResultType); + TestResultType& result = *results.back(); + result.external_path = external_path; + MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style, + RandomRange::kGeneral, &result.storage_matrix); + } + + life_stage = LifeStage::kHasResultPaths; +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::EvalResult( + TestResult<typename SpecType::DstScalar>* result) { + RUY_CHECK(result->path != Path::kNone || + result->external_path != ExternalPath::kNone); + if (result->path != Path::kNone) { + EvalRuy(result); + } else { +#ifdef RUY_TEST_EXTERNAL_PATHS + using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>; + EvalExternalPath(this, result); +#endif + } + const std::string& pathname = PathName(*result); + if (std::find(CoveredPaths()->begin(), CoveredPaths()->end(), pathname) == + CoveredPaths()->end()) { + CoveredPaths()->push_back(pathname); + } +} + +using f32 = float; +using f64 = double; +using u8 = std::uint8_t; +using i8 = std::int8_t; +using u16 = std::uint16_t; +using i16 = std::int16_t; +using u32 = std::uint32_t; +using i32 = std::int32_t; +using u64 = std::uint64_t; +using i64 = std::int64_t; + +template <typename Scalar> +const char* TypeName() { + return nullptr; +} + +#define RUY_TYPENAME(TYPE) \ + template <> \ + const char* TypeName<TYPE>() { \ + return #TYPE; \ + } + +RUY_TYPENAME(f32) +RUY_TYPENAME(f64) +RUY_TYPENAME(u8) +RUY_TYPENAME(i8) +RUY_TYPENAME(u16) +RUY_TYPENAME(i16) +RUY_TYPENAME(u32) +RUY_TYPENAME(i32) +RUY_TYPENAME(u64) +RUY_TYPENAME(i64) + +#undef RUY_TYPENAME + +template <typename Scalar> +const char* SymmetryName(const Matrix<Scalar>& matrix) { + if (matrix.zero_point == SymmetricZeroPoint<Scalar>()) { + return "symm"; + } else { + return "asymm"; + } +} + +template <typename Scalar> +int StorageSize(const Matrix<Scalar>& matrix) { + return sizeof(Scalar) * FlatSize(matrix.layout); +} + +// Helper that replicates a buffer and gives out pointers to the replicas. +// This is useful when one wants to traverse data so that it is cold in cache. +// By having a sufficiently large value of num_repeats, one can ensure that the +// working set covered by the replicas is greater than the cache size. +template <typename T> +class RepeatedBuffer { + public: + RepeatedBuffer() = default; + void Init(const T* elems, std::size_t num_elems, int num_repeats) { + buffers_.clear(); + allocator_.FreeAll(); + for (int i = 0; i < num_repeats; i++) { + T* p; + allocator_.Allocate(num_elems, &p); + memcpy(p, elems, num_elems * sizeof(T)); + buffers_.push_back(p); + } + } + T* Next() { + T* ret = buffers_[current_]; + current_ = (current_ + 1) % buffers_.size(); + return ret; + } + + private: + Allocator allocator_; + std::vector<T*> buffers_; + int current_ = 0; +}; + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark( + TestResult<typename SpecType::DstScalar>* result) { + using DstScalar = typename SpecType::DstScalar; + + const bool cold = getenv("RUY_BENCHMARK_COLD"); + LhsScalar* orig_lhs_data = lhs.matrix.data.get(); + RhsScalar* orig_rhs_data = rhs.matrix.data.get(); + DstScalar* orig_dst_data = result->storage_matrix.matrix.data.get(); + void* orig_prepacked_lhs_data = result->prepacked_lhs.data; + void* orig_prepacked_rhs_data = result->prepacked_rhs.data; + + int num_matmul_sets = 0; + + RepeatedBuffer<LhsScalar> cold_lhs; + RepeatedBuffer<RhsScalar> cold_rhs; + RepeatedBuffer<DstScalar> cold_dst; + RepeatedBuffer<char> cold_prepacked_lhs; + RepeatedBuffer<char> cold_prepacked_rhs; + + if (cold) { + const int kWorkingSetSize = 100 << 20; + const int each_matmul_set_size = StorageSize(lhs.matrix) + + StorageSize(rhs.matrix) + + StorageSize(result->storage_matrix.matrix); + num_matmul_sets = + (kWorkingSetSize + each_matmul_set_size - 1) / each_matmul_set_size; + + cold_lhs.Init(lhs.matrix.data.get(), FlatSize(lhs.matrix.layout), + num_matmul_sets); + cold_rhs.Init(rhs.matrix.data.get(), FlatSize(rhs.matrix.layout), + num_matmul_sets); + cold_dst.Init(result->storage_matrix.matrix.data.get(), + FlatSize(result->storage_matrix.matrix.layout), + num_matmul_sets); + if (benchmark_prepack_lhs) { + cold_prepacked_lhs.Init(static_cast<char*>(result->prepacked_lhs.data), + result->prepacked_lhs.data_size, num_matmul_sets); + } + if (benchmark_prepack_rhs) { + cold_prepacked_rhs.Init(static_cast<char*>(result->prepacked_rhs.data), + result->prepacked_rhs.data_size, num_matmul_sets); + } + } + const bool record_pmu = GetBoolEnvVarOrFalse("RUY_BENCHMARK_PMU"); + int repeats = GetIntEnvVarOrZero("RUY_BENCHMARK_REPEATS"); + if (!repeats) { + repeats = 4; + } + float benchmark_min_secs = GetFloatEnvVarOrZero("RUY_BENCHMARK_MIN_SECS"); + if (!benchmark_min_secs) { + benchmark_min_secs = 0.5; + } +#ifdef RUY_PROFILER + { + const char* lhstype = TypeName<LhsScalar>(); + const char* lhssymm = SymmetryName(lhs.matrix); + const char* rhstype = TypeName<RhsScalar>(); + const char* rhssymm = SymmetryName(rhs.matrix); + + printf("Profiling path=%s shape=(%dx%dx%d) lhs=(%s,%s) rhs=(%s,%s)\n", + PathName(*result).c_str(), rows, depth, cols, lhstype, lhssymm, + rhstype, rhssymm); + ruy::profiler::ScopeProfile profile; +#endif + + float latency = std::numeric_limits<float>::infinity(); + float l1_refill_rate = std::numeric_limits<float>::infinity(); + float l2_refill_rate = std::numeric_limits<float>::infinity(); + float l3_refill_rate = std::numeric_limits<float>::infinity(); + float l1tlb_refill_rate = std::numeric_limits<float>::infinity(); + float l2tlb_refill_rate = std::numeric_limits<float>::infinity(); + float mispred_rate = std::numeric_limits<float>::infinity(); + float frontend_stall_rate = std::numeric_limits<float>::infinity(); + float backend_stall_rate = std::numeric_limits<float>::infinity(); + + for (int repeat = 0; repeat < repeats; repeat++) { + auto& pmu_events = GlobalPmuEvents(); + if (record_pmu) { + pmu_events.StartRecording(); + } + TimePoint time_start = Now(); + TimePoint t = time_start; + int iters = 0; + int iters_at_a_time = 1; + while (ToFloatSeconds(t - time_start) < benchmark_min_secs) { + for (int i = 0; i < iters_at_a_time; i++) { + if (cold) { + lhs.matrix.data = cold_lhs.Next(); + rhs.matrix.data = cold_rhs.Next(); + result->storage_matrix.matrix.data = cold_dst.Next(); + if (benchmark_prepack_lhs) { + result->prepacked_lhs.data = cold_prepacked_lhs.Next(); + } + if (benchmark_prepack_rhs) { + result->prepacked_rhs.data = cold_prepacked_rhs.Next(); + } + } + EvalResult(result); + iters++; + } + iters_at_a_time *= 2; + t = Now(); + } + latency = std::min( + latency, static_cast<float>(ToFloatSeconds(t - time_start) / iters)); + if (record_pmu) { + pmu_events.StopRecording(); + const float normalization_factor = + 1.0f / (static_cast<float>(iters) * rows * cols * depth); + l1_refill_rate = std::min( + l1_refill_rate, pmu_events.L1RefillCount() * normalization_factor); + l2_refill_rate = std::min( + l2_refill_rate, pmu_events.L2RefillCount() * normalization_factor); + l3_refill_rate = std::min( + l3_refill_rate, pmu_events.L3RefillCount() * normalization_factor); + l1tlb_refill_rate = + std::min(l1tlb_refill_rate, + pmu_events.L1TLBRefillCount() * normalization_factor); + l2tlb_refill_rate = + std::min(l2tlb_refill_rate, + pmu_events.L2TLBRefillCount() * normalization_factor); + mispred_rate = + std::min(mispred_rate, pmu_events.BranchMispredictionCount() * + normalization_factor); + frontend_stall_rate = + std::min(frontend_stall_rate, + pmu_events.FrontendStallCount() * normalization_factor); + backend_stall_rate = + std::min(backend_stall_rate, + pmu_events.BackendStallCount() * normalization_factor); + } + } + result->latency = latency; + if (record_pmu) { + result->l1_refill_rate = l1_refill_rate; + result->l2_refill_rate = l2_refill_rate; + result->l3_refill_rate = l3_refill_rate; + result->l1tlb_refill_rate = l1tlb_refill_rate; + result->l2tlb_refill_rate = l2tlb_refill_rate; + result->mispred_rate = mispred_rate; + result->frontend_stall_rate = frontend_stall_rate; + result->backend_stall_rate = backend_stall_rate; + } + +#ifdef RUY_PROFILER + } + fflush(stdout); +#endif + + if (cold) { + lhs.matrix.data = orig_lhs_data; + rhs.matrix.data = orig_rhs_data; + memcpy(orig_dst_data, result->storage_matrix.matrix.data.get(), + StorageSize(result->storage_matrix.matrix)); + result->storage_matrix.matrix.data = orig_dst_data; + result->prepacked_lhs.data = orig_prepacked_lhs_data; + result->prepacked_rhs.data = orig_prepacked_rhs_data; + } +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::Eval() { + RUY_CHECK_EQ(life_stage, LifeStage::kHasPrepackedMatrices); + for (auto& result : results) { + if (benchmark) { + Benchmark(result.get()); + } else { + EvalResult(result.get()); + } + } + life_stage = LifeStage::kEvaluated; +} + +template <typename Scalar> +std::string DumpRegion(const Matrix<Scalar>& matrix, int center_row, + int center_col) { + static constexpr int kRadius = 20; + int first_row = std::max(0, center_row - kRadius); + int last_row = std::min(matrix.layout.rows - 1, center_row + kRadius); + int first_col = std::max(0, center_col - kRadius); + int last_col = std::min(matrix.layout.cols - 1, center_col + kRadius); + std::ostringstream stream; + for (int row = first_row; row <= last_row; row++) { + for (int col = first_col; col <= last_col; col++) { + stream << static_cast<double>(Element(matrix, row, col)) << " "; + } + stream << "\n"; + } + return stream.str(); +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::VerifyTestResults() const { + const int depth = lhs.matrix.layout.cols; + for (int i = 0; i < results.size() - 1; i++) { + if (!Agree(*results[i], *results[i + 1], depth)) { + std::string paths_in_agreement; + paths_in_agreement.append(PathName(*results[0])); + for (int j = 1; j <= i; j++) { + paths_in_agreement.append(", "); + paths_in_agreement.append(PathName(*results[j])); + } + ErrorAnalysis error_analysis; + AnalyzeTestError(*this, i + 1, &error_analysis); + std::cerr << "Error: path (" << PathName(*results[i + 1]) + << ") disagrees with the other paths (" << paths_in_agreement + << "), which agree with each other." << std::endl; + std::cerr << "Shape: rows = " << rows << ", cols = " << cols + << ", depth = " << depth << std::endl; + std::cerr << "Stats of the good result matrix: " + << StatsAsString(error_analysis.stats_good) << std::endl; + std::cerr << "Stats of the bad result matrix: " + << StatsAsString(error_analysis.stats_bad) << std::endl; + if (error_analysis.error_rows.size() < rows) { + std::cerr << "Rows containing errors: " + << Join(error_analysis.error_rows) << std::endl; + } else { + std::cerr << "Errors found in ALL rows." << std::endl; + } + if (error_analysis.error_cols.size() < cols) { + std::cerr << "Cols containing errors: " + << Join(error_analysis.error_cols) << std::endl; + } else { + std::cerr << "Errors found in ALL cols." << std::endl; + } + std::cerr << "The first error occurs at row " + << error_analysis.row_of_first_error << ", col " + << error_analysis.col_of_first_error << std::endl; + std::cerr << "Good value: " << error_analysis.first_error_good_value + << std::endl; + std::cerr << "Bad value : " << error_analysis.first_error_bad_value + << std::endl; + std::cerr << "Region of Good result matrix around first error:\n\n" + << DumpRegion(results[0]->storage_matrix.matrix, + error_analysis.row_of_first_error, + error_analysis.col_of_first_error) + << std::endl; + std::cerr << "Region of Bad result matrix around first error:\n\n" + << DumpRegion(results[i + 1]->storage_matrix.matrix, + error_analysis.row_of_first_error, + error_analysis.col_of_first_error) + << std::endl; + RUY_CHECK(false); + } + } +} + +template <typename LhsScalar, typename RhsScalar, typename SpecType> +void TestSet<LhsScalar, RhsScalar, SpecType>::Verify() { + RUY_CHECK_EQ(life_stage, LifeStage::kEvaluated); + if (expected_outcome == ExpectedOutcome::kSuccess) { + VerifyTestResults(); + } + life_stage = LifeStage::kFinal; +} + +template <typename TestSetType> +void TestRCC(int rows, int depth, int cols, ExpectedOutcome expected_outcome) { + TestSetType test_set; + test_set.rows = rows; + test_set.depth = depth; + test_set.cols = cols; + test_set.lhs_order = Order::kRowMajor; + test_set.rhs_order = Order::kColMajor; + test_set.dst_order = Order::kColMajor; + test_set.layout_style = LayoutStyle::kPackedLinear; + test_set.expected_outcome = expected_outcome; + test_set.Run(); +} + +template <typename TestSetType> +void TestRCC(int rows, int depth, int cols) { + TestRCC<TestSetType>(rows, depth, cols, ExpectedOutcome::kSuccess); +} + +template <typename TestSetType> +void TestNonRCC(int rows, int depth, int cols, + ExpectedOutcome expected_outcome) { + TestSetType test_set; + test_set.rows = rows; + test_set.depth = depth; + test_set.cols = cols; + test_set.lhs_order = Order::kColMajor; + test_set.rhs_order = Order::kColMajor; + test_set.dst_order = Order::kColMajor; + test_set.layout_style = LayoutStyle::kPackedLinear; + test_set.expected_outcome = expected_outcome; + test_set.Run(); +} + +template <typename TestSetType> +void TestLinearAllOrders(int rows, int depth, int cols, + ExpectedOutcome expected_outcome) { + const std::vector<Order> orders{Order::kColMajor, Order::kRowMajor}; + + for (Order lhs_order : orders) { + for (Order rhs_order : orders) { + for (Order dst_order : orders) { + TestSetType test_set; + test_set.rows = rows; + test_set.depth = depth; + test_set.cols = cols; + test_set.lhs_order = lhs_order; + test_set.rhs_order = rhs_order; + test_set.dst_order = dst_order; + test_set.layout_style = LayoutStyle::kLinear; + test_set.expected_outcome = expected_outcome; + test_set.Run(); + } + } + } +} + +template <typename TestSetType> +void TestLinearAllOrders(int rows, int depth, int cols) { + TestLinearAllOrders<TestSetType>(rows, depth, cols, + ExpectedOutcome::kSuccess); +} + +} // namespace ruy + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_TEST_H_ |