Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/google/ruy.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'ruy/pmu.cc')
-rw-r--r--ruy/pmu.cc281
1 files changed, 281 insertions, 0 deletions
diff --git a/ruy/pmu.cc b/ruy/pmu.cc
new file mode 100644
index 0000000..1d87b1f
--- /dev/null
+++ b/ruy/pmu.cc
@@ -0,0 +1,281 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ruy/pmu.h"
+
+#include "ruy/check_macros.h"
+
+#ifdef __linux__
+#include <asm/unistd.h>
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <cstdio>
+#endif
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+namespace ruy {
+
+// Linux-specific. Not ARM-specific.
+#ifdef __linux__
+class PerfEvent {
+ public:
+ PerfEvent(std::uint32_t type, std::uint64_t config) {
+ perf_event_attr pe;
+ memset(&pe, 0, sizeof(pe));
+ pe.size = sizeof(pe);
+ pe.type = type;
+ pe.config = config;
+ pe.disabled = 1;
+ pe.exclude_kernel = 1;
+ pe.exclude_hv = 1;
+ pe.inherit = 1;
+ fd_ = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
+ if (fd_ == -1) {
+ fprintf(stderr, "perf_event_open failed for config 0x%lx\n",
+ static_cast<unsigned long>(config));
+ // abort();
+ }
+ }
+
+ ~PerfEvent() {
+ RUY_CHECK(!started_);
+ close(fd_);
+ }
+
+ void Start() {
+ RUY_CHECK(!started_);
+ started_ = true;
+ ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
+ ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
+ count_at_start_ = Read();
+ }
+
+ void Stop() {
+ RUY_CHECK(started_);
+ started_ = false;
+ ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
+ count_at_stop_ = Read();
+ }
+
+ std::int64_t Count() const {
+ RUY_CHECK(!started_);
+ return count_at_stop_ - count_at_start_;
+ }
+
+ private:
+ std::int64_t Read() const {
+ std::int64_t count;
+ RUY_CHECK_NE(read(fd_, &count, sizeof(count)), -1);
+ return count;
+ }
+ std::int64_t count_at_start_ = -1;
+ std::int64_t count_at_stop_ = -1;
+ bool started_ = false;
+ int fd_ = -1;
+};
+#else
+// Placeholder implementation to at least compile outside of linux.
+#define PERF_TYPE_RAW 0
+class PerfEvent {
+ public:
+ PerfEvent(std::uint32_t, std::uint64_t) {}
+ ~PerfEvent() {}
+ void Start() {}
+ void Stop() {}
+ std::int64_t Count() const { return 0; }
+};
+#endif
+
+// ARM-specific. Query ARM PMU counters as Linux perf events using
+// PERF_TYPE_RAW.
+namespace arm_pmuv3 {
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-const-variable"
+
+// These event numbers are listed in the ARMv8 architecture reference manual.
+constexpr std::uint16_t L1I_CACHE_REFILL = 0x01;
+constexpr std::uint16_t L1I_TLB_REFILL = 0x02;
+constexpr std::uint16_t L1D_CACHE_REFILL = 0x03;
+constexpr std::uint16_t L1D_CACHE = 0x04;
+constexpr std::uint16_t L1D_TLB_REFILL = 0x05;
+constexpr std::uint16_t LD_RETIRED = 0x06;
+constexpr std::uint16_t ST_RETIRED = 0x07;
+constexpr std::uint16_t INST_RETIRED = 0x08;
+constexpr std::uint16_t EXC_TAKEN = 0x09;
+constexpr std::uint16_t EXC_RETURN = 0x0A;
+constexpr std::uint16_t CID_WRITE_RETIRED = 0x0B;
+constexpr std::uint16_t PC_WRITE_RETIRED = 0x0C;
+constexpr std::uint16_t BR_IMMED_RETIRED = 0x0D;
+constexpr std::uint16_t BR_RETURN_RETIRED = 0x0E;
+constexpr std::uint16_t UNALIGNED_LDST_RETIRED = 0x0F;
+constexpr std::uint16_t BR_MIS_PRED = 0x10;
+constexpr std::uint16_t CPU_CYCLES = 0x11;
+constexpr std::uint16_t BR_PRED = 0x12;
+constexpr std::uint16_t MEM_ACCESS = 0x13;
+constexpr std::uint16_t L1I_CACHE = 0x14;
+constexpr std::uint16_t L1D_CACHE_WB = 0x15;
+constexpr std::uint16_t L2D_CACHE = 0x16;
+constexpr std::uint16_t L2D_CACHE_REFILL = 0x17;
+constexpr std::uint16_t L2D_CACHE_WB = 0x18;
+constexpr std::uint16_t BUS_ACCESS = 0x19;
+constexpr std::uint16_t MEMORY_ERROR = 0x1A;
+constexpr std::uint16_t INST_SPEC = 0x1B;
+constexpr std::uint16_t TTBR_WRITE_RETIRED = 0x1C;
+constexpr std::uint16_t BUS_CYCLES = 0x1D;
+constexpr std::uint16_t CHAIN = 0x1E;
+constexpr std::uint16_t L1D_CACHE_ALLOCATE = 0x1F;
+constexpr std::uint16_t L2D_CACHE_ALLOCATE = 0x20;
+constexpr std::uint16_t BR_RETIRED = 0x21;
+constexpr std::uint16_t BR_MIS_PRED_RETIRED = 0x22;
+constexpr std::uint16_t STALL_FRONTEND = 0x23;
+constexpr std::uint16_t STALL_BACKEND = 0x24;
+constexpr std::uint16_t L1D_TLB = 0x25;
+constexpr std::uint16_t L1I_TLB = 0x26;
+constexpr std::uint16_t L2I_CACHE = 0x27;
+constexpr std::uint16_t L2I_CACHE_REFILL = 0x28;
+constexpr std::uint16_t L3D_CACHE_ALLOCATE = 0x29;
+constexpr std::uint16_t L3D_CACHE_REFILL = 0x2A;
+constexpr std::uint16_t L3D_CACHE = 0x2B;
+constexpr std::uint16_t L3D_CACHE_WB = 0x2C;
+constexpr std::uint16_t L2D_TLB_REFILL = 0x2D;
+constexpr std::uint16_t L2I_TLB_REFILL = 0x2E;
+constexpr std::uint16_t L2D_TLB = 0x2F;
+constexpr std::uint16_t L2I_TLB = 0x30;
+constexpr std::uint16_t LL_CACHE = 0x32;
+constexpr std::uint16_t LL_CACHE_MISS = 0x33;
+constexpr std::uint16_t DTLB_WALK = 0x34;
+constexpr std::uint16_t LL_CACHE_RD = 0x36;
+constexpr std::uint16_t LL_CACHE_MISS_RD = 0x37;
+
+// Additional implementation-defined events found by googling around.
+constexpr std::uint16_t L1D_CACHE_RD = 0x40;
+constexpr std::uint16_t L1D_CACHE_REFILL_RD = 0x42;
+constexpr std::uint16_t L1D_TLB_REFILL_RD = 0x4C;
+constexpr std::uint16_t L1D_TLB_RD = 0x4E;
+constexpr std::uint16_t L2D_CACHE_RD = 0x50;
+constexpr std::uint16_t L2D_CACHE_REFILL_RD = 0x52;
+constexpr std::uint16_t BUS_ACCESS_RD = 0x60;
+constexpr std::uint16_t MEM_ACCESS_RD = 0x66;
+constexpr std::uint16_t L3D_CACHE_RD = 0xA0;
+constexpr std::uint16_t L3D_CACHE_REFILL_RD = 0xA2;
+
+#pragma GCC diagnostic pop
+
+}; // namespace arm_pmuv3
+
+class PmuEventsPrivate {
+ public:
+ PmuEventsPrivate()
+ : l1d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_REFILL),
+ l2d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_REFILL),
+ l3d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L3D_CACHE_REFILL),
+ ll_cache_miss(PERF_TYPE_RAW, arm_pmuv3::LL_CACHE_MISS),
+ l1d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_TLB_REFILL),
+ l2d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_TLB_REFILL),
+ stall_frontend(PERF_TYPE_RAW, arm_pmuv3::STALL_FRONTEND),
+ stall_backend(PERF_TYPE_RAW, arm_pmuv3::STALL_BACKEND),
+ br_mis_pred(PERF_TYPE_RAW, arm_pmuv3::BR_MIS_PRED) {}
+
+ private:
+ friend class PmuEvents;
+ PerfEvent l1d_cache_refill;
+ PerfEvent l2d_cache_refill;
+ PerfEvent l3d_cache_refill;
+ PerfEvent ll_cache_miss;
+ PerfEvent l1d_tlb_refill;
+ PerfEvent l2d_tlb_refill;
+ PerfEvent stall_frontend;
+ PerfEvent stall_backend;
+ PerfEvent br_mis_pred;
+};
+
+PmuEvents::PmuEvents() : priv(new PmuEventsPrivate) {}
+PmuEvents::~PmuEvents() { delete priv; }
+
+void PmuEvents::StartRecording() {
+ priv->l1d_cache_refill.Start();
+ priv->l2d_cache_refill.Start();
+ priv->l3d_cache_refill.Start();
+ priv->ll_cache_miss.Start();
+ priv->l1d_tlb_refill.Start();
+ priv->l2d_tlb_refill.Start();
+ priv->stall_frontend.Start();
+ priv->stall_backend.Start();
+ priv->br_mis_pred.Start();
+}
+
+void PmuEvents::StopRecording() {
+ priv->l1d_cache_refill.Stop();
+ priv->l2d_cache_refill.Stop();
+ priv->l3d_cache_refill.Stop();
+ priv->ll_cache_miss.Stop();
+ priv->l1d_tlb_refill.Stop();
+ priv->l2d_tlb_refill.Stop();
+ priv->stall_frontend.Stop();
+ priv->stall_backend.Stop();
+ priv->br_mis_pred.Stop();
+}
+
+float PmuEvents::BranchMispredictionCount() const {
+ return static_cast<float>(priv->br_mis_pred.Count());
+}
+
+float PmuEvents::FrontendStallCount() const {
+ return static_cast<float>(priv->stall_frontend.Count());
+}
+
+float PmuEvents::BackendStallCount() const {
+ return static_cast<float>(priv->stall_backend.Count());
+}
+
+float PmuEvents::L1RefillCount() const {
+ return static_cast<float>(priv->l1d_cache_refill.Count());
+}
+
+float PmuEvents::L2RefillCount() const {
+ return static_cast<float>(priv->l2d_cache_refill.Count());
+}
+
+float PmuEvents::L3RefillCount() const {
+ // Important: this was discovered in the context of the above experiments,
+ // which also tested the _RD variants of these counters. So it's possible that
+ // it's just not needed here with the default (non _RD) counters.
+ //
+ // Some CPUs implement LL_CACHE_MISS[_RD], some implement
+ // L3D_CACHE_REFILL[_RD]. It seems that either one of these two counters is
+ // zero, or they roughly both agree with each other. Therefore, taking the max
+ // of them is a reasonable way to get something more portable across various
+ // CPUs.
+ return static_cast<float>(
+ std::max(priv->l3d_cache_refill.Count(), priv->ll_cache_miss.Count()));
+}
+
+float PmuEvents::L1TLBRefillCount() const {
+ return static_cast<float>(priv->l1d_tlb_refill.Count());
+}
+
+float PmuEvents::L2TLBRefillCount() const {
+ return static_cast<float>(priv->l2d_tlb_refill.Count());
+}
+
+} // namespace ruy