diff options
Diffstat (limited to 'ruy/pmu.cc')
-rw-r--r-- | ruy/pmu.cc | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/ruy/pmu.cc b/ruy/pmu.cc new file mode 100644 index 0000000..1d87b1f --- /dev/null +++ b/ruy/pmu.cc @@ -0,0 +1,281 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "ruy/pmu.h" + +#include "ruy/check_macros.h" + +#ifdef __linux__ +#include <asm/unistd.h> +#include <linux/perf_event.h> +#include <sys/ioctl.h> +#include <syscall.h> +#include <unistd.h> + +#include <cstdio> +#endif + +#include <algorithm> +#include <cstdint> +#include <cstdlib> +#include <cstring> + +namespace ruy { + +// Linux-specific. Not ARM-specific. +#ifdef __linux__ +class PerfEvent { + public: + PerfEvent(std::uint32_t type, std::uint64_t config) { + perf_event_attr pe; + memset(&pe, 0, sizeof(pe)); + pe.size = sizeof(pe); + pe.type = type; + pe.config = config; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + pe.inherit = 1; + fd_ = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0); + if (fd_ == -1) { + fprintf(stderr, "perf_event_open failed for config 0x%lx\n", + static_cast<unsigned long>(config)); + // abort(); + } + } + + ~PerfEvent() { + RUY_CHECK(!started_); + close(fd_); + } + + void Start() { + RUY_CHECK(!started_); + started_ = true; + ioctl(fd_, PERF_EVENT_IOC_RESET, 0); + ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0); + count_at_start_ = Read(); + } + + void Stop() { + RUY_CHECK(started_); + started_ = false; + ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0); + count_at_stop_ = Read(); + } + + std::int64_t Count() const { + RUY_CHECK(!started_); + return count_at_stop_ - count_at_start_; + } + + private: + std::int64_t Read() const { + std::int64_t count; + RUY_CHECK_NE(read(fd_, &count, sizeof(count)), -1); + return count; + } + std::int64_t count_at_start_ = -1; + std::int64_t count_at_stop_ = -1; + bool started_ = false; + int fd_ = -1; +}; +#else +// Placeholder implementation to at least compile outside of linux. +#define PERF_TYPE_RAW 0 +class PerfEvent { + public: + PerfEvent(std::uint32_t, std::uint64_t) {} + ~PerfEvent() {} + void Start() {} + void Stop() {} + std::int64_t Count() const { return 0; } +}; +#endif + +// ARM-specific. Query ARM PMU counters as Linux perf events using +// PERF_TYPE_RAW. +namespace arm_pmuv3 { + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-const-variable" + +// These event numbers are listed in the ARMv8 architecture reference manual. +constexpr std::uint16_t L1I_CACHE_REFILL = 0x01; +constexpr std::uint16_t L1I_TLB_REFILL = 0x02; +constexpr std::uint16_t L1D_CACHE_REFILL = 0x03; +constexpr std::uint16_t L1D_CACHE = 0x04; +constexpr std::uint16_t L1D_TLB_REFILL = 0x05; +constexpr std::uint16_t LD_RETIRED = 0x06; +constexpr std::uint16_t ST_RETIRED = 0x07; +constexpr std::uint16_t INST_RETIRED = 0x08; +constexpr std::uint16_t EXC_TAKEN = 0x09; +constexpr std::uint16_t EXC_RETURN = 0x0A; +constexpr std::uint16_t CID_WRITE_RETIRED = 0x0B; +constexpr std::uint16_t PC_WRITE_RETIRED = 0x0C; +constexpr std::uint16_t BR_IMMED_RETIRED = 0x0D; +constexpr std::uint16_t BR_RETURN_RETIRED = 0x0E; +constexpr std::uint16_t UNALIGNED_LDST_RETIRED = 0x0F; +constexpr std::uint16_t BR_MIS_PRED = 0x10; +constexpr std::uint16_t CPU_CYCLES = 0x11; +constexpr std::uint16_t BR_PRED = 0x12; +constexpr std::uint16_t MEM_ACCESS = 0x13; +constexpr std::uint16_t L1I_CACHE = 0x14; +constexpr std::uint16_t L1D_CACHE_WB = 0x15; +constexpr std::uint16_t L2D_CACHE = 0x16; +constexpr std::uint16_t L2D_CACHE_REFILL = 0x17; +constexpr std::uint16_t L2D_CACHE_WB = 0x18; +constexpr std::uint16_t BUS_ACCESS = 0x19; +constexpr std::uint16_t MEMORY_ERROR = 0x1A; +constexpr std::uint16_t INST_SPEC = 0x1B; +constexpr std::uint16_t TTBR_WRITE_RETIRED = 0x1C; +constexpr std::uint16_t BUS_CYCLES = 0x1D; +constexpr std::uint16_t CHAIN = 0x1E; +constexpr std::uint16_t L1D_CACHE_ALLOCATE = 0x1F; +constexpr std::uint16_t L2D_CACHE_ALLOCATE = 0x20; +constexpr std::uint16_t BR_RETIRED = 0x21; +constexpr std::uint16_t BR_MIS_PRED_RETIRED = 0x22; +constexpr std::uint16_t STALL_FRONTEND = 0x23; +constexpr std::uint16_t STALL_BACKEND = 0x24; +constexpr std::uint16_t L1D_TLB = 0x25; +constexpr std::uint16_t L1I_TLB = 0x26; +constexpr std::uint16_t L2I_CACHE = 0x27; +constexpr std::uint16_t L2I_CACHE_REFILL = 0x28; +constexpr std::uint16_t L3D_CACHE_ALLOCATE = 0x29; +constexpr std::uint16_t L3D_CACHE_REFILL = 0x2A; +constexpr std::uint16_t L3D_CACHE = 0x2B; +constexpr std::uint16_t L3D_CACHE_WB = 0x2C; +constexpr std::uint16_t L2D_TLB_REFILL = 0x2D; +constexpr std::uint16_t L2I_TLB_REFILL = 0x2E; +constexpr std::uint16_t L2D_TLB = 0x2F; +constexpr std::uint16_t L2I_TLB = 0x30; +constexpr std::uint16_t LL_CACHE = 0x32; +constexpr std::uint16_t LL_CACHE_MISS = 0x33; +constexpr std::uint16_t DTLB_WALK = 0x34; +constexpr std::uint16_t LL_CACHE_RD = 0x36; +constexpr std::uint16_t LL_CACHE_MISS_RD = 0x37; + +// Additional implementation-defined events found by googling around. +constexpr std::uint16_t L1D_CACHE_RD = 0x40; +constexpr std::uint16_t L1D_CACHE_REFILL_RD = 0x42; +constexpr std::uint16_t L1D_TLB_REFILL_RD = 0x4C; +constexpr std::uint16_t L1D_TLB_RD = 0x4E; +constexpr std::uint16_t L2D_CACHE_RD = 0x50; +constexpr std::uint16_t L2D_CACHE_REFILL_RD = 0x52; +constexpr std::uint16_t BUS_ACCESS_RD = 0x60; +constexpr std::uint16_t MEM_ACCESS_RD = 0x66; +constexpr std::uint16_t L3D_CACHE_RD = 0xA0; +constexpr std::uint16_t L3D_CACHE_REFILL_RD = 0xA2; + +#pragma GCC diagnostic pop + +}; // namespace arm_pmuv3 + +class PmuEventsPrivate { + public: + PmuEventsPrivate() + : l1d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_REFILL), + l2d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_REFILL), + l3d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L3D_CACHE_REFILL), + ll_cache_miss(PERF_TYPE_RAW, arm_pmuv3::LL_CACHE_MISS), + l1d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_TLB_REFILL), + l2d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_TLB_REFILL), + stall_frontend(PERF_TYPE_RAW, arm_pmuv3::STALL_FRONTEND), + stall_backend(PERF_TYPE_RAW, arm_pmuv3::STALL_BACKEND), + br_mis_pred(PERF_TYPE_RAW, arm_pmuv3::BR_MIS_PRED) {} + + private: + friend class PmuEvents; + PerfEvent l1d_cache_refill; + PerfEvent l2d_cache_refill; + PerfEvent l3d_cache_refill; + PerfEvent ll_cache_miss; + PerfEvent l1d_tlb_refill; + PerfEvent l2d_tlb_refill; + PerfEvent stall_frontend; + PerfEvent stall_backend; + PerfEvent br_mis_pred; +}; + +PmuEvents::PmuEvents() : priv(new PmuEventsPrivate) {} +PmuEvents::~PmuEvents() { delete priv; } + +void PmuEvents::StartRecording() { + priv->l1d_cache_refill.Start(); + priv->l2d_cache_refill.Start(); + priv->l3d_cache_refill.Start(); + priv->ll_cache_miss.Start(); + priv->l1d_tlb_refill.Start(); + priv->l2d_tlb_refill.Start(); + priv->stall_frontend.Start(); + priv->stall_backend.Start(); + priv->br_mis_pred.Start(); +} + +void PmuEvents::StopRecording() { + priv->l1d_cache_refill.Stop(); + priv->l2d_cache_refill.Stop(); + priv->l3d_cache_refill.Stop(); + priv->ll_cache_miss.Stop(); + priv->l1d_tlb_refill.Stop(); + priv->l2d_tlb_refill.Stop(); + priv->stall_frontend.Stop(); + priv->stall_backend.Stop(); + priv->br_mis_pred.Stop(); +} + +float PmuEvents::BranchMispredictionCount() const { + return static_cast<float>(priv->br_mis_pred.Count()); +} + +float PmuEvents::FrontendStallCount() const { + return static_cast<float>(priv->stall_frontend.Count()); +} + +float PmuEvents::BackendStallCount() const { + return static_cast<float>(priv->stall_backend.Count()); +} + +float PmuEvents::L1RefillCount() const { + return static_cast<float>(priv->l1d_cache_refill.Count()); +} + +float PmuEvents::L2RefillCount() const { + return static_cast<float>(priv->l2d_cache_refill.Count()); +} + +float PmuEvents::L3RefillCount() const { + // Important: this was discovered in the context of the above experiments, + // which also tested the _RD variants of these counters. So it's possible that + // it's just not needed here with the default (non _RD) counters. + // + // Some CPUs implement LL_CACHE_MISS[_RD], some implement + // L3D_CACHE_REFILL[_RD]. It seems that either one of these two counters is + // zero, or they roughly both agree with each other. Therefore, taking the max + // of them is a reasonable way to get something more portable across various + // CPUs. + return static_cast<float>( + std::max(priv->l3d_cache_refill.Count(), priv->ll_cache_miss.Count())); +} + +float PmuEvents::L1TLBRefillCount() const { + return static_cast<float>(priv->l1d_tlb_refill.Count()); +} + +float PmuEvents::L2TLBRefillCount() const { + return static_cast<float>(priv->l2d_tlb_refill.Count()); +} + +} // namespace ruy |