diff options
author | Benoit Jacob <benoitjacob@google.com> | 2021-01-08 02:00:18 +0300 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2021-01-08 02:00:52 +0300 |
commit | 3f655fa1dd2513da8a271a89f068df2b30a4979f (patch) | |
tree | 6b7e0afe7a16b1c7a77359c380cecc5fd847acdb | |
parent | 878283640de7946a43053e8ebf4f15114fbc9156 (diff) |
Cosmetics: class-ify TrMulTask, in particular put the trailing _ where they belong.
Also remove a useless #include in context.h.
PiperOrigin-RevId: 350645020
-rw-r--r-- | ruy/context.h | 2 | ||||
-rw-r--r-- | ruy/trmul.cc | 96 |
2 files changed, 48 insertions, 50 deletions
diff --git a/ruy/context.h b/ruy/context.h index 5dc2c7a..3de0210 100644 --- a/ruy/context.h +++ b/ruy/context.h @@ -20,8 +20,6 @@ limitations under the License. #include <cstdint> -#include "ruy/performance_advisory.h" - namespace ruy { class Ctx; diff --git a/ruy/trmul.cc b/ruy/trmul.cc index 9038d7e..5a385c0 100644 --- a/ruy/trmul.cc +++ b/ruy/trmul.cc @@ -52,62 +52,62 @@ enum class PackingStatus : std::uint8_t { }; // TrMulTask is the task that a ruy thread runs to perform the TrMul operation. -struct TrMulTask final : Task { - TrMulTask(TrMulParams* params_, const BlockMap& block_map_, - std::atomic<int>* atomic_block_id_, int thread_id_, - bool need_atomics_, - SidePair<std::atomic<PackingStatus>*> packing_status_, - TuningResolver* tuning_resolver_, Allocator* local_allocator_, - CpuInfo* cpuinfo_) - : params(params_), - block_map(block_map_), - atomic_block_id(atomic_block_id_), - thread_id(thread_id_), - need_atomics(need_atomics_), - packing_status(packing_status_), - tuning_resolver(tuning_resolver_), - local_allocator(local_allocator_), - local_already_packed{nullptr, nullptr}, - cpuinfo(cpuinfo_) {} +class TrMulTask final : public Task { + public: + TrMulTask(TrMulParams* params, const BlockMap& block_map, + std::atomic<int>* atomic_block_id, int thread_id, bool need_atomics, + SidePair<std::atomic<PackingStatus>*> packing_status, + TuningResolver* tuning_resolver, Allocator* local_allocator, + CpuInfo* cpuinfo) + : params_(params), + block_map_(block_map), + atomic_block_id_(atomic_block_id), + thread_id_(thread_id), + need_atomics_(need_atomics), + packing_status_(packing_status), + tuning_resolver_(tuning_resolver), + local_allocator_(local_allocator), + local_already_packed_{nullptr, nullptr}, + cpuinfo_(cpuinfo) {} // Thread main function. This is one thread's share of the TrMul work. void Run() override { // Allocate and initialize `local_packed`. for (Side side : {Side::kLhs, Side::kRhs}) { - if (!params->is_prepacked[side]) { - const int size = NumBlocksPerSide(side, block_map); - local_allocator->Allocate(size, &local_already_packed[side]); - memset(local_already_packed[side], 0, size * sizeof(bool)); + if (!params_->is_prepacked[side]) { + const int size = NumBlocksPerSide(side, block_map_); + local_allocator_->Allocate(size, &local_already_packed_[side]); + memset(local_already_packed_[side], 0, size * sizeof(bool)); } } - const Tuning tuning = tuning_resolver->Resolve(cpuinfo); - const int num_blocks = NumBlocks(block_map); + const Tuning tuning = tuning_resolver_->Resolve(cpuinfo_); + const int num_blocks = NumBlocks(block_map_); // Each thread starts by initially reserving the block whose id // is the thread id. - int block_id = thread_id; + int block_id = thread_id_; // Loop until all blocks have been computed. while (block_id < num_blocks) { // Reserve the next block to handle, hiding the latency of this atomic op. const int next_block_id = - atomic_block_id->fetch_add(1, std::memory_order_relaxed); + atomic_block_id_->fetch_add(1, std::memory_order_relaxed); // Get coordinates of the current block to handle, in "block space". SidePair<int> block; - GetBlockByIndex(block_map, block_id, &block); + GetBlockByIndex(block_map_, block_id, &block); // Get coordinates of the current block to handle, in matrix space. SidePair<int> start, end; - GetBlockMatrixCoords(block_map, block, &start, &end); + GetBlockMatrixCoords(block_map_, block, &start, &end); // Maybe pack the current LHS/RHS block, if not already packed. EnsurePacked(block, start, end, tuning); // Actually do matrix multiplication work - params->RunKernel(tuning, start, end); + params_->RunKernel(tuning, start, end); // Move on to the next block as obtained by the atomic increment // at the start of this while loop iteration. block_id = next_block_id; } - local_allocator->FreeAll(); + local_allocator_->FreeAll(); } private: @@ -116,11 +116,11 @@ struct TrMulTask final : Task { // If the block was not started packing, packs it and returns true. // If the block was being packed by another thread, returns false. bool TryPack(Side side, int block, int start, int end, Tuning tuning) { - if (params->is_prepacked[side]) { + if (params_->is_prepacked[side]) { return true; } - if (!local_already_packed[side][block]) { - if (need_atomics) { + if (!local_already_packed_[side][block]) { + if (need_atomics_) { // Explanation of this compare_exchange_strong operation: // This atomically performs all of the following: // 1. Read `status` with "acquire" memory order. @@ -156,14 +156,14 @@ struct TrMulTask final : Task { // compare_exchange_strong isn't such a problem. But we don't really // know for sure, that would be interesting to experiment more with. PackingStatus exchanged_status = PackingStatus::kNotStarted; - std::atomic<PackingStatus>& status = packing_status[side][block]; + std::atomic<PackingStatus>& status = packing_status_[side][block]; if (status.compare_exchange_strong( exchanged_status, PackingStatus::kInProgress, std::memory_order_acq_rel, std::memory_order_acquire)) { // In this branch, the status was kNotStarted and we just atomically // changed it to kInProgress as we are about to handle the packing // ourselves. - params->RunPack(side, tuning, start, end); + params_->RunPack(side, tuning, start, end); status.store(PackingStatus::kFinished, std::memory_order_release); } else if (exchanged_status == PackingStatus::kInProgress) { // Another thread is currently packing this block. @@ -174,9 +174,9 @@ struct TrMulTask final : Task { } else { // Single-threaded case: no need for expensive atomics, // local_already_packed is the truth already. - params->RunPack(side, tuning, start, end); + params_->RunPack(side, tuning, start, end); } - local_already_packed[side][block] = true; + local_already_packed_[side][block] = true; } return true; } @@ -205,11 +205,11 @@ struct TrMulTask final : Task { const Side runahead_side = next_runahead_side; const int runahead_block = next_runahead_block[runahead_side]; next_runahead_side = OtherSide(next_runahead_side); - if (runahead_block >= NumBlocksPerSide(runahead_side, block_map)) { + if (runahead_block >= NumBlocksPerSide(runahead_side, block_map_)) { continue; } int runahead_block_start, runahead_block_end; - GetBlockMatrixCoords(runahead_side, block_map, runahead_block, + GetBlockMatrixCoords(runahead_side, block_map_, runahead_block, &runahead_block_start, &runahead_block_end); TryPack(runahead_side, runahead_block, runahead_block_start, runahead_block_end, tuning); @@ -218,19 +218,19 @@ struct TrMulTask final : Task { } } - TrMulParams* params; - const BlockMap& block_map; - std::atomic<int>* atomic_block_id; - int thread_id; - bool need_atomics; - SidePair<std::atomic<PackingStatus>*> packing_status; - TuningResolver* tuning_resolver; - Allocator* local_allocator; + TrMulParams* params_; + const BlockMap& block_map_; + std::atomic<int>* atomic_block_id_; + int thread_id_; + bool need_atomics_; + SidePair<std::atomic<PackingStatus>*> packing_status_; + TuningResolver* tuning_resolver_; + Allocator* local_allocator_; // Local indicators of packedness to avoid the overhead of atomic ops. - SidePair<bool*> local_already_packed; + SidePair<bool*> local_already_packed_; - CpuInfo* cpuinfo; + CpuInfo* cpuinfo_; }; int GetThreadCount(Ctx* ctx, int rows, int cols, int depth) { |