diff options
author | Benoit Jacob <benoitjacob@google.com> | 2020-03-28 04:58:51 +0300 |
---|---|---|
committer | Benoit Jacob <benoitjacob@google.com> | 2020-03-30 23:51:39 +0300 |
commit | f7ea583082c670103fb2cebd6035b944c71d64c4 (patch) | |
tree | 4a58b4b3a210fc78776e16cf19b46e671714ac4a /ruy/prepacked_cache.h | |
parent | 299a33a5c2affb88c75726c77be6dd4491418b17 (diff) |
Move ruy's code to a ruy/ subdirectory.
The motivation is that having source files in the repository root runs into a number of corner cases with copybara setups and with external CMake build systems, so enclosing all code in ruy/ avoids that while generally making our setup much more similar to that of other related projects (TensorFlow, IREE).
PiperOrigin-RevId: 303448881
Diffstat (limited to 'ruy/prepacked_cache.h')
-rw-r--r-- | ruy/prepacked_cache.h | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/ruy/prepacked_cache.h b/ruy/prepacked_cache.h new file mode 100644 index 0000000..eedd7e4 --- /dev/null +++ b/ruy/prepacked_cache.h @@ -0,0 +1,130 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PREPACKED_CACHE_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PREPACKED_CACHE_H_ + +#include <cstddef> +#include <iostream> +#include <map> +#include <queue> +#include <vector> + +#include "ruy/allocator.h" +#include "ruy/matrix.h" +#include "ruy/time.h" + +namespace ruy { + +namespace detail { + +// Tracks a set of blocks allocated from the underlying system allocator. +class SystemBlockAllocator { + public: + void *Alloc(std::ptrdiff_t num_bytes) { + void *p = detail::SystemAlignedAlloc(num_bytes); + blocks_.push_back(p); + return p; + } + + void Free(void *block) { + for (auto it = blocks_.begin(); it != blocks_.end(); ++it) { + if (*it == block) { + detail::SystemAlignedFree(block); + blocks_.erase(it); + return; + } + } + RUY_DCHECK(false); // Trying to free pointer we did not allocate. + } + + ~SystemBlockAllocator() { + for (void *block : blocks_) { + detail::SystemAlignedFree(block); + } + } + + private: + std::vector<void *> blocks_; +}; + +} // namespace detail + +enum CachePolicy { kNoCache, kCacheLHSOnNarrowMul }; + +// "Low effort" Least Recently Used Cache for Prepacked Matrices +// A cache mechanism for prepacked matrices that ejects oldest entries. +// The implementation is "low effort" in the following ways: +// - we just linearly search for the oldest entry when doing an ejection +// - the ejection policy is very simple: if the new size would be above the +// . threshold, we will eject entries until the size is below the threshold. +// Current use cases (RNNs with GEMV operations) indicate that ejection is rare +// and memory constraints are tight, so we devote no additional storage to the +// LRU mechanism and accept O(n) search to eject oldest entry. In practice, +// the number of total entries has not been shown to be large. +// This class is not thread safe. In Ruy, memory allocation for packed matrices +// is done in a single threaded context and the actual packing activity may +// be done in a multi-threaded context. +class PrepackedCache { + public: + static constexpr int kDefaultEjectionThresholdBytes = 1 << 28; + + using CacheKey = std::pair<void *, void *>; + + using MatrixWithTimeStamp = std::pair<PrepackedMatrix, TimePoint>; + + using CacheIterator = std::map<CacheKey, MatrixWithTimeStamp>::const_iterator; + + using AlignedAllocator = detail::AlignedAllocator; + + explicit PrepackedCache( + int32_t ejection_threshold = kDefaultEjectionThresholdBytes) + : ejection_threshold_(ejection_threshold), cache_size_(0) {} + + // Looks for an entry with `key`. If found, update its time stamp. + CacheIterator FindAndUpdate(const CacheKey &key); + + // Returns end iterator for internal cache. The iterator type is appropriate + // to use with `FindAndUpdate`. + CacheIterator cend() const { return cache_.end(); } + + // Returns the total size (in bytes) of data held in this cache. + int TotalSize() const { return cache_size_; } + + // All calls to get current TimePoints go through here. + // TODO(b/145625614) Profile timestamps on relevant models to see if + // this level of granularity is sufficient. CoarseNow is cheap so + // it would be nice to keep it. + TimePoint CacheNow() const { return CoarseNow(); } + + // Performs the memory allocation for the `data` and `sums` members of a + // PrepackedMatrix. + void AllocatePrepackedMatrix(PrepackedMatrix *pmatrix); + + // Adds the PrepackedMatrix to the cache, possibly ejecting other values. + void Insert(const CacheKey &key, const PrepackedMatrix &matrix); + + private: + void EjectOne(); + void DoInsert(const CacheKey &key, const PrepackedMatrix &matrix); + detail::SystemBlockAllocator allocator_; + std::map<CacheKey, MatrixWithTimeStamp> cache_; + const int32_t ejection_threshold_; + size_t cache_size_; +}; + +} // namespace ruy + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PREPACKED_CACHE_H_ |