1 files changed, 163 insertions, 0 deletions
diff --git a/ruy/tune.h b/ruy/tune.h
new file mode 100644
index 0000000..e6a0ee8
--- /dev/null
+++ b/ruy/tune.h
@@ -0,0 +1,163 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Library doing minimal CPU detection to decide what to tune asm code for.
+//
+// # Tuning vs Path
+//
+// Tunings are merely local variations of optimized code paths, that are
+// drop-in replacements for each other --- the input and output data layouts
+// are identical.  By contrast, what ruy calls a Path dictates its own
+// data layouts. For example, Path::kNeonDotprod will use different
+// layouts compared to Path::kNeon; but within each, different tunings
+// will share that same layout.
+//
+// # Tuning is for now only based on 1 bit: OutOfOrder / InOrder
+//
+// In practice, each of our asm code paths only needs one bit information to
+// decide on tuning: whether the CPU is out-of-order or in-order.
+// That is because out-of-order CPUs are by definition relatively insensitive
+// to small-scale asm details (which is what "tuning" is about); and for each
+// asm code path, there tends to be one main in-order CPU architecture that
+// we focus our tuning effort on. Examples:
+//  * For Path::kNeon, the main in-order CPU is Cortex-A53/A55 (pre-dotprod)
+//  * For Path::kNeonDotprod, the main in-order CPU is Cortex-A55r1 (dotprod)
+//
+// Because having tuned code paths is a compromise of efficiency gains
+// versus implementation effort and code size, we are happy to stop at just this
+// single bit of information, OutOfOrder/InOrder, at least in the current CPU
+// landscape. This could change in the future.
+//
+// # Implementation notes and alternatives.
+//
+// The current implementation uses a nano-benchmark, see tune.cc.
+// That is why it's quite expensive, making caching /
+// statefulness necessary (see TuningResolver class comment).
+//
+// An interesting alternative, which was explained to us by Marat Dukhan
+// (maratek@) after this was implemented, would be to use the
+// getcpu(2) system call on Linux. This returns a
+// numeric CPU identifier that could be mapped to a OutOfOrder/InOrder
+// classification given additional information about the CPU.  Such
+// additional information could be obtained by the cpuinfo library,
+//   https://github.com/pytorch/cpuinfo
+// which obtains this information mainly from parsing /proc/cpuinfo.
+// Pros:
+//   * Would remove the need for the relatively expensive nano-benchmark
+//     (dozens of microseconds, which have to be reevaluated again several
+//     times per second).
+//   * Would conceivably be more reliable.
+// Cons:
+//   * Linux-specific.
+//   * Modest binary size increase (Marat mentioned the cpuinfo lib is 20k).
+//   * Won't support exactly 100% of devices (nonstandard /proc/cpuinfo etc).
+//
+// We could also have both:
+//  * Maybe by trying getcpu first if supported, then falling back to a
+//    nano-benchmark.
+//  * Maybe using getcpu in conjunction with the nano-benchmark to cache
+//    per-CPU-id nano-benchmark results.
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_TUNE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_TUNE_H_
+
+#include "ruy/opt_set.h"
+#include "ruy/platform.h"
+#include "ruy/time.h"
+
+// Tuning only implemented on NEON_64 at the moment (see assembly code
+// in the nano-benchmark) and not on Apple (some Apple CPUs produce incorrect
+// results on in-order-tuned kernels combining ARM and NEON load instructions
+// and NEON `ins` instructions).
+//
+// When tuning is not implemented, we simply always use Tuning::kOutOfOrder.
+#if RUY_OPT_ENABLED(RUY_OPT_TUNING) && RUY_PLATFORM(NEON_64) && \
+    !RUY_PLATFORM(APPLE)
+#define RUY_IMPLEMENT_TUNING
+#endif
+
+namespace ruy {
+
+enum class Tuning {
+  // kAuto means please use auto-detection. It's the default in the
+  // user-visible parts (see Context). It's meant to be resolved to an
+  // actual tuning at some point by means of TuningResolver.
+  kAuto,
+  // Target an out-order CPU. Example: ARM Cortex-A75.
+  kOutOfOrder,
+  // Target an in-order CPU. Example: ARM Cortex-A55.
+  kInOrder
+};
+
+// Why a TuningResolver class?
+//
+// Ideally, this Library would offer a single function,
+//   Tuning GetCurrentCPUTuning();
+//
+// However, determining information about the current CPU is not necessarily,
+// cheap, so we currently cache that and only invalidate/reevaluate after
+// a fixed amount of time. This need to store state is why this library
+// has to expose a class, TuningResolver, not just a function.
+class TuningResolver {
+ public:
+  TuningResolver();
+
+  // Allows the user to specify an explicit Tuning value, bypassing auto
+  // detection; or to specify Tuning::kAuto, reverting to auto detection.
+  void SetTuning(Tuning tuning) { unresolved_tuning_ = tuning; }
+
+  // Get an actual tuning --- that is the function that this class wanted to be.
+  Tuning Resolve();
+
+ private:
+  TuningResolver(const TuningResolver&) = delete;
+
+  // TuningTool is a demo/tool used to tweak the tuning implementation to
+  // specific devices. It needs to access some finer granularity information
+  // than just the Tuning returned by Resolve. Nothing else should need
+  // access to that.
+  friend class TuneTool;
+  // Actually runs a nano-benchmark, producing a real number called 'ratio'
+  // whose meaning is generally opaque / implementation defined. Typically,
+  // this would be the ratio between the latencies of two different
+  // pieces of asm code differing only by the ordering of instructions,
+  // revealing whether the CPU cares about such ordering details.
+  // An implementation may just return a dummy value if it is not based on
+  // such nanobenchmarking / ratio evaluation.
+  float EvalRatio();
+  // Empirically determined threshold on ratio values delineating
+  // out-of-order (ratios closer to 1) from in-order (ratios farther from 1).
+  // An implementation may just return a dummy value if it is not based on
+  // such nanobenchmarking / ratio evaluation.
+  float ThresholdRatio();
+  // Perform the tuning resolution now. That may typically use EvalRatio and
+  // ThresholdRatio, but an implementation may use a different approach instead.
+  Tuning ResolveNow();
+
+  // The tuning as specified by the user, before actual resolution happens
+  // i.e. before querying any specifics of the current CPU.
+  // The default value kAuto means try to auto-detect. Other values mean
+  // bypass auto-detect, use explicit value instead. See SetTuning().
+  Tuning unresolved_tuning_ = Tuning::kAuto;
+  // Cached last resolved tuning.
+  Tuning last_resolved_tuning_ = Tuning::kAuto;
+  // Timepoint of cached last resolved tuning, for invalidation purposes.
+  TimePoint last_resolved_timepoint_;
+  // Cached last resolved tunings that are older than this age are invalid.
+  const Duration expiry_duration_;
+};
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_TUNE_H_