1 files changed, 203 insertions, 0 deletions
diff --git a/ruy/profiler/instrumentation.h b/ruy/profiler/instrumentation.h
new file mode 100644
index 0000000..a9046d4
--- /dev/null
+++ b/ruy/profiler/instrumentation.h
@@ -0,0 +1,203 @@
+/* Copyright 2020 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PROFILER_INSTRUMENTATION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PROFILER_INSTRUMENTATION_H_
+
+#ifdef RUY_PROFILER
+#include <cstdio>
+#include <mutex>
+#include <vector>
+#endif
+
+namespace ruy {
+namespace profiler {
+
+#ifdef RUY_PROFILER
+
+// A label is how a code scope is annotated to appear in profiles.
+// The stacks that are sampled by the profiler are stacks of such labels.
+// A label consists of a literal string, plus optional integer arguments.
+class Label {
+ public:
+  Label() {}
+  template <typename... Args>
+  explicit Label(Args... args) {
+    Set(args...);
+  }
+  void Set(const char* format) {
+    format_ = format;
+    args_count_ = 0;
+  }
+  template <typename... Args>
+  void Set(const char* format, Args... args) {
+    format_ = format;
+    args_count_ = sizeof...(args);
+    SetArgs(0, args...);
+  }
+
+  void operator=(const Label& other);
+
+  bool operator==(const Label& other) const;
+
+  std::string Formatted() const;
+  const char* format() const { return format_; }
+
+ private:
+  void SetArgs(int position, int arg0) { args_[position] = arg0; }
+
+  template <typename... Args>
+  void SetArgs(int position, int arg0, Args... args) {
+    SetArgs(position, arg0);
+    SetArgs(position + 1, args...);
+  }
+
+  static constexpr int kMaxArgs = 4;
+  const char* format_ = nullptr;
+  int args_count_ = 0;
+  int args_[kMaxArgs];
+};
+
+namespace detail {
+
+// Forward-declaration, see class ThreadStack below.
+class ThreadStack;
+
+bool& GlobalIsProfilerRunning();
+
+// Returns the global vector of pointers to all stacks, there being one stack
+// per thread executing instrumented code.
+std::vector<ThreadStack*>* GlobalAllThreadStacks();
+
+// Returns the mutex to be locked around any access to GlobalAllThreadStacks().
+std::mutex* GlobalsMutex();
+
+// Returns the thread-local stack, specific to the current thread.
+ThreadStack* ThreadLocalThreadStack();
+
+// This 'stack' is what may be more appropriately called a 'pseudostack':
+// It contains Label entries that are 'manually' entered by instrumentation
+// code. It's unrelated to real call stacks.
+struct Stack {
+  std::uint32_t id = 0;
+  static constexpr int kMaxSize = 64;
+  int size = 0;
+  Label labels[kMaxSize];
+};
+
+// Returns the buffer byte size required by CopyToSample.
+int GetBufferSize(const Stack& stack);
+
+// Copies this Stack into a byte buffer, called a 'sample'.
+void CopyToBuffer(const Stack& stack, char* dst);
+
+// Populates this Stack from an existing sample buffer, typically
+// produced by CopyToSample.
+void ReadFromBuffer(const char* src, Stack* stack);
+
+// ThreadStack is meant to be used as a thread-local singleton, assigning to
+// each thread a Stack object holding its pseudo-stack of profile labels,
+// plus a mutex allowing to synchronize accesses to this pseudo-stack between
+// this thread and a possible profiler thread sampling it.
+class ThreadStack {
+ public:
+  ThreadStack();
+  ~ThreadStack();
+
+  const Stack& stack() const { return stack_; }
+
+  // Returns the mutex to lock around any access to this stack. Each stack is
+  // accessed by potentially two threads: the thread that it belongs to
+  // (which calls Push and Pop) and the profiler thread during profiling
+  // (which calls CopyToSample).
+  std::mutex& Mutex() const { return mutex_; }
+
+  // Pushes a new label on the top of this Stack.
+  template <typename... Args>
+  void Push(Args... args) {
+    // This mutex locking is needed to guard against race conditions as both
+    // the current thread and the profiler thread may be concurrently accessing
+    // this stack. In addition to that, this mutex locking also serves the other
+    // purpose of acting as a barrier (of compiler code reordering, of runtime
+    // CPU instruction reordering, and of memory access reordering), which
+    // gives a measure of correctness to this profiler. The downside is some
+    // latency. As this lock will be uncontended most of the times, the cost
+    // should be roughly that of an sequentially-consistent atomic access,
+    // comparable to an access to the level of CPU data cache that is shared
+    // among all cores, typically 60 cycles on current ARM CPUs, plus side
+    // effects from barrier instructions.
+    std::lock_guard<std::mutex> lock(mutex_);
+    // Avoid overrunning the stack, even in 'release' builds. This profiling
+    // instrumentation code should not ship in release builds anyway, the
+    // overhead of this check is negligible, and overrunning a stack array would
+    // be bad.
+    if (stack_.size >= Stack::kMaxSize) {
+      abort();
+    }
+    stack_.labels[stack_.size++].Set(args...);
+  }
+
+  // Pops the top-most label from this Stack.
+  void Pop() {
+    // See the comment in Push about this lock. While it would be tempting to
+    // try to remove this lock and just atomically decrement size_ with a
+    // store-release, that would not necessarily be a substitute for all of the
+    // purposes that this lock serves, or if it was done carefully to serve all
+    // of the same purposes, then that wouldn't be faster than this (mostly
+    // uncontended) lock.
+    std::lock_guard<std::mutex> lock(mutex_);
+    stack_.size--;
+  }
+
+ private:
+  mutable std::mutex mutex_;
+  Stack stack_;
+};
+
+}  // namespace detail
+
+// RAII user-facing way to construct Labels associated with their life scope
+// and get them pushed to / popped from the current thread stack.
+class ScopeLabel {
+ public:
+  template <typename... Args>
+  ScopeLabel(Args... args) : thread_stack_(detail::ThreadLocalThreadStack()) {
+    thread_stack_->Push(args...);
+  }
+
+  ~ScopeLabel() { thread_stack_->Pop(); }
+
+ private:
+  detail::ThreadStack* thread_stack_;
+};
+
+#else  // no RUY_PROFILER
+
+class ScopeLabel {
+ public:
+  template <typename... Args>
+  explicit ScopeLabel(Args...) {}
+
+  // This destructor is needed to consistently silence clang's -Wunused-variable
+  // which seems to trigger semi-randomly.
+  ~ScopeLabel() {}
+};
+
+#endif
+
+}  // namespace profiler
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_PROFILER_INSTRUMENTATION_H_