Add a tracing framework (really just logging).

This isn't a performance tracing framework (unlike the old ruy tracing). This is about understanding what happens inside a ruy::Mul with a view toward documenting how ruy works. Added a 'parametrized_example' to help play with this tracing on any flavor of ruy::Mul call. This also serves as a more elaborate example of how to call ruy::Mul, and as a single binary instantiating several different instantiations of the ruy::Mul template, which is useful for measuring binary size and showing a breakdown of ruy symbols in a document. A few code changes beyond tracing slipped in: - Improved logic in determining the traversal order in MakeBlockMap: In rectangular cases, since we first do the top-level rectangularness subdivision with linear traversal anyway, the traversal order only applies within each subdivision past that, so it should be based on sizes already divided by rectangularness. In practice this nudges 1000x400x2000 from kFractalHilbert to kFractalU on Pixel4, without making an observable perf difference in that case. - Removed the old RUY_BLOCK_MAP_DEBUG logging code: superseded. Kept only a minimal hook to force a block_size_log2 choice. - Wrote new comments on BlockMap internals. - Fixed Ctx::set_runtime_enabled_paths to behave as documented: passing Path::kNone reverts to the default behavior (auto detect). - Exposed Context::set_runtime_enabled_paths. - Renamed UseSimpleLoop -> GetUseSimpleLoop (easier to read trace). PiperOrigin-RevId: 352695092
author: Benoit Jacob <benoitjacob@google.com> 2021-01-20 05:07:54 +0300
committer: Benoit Jacob <benoitjacob@google.com> 2021-01-20 06:37:35 +0300
commit: 20b5eb06ebc29c30a5ed460b658fe48d1afc119e (patch)
tree: 192a231991a43336b9049b38dd651b35d4157935
parent: 4ed621615d2f0a54410976cdaaae22779eaec664 (diff)
19 files changed, 1260 insertions, 85 deletions
diff --git a/example/BUILD b/example/BUILD
index aa85701..738c33e 100644
--- a/example/BUILD
+++ b/example/BUILD
@@ -8,3 +8,9 @@ cc_binary(
     srcs = ["example.cc"],
     deps = ["//ruy"],
 )
+
+cc_binary(
+    name = "parametrized_example",
+    srcs = ["parametrized_example.cc"],
+    deps = ["//ruy"],
+)
diff --git a/example/parametrized_example.cc b/example/parametrized_example.cc
new file mode 100644
index 0000000..ef6ad23
--- /dev/null
+++ b/example/parametrized_example.cc
@@ -0,0 +1,198 @@
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <type_traits>
+
+#include "ruy/context.h"
+#include "ruy/matrix.h"
+#include "ruy/mul_params.h"
+#include "ruy/ruy.h"
+
+template <typename... Dst>
+void read_cmdline_args(bool help, int argc, char* argv[], const char* name,
+                       const char* format, const char* default_value,
+                       const char* allowed_values, Dst... dst) {
+  if (help) {
+    fprintf(stderr, "%-20s %-12s %-16s %s\n", name, format, default_value,
+            allowed_values ? allowed_values : "");
+    return;
+  }
+  const char* value = default_value;
+  for (int i = 1; i < argc; i++) {
+    if (std::strstr(argv[i], name) == argv[i]) {
+      const char* equal_sign = std::strchr(argv[i], '=');
+      if (equal_sign == argv[i] + std::strlen(name)) {
+        value = equal_sign + 1;
+      }
+      break;
+    }
+  }
+  if (allowed_values) {
+    if (!std::strstr(allowed_values, value)) {
+      fprintf(stderr, "Illegal value %s. The legal values are %s.\n", value,
+              allowed_values);
+      exit(1);
+    }
+  }
+  if (sizeof...(Dst) != sscanf(value, format, dst...)) {
+    fprintf(stderr, "Failed to parse %s\n", value);
+    exit(1);
+  }
+}
+
+struct Params {
+  char types[100];
+  int m, k, n;  // matmul shape m*k*n
+  int paths;
+  int num_threads;
+  int repeat;
+  int lhs_cache_policy;
+  int rhs_cache_policy;
+  int lhs_stride;
+  int rhs_stride;
+  int dst_stride;
+  int lhs_zero_point;
+  int rhs_zero_point;
+  int dst_zero_point;
+  char lhs_order[100];
+  char rhs_order[100];
+  char dst_order[100];
+};
+
+template <typename LhsType, typename RhsType, typename DstType>
+void run(const Params& params) {
+  using AccumType =
+      typename std::conditional<std::is_floating_point<DstType>::value, DstType,
+                                std::int32_t>::type;
+
+  ruy::Matrix<LhsType> lhs;
+  ruy::Matrix<RhsType> rhs;
+  ruy::Matrix<DstType> dst;
+
+  auto parse_order = [](const char* name) {
+    if (!std::strcmp(name, "row-major")) {
+      return ruy::Order::kRowMajor;
+    } else if (!std::strcmp(name, "column-major")) {
+      return ruy::Order::kColMajor;
+    } else {
+      fprintf(stderr, "Failed to parse %s\n", name);
+      exit(1);
+    }
+  };
+
+  auto make_layout = [](int rows, int cols, int stride, ruy::Order order,
+                        ruy::Layout* layout) {
+    layout->set_rows(rows);
+    layout->set_cols(cols);
+    layout->set_order(order);
+    int base_stride = order == ruy::Order::kRowMajor ? cols : rows;
+    layout->set_stride(stride ? stride : base_stride);
+  };
+
+  make_layout(params.m, params.k, params.lhs_stride,
+              parse_order(params.lhs_order), lhs.mutable_layout());
+  make_layout(params.k, params.n, params.rhs_stride,
+              parse_order(params.rhs_order), rhs.mutable_layout());
+  make_layout(params.m, params.n, params.dst_stride,
+              parse_order(params.dst_order), dst.mutable_layout());
+
+  lhs.set_zero_point(params.lhs_zero_point);
+  rhs.set_zero_point(params.rhs_zero_point);
+  dst.set_zero_point(params.dst_zero_point);
+
+  lhs.set_cache_policy(static_cast<ruy::CachePolicy>(params.lhs_cache_policy));
+  rhs.set_cache_policy(static_cast<ruy::CachePolicy>(params.rhs_cache_policy));
+
+  auto flat_size = [](const ruy::Layout& layout) {
+    int outer_size =
+        layout.order() == ruy::Order::kRowMajor ? layout.rows() : layout.cols();
+    return outer_size * layout.stride();
+  };
+
+  std::vector<LhsType> lhs_buf(flat_size(lhs.layout()));
+  std::vector<RhsType> rhs_buf(flat_size(rhs.layout()));
+  std::vector<DstType> dst_buf(flat_size(dst.layout()));
+
+  lhs.set_data(lhs_buf.data());
+  rhs.set_data(rhs_buf.data());
+  dst.set_data(dst_buf.data());
+
+  ruy::Context context;
+  context.set_max_num_threads(params.num_threads);
+  context.set_runtime_enabled_paths(static_cast<ruy::Path>(params.paths));
+
+  ruy::MulParams<AccumType, DstType> mul_params;
+  // Here an actual application might set some mul_params fields.
+  // Quantization multipliers, bias-vector, clamp bounds, etc.
+
+  for (int r = 0; r < params.repeat; r++) {
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  bool help = argc == 1 || (argc == 2 && !strcmp(argv[1], "--help"));
+  if (help) {
+    fprintf(stderr, "Command-line flags (all in the form --flag=value):\n");
+    fprintf(stderr, "%-20s %-12s %-16s %s\n", "flag", "format", "default",
+            "allowed");
+  }
+  Params params;
+  const char* allowed_types =
+      "f32xf32->f32, i8xi8->i8, i8xi8->i16, i8xi8->i32, u8xu8->i16, u8xi8->u8";
+  const char* allowed_orders = "row-major, column-major";
+  read_cmdline_args(help, argc, argv, "--types", "%s", "f32xf32->f32",
+                    allowed_types, &params.types);
+  read_cmdline_args(help, argc, argv, "--shape", "%dx%dx%d", "100x100x100",
+                    nullptr, &params.m, &params.k, &params.n);
+  read_cmdline_args(help, argc, argv, "--paths", "%x", "0", nullptr,
+                    &params.paths);
+  read_cmdline_args(help, argc, argv, "--num_threads", "%d", "1", nullptr,
+                    &params.num_threads);
+  read_cmdline_args(help, argc, argv, "--repeat", "%d", "1", nullptr,
+                    &params.repeat);
+  read_cmdline_args(help, argc, argv, "--lhs_cache_policy", "%d", "0",
+                    "0, 1, 2, 3", &params.lhs_cache_policy);
+  read_cmdline_args(help, argc, argv, "--rhs_cache_policy", "%d", "0",
+                    "0, 1, 2, 3", &params.rhs_cache_policy);
+  read_cmdline_args(help, argc, argv, "--lhs_stride", "%d", "0", nullptr,
+                    &params.lhs_stride);
+  read_cmdline_args(help, argc, argv, "--rhs_stride", "%d", "0", nullptr,
+                    &params.rhs_stride);
+  read_cmdline_args(help, argc, argv, "--dst_stride", "%d", "0", nullptr,
+                    &params.dst_stride);
+  read_cmdline_args(help, argc, argv, "--lhs_zero_point", "%d", "0", nullptr,
+                    &params.lhs_zero_point);
+  read_cmdline_args(help, argc, argv, "--rhs_zero_point", "%d", "0", nullptr,
+                    &params.rhs_zero_point);
+  read_cmdline_args(help, argc, argv, "--dst_zero_point", "%d", "0", nullptr,
+                    &params.dst_zero_point);
+  read_cmdline_args(help, argc, argv, "--lhs_order", "%s", "row-major",
+                    allowed_orders, &params.lhs_order);
+  read_cmdline_args(help, argc, argv, "--rhs_order", "%s", "row-major",
+                    allowed_orders, &params.rhs_order);
+  read_cmdline_args(help, argc, argv, "--rhs_order", "%s", "row-major",
+                    allowed_orders, &params.dst_order);
+
+  if (help) {
+    exit(1);
+  }
+
+  if (!strcmp(params.types, "f32xf32->f32")) {
+    run<float, float, float>(params);
+  } else if (!strcmp(params.types, "i8xi8->i8")) {
+    run<std::int8_t, std::int8_t, std::int8_t>(params);
+  } else if (!strcmp(params.types, "i8xi8->i16")) {
+    run<std::int8_t, std::int8_t, std::int16_t>(params);
+  } else if (!strcmp(params.types, "i8xi8->i32")) {
+    run<std::int8_t, std::int8_t, std::int32_t>(params);
+  } else if (!strcmp(params.types, "u8xu8->i16")) {
+    run<std::uint8_t, std::uint8_t, std::int16_t>(params);
+  } else if (!strcmp(params.types, "u8xi8->u8")) {
+    run<std::uint8_t, std::int8_t, std::uint8_t>(params);
+  } else {
+    fprintf(stderr, "Unknown types: %s\n", params.types);
+    exit(1);
+  }
+}
diff --git a/ruy/BUILD b/ruy/BUILD
index 784a3b8..4cbeee1 100644
--- a/ruy/BUILD
+++ b/ruy/BUILD
@@ -94,6 +94,19 @@ selects.config_setting_group(
 )
 
 cc_library(
+    name = "trace",
+    hdrs = ["trace.h"],
+    copts = ruy_copts(),
+    deps = [
+        ":mat",
+        ":matrix",
+        ":path",
+        ":platform",
+        ":side_pair",
+    ],
+)
+
+cc_library(
     name = "platform",
     hdrs = ["platform.h"],
     copts = ruy_copts(),
@@ -294,6 +307,7 @@ cc_library(
         ":opt_set",
         ":side_pair",
         ":size_util",
+        ":trace",
         "//ruy/profiler:instrumentation",
     ],
 )
@@ -344,6 +358,7 @@ cc_library(
         ":blocking_counter",
         ":check_macros",
         ":time",
+        ":trace",
         ":wait",
     ],
 )
@@ -776,6 +791,7 @@ cc_library(
         ":platform",
         ":side_pair",
         ":size_util",
+        ":trace",
         ":tune",
         "//ruy/profiler:instrumentation",
     ],
@@ -799,6 +815,7 @@ cc_library(
         ":pack_common",
         ":path",
         ":platform",
+        ":trace",
         ":tune",
         "//ruy/profiler:instrumentation",
     ],
@@ -882,6 +899,7 @@ cc_library(
         ":platform",
         ":prepacked_cache",
         ":thread_pool",
+        ":trace",
         ":tune",
     ],
 )
@@ -945,6 +963,7 @@ cc_library(
         ":side_pair",
         ":size_util",
         ":thread_pool",
+        ":trace",
         ":trmul_params",
         ":tune",
         "//ruy/profiler:instrumentation",
@@ -962,6 +981,7 @@ cc_library(
         ":matrix",
         ":prepacked_cache",
         ":side_pair",
+        ":trace",
         ":trmul_params",
     ],
 )
@@ -982,6 +1002,7 @@ cc_library(
         ":performance_advisory",
         ":platform",
         ":side_pair",
+        ":trace",
         ":trmul_params",
     ],
 )
@@ -1014,6 +1035,7 @@ cc_library(
         ":mat",
         ":mul_params",
         ":prepare_packed_matrices",
+        ":trace",
         ":trmul",
         ":trmul_params",
         ":validate",
@@ -1025,11 +1047,16 @@ cc_library(
 cc_library(
     name = "ruy",
     hdrs = [
+        "context.h",
+        "matrix.h",
+        "mul_params.h",
+        "path.h",
         "ruy.h",
     ],
     copts = ruy_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":check_macros",
         ":context",
         ":context_get_ctx",
         ":frontend",
@@ -1037,6 +1064,9 @@ cc_library(
         ":matrix",
         ":mul_params",
         ":path",
+        ":platform",
+        ":size_util",
+        ":trace",
     ],
 )
 
diff --git a/ruy/block_map.cc b/ruy/block_map.cc
index 44e5039..8240de2 100644
--- a/ruy/block_map.cc
+++ b/ruy/block_map.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "ruy/opt_set.h"
 #include "ruy/profiler/instrumentation.h"
 #include "ruy/size_util.h"
+#include "ruy/trace.h"
 
 namespace ruy {
 
@@ -126,13 +127,18 @@ void GetBlockByIndex(const BlockMap& block_map, int index,
   }
 }
 
+namespace {
+
 BlockMapTraversalOrder GetTraversalOrder(
-    int rows, int cols, int depth, int lhs_scalar_size, int rhs_scalar_size,
-    const CpuCacheParams& cpu_cache_params) {
+    int rows_after_rectangularness_division,
+    int cols_after_rectangularness_division, int depth, int lhs_scalar_size,
+    int rhs_scalar_size, const CpuCacheParams& cpu_cache_params) {
   static constexpr bool kAnyFractal =
       RUY_OPT(FRACTAL_Z) | RUY_OPT(FRACTAL_U) | RUY_OPT(FRACTAL_HILBERT);
   const int working_set_size =
-      (lhs_scalar_size * rows + rhs_scalar_size * cols) * depth;
+      (lhs_scalar_size * rows_after_rectangularness_division +
+       rhs_scalar_size * cols_after_rectangularness_division) *
+      depth;
   if (kAnyFractal && (working_set_size > cpu_cache_params.local_cache_size)) {
     if (RUY_OPT(FRACTAL_HILBERT) &&
         (working_set_size > cpu_cache_params.last_level_cache_size)) {
@@ -147,8 +153,6 @@ BlockMapTraversalOrder GetTraversalOrder(
   }
 }
 
-namespace {
-
 int floor_log2_quotient(int num, int denom) {
   if (num <= denom) {
     return 0;
@@ -313,37 +317,38 @@ int GetKernelAmortizationScore(int block_size_log2, int rows, int cols,
 
 }  // namespace
 
+bool IsObviouslyLinearTraversal(int rows, int cols, int depth,
+                                int lhs_scalar_size, int rhs_scalar_size,
+                                const CpuCacheParams& cpu_cache_params) {
+  if (rows == 1 || cols == 1) {
+    return true;
+  }
+  // Normally, GetTraversalOrder wants the dimensions (rows x cols) divided
+  // by the rectangularness factors, since any non-linear traversal order will
+  // be local to each subdivision. In the present function, we don't know the
+  // rectangularness factors yet, and we can't just call GetRectangularness
+  // as that requires knowing the kernel block layout. Since we just want
+  // a coarse estimate with only the guarantee that if we return `true` then
+  // linear traversal will be used, it is OK here to over-estimate `rows` and
+  // `cols`, by omitting to divide them by the rectangularness factors.ß
+  return GetTraversalOrder(rows, cols, depth, lhs_scalar_size, rhs_scalar_size,
+                           cpu_cache_params) == BlockMapTraversalOrder::kLinear;
+}
+
 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                   int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
                   int tentative_thread_count,
                   const CpuCacheParams& cpu_cache_params, BlockMap* block_map) {
+  RUY_TRACE_SCOPE;
   profiler::ScopeLabel label("MakeBlockMap");
 
-#ifdef RUY_MAKEBLOCKMAP_DEBUG
-#if RUY_MAKEBLOCKMAP_DEBUG >= 2
-  static constexpr bool debug_everytime = true;
-#else
-  static constexpr bool debug_everytime = false;
-#endif
-  static bool firsttime = true;
-  if (firsttime || debug_everytime) {
-    fprintf(stderr,
-            "MakeBlockMap(rows=%d, cols=%d, depth=%d, kernel_rows=%d, "
-            "kernel_cols=%d, lhs_scalar_size=%d, rhs_scalar_size=%d, "
-            "tentative_thread_count=%d)\n",
-            rows, cols, depth, kernel_rows, kernel_cols, lhs_scalar_size,
-            rhs_scalar_size, tentative_thread_count);
-  }
-#endif
-
   RUY_DCHECK_GE(rows, kernel_rows);
   RUY_DCHECK_GE(cols, kernel_cols);
   RUY_DCHECK_EQ(rows % kernel_rows, 0);
   RUY_DCHECK_EQ(cols % kernel_cols, 0);
 
-  block_map->traversal_order = GetTraversalOrder(
-      rows, cols, depth, lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
-
+  // Estimate the 'rectangularness', the first level of subdivision bringing
+  // the shape to within 2x of a square shape.
   int rows_rectangularness_log2 = 0;
   int cols_rectangularness_log2 = 0;
   GetRectangularness(rows, cols, kernel_rows, kernel_cols,
@@ -358,6 +363,18 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
 
   RUY_DCHECK_GE(size_log2, kernel_size_log2);
 
+  // Heuristic selecting the power-of-two grid subdivision insider of each
+  // square-ish region (past the above subdivision by 'rectangularness').
+  // Note that it is the number of subdivisions, not the resulting block size,
+  // that will be a power of two. But inside of that heuristic, it simplifies
+  // code to talk in terms of 'block_size_log2', as if it were the block size
+  // that were a power of two. This 'block_size_log2' is to be interpreted as
+  // "log2 rounded below", e.g. when block_size_log2=8 we might have a block
+  // size in [256, 511]. When the shape is non-square, rows!=cols, this
+  // refers to the smaller of the two, so the other might be as large as
+  // 1021 (can't be 1022 because following the above 'rectangularness'
+  // subdivision, the aspect ratio is already < 2).
+
   // We are going to try candidate values for block_size_log2 ranging from
   // kernel_size_log2 to (kernel_size_log2 + kMaxKernelsPerBlockLog2).
   // For each of them we will compute a 'score' by adding individual scores
@@ -368,12 +385,16 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
   // kNeonDotprod, 8bit quantized path. Don't read too much into it, go ahead
   // and tune this as needed to achieve good performance elsewhere. Use
   // the unit test, block_map_test, to encode values that should be preserved
-  // on specific architectures. Use RUY_MAKEBLOCKMAP_DEBUG to help tuning this.
+  // on specific architectures. Use RUY_TRACE to debug the current heuristics
+  // and RUY_MAKEBLOCKMAP_EXPLICIT_BLOCK_SIZE_LOG2 to test the impact of a
+  // different block_size_log2 choice, to empirically find the optimal value
+  // before getting to updating the heuristic so that it produces that value.
   static constexpr int kMaxKernelsPerBlockLog2 = 6;
   const int max_block_size_log2 =
       std::min(size_log2, kernel_size_log2 + kMaxKernelsPerBlockLog2);
   int best_score = std::numeric_limits<int>::min();
   int best_score_block_size_log2 = -1;
+  RUY_TRACE_INFO(MAKE_BLOCK_MAP_START);
   for (int block_size_log2 = kernel_size_log2;
        block_size_log2 <= max_block_size_log2; block_size_log2++) {
     const int multithreading_score = GetMultithreadingScore(
@@ -385,58 +406,47 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
         block_size_log2, rows, cols, kernel_rows_log2, kernel_cols_log2);
     const int score =
         multithreading_score + cache_locality_score + kernel_amortization_score;
-#ifdef RUY_MAKEBLOCKMAP_DEBUG
-    if (firsttime || debug_everytime) {
-      fprintf(stderr,
-              "block_size_log2=%d: score=%d multithreading_score=%d "
-              "cache_locality_score=%d kernel_amortization_score=%d\n",
-              block_size_log2, score, multithreading_score,
-              cache_locality_score, kernel_amortization_score);
-    }
-#endif
     if (score >= best_score) {
       best_score = score;
       best_score_block_size_log2 = block_size_log2;
     }
+    RUY_TRACE_INFO(MAKE_BLOCK_MAP_EACH_TENTATIVE_BLOCK_SIZE);
   }
 
-#ifdef RUY_MAKEBLOCKMAP_DEBUG
-  if (firsttime || debug_everytime) {
-    fprintf(stderr, "best_score_block_size_log2=%d\n",
-            best_score_block_size_log2);
-  }
-
-  static const char* explicit_block_size_log2_env =
-      getenv("RUY_MAKEBLOCKMAP_EXPLICIT_BLOCK_SIZE_LOG2");
-  if (explicit_block_size_log2_env) {
-    best_score_block_size_log2 = std::stoi(explicit_block_size_log2_env);
-    if (firsttime || debug_everytime) {
-      fprintf(stderr, "Overridden best_score_block_size_log2=%d\n",
-              best_score_block_size_log2);
-    }
-  }
-  firsttime = false;
+#ifdef RUY_MAKEBLOCKMAP_EXPLICIT_BLOCK_SIZE_LOG2
+  // Useful for tuning.
+  best_score_block_size_log2 = RUY_MAKEBLOCKMAP_EXPLICIT_BLOCK_SIZE_LOG2;
 #endif
 
+  // As explained in the above comment, phrasing the above code in terms of
+  // block_size_log2 was only convenience inside of that heuristic. Now we
+  // revert to talking in terms of grid subdivision. That is what will actually
+  // be powers of two.
   int num_blocks_base_log2 = size_log2 - best_score_block_size_log2;
   RUY_DCHECK_GE(num_blocks_base_log2, 0);
-
   const int num_blocks_of_rows_log2 =
       num_blocks_base_log2 + rows_rectangularness_log2;
   const int num_blocks_of_cols_log2 =
       num_blocks_base_log2 + cols_rectangularness_log2;
 
-  const int smallr =
+  // Now that we know the grid subdivision, we can pinpoint the exact block
+  // sizes. They can't be powers of two in general; they can't even be all
+  // equal in general; so the following few parameters will govern how blocks
+  // of slightly different shapes are put together in the block map.
+  const int small_block_rows =
       round_down_pot(rows >> num_blocks_of_rows_log2, kernel_rows);
-  const int smallc =
+  const int small_block_cols =
       round_down_pot(cols >> num_blocks_of_cols_log2, kernel_cols);
-  const int missr =
-      round_up_pot(rows - (smallr << num_blocks_of_rows_log2), kernel_rows) >>
+  const int rows_of_large_blocks =
+      round_up_pot(rows - (small_block_rows << num_blocks_of_rows_log2),
+                   kernel_rows) >>
       pot_log2(kernel_rows);
-  const int missc =
-      round_up_pot(cols - (smallc << num_blocks_of_cols_log2), kernel_cols) >>
+  const int cols_of_large_blocks =
+      round_up_pot(cols - (small_block_cols << num_blocks_of_cols_log2),
+                   kernel_cols) >>
       pot_log2(kernel_cols);
 
+  // We have everything! Write out to the destination block_map.
   block_map->dims[Side::kLhs] = rows;
   block_map->dims[Side::kRhs] = cols;
   block_map->kernel_dims[Side::kLhs] = kernel_rows;
@@ -444,13 +454,19 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
   block_map->num_blocks_base_log2 = num_blocks_base_log2;
   block_map->rectangularness_log2[Side::kLhs] = rows_rectangularness_log2;
   block_map->rectangularness_log2[Side::kRhs] = cols_rectangularness_log2;
-  block_map->small_block_dims[Side::kLhs] = smallr;
-  block_map->small_block_dims[Side::kRhs] = smallc;
-  block_map->large_blocks[Side::kLhs] = missr;
-  block_map->large_blocks[Side::kRhs] = missc;
+  block_map->small_block_dims[Side::kLhs] = small_block_rows;
+  block_map->small_block_dims[Side::kRhs] = small_block_cols;
+  block_map->large_blocks[Side::kLhs] = rows_of_large_blocks;
+  block_map->large_blocks[Side::kRhs] = cols_of_large_blocks;
+  // See the comment on GetTraversalOrder for why we are dividing `rows` and
+  // `cols` by the rectangularness subdivision parameters here.
+  block_map->traversal_order = GetTraversalOrder(
+      rows >> rows_rectangularness_log2, cols >> cols_rectangularness_log2,
+      depth, lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
   // Done last: NumBlocks needs some of the block_map fields to be already set.
   block_map->thread_count =
       std::min(tentative_thread_count, NumBlocks(*block_map));
+  RUY_TRACE_INFO(MAKE_BLOCK_MAP_END);
 }
 
 void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block,
diff --git a/ruy/block_map.h b/ruy/block_map.h
index 8053916..0057509 100644
--- a/ruy/block_map.h
+++ b/ruy/block_map.h
@@ -103,11 +103,15 @@ struct BlockMap {
   SidePair<int> large_blocks;
 };
 
-// Returns the traversal order to be used for the given matrix multiplication
-// parameters.
-BlockMapTraversalOrder GetTraversalOrder(
-    int rows, int cols, int depth, int lhs_scalar_size, int rhs_scalar_size,
-    const CpuCacheParams& cpu_cache_params);
+// This function produces a coarse estimate of whether linear traversal will
+// be used for this matmul. It offers a one-way guarantee: if this function
+// returns true then linear traversal will be used.
+//
+// The purpose of this function is to allow TrMul to make a cheap, early
+// decision to enter a "simple loop" code path for simple cases.
+bool IsObviouslyLinearTraversal(int rows, int cols, int depth,
+                                int lhs_scalar_size, int rhs_scalar_size,
+                                const CpuCacheParams& cpu_cache_params);
 
 // Create a BlockMap suitable for tiling the destination matrix in a
 // matrix multiplication with the given parameters.
diff --git a/ruy/context.cc b/ruy/context.cc
index e21d4fd..4661738 100644
--- a/ruy/context.cc
+++ b/ruy/context.cc
@@ -51,4 +51,8 @@ bool Context::performance_advisory(PerformanceAdvisory advisory) const {
   return ctx().performance_advisory(advisory);
 }
 
+void Context::set_runtime_enabled_paths(Path paths) {
+  mutable_ctx()->SetRuntimeEnabledPaths(paths);
+}
+
 }  // namespace ruy
diff --git a/ruy/context.h b/ruy/context.h
index 3de0210..79a4b5c 100644
--- a/ruy/context.h
+++ b/ruy/context.h
@@ -79,6 +79,17 @@ class Context final {
   // pre-packed matrix data. This function clears that cache.
   void ClearPrepackedCache();
 
+  // Override auto-detection of supported code paths.
+  //
+  // Passing `paths == Path::kNone` means reverting to the default behavior.
+  // This will trigger auto-detection on the next use.
+  //
+  // Other values will override auto-detection with the explicitly provided set
+  // of paths.
+  //
+  // Paths in kNonArchPaths are always implicitly supported.
+  void set_runtime_enabled_paths(Path paths);
+
  private:
   CtxImpl* const impl_;
 
diff --git a/ruy/create_trmul_params.h b/ruy/create_trmul_params.h
index ebe36fb..531e066 100644
--- a/ruy/create_trmul_params.h
+++ b/ruy/create_trmul_params.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "ruy/pack.h"
 #include "ruy/path.h"
 #include "ruy/performance_advisory.h"
+#include "ruy/trace.h"
 #include "ruy/trmul_params.h"
 
 namespace ruy {
@@ -119,6 +120,7 @@ void CheckKernelPath(Path expected_path) {
 template <Path ThePath, typename LhsScalar, typename RhsScalar,
           typename AccumScalar, typename DstScalar>
 void PopulateTrMulParams(TrMulParams* params) {
+  RUY_TRACE_SCOPE;
   using PackedLhsScalar = PackedType<ThePath, LhsScalar>;
   using PackedRhsScalar = PackedType<ThePath, RhsScalar>;
   using Kernel =
@@ -138,6 +140,7 @@ void PopulateTrMulParams(TrMulParams* params) {
       &RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
   params->run_kernel = &RunKernel<Kernel>::Run;
   CheckKernelPath<Kernel>(ThePath);
+  RUY_TRACE_INFO(POPULATE_TRMUL_PARAMS);
 }
 
 // PopulateTrMulParamsAllCompiledPaths calls into one of multiple
@@ -237,6 +240,7 @@ struct PathSearchCountdown<CompiledPaths, -1, LhsScalar, RhsScalar, AccumScalar,
 template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
           typename AccumScalar, typename DstScalar>
 void PopulateTrMulParamsAllCompiledPaths(Path the_path, TrMulParams* params) {
+  RUY_TRACE_SCOPE;
   return PathSearchCountdown<CompiledPaths, 8 * sizeof(Path) - 1, LhsScalar,
                              RhsScalar, AccumScalar,
                              DstScalar>::Search(the_path, params);
@@ -403,6 +407,7 @@ void CreateTrMulParamsAssumingColMajorDst(
     const Mat<DstScalar>& dst,
     const MulParams<AccumScalar, DstScalar>& mul_params,
     ChannelDimension channel_dimension, Ctx* ctx, TrMulParams* params) {
+  RUY_TRACE_SCOPE;
   RUY_DCHECK(IsColMajor(dst.layout));
 
   // Fill in the fields we already know.
@@ -417,6 +422,8 @@ void CreateTrMulParamsAssumingColMajorDst(
   // might be exposed publicly in Context in the future.
   const Path the_path = ctx->SelectPath(CompiledPaths);
 
+  RUY_TRACE_INFO(CREATE_TRMUL_PARAMS_ASSUMING_COLMAJOR_DST);
+
   // If we ever need again to fall back to Path::kStandardCpp, this is a good
   // place to do it -- just pass Path::kStandardCpp as both the template and
   // runtime parameters in this function call.
@@ -459,11 +466,13 @@ void CreateTrMulParams(const Mat<LhsScalar>& lhs, const Mat<RhsScalar>& rhs,
                        const Mat<DstScalar>& dst,
                        const MulParams<AccumScalar, DstScalar>& mul_params,
                        Ctx* ctx, TrMulParams* params) {
+  RUY_TRACE_SCOPE;
   ChannelDimension channel_dimension = mul_params.channel_dimension();
   if (IsColMajor(dst.layout)) {
     detail::CreateTrMulParamsAssumingColMajorDst<CompiledPaths>(
         lhs, rhs, dst, mul_params, channel_dimension, ctx, params);
   } else {
+    RUY_TRACE_INFO(CREATE_TRMUL_PARAMS_TRANSPOSING);
     detail::CreateTrMulParamsAssumingColMajorDst<CompiledPaths>(
         rhs, lhs, Transpose(dst), mul_params, Transpose(channel_dimension), ctx,
         params);
diff --git a/ruy/ctx.cc b/ruy/ctx.cc
index 547f74c..0ef098d 100644
--- a/ruy/ctx.cc
+++ b/ruy/ctx.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "ruy/performance_advisory.h"
 #include "ruy/platform.h"
 #include "ruy/prepacked_cache.h"
+#include "ruy/trace.h"
 
 namespace ruy {
 
@@ -57,7 +58,14 @@ bool Ctx::performance_advisory(PerformanceAdvisory advisory) const {
 }
 
 void Ctx::SetRuntimeEnabledPaths(Path paths) {
-  mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
+  if (paths == Path::kNone) {
+    // Revert to default behavior using runtime detection.
+    mutable_impl()->runtime_enabled_paths_ = Path::kNone;
+  } else {
+    // Explicitly set enabled paths. Ensure that non-arch are always enabled
+    // (needed for fallbacks).
+    mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
+  }
 }
 
 CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }
@@ -133,6 +141,7 @@ Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
 }  // namespace
 
 Path Ctx::GetRuntimeEnabledPaths() {
+  RUY_TRACE_SCOPE;
   // Just a shorthand alias. Using a pointer to make it clear we're mutating
   // this value in-place.
   Path* paths = &mutable_impl()->runtime_enabled_paths_;
@@ -140,16 +149,19 @@ Path Ctx::GetRuntimeEnabledPaths() {
   // The value Path::kNone indicates the initial state before detection has been
   // performed.
   if (*paths != Path::kNone) {
+    RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE);
     return *paths;
   }
   // User may have set path explicitly in env var.
   Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("RUY_PATHS"));
   if (paths_bitfield != Path::kNone) {
     *paths = paths_bitfield;
+    RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR);
     return *paths;
   }
   // Finally, use runtime detection.
   *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
+  RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_DETECTION);
   return *paths;
 }
 
diff --git a/ruy/frontend.cc b/ruy/frontend.cc
index 58a8538..01ee474 100644
--- a/ruy/frontend.cc
+++ b/ruy/frontend.cc
@@ -23,6 +23,7 @@ limitations under the License.
 namespace ruy {
 
 void MulFrontEndFromTrMulParams(Ctx* ctx, TrMulParams* params) {
+  RUY_TRACE_SCOPE;
   // Handle Matrix::cache_policy, possibly retrieving existing packed matrices
   // or packing and caching now.
   PreparePackedMatrices(ctx, params);
diff --git a/ruy/frontend.h b/ruy/frontend.h
index c3d96b6..a79f590 100644
--- a/ruy/frontend.h
+++ b/ruy/frontend.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "ruy/create_trmul_params.h"
 #include "ruy/ctx.h"
 #include "ruy/profiler/instrumentation.h"
+#include "ruy/trace.h"
 #include "ruy/trmul_params.h"
 #include "ruy/validate.h"
 
@@ -48,6 +49,7 @@ void MulFrontEndUpToCreateTrMulParams(
     const Mat<DstScalar>& dst,
     const MulParams<AccumScalar, DstScalar>& mul_params, Ctx* ctx,
     TrMulParams* params) {
+  RUY_TRACE_SCOPE;
   static_assert(CompiledPaths != Path::kNone, "Must compile at least one Path");
   static_assert(
       (CompiledPaths & ~kAllPathsIncludingInternalVariants) == Path::kNone,
@@ -80,6 +82,7 @@ template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
 void MulFrontEnd(const Mat<LhsScalar>& lhs, const Mat<RhsScalar>& rhs,
                  const MulParams<AccumScalar, DstScalar>& mul_params, Ctx* ctx,
                  Mat<DstScalar>* dst) {
+  RUY_TRACE_SCOPE;
   profiler::ScopeLabel mul_label("Mul");
   profiler::ScopeLabel shape_specific_label("matmul shape: %dx%dx%d",
                                             lhs.layout.rows, lhs.layout.cols,
diff --git a/ruy/kernel.h b/ruy/kernel.h
index ae06055..6bfeb4a 100644
--- a/ruy/kernel.h
+++ b/ruy/kernel.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "ruy/kernel_common.h"
 #include "ruy/mul_params.h"
 #include "ruy/platform.h"
+#include "ruy/trace.h"
 
 // IWYU pragma: begin_exports
 #if RUY_PLATFORM_NEON
@@ -58,9 +59,11 @@ class RunKernel final {
   static void Run(Tuning tuning, const SidePair<PEMat>& src,
                   const void* mul_params, const SidePair<int>& start,
                   const SidePair<int>& end, EMat* dst) {
+    RUY_TRACE_SCOPE_NAME("RunKernel");
     const auto& unerased_lhs = UneraseType<LhsScalar>(src[Side::kLhs]);
     const auto& unerased_rhs = UneraseType<RhsScalar>(src[Side::kRhs]);
     auto unerased_dst = UneraseType<DstScalar>(*dst);
+    RUY_TRACE_INFO(RUN_KERNEL);
     RunTyped(tuning, unerased_lhs, unerased_rhs,
              *static_cast<const MulParamsType*>(mul_params), start, end,
              &unerased_dst);
diff --git a/ruy/pack.h b/ruy/pack.h
index e12caca..744f9bc 100644
--- a/ruy/pack.h
+++ b/ruy/pack.h
@@ -87,6 +87,7 @@ limitations under the License.
 #include "ruy/pack_common.h"
 #include "ruy/path.h"
 #include "ruy/platform.h"
+#include "ruy/trace.h"
 
 // IWYU pragma: begin_exports
 #if RUY_PLATFORM_NEON
@@ -135,9 +136,11 @@ template <Path ThePath, typename FixedKernelLayout, typename Scalar,
           typename PackedScalar>
 void RunPack(Tuning tuning, const EMat& src_matrix, PEMat* packed_matrix,
              int start_col, int end_col) {
+  RUY_TRACE_SCOPE;
   using SumsType = typename PMat<PackedScalar>::SumsType;
   Mat<Scalar> src = UneraseType<Scalar>(src_matrix);
   PMat<PackedScalar> packed = UneraseType<PackedScalar>(*packed_matrix);
+  RUY_TRACE_INFO(RUN_PACK);
   if (src.layout.order == Order::kColMajor) {
     PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType,
              Order::kColMajor>::Run(tuning, src, &packed, start_col, end_col);
diff --git a/ruy/platform.h b/ruy/platform.h
index eb51931..ffffeb1 100644
--- a/ruy/platform.h
+++ b/ruy/platform.h
@@ -33,7 +33,7 @@ limitations under the License.
 #endif
 
 // Detect APPLE.
-#if defined(__ppc__) || defined(__powerpc__)
+#ifdef __ppc__
 #define RUY_PLATFORM_PPC 1
 #else
 #define RUY_PLATFORM_PPC 0
diff --git a/ruy/prepare_packed_matrices.cc b/ruy/prepare_packed_matrices.cc
index 5f02ce7..5a01af7 100644
--- a/ruy/prepare_packed_matrices.cc
+++ b/ruy/prepare_packed_matrices.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "ruy/matrix.h"
 #include "ruy/prepacked_cache.h"
 #include "ruy/side_pair.h"
+#include "ruy/trace.h"
 #include "ruy/trmul_params.h"
 
 namespace ruy {
@@ -66,18 +67,21 @@ bool ShouldCache(const TrMulParams& params, Side side) {
 }  // namespace
 
 void PreparePackedMatrices(Ctx* ctx, TrMulParams* params) {
+  RUY_TRACE_SCOPE;
   for (Side side : {Side::kLhs, Side::kRhs}) {
     PEMat& packed_matrix = params->packed_matrix[side];
     if (ShouldCache(*params, side)) {
       // Use a cached packed matrix (possibly packing and caching now).
       auto* cache = ctx->GetPrepackedCache();
       auto action = cache->Get(params->src[side].data, &packed_matrix);
+      RUY_TRACE_INFO(PREPARE_PACKED_MATRICES_SHOULD_CACHE);
       if (action == PrepackedCache::Action::kInsertedNewEntry) {
         params->RunPack(side, ctx->GetMainThreadTuning(), 0,
                         packed_matrix.layout.cols);
       }
       params->is_prepacked[side] = true;
     } else {
+      RUY_TRACE_INFO(PREPARE_PACKED_MATRICES_NO_CACHE);
       // Do not use a cached packed matrix. Only need to allocate buffers now.
       Allocator* allocator = ctx->GetMainAllocator();
       packed_matrix.data = allocator->AllocateBytesAvoidingAliasingWith(
diff --git a/ruy/ruy.h b/ruy/ruy.h
index b2a5f3c..3cf7bdd 100644
--- a/ruy/ruy.h
+++ b/ruy/ruy.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "ruy/matrix.h"
 #include "ruy/mul_params.h"
 #include "ruy/path.h"
+#include "ruy/trace.h"
 
 namespace ruy {
 
@@ -37,6 +38,8 @@ template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
 void Mul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
          const MulParams<AccumScalar, DstScalar>& mul_params, Context* context,
          Matrix<DstScalar>* dst) {
+  RUY_TRACE_SCOPE;
+  RUY_TRACE_INFO(MUL);
   Mat<LhsScalar> internal_lhs = ToInternal(lhs);
   Mat<RhsScalar> internal_rhs = ToInternal(rhs);
   Mat<DstScalar> internal_dst = ToInternal(*dst);
diff --git a/ruy/thread_pool.cc b/ruy/thread_pool.cc
index a3f0501..100cfe3 100644
--- a/ruy/thread_pool.cc
+++ b/ruy/thread_pool.cc
@@ -21,10 +21,11 @@ limitations under the License.
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
-#include <mutex>               // NOLINT(build/c++11)
-#include <thread>              // NOLINT(build/c++11)
+#include <mutex>   // NOLINT(build/c++11)
+#include <thread>  // NOLINT(build/c++11)
 
 #include "ruy/check_macros.h"
+#include "ruy/trace.h"
 #include "ruy/wait.h"
 
 namespace ruy {
@@ -109,23 +110,28 @@ class Thread {
  private:
   // Thread entry point.
   void ThreadFuncImpl() {
+    RUY_TRACE_SCOPE_NAME("Ruy worker thread function");
     ChangeState(State::Ready);
 
     // Thread main loop
     while (true) {
+      RUY_TRACE_SCOPE_NAME("Ruy worker thread loop iteration");
       // In the 'Ready' state, we have nothing to do but to wait until
       // we switch to another state.
       const auto& condition = [this]() {
         return state_.load(std::memory_order_acquire) != State::Ready;
       };
+      RUY_TRACE_INFO(THREAD_FUNC_IMPL_WAITING);
       Wait(condition, spin_duration_, &state_cond_, &state_mutex_);
 
       // Act on new state.
       switch (state_.load(std::memory_order_acquire)) {
-        case State::HasWork:
+        case State::HasWork: {
+          RUY_TRACE_SCOPE_NAME("Worker thread task");
           // Got work to do! So do it, and then revert to 'Ready' state.
           ChangeState(State::Ready);
           break;
+        }
         case State::ExitAsSoonAsPossible:
           return;
         default:
@@ -159,6 +165,7 @@ class Thread {
 };
 
 void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) {
+  RUY_TRACE_SCOPE_NAME("ThreadPool::Execute");
   RUY_DCHECK_GE(task_count, 1);
 
   // Case of 1 thread: just run the single task on the current thread.
@@ -171,13 +178,16 @@ void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) {
   CreateThreads(task_count - 1);
   counter_to_decrement_when_ready_.Reset(task_count - 1);
   for (int i = 1; i < task_count; i++) {
+    RUY_TRACE_INFO(THREADPOOL_EXECUTE_STARTING_TASK);
     auto task_address = reinterpret_cast<std::uintptr_t>(tasks) + i * stride;
     threads_[i - 1]->StartWork(reinterpret_cast<Task*>(task_address));
   }
 
+  RUY_TRACE_INFO(THREADPOOL_EXECUTE_STARTING_TASK_ZERO_ON_CUR_THREAD);
   // Execute task #0 immediately on the current thread.
   (tasks + 0)->Run();
 
+  RUY_TRACE_INFO(THREADPOOL_EXECUTE_WAITING_FOR_THREADS);
   // Wait for the threads submitted above to finish.
   counter_to_decrement_when_ready_.Wait(spin_duration_);
 }
diff --git a/ruy/trace.h b/ruy/trace.h
new file mode 100644
index 0000000..4f5059e
--- /dev/null
+++ b/ruy/trace.h
@@ -0,0 +1,836 @@
+/* Copyright 2021 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef RUY_RUY_TRACE_H_
+#define RUY_RUY_TRACE_H_
+
+#ifdef RUY_TRACE
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "ruy/mat.h"
+#include "ruy/matrix.h"
+#include "ruy/path.h"
+#include "ruy/platform.h"
+#include "ruy/side_pair.h"
+
+namespace ruy {
+
+// Helper for `formatted` so we don't have to put .c_str() on strings.
+template <typename T>
+T value_for_snprintf(T value) {
+  return value;
+}
+
+inline const char* value_for_snprintf(const std::string& s) {
+  return s.c_str();
+}
+
+// A sprintf-like function returning a std::string.
+// Remove this once we can rely on std::format (c++20).
+template <typename... Args>
+std::string formatted(const char* format, Args... args) {
+  char buf[1024];
+#pragma GCC diagnostic push
+#pragma GCC diagnostic warning "-Wformat-security"
+  int size = snprintf(buf, sizeof buf, format, value_for_snprintf(args)...);
+#pragma GCC diagnostic pop
+  if (size <= 0) {
+    abort();
+  }
+  return std::string(buf);
+}
+
+// An entry in the trace.
+struct ThreadTraceEntry final {
+  std::string text;
+  int indent = 0;
+  const char* source_file = nullptr;
+  int source_line = 0;
+};
+
+// Trace for one thread.
+class ThreadTrace final {
+ public:
+  ~ThreadTrace() {}
+
+  void set_thread_id(int thread_id) { thread_id_ = thread_id; }
+  int thread_id() const { return thread_id_; }
+
+  bool is_in_run_ahead_packing_loop() const {
+    return is_in_run_ahead_packing_loop_;
+  }
+  void set_is_in_run_ahead_packing_loop(bool value) {
+    is_in_run_ahead_packing_loop_ = value;
+  }
+
+  void set_current_source_file(const char* source_file) {
+    current_source_file_ = source_file;
+  }
+
+  void set_current_source_line(int source_line) {
+    current_source_line_ = source_line;
+  }
+
+  const std::vector<ThreadTraceEntry>& entries() const { return entries_; }
+
+  template <typename... Args>
+  void Write(const char* format, Args... args) {
+    ThreadTraceEntry entry;
+    entry.text = formatted(format, args...);
+    entry.indent = indent_;
+    entry.source_file = current_source_file_;
+    entry.source_line = current_source_line_;
+    entries_.emplace_back(std::move(entry));
+  }
+
+  template <typename... Args>
+  void EnterScope(const char* scope_name) {
+    Write("%s {", scope_name);
+    indent_++;
+  }
+  void LeaveScope(const char* scope_name) {
+    indent_--;
+    Write("}  // end of %s", scope_name);
+  }
+
+ private:
+  // The trace contents
+  std::vector<ThreadTraceEntry> entries_;
+
+  // Current indentation level.
+  int indent_ = 0;
+  // Thread's ID as set by Ruy, e.g. [0,N-1]. Not OS TID.
+  int thread_id_ = -1;
+  // The run-ahead loop in `EnsurePacked` may run many iterations when the
+  // thread is waiting for a block to be packed by another thread --- it's
+  // a busy wait. We track whether we are in that mode to avoid generating
+  // many uninteresting trace entries.
+  bool is_in_run_ahead_packing_loop_ = false;
+  // Last recorded value of __FILE__ and __LINE__, as a convenience so we don't
+  // have to pass these in every call to `Write`.
+  const char* current_source_file_ = nullptr;
+  int current_source_line_ = 0;
+};
+
+// Main components of ruy. Used for trace colorization.
+enum class Component { kNone, kFrontEnd, kMiddleEnd, kBackEnd, kThreadPool };
+
+// Output format for the trace.
+enum class TraceOutputFormat { kNone, kTerminal, kHtml };
+
+inline std::string IndentString(int indent) {
+  std::string s;
+  for (int i = 0; i < indent; i++) {
+    s += "  ";
+  }
+  return s;
+}
+
+// Returns the text to write to the trace to open a colored section.
+inline const char* ColorSectionStart(TraceOutputFormat output_format,
+                                     Component component) {
+  switch (output_format) {
+    case TraceOutputFormat::kTerminal:
+      switch (component) {
+        case Component::kFrontEnd:
+          return "\x1b[36m";
+        case Component::kMiddleEnd:
+          return "\x1b[32m";
+        case Component::kBackEnd:
+          return "\x1b[31m";
+        case Component::kThreadPool:
+          return "\x1b[33m";
+        default:
+          abort();
+          return nullptr;
+      }
+    case TraceOutputFormat::kHtml:
+      switch (component) {
+        case Component::kFrontEnd:
+          return "<span style=\"background-color:#B2EBF2\">";
+        case Component::kMiddleEnd:
+          return "<span style=\"background-color:#C8E6C9\">";
+        case Component::kBackEnd:
+          return "<span style=\"background-color:#FFCDD2\">";
+        case Component::kThreadPool:
+          return "<span style=\"background-color:#FFF9C4\">";
+        default:
+          abort();
+          return nullptr;
+      }
+    default:
+      abort();
+      return nullptr;
+  }
+}
+
+// Returns the text to write to the trace to close a colored section.
+inline const char* ColorSectionEnd(TraceOutputFormat output_format) {
+  switch (output_format) {
+    case TraceOutputFormat::kTerminal:
+      return "\x1b[0m";
+    case TraceOutputFormat::kHtml:
+      return "</span>";
+    default:
+      abort();
+      return nullptr;
+  }
+}
+
+// Returns the output format to use for the trace.
+inline TraceOutputFormat GetOutputFormat() {
+  const char* html_env = getenv("RUY_TRACE_HTML");
+  if (html_env && strtol(html_env, nullptr, 10) != 0) {
+    return TraceOutputFormat::kHtml;
+  } else {
+    return TraceOutputFormat::kTerminal;
+  }
+}
+
+// A `basename` function that's good enough for ruy __FILE__'s.
+// Note: `basename` is POSIX-only and annoying (takes a char*, may mutate).
+inline const char* GetBaseName(const char* path) {
+  std::size_t len = strlen(path);
+  if (len == 0) {
+    return path;
+  }
+  const char* ptr = path + len - 1;
+  while (ptr != path) {
+    if (*ptr == '/' || *ptr == '\\') {
+      return ptr + 1;
+    }
+    --ptr;
+  }
+  // Path did not contain any path separator.
+  return path;
+}
+
+// Determines a Component (used for colorization) by source file.
+inline Component GetComponentBySourceFile(const char* base_name) {
+  if (!strcmp(base_name, "pack.h") || !strcmp(base_name, "kernel.h")) {
+    return Component::kBackEnd;
+  } else if (!strcmp(base_name, "trmul.cc") ||
+             !strcmp(base_name, "block_map.cc")) {
+    return Component::kMiddleEnd;
+  } else if (!strcmp(base_name, "thread_pool.cc")) {
+    return Component::kThreadPool;
+  } else {
+    return Component::kFrontEnd;
+  }
+}
+
+inline std::string EscapeText(TraceOutputFormat output_format,
+                              const std::string& text) {
+  if (output_format == TraceOutputFormat::kHtml) {
+    std::string escaped_text;
+    for (char c : text) {
+      if (c == '<') {
+        escaped_text += "&lt;";
+      } else if (c == '>') {
+        escaped_text += "&gt;";
+      } else {
+        escaped_text += c;
+      }
+    }
+    return escaped_text;
+  } else {
+    return text;
+  }
+}
+
+// Prints an entry from the trace to the destination trace file.
+inline void Print(const ThreadTraceEntry& entry,
+                  TraceOutputFormat output_format, FILE* file) {
+  const char* base_name = GetBaseName(entry.source_file);
+  Component component = GetComponentBySourceFile(base_name);
+  const std::string& source_location =
+      formatted("%s:%d", base_name, entry.source_line);
+  const std::string& escaped_text = EscapeText(output_format, entry.text);
+  fprintf(file, "%s%-32s%s%s%s\n", ColorSectionStart(output_format, component),
+          source_location.c_str(), IndentString(entry.indent).c_str(),
+          escaped_text.c_str(), ColorSectionEnd(output_format));
+}
+
+// Prints a thread's entire trace to the destination trace file.
+inline void Print(const ThreadTrace& trace, TraceOutputFormat output_format,
+                  FILE* file) {
+  if (output_format == TraceOutputFormat::kHtml) {
+    fprintf(file, "<html><body><pre>\n<span style=\"font-weight:bold\">\n");
+  }
+  fprintf(file, "Ruy trace for thread %d:\n", trace.thread_id());
+  if (output_format == TraceOutputFormat::kHtml) {
+    fprintf(file, "</span>\n");
+  }
+  for (const ThreadTraceEntry& entry : trace.entries()) {
+    Print(entry, output_format, file);
+  }
+  fprintf(file, "\n");
+  if (output_format == TraceOutputFormat::kHtml) {
+    fprintf(file, "</pre></body></html>\n");
+  }
+}
+
+// Holds all the threads' traces. This is a global singleton class.
+// On exit, when the singleton is destroyed, the destructor prints out the
+// traces.
+class AllThreadTraces final {
+ public:
+  // Add a new ThreadTrace for the current thread. Should be called only once
+  // on each thread.
+  ThreadTrace* AddCurrentThread() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ThreadTrace* thread_trace = new ThreadTrace;
+    thread_traces_.emplace_back(thread_trace);
+    return thread_trace;
+  }
+  ~AllThreadTraces() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    // Open the destination file.
+    const char* file_env = getenv("RUY_TRACE_FILE");
+    FILE* file = stdout;
+    if (file_env) {
+      file = fopen(file_env, "w");
+      if (!file) {
+        fprintf(stderr, "Failed to open %s for write\n", file_env);
+        exit(1);
+      }
+    }
+    // Sort the threads by Ruy Thread ID (not OS TID).
+    auto output_format = GetOutputFormat();
+    std::sort(std::begin(thread_traces_), std::end(thread_traces_),
+              [](const auto& a, const auto& b) {
+                return a->thread_id() < b->thread_id();
+              });
+    // Print all the threads' traces.
+    for (const auto& trace : thread_traces_) {
+      Print(*trace, output_format, file);
+    }
+    if (file_env) {
+      fclose(file);
+    }
+  }
+  static AllThreadTraces* Singleton() {
+    static AllThreadTraces all_thread_traces;
+    return &all_thread_traces;
+  }
+
+ private:
+  std::vector<std::unique_ptr<ThreadTrace>> thread_traces_;
+  std::mutex mutex_;
+};
+
+// Returns the thread-local ThreadTrace singleton, constructing it as needed.
+inline ThreadTrace* ThreadLocalTrace() {
+  static thread_local ThreadTrace* thread_local_trace =
+      AllThreadTraces::Singleton()->AddCurrentThread();
+  return thread_local_trace;
+}
+
+// RAII helper to trace a scope, e.g. a function scope.
+class RuyTraceScope {
+  const char* source_file_;
+  int source_line_;
+  const char* scope_name_;
+
+ public:
+  RuyTraceScope(const char* source_file, int source_line,
+                const char* scope_name)
+      : source_file_(source_file),
+        source_line_(source_line),
+        scope_name_(scope_name) {
+    ThreadLocalTrace()->set_current_source_file(source_file_);
+    ThreadLocalTrace()->set_current_source_line(source_line_);
+    ThreadLocalTrace()->EnterScope(scope_name_);
+  }
+  ~RuyTraceScope() {
+    ThreadLocalTrace()->set_current_source_file(source_file_);
+    ThreadLocalTrace()->set_current_source_line(source_line_);
+    ThreadLocalTrace()->LeaveScope(scope_name_);
+  }
+};
+
+#define RUY_TRACE_SCOPE_NAME_IMPL(file, line, name) \
+  RuyTraceScope ruy_trace_scope##line(file, line, name)
+#define RUY_TRACE_SCOPE_NAME(name) \
+  RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, name)
+#define RUY_TRACE_SCOPE \
+  RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, __FUNCTION__)
+
+// Helpers to trace Ruy objects.
+
+inline std::string str(Order o) {
+  return o == Order::kRowMajor ? "row-major" : "column-major";
+}
+
+inline std::string str(Side s) { return s == Side::kLhs ? "LHS" : "RHS"; }
+
+inline std::string str(const Layout& layout) {
+  std::string s =
+      formatted("%dx%d, %s", layout.rows(), layout.cols(), str(layout.order()));
+  int inner_size =
+      layout.order() == Order::kRowMajor ? layout.cols() : layout.rows();
+  if (inner_size != layout.stride()) {
+    s += formatted(", stride=%d", layout.stride());
+  } else {
+    s += formatted(", unstrided");
+  }
+  return s;
+}
+
+inline std::string str(const MatLayout& layout) {
+  std::string s =
+      formatted("%dx%d, %s", layout.rows, layout.cols, str(layout.order));
+  int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows;
+  if (inner_size != layout.stride) {
+    s += formatted(", stride=%d", layout.stride);
+  } else {
+    s += formatted(", unstrided");
+  }
+  return s;
+}
+
+inline std::string str(const PMatLayout& layout) {
+  std::string s =
+      formatted("%dx%d, %s", layout.rows, layout.cols, str(layout.order));
+  int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows;
+  if (inner_size != layout.stride) {
+    s += formatted(", stride=%d", layout.stride);
+  } else {
+    s += formatted(", unstrided");
+  }
+  s += formatted(", kernel blocks: %dx%d %s", layout.kernel.rows,
+                 layout.kernel.cols, str(layout.kernel.order));
+  return s;
+}
+
+template <typename T>
+std::string str() {
+  return "<unknown type>";
+}
+#define RUY_IMPL_STR_TYPE_STD(T)     \
+  template <>                        \
+  inline std::string str<std::T>() { \
+    return #T;                       \
+  }
+#define RUY_IMPL_STR_TYPE(T)    \
+  template <>                   \
+  inline std::string str<T>() { \
+    return #T;                  \
+  }
+
+RUY_IMPL_STR_TYPE(float)
+RUY_IMPL_STR_TYPE(double)
+RUY_IMPL_STR_TYPE_STD(int8_t)
+RUY_IMPL_STR_TYPE_STD(uint8_t)
+RUY_IMPL_STR_TYPE_STD(int16_t)
+RUY_IMPL_STR_TYPE_STD(uint16_t)
+RUY_IMPL_STR_TYPE_STD(int32_t)
+RUY_IMPL_STR_TYPE_STD(uint32_t)
+RUY_IMPL_STR_TYPE_STD(int64_t)
+RUY_IMPL_STR_TYPE_STD(uint64_t)
+
+template <typename T>
+std::string str(const Matrix<T>& matrix) {
+  std::string s = formatted("Matrix<%s>, %s", str<T>(), str(matrix.layout()));
+  if (matrix.zero_point()) {
+    s += formatted(", zero_point=%d", static_cast<int>(matrix.zero_point()));
+  }
+  if (matrix.cache_policy() != CachePolicy::kNeverCache) {
+    s +=
+        formatted(", cache_policy=%d", static_cast<int>(matrix.cache_policy()));
+  }
+  return s;
+}
+
+inline std::string str(const Type& type) {
+  char c;
+  if (type.is_floating_point) {
+    c = 'f';
+  } else if (type.is_signed) {
+    c = 'i';
+  } else {
+    c = 'u';
+  }
+  return formatted("%c%d", c, type.size * 8);
+}
+
+inline std::string str(const EMat& mat) {
+  std::string s =
+      formatted("EMat, data_type=%s, %s", str(mat.data_type), str(mat.layout));
+  if (mat.zero_point) {
+    s += formatted(", zero_point=%d", static_cast<int>(mat.zero_point));
+  }
+  if (mat.cache_policy != CachePolicy::kNeverCache) {
+    s += formatted(", cache_policy=%d", static_cast<int>(mat.cache_policy));
+  }
+  return s;
+}
+
+inline std::string str(const PEMat& mat) {
+  std::string s =
+      formatted("PEMat, data_type=%s, %s", str(mat.data_type), str(mat.layout));
+  if (mat.zero_point) {
+    s += formatted(", zero_point=%d", static_cast<int>(mat.zero_point));
+  }
+  return s;
+}
+
+inline std::string str(Path paths) {
+  bool first = true;
+  std::string s;
+  for (int bit = 0; bit < 16; bit++) {
+    Path cur_path = static_cast<Path>(1 << bit);
+    if ((paths & cur_path) != Path::kNone) {
+      if (!first) {
+        s += " | ";
+      }
+      first = false;
+      switch (cur_path) {
+        case Path::kNone:
+          continue;
+#define RUY_HANDLE_PATH(p) \
+  case Path::p:            \
+    s += #p;               \
+    break;
+          RUY_HANDLE_PATH(kStandardCpp)
+          RUY_HANDLE_PATH(kInternalStandardCppVariant1)
+          RUY_HANDLE_PATH(kInternalStandardCppVariant2)
+          RUY_HANDLE_PATH(kInternalStandardCppVariant3)
+#if RUY_PLATFORM_ARM
+          RUY_HANDLE_PATH(kNeon)
+          RUY_HANDLE_PATH(kNeonDotprod)
+#endif  // RUY_PLATFORM_ARM
+#if RUY_PLATFORM_X86
+          RUY_HANDLE_PATH(kAvx)
+          RUY_HANDLE_PATH(kAvx2Fma)
+          RUY_HANDLE_PATH(kAvx512)
+#endif  // RUY_PLATFORM_X86
+#undef RUY_HANDLE_PATH
+        default:
+          fprintf(stderr, "Unhandled Path value 0x%x\n",
+                  static_cast<int>(cur_path));
+          abort();
+      }
+    }
+  }
+  return s;
+}
+
+// Implementation of RUY_TRACE_INFO(X) macros.
+
+#define RUY_TRACE_INFO_MUL                                            \
+  ThreadLocalTrace()->Write("CompiledPaths: %s", str(CompiledPaths)); \
+  ThreadLocalTrace()->Write("LHS: %s", str(lhs));                     \
+  ThreadLocalTrace()->Write("RHS: %s", str(rhs));                     \
+  ThreadLocalTrace()->Write("Destination: %s", str(*dst));
+
+#define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_TRANSPOSING                      \
+  ThreadLocalTrace()->Write("Canonicalizing to column-major destination:"); \
+  ThreadLocalTrace()->Write(                                                \
+      "Swapping LHS<->RHS and flipping all storage orders.");
+
+#define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_ASSUMING_COLMAJOR_DST         \
+  ThreadLocalTrace()->Write("Runtime-selected path: %s", str(the_path)); \
+  ThreadLocalTrace()->Write("LHS: %s", str(params->src[Side::kLhs]));    \
+  ThreadLocalTrace()->Write("RHS: %s", str(params->src[Side::kRhs]));    \
+  ThreadLocalTrace()->Write("Destination: %s", str(params->dst));
+
+#define RUY_TRACE_INFO_POPULATE_TRMUL_PARAMS                                \
+  ThreadLocalTrace()->Write(                                                \
+      "Here we have this Path as a template parameter: %s", str(ThePath));  \
+  ThreadLocalTrace()->Write("PackedLhsScalar: %s", str<PackedLhsScalar>()); \
+  ThreadLocalTrace()->Write("PackedRhsScalar: %s", str<PackedRhsScalar>()); \
+  ThreadLocalTrace()->Write("Kernel function pointer: %p",                  \
+                            params->run_kernel);                            \
+  ThreadLocalTrace()->Write("Kernel LHS block layout: %dx%d %s",            \
+                            LhsKernelLayout::kRows, LhsKernelLayout::kCols, \
+                            str(LhsKernelLayout::kOrder));                  \
+  ThreadLocalTrace()->Write("Kernel RHS block layout: %dx%d %s",            \
+                            RhsKernelLayout::kRows, RhsKernelLayout::kCols, \
+                            str(RhsKernelLayout::kOrder));                  \
+  ThreadLocalTrace()->Write("Created packed matrices:");                    \
+  ThreadLocalTrace()->Write("Packed LHS matrix: %s",                        \
+                            str(params->packed_matrix[Side::kLhs]));        \
+  ThreadLocalTrace()->Write("Packed RHS matrix: %s",                        \
+                            str(params->packed_matrix[Side::kRhs]));        \
+  ThreadLocalTrace()->Write("LHS packing function pointer: %p",             \
+                            params->run_pack[Side::kLhs]);                  \
+  ThreadLocalTrace()->Write("RHS packing function pointer: %p",             \
+                            params->run_pack[Side::kRhs]);
+
+#define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE        \
+  ThreadLocalTrace()->Write("SetRuntimeEnabledPaths forcing paths: %s", \
+                            str(*paths));
+
+#define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR        \
+  ThreadLocalTrace()->Write("Environment variable forcing paths: %s", \
+                            str(*paths));
+
+#define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_DETECTION \
+  ThreadLocalTrace()->Write(                                     \
+      "Runtime-detected paths: %s",                              \
+      str(*paths & ~kNonArchPathsIncludingInternalVariants));
+
+#define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_SHOULD_CACHE            \
+  ThreadLocalTrace()->Write(                                           \
+      "Caching the packed %s matrix. Already in cache: %s", str(side), \
+      action == PrepackedCache::Action::kInsertedNewEntry ? "no" : "yes");
+
+#define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_NO_CACHE \
+  ThreadLocalTrace()->Write("Not caching the packed %s matrix.", str(side));
+
+#define RUY_TRACE_INFO_GET_TENTATIVE_THREAD_COUNT                          \
+  ThreadLocalTrace()->Write(                                               \
+      "tentative_thread_count=%d (determined based on shape %dx%dx%d and " \
+      "max_num_threads=%d)",                                               \
+      tentative_thread_count, rows, depth, cols, ctx->max_num_threads());
+
+#define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_TRUE                    \
+  ThreadLocalTrace()->Write(                                               \
+      "Choosing to use the simple loop code path in TrMul because of the " \
+      "linear traversal and single thread.");
+
+#define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_FALSE                    \
+  ThreadLocalTrace()->Write(                                                \
+      "Choosing to use the general case code path in TrMul because of: %s", \
+      tentative_thread_count > 1 ? "multi-threading"                        \
+                                 : "non-linear traversal order");
+
+#define RUY_TRACE_INFO_TRMUL_SIMPLE_LOOP \
+  ThreadLocalTrace()->Write("Entering the simple loop code path of TrMul");
+
+#define RUY_TRACE_INFO_TRMUL_GENERAL_CASE \
+  ThreadLocalTrace()->Write("Entering the general case code path of TrMul");
+
+#define RUY_TRACE_INFO_MAKE_BLOCK_MAP_START                                    \
+  ThreadLocalTrace()->Write("Kernel block: %dx%d", kernel_rows, kernel_cols);  \
+  ThreadLocalTrace()->Write(                                                   \
+      "BlockMap shape: %dx%d (destination matrix shape rounded to next "       \
+      "kernel blocks)",                                                        \
+      rows, cols);                                                             \
+  ThreadLocalTrace()->Write(                                                   \
+      "Rectangularness log2: %dx%d (powers of two factors bringing the shape " \
+      "closest to square)",                                                    \
+      rows_rectangularness_log2, cols_rectangularness_log2);                   \
+  ThreadLocalTrace()->Write("Accumulation depth: %d", depth);                  \
+  ThreadLocalTrace()->Write("LHS scalar type size: %d", lhs_scalar_size);      \
+  ThreadLocalTrace()->Write("RHS scalar type size: %d", rhs_scalar_size);      \
+  ThreadLocalTrace()->Write("Tentative thread count: %d",                      \
+                            tentative_thread_count);                           \
+  ThreadLocalTrace()->Write(                                                   \
+      "CPU cache params: local_cache_size=%d, last_level_cache_size=%d",       \
+      cpu_cache_params.local_cache_size,                                       \
+      cpu_cache_params.last_level_cache_size);                                 \
+  ThreadLocalTrace()->Write(                                                   \
+      "For the sizes below, when rows!=cols, we always retain the min of the " \
+      "two.");                                                                 \
+  ThreadLocalTrace()->Write("Kernel block size_log2: %d", kernel_size_log2);   \
+  ThreadLocalTrace()->Write(                                                   \
+      "BlockMap size_log2: %d (destination matrix shape rounded to next "      \
+      "kernel blocks)",                                                        \
+      size_log2);                                                              \
+  ThreadLocalTrace()->Write(                                                   \
+      "Now we will pick the optimal log2 of BlockMap block size");
+
+#define RUY_TRACE_INFO_MAKE_BLOCK_MAP_EACH_TENTATIVE_BLOCK_SIZE           \
+  ThreadLocalTrace()->Write(                                              \
+      "For BlockMap block size_log2 %d, score=%d ("                       \
+      "multithreading_score=%d + cache_locality_score=%d + "              \
+      "kernel_amortization_score=%d)",                                    \
+      block_size_log2, score, multithreading_score, cache_locality_score, \
+      kernel_amortization_score);
+
+#define RUY_TRACE_INFO_MAKE_BLOCK_MAP_END                                      \
+  ThreadLocalTrace()->Write("Selecting BlockMap block size_log2: %d",          \
+                            best_score_block_size_log2);                       \
+  ThreadLocalTrace()->Write(                                                   \
+      "BlockMap has %dx%d blocks, each of size between %dx%d and %dx%d.",      \
+      1 << num_blocks_of_rows_log2, 1 << num_blocks_of_cols_log2,              \
+      block_map->small_block_dims[Side::kLhs],                                 \
+      block_map->small_block_dims[Side::kRhs],                                 \
+      block_map->small_block_dims[Side::kLhs] +                                \
+          block_map->kernel_dims[Side::kLhs],                                  \
+      block_map->small_block_dims[Side::kRhs] +                                \
+          block_map->kernel_dims[Side::kRhs]);                                 \
+  ThreadLocalTrace()->Write(                                                   \
+      "The first %d rows of blocks have %d rows, the remaining ones have %d "  \
+      "rows ",                                                                 \
+      block_map->large_blocks[Side::kLhs],                                     \
+      block_map->small_block_dims[Side::kLhs] +                                \
+          block_map->kernel_dims[Side::kLhs],                                  \
+      block_map->small_block_dims[Side::kLhs]);                                \
+  ThreadLocalTrace()->Write(                                                   \
+      "The first %d columns of blocks have %d columns, the remaining ones "    \
+      "have %d columns ",                                                      \
+      block_map->large_blocks[Side::kRhs],                                     \
+      block_map->small_block_dims[Side::kRhs] +                                \
+          block_map->kernel_dims[Side::kLhs],                                  \
+      block_map->small_block_dims[Side::kRhs]);                                \
+  ThreadLocalTrace()->Write(                                                   \
+      "Traversal order: %s",                                                   \
+      block_map->traversal_order == BlockMapTraversalOrder::kLinear ? "linear" \
+      : block_map->traversal_order == BlockMapTraversalOrder::kFractalZ        \
+          ? "fractal Z-curve"                                                  \
+      : block_map->traversal_order == BlockMapTraversalOrder::kFractalU        \
+          ? "fractal U-curve"                                                  \
+      : block_map->traversal_order == BlockMapTraversalOrder::kFractalHilbert  \
+          ? "fractal Hilbert curve"                                            \
+          : nullptr);                                                          \
+  ThreadLocalTrace()->Write("Finalized thread count: %d",                      \
+                            block_map->thread_count);
+
+#define RUY_TRACE_SET_THEAD_ID(thread_id) \
+  ThreadLocalTrace()->set_thread_id(thread_id);
+
+#define RUY_TRACE_INFO_TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS              \
+  ThreadLocalTrace()->Write(                                              \
+      "Block #%d is at position (%d, %d) in the BlockMap.", block_id,     \
+      block[Side::kLhs], block[Side::kRhs]);                              \
+  ThreadLocalTrace()->Write(                                              \
+      "Block #%d has shape %dx%d and starts at position (%d, %d) in the " \
+      "destination matrix.",                                              \
+      block_id, end[Side::kLhs] - start[Side::kLhs],                      \
+      end[Side::kRhs] - start[Side::kRhs], start[Side::kLhs],             \
+      start[Side::kRhs]);                                                 \
+  ThreadLocalTrace()->Write(                                              \
+      "Block #%d depends on LHS panel #%d and RHS panel #%d.", block_id,  \
+      block[Side::kLhs], block[Side::kRhs]);
+
+#define RUY_TRACE_INFO_TRYPACK_PACKING                                  \
+  ThreadLocalTrace()->Write(                                            \
+      "%s panel #%d is not already packed. Packing it now.", str(side), \
+      block);
+
+#define RUY_TRACE_INFO_TRYPACK_ANOTHER_THREAD_PACKING                \
+  if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) {         \
+    ThreadLocalTrace()->Write(                                       \
+        "%s panel #%d is currently being packed by another thread.", \
+        str(side), block);                                           \
+  }
+
+#define RUY_TRACE_INFO_TRYPACK_PREVIOUSLY_PACKED                          \
+  if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) {              \
+    ThreadLocalTrace()->Write("%s panel #%d had previously been packed.", \
+                              str(side), block);                          \
+  }
+
+#define RUY_TRACE_INFO_TRYPACK_PACKED_BY_ANOTHER_THREAD                  \
+  ThreadLocalTrace()->Write(                                             \
+      "%s panel #%d has just been packed by another thread.", str(side), \
+      block);
+
+#define RUY_TRACE_INFO_ENSURE_PACKED_ENTER_RUN_AHEAD                       \
+  if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) {               \
+    ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(true);            \
+    ThreadLocalTrace()->Write(                                             \
+        "We're blocked on other threads packing the panels that we need. " \
+        "Packing some other panels while we wait...");                     \
+  }
+
+#define RUY_TRACE_INFO_ENSURE_PACKED_END                                  \
+  if (ThreadLocalTrace()->is_in_run_ahead_packing_loop()) {               \
+    ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(false);          \
+    ThreadLocalTrace()->Write(                                            \
+        "Other threads have finished packing what we were waiting for."); \
+  }
+
+#define RUY_TRACE_INFO_RUN_PACK                                               \
+  ThreadLocalTrace()->Write("Path: %s", str(ThePath));                        \
+  ThreadLocalTrace()->Write("Packing panel consisting of columns [%d, %d)",   \
+                            start_col, end_col);                              \
+  ThreadLocalTrace()->Write("Source: columns [%d, %d) of %s", start_col,      \
+                            end_col, str(src_matrix));                        \
+  ThreadLocalTrace()->Write("Destination: columns [%d, %d) of %s", start_col, \
+                            end_col, str(*packed_matrix));                    \
+  if (end_col > src_matrix.layout.cols) {                                     \
+    ThreadLocalTrace()->Write(                                                \
+        "This runs past the last column of the source matrix. Padding as "    \
+        "needed.");                                                           \
+  }                                                                           \
+  if (packed_matrix->layout.rows > src_matrix.layout.rows) {                  \
+    ThreadLocalTrace()->Write(                                                \
+        "The packed matrix has more rows than the source matrix due to "      \
+        "rounding up to the kernel block size. Padding as needed.");          \
+  }
+
+#define RUY_TRACE_INFO_RUN_KERNEL                                              \
+  {                                                                            \
+    ThreadLocalTrace()->Write("Path: %s", str(KernelArgs<KernelType>::kPath)); \
+    int lhs_cols = end[Side::kLhs] - start[Side::kLhs];                        \
+    int rhs_cols = end[Side::kRhs] - start[Side::kRhs];                        \
+    int kernel_lhs_cols = src[Side::kLhs].layout.kernel.cols;                  \
+    int kernel_rhs_cols = src[Side::kRhs].layout.kernel.cols;                  \
+    ThreadLocalTrace()->Write("LHS: columns [%d, %d) of %s",                   \
+                              start[Side::kLhs], end[Side::kLhs],              \
+                              str(src[Side::kLhs]));                           \
+    ThreadLocalTrace()->Write("RHS: columns [%d, %d) of %s",                   \
+                              start[Side::kRhs], end[Side::kRhs],              \
+                              str(src[Side::kRhs]));                           \
+    ThreadLocalTrace()->Write("Destination: block [%d, %d)x[%d, %d) of %s",    \
+                              start[Side::kLhs], end[Side::kLhs],              \
+                              start[Side::kRhs], end[Side::kRhs], str(*dst));  \
+    if (end[Side::kLhs] > dst->layout.rows ||                                  \
+        end[Side::kRhs] > dst->layout.cols) {                                  \
+      ThreadLocalTrace()->Write(                                               \
+          "This runs over the destination matrix boundaries. The kernel will " \
+          "internally clamp stores to avoid overruns.");                       \
+    }                                                                          \
+    ThreadLocalTrace()->Write(                                                 \
+        "The kernel's inner loop only produces a %dx%d block, so the "         \
+        "kernel's outer loops will run %dx%d times.",                          \
+        kernel_lhs_cols, kernel_rhs_cols, lhs_cols / kernel_lhs_cols,          \
+        rhs_cols / kernel_rhs_cols);                                           \
+  }
+
+#define RUY_TRACE_INFO_THREAD_FUNC_IMPL_WAITING \
+  ThreadLocalTrace()->Write("Waiting for a task...");
+
+#define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK \
+  ThreadLocalTrace()->Write("Sending task #%d to a worker thread...", i);
+
+#define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK_ZERO_ON_CUR_THREAD \
+  ThreadLocalTrace()->Write("Running task #0 on the current thread...");
+
+#define RUY_TRACE_INFO_THREADPOOL_EXECUTE_WAITING_FOR_THREADS \
+  ThreadLocalTrace()->Write("Waiting for worker threads to finish..");
+
+#define RUY_TRACE_INFO(id)                                 \
+  [=]() {                                                  \
+    ThreadLocalTrace()->set_current_source_file(__FILE__); \
+    ThreadLocalTrace()->set_current_source_line(__LINE__); \
+    RUY_TRACE_INFO_##id                                    \
+  }()
+
+}  // namespace ruy
+
+#else
+
+// Vacuous implementation when RUY_TRACE is not defined.
+#define RUY_TRACE_SCOPE_NAME(name)
+#define RUY_TRACE_SCOPE
+#define RUY_TRACE_SET_THEAD_ID(thread_id)
+#define RUY_TRACE_INFO(id)
+
+#endif
+
+#endif  // RUY_RUY_TRACE_H_
diff --git a/ruy/trmul.cc b/ruy/trmul.cc
index 5a385c0..9345f0c 100644
--- a/ruy/trmul.cc
+++ b/ruy/trmul.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "ruy/side_pair.h"
 #include "ruy/size_util.h"
 #include "ruy/thread_pool.h"
+#include "ruy/trace.h"
 #include "ruy/tune.h"
 
 namespace ruy {
@@ -72,6 +73,8 @@ class TrMulTask final : public Task {
 
   // Thread main function. This is one thread's share of the TrMul work.
   void Run() override {
+    RUY_TRACE_SCOPE_NAME("TrMulTask::Run");
+    RUY_TRACE_SET_THEAD_ID(thread_id_);
     // Allocate and initialize `local_packed`.
     for (Side side : {Side::kLhs, Side::kRhs}) {
       if (!params_->is_prepacked[side]) {
@@ -89,6 +92,7 @@ class TrMulTask final : public Task {
     int block_id = thread_id_;
     // Loop until all blocks have been computed.
     while (block_id < num_blocks) {
+      RUY_TRACE_SCOPE_NAME("Main loop iteration");
       // Reserve the next block to handle, hiding the latency of this atomic op.
       const int next_block_id =
           atomic_block_id_->fetch_add(1, std::memory_order_relaxed);
@@ -98,6 +102,7 @@ class TrMulTask final : public Task {
       // Get coordinates of the current block to handle, in matrix space.
       SidePair<int> start, end;
       GetBlockMatrixCoords(block_map_, block, &start, &end);
+      RUY_TRACE_INFO(TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS);
       // Maybe pack the current LHS/RHS block, if not already packed.
       EnsurePacked(block, start, end, tuning);
       // Actually do matrix multiplication work
@@ -163,11 +168,15 @@ class TrMulTask final : public Task {
           // In this branch, the status was kNotStarted and we just atomically
           // changed it to kInProgress as we are about to handle the packing
           // ourselves.
+          RUY_TRACE_INFO(TRYPACK_PACKING);
           params_->RunPack(side, tuning, start, end);
           status.store(PackingStatus::kFinished, std::memory_order_release);
         } else if (exchanged_status == PackingStatus::kInProgress) {
           // Another thread is currently packing this block.
+          RUY_TRACE_INFO(TRYPACK_ANOTHER_THREAD_PACKING);
           return false;
+        } else {
+          RUY_TRACE_INFO(TRYPACK_PACKED_BY_ANOTHER_THREAD);
         }
         RUY_DCHECK(status.load(std::memory_order_acquire) ==
                    PackingStatus::kFinished);
@@ -177,6 +186,8 @@ class TrMulTask final : public Task {
         params_->RunPack(side, tuning, start, end);
       }
       local_already_packed_[side][block] = true;
+    } else {
+      RUY_TRACE_INFO(TRYPACK_PREVIOUSLY_PACKED);
     }
     return true;
   }
@@ -202,6 +213,7 @@ class TrMulTask final : public Task {
         break;
       }
 #if RUY_OPT(PACK_AHEAD)
+      RUY_TRACE_INFO(ENSURE_PACKED_ENTER_RUN_AHEAD);
       const Side runahead_side = next_runahead_side;
       const int runahead_block = next_runahead_block[runahead_side];
       next_runahead_side = OtherSide(next_runahead_side);
@@ -216,6 +228,7 @@ class TrMulTask final : public Task {
       next_runahead_block[runahead_side] = runahead_block + 1;
 #endif
     }
+    RUY_TRACE_INFO(ENSURE_PACKED_END);
   }
 
   TrMulParams* params_;
@@ -233,32 +246,36 @@ class TrMulTask final : public Task {
   CpuInfo* cpuinfo_;
 };
 
-int GetThreadCount(Ctx* ctx, int rows, int cols, int depth) {
+int GetTentativeThreadCount(Ctx* ctx, int rows, int cols, int depth) {
 #if RUY_PLATFORM_EMSCRIPTEN
   // b/139927184, std::thread constructor raises exception
   return 1;
 #endif
+  RUY_TRACE_SCOPE;
   // Empirically determined rule for reasonable number of
   // threads to use. This is proportional to the number of arithmetic ops
   // in this Mul (product of the 3 sizes).
   static constexpr int kDivisorLog2 = 15;
   const int guess_log2 = std::max(
       0, ceil_log2(rows) + ceil_log2(cols) + ceil_log2(depth) - kDivisorLog2);
-  return std::min(1 << guess_log2, ctx->max_num_threads());
+  int tentative_thread_count =
+      std::min(1 << guess_log2, ctx->max_num_threads());
+  RUY_TRACE_INFO(GET_TENTATIVE_THREAD_COUNT);
+  return tentative_thread_count;
 }
 
-bool UseSimpleLoop(int tentative_thread_count, int rows, int cols, int depth,
-                   int lhs_scalar_size, int rhs_scalar_size,
-                   const CpuCacheParams& cpu_cache_params) {
+bool GetUseSimpleLoop(int tentative_thread_count, int rows, int cols, int depth,
+                      int lhs_scalar_size, int rhs_scalar_size,
+                      const CpuCacheParams& cpu_cache_params) {
+  RUY_TRACE_SCOPE;
   if (tentative_thread_count == 1) {
-    const BlockMapTraversalOrder traversal_order = GetTraversalOrder(
-        rows, cols, depth, lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
-    // If we are in the GEMV case or the block_map would be using linear
-    // traversal anyway, use the simple loop.
-    if ((cols == 1) || traversal_order == BlockMapTraversalOrder::kLinear) {
+    if (IsObviouslyLinearTraversal(rows, cols, depth, lhs_scalar_size,
+                                   rhs_scalar_size, cpu_cache_params)) {
+      RUY_TRACE_INFO(GET_USE_SIMPLE_LOOP_RETURNS_TRUE);
       return true;
     }
   }
+  RUY_TRACE_INFO(GET_USE_SIMPLE_LOOP_RETURNS_FALSE);
   return false;
 }
 
@@ -271,6 +288,7 @@ bool UseSimpleLoop(int tentative_thread_count, int rows, int cols, int depth,
 // thread, the code that is potentially running on worker threads is in
 // TrMulTask::Run().
 void TrMul(Ctx* ctx, TrMulParams* params) {
+  RUY_TRACE_SCOPE;
   profiler::ScopeLabel label(
       "TrMul (Path=0x%x, max_num_threads=%d, is_prepacked=(%d,%d))",
       static_cast<int>(params->path), ctx->max_num_threads(),
@@ -285,17 +303,20 @@ void TrMul(Ctx* ctx, TrMulParams* params) {
   const int cols = rhs.layout.cols;
   const int depth = lhs.layout.rows;
 
-  const int tentative_thread_count = GetThreadCount(ctx, rows, cols, depth);
+  const int tentative_thread_count =
+      GetTentativeThreadCount(ctx, rows, cols, depth);
   const auto& cpu_cache_params = ctx->mutable_cpuinfo()->CacheParams();
 
   // Case of running this TrMul as a simple loop.
   // This is a good place to start reading this function: all the rest
   // of this function is just an optimized, but functionally equivalent,
   // version of that.
-  if (UseSimpleLoop(tentative_thread_count, rows, cols, depth,
-                    lhs.data_type.size, rhs.data_type.size, cpu_cache_params)) {
+  if (GetUseSimpleLoop(tentative_thread_count, rows, cols, depth,
+                       lhs.data_type.size, rhs.data_type.size,
+                       cpu_cache_params)) {
     profiler::ScopeLabel label_simple("TrMulImpl, simple loop");
     Tuning tuning = ctx->GetMainThreadTuning();
+    RUY_TRACE_INFO(TRMUL_SIMPLE_LOOP);
 
     const SidePair<int> origin{0, 0};
     const SidePair<int> rounded_dims{packed_lhs.layout.cols,
@@ -310,6 +331,7 @@ void TrMul(Ctx* ctx, TrMulParams* params) {
   }
 
   profiler::ScopeLabel label_general("TrMulImpl, general case");
+  RUY_TRACE_INFO(TRMUL_GENERAL_CASE);
   Allocator* main_allocator = ctx->GetMainAllocator();
 
   // Initialize block map.
author	Benoit Jacob <benoitjacob@google.com>	2021-01-20 05:07:54 +0300
committer	Benoit Jacob <benoitjacob@google.com>	2021-01-20 06:37:35 +0300
commit	20b5eb06ebc29c30a5ed460b658fe48d1afc119e (patch)
tree	192a231991a43336b9049b38dd651b35d4157935
parent	4ed621615d2f0a54410976cdaaae22779eaec664 (diff)