1 files changed, 388 insertions, 0 deletions
diff --git a/ruy/internal_matrix.h b/ruy/internal_matrix.h
new file mode 100644
index 0000000..7fe13be
--- /dev/null
+++ b/ruy/internal_matrix.h
@@ -0,0 +1,388 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Internal types and helpers for matrices.
+//
+// Ruy has a couple slightly different notions of matrices, besides the
+// Matrix<T> class that we expose to the user-facing API.
+//
+// TODO(silvasean): Put parts of this architecture description somewhere more
+// prominent.
+//
+// The 4 main matrix types are:
+// - Matrix<T>: This is a user-facing type on Ruy's external API boundary. It is
+// also used internally.
+// - DMatrix: This is a type-erased version of Matrix<T>. "D" = "dynamic".
+// - PMatrix: This represents a packed matrix, which requires tracking kernel
+// layout and row/column sums for quantization. It is type-erased.
+// - PackedMatrix<T>: This is a statically typed variant of PMatrix for
+// convenience inside typed routines.
+//
+// Note that Matrix<T> is *not* implemented in terms of the internal types. It
+// is an independent, simple, and user-facing type.
+//
+// The use of type-erasure might seem surprising for a library like Ruy with a
+// heavily-templated entry point, but it is motivated by the desire for most of
+// Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
+// main parts:
+// - "front-end" (dispatch.h) - this is the highly templated ruy::Mul entry
+// point, along with routines that select RunKernel and RunPack implementations
+// statically based on those template parameters.
+// - "back-end" (kernel.h, pack.h)- this consists of the implementations of
+// RunKernel and RunPack, often in assembly code, which are the building blocks
+// that Ruy calls to perform matrix multiplication.  These are templated so that
+// only the requested types/Path's are actually emitted by the compiler.
+// - "middle-end" (trmul.h) - this is the part of Ruy that orchestrates the
+// calls to the "back-end" optimized building blocks. This layer has to deal
+// with issues like cache locality and low-overhead multi-threading.
+//
+// There is a desire for the "middle-end" to be non-templated in order to
+// simplify the implementation and reduce code-size. We type-erase when going
+// from the "front-end" to the "middle-end", and un-type-erase going from the
+// "middle-end" to the "back-end". The un-type-erasure is possible because the
+// "front-end" is responsible for instantiating the needed "back-end" templates,
+// and thus the static type information is still present.
+//
+// Each layer of Ruy uses matrix types:
+// - "front-end": Matrix<T>
+// - "middle-end": DMatrix, PMatrix
+// - "back-end": Matrix<T>, PackedMatrix<T>
+//
+// The use of separate types for packed matrices is not essential, but makes it
+// obvious at a glance whether a matrix is a packed matrix or not. We would
+// reconsider this decision if there was significant duplication between packed
+// and unpacked matrices, but that doesn't seem to be the case at the moment.
+//
+// Another goal is to keep the user-facing Matrix<T> as simple and
+// understandable as possible. Ideally, a user should be able to read the struct
+// definition for Matrix<T> and see a very simple definition with no internal
+// details like sums and kernel block layout.
+//
+// To present another structured view of our various matrix types, here's a
+// table:
+//                Plain matrices   Packed matrices
+//             +----------------------------------
+// Templated   |  Matrix<T>        PackedMatrix<T>
+// Type-erased |  DMatrix          PMatrix
+//
+//
+// There is 1 additional matrix type not mentioned above, due to its low
+// importance:
+// - PrepackedMatrix: This is a user-facing version of PMatrix. It has the bare
+// minimum of fields needed for representing the raw data and sums buffers of a
+// packed matrix for the "advanced" explicit pre-packing API. This type plays no
+// role in Ruy's internals and can generally by ignored. The only reason it
+// exists is so that PMatrix is not exposed to users -- we prefer to keep the
+// internal matrix types hidden, even from "advanced" users.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_INTERNAL_MATRIX_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_INTERNAL_MATRIX_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
+#include "ruy/check_macros.h"
+#include "ruy/common.h"
+#include "ruy/matrix.h"
+#include "ruy/size_util.h"
+
+namespace ruy {
+
+// KernelLayout describes small-scale block structure in a packed matrix layout.
+// It's a runtime (as opposed to compile-time-constant) version of the
+// FixedKernelLayout struct used to declare kernel layouts.
+//
+// This is is sometimes known as "tiling" in other contexts.
+//
+// For example, consider a packed matrix in column-major format with a
+// column-major KernelLayout. The matrix logically has a shape of
+// `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
+// of shape `[cols / kcols, rows / krows, kcols, krows]`.
+//
+// Note that in the case of kcols=1, krows=1, this degenerates to
+// `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
+// structure.
+struct KernelLayout {
+  Order order = Order::kColMajor;
+  std::uint8_t rows = 1;
+  std::uint8_t cols = 1;
+};
+
+// A packed matrix has a small-scale block structure that is not present in in
+// the input matrices. This block structure is necessary for the kernels to
+// process data efficiently.
+//
+// This struct is very similar to Layout, but has the extra KernelLayout field.
+struct PackedLayout {
+  std::int32_t rows = 0;
+  std::int32_t cols = 0;
+  // Stride is the offset between two adjacent matrix elements
+  // in the non-contiguous direction.
+  std::int32_t stride = 0;
+  Order order = Order::kColMajor;
+  // Small scale layout shuffling, potentially departing from
+  // linear row-major or column-major storage. See KernelLayout.
+  KernelLayout kernel;
+};
+
+// Dynamic representation for a type.
+//
+// The most important field in this struct is the size, which Ruy uses to know
+// how much memory to allocate without having to be templated on a type.
+// Signed-ness and floating-point-ness are mainly present as debugging checks.
+//
+// Note: Ruy does not use this struct to to dynamically dispatch between
+// different typed implementations. As described in the comment at the top of
+// this file, Ruy's "front-end", which is templated, instantiates all the
+// necessary "back-end" routines with complete static knowledge of all the
+// types.
+struct Type {
+  template <typename T>
+  static Type Create() {
+    Type ret;
+    ret.is_signed = std::is_signed<T>::value;
+    ret.is_floating_point = std::is_floating_point<T>::value;
+    ret.size = sizeof(T);
+    return ret;
+  }
+
+  template <typename T>
+  void AssertIs() const {
+    RUY_DCHECK_EQ(is_signed, Create<T>().is_signed);
+    RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point);
+    RUY_DCHECK_EQ(size, Create<T>().size);
+  }
+
+  bool is_signed = false;
+  bool is_floating_point = false;
+  std::uint8_t size = 0;
+};
+
+// Type-erased matrix.
+struct DMatrix {
+  Type data_type;
+  void* data = nullptr;
+  Layout layout;
+  std::int32_t zero_point = 0;
+};
+
+// Type-erased packed matrix.
+struct PMatrix {
+  Type data_type;
+  void* data = nullptr;
+  Type sums_type;
+  void* sums = nullptr;
+  PackedLayout layout;
+  std::int32_t zero_point = 0;
+};
+
+// Convenient typed helper for packed matrices.
+template <typename Scalar>
+struct PackedMatrix {
+  // The row/column sums needed for quantized matrix multiplication when
+  // the opposite operand of the multiplication uses a non-symmetric zero
+  // point.
+  // This member is only relevant for packed matrices.
+  // Additionally, Ruy always uses 32-bit signed accumulators for quantized
+  // matrix multiplication.
+  // For floating point types, there is no quantization, so this pointer
+  // will always be null. We still need code referencing it to compile
+  // though, even if it is always branched around. Hence we use Scalar*
+  // itself as the type in that case.
+  using SumsType =
+      typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
+                                std::int32_t>::type;
+
+  Scalar* data = nullptr;
+  SumsType* sums = nullptr;
+  PackedLayout layout;
+  std::int32_t zero_point = 0;
+};
+
+template <typename T>
+DMatrix ToDMatrix(const Matrix<T>& matrix) {
+  DMatrix ret;
+  ret.data_type = Type::Create<T>();
+  ret.data = ToVoidPtr(matrix.data.get());
+  ret.layout = matrix.layout;
+  ret.zero_point = matrix.zero_point;
+  return ret;
+}
+
+template <typename T>
+Matrix<T> ToMatrix(const DMatrix& dmatrix) {
+  dmatrix.data_type.AssertIs<T>();
+  Matrix<T> ret;
+  ret.data = static_cast<T*>(dmatrix.data);
+  ret.layout = dmatrix.layout;
+  ret.zero_point = dmatrix.zero_point;
+  return ret;
+}
+
+template <typename T>
+PackedMatrix<T> ToPackedMatrix(const PMatrix& pmatrix) {
+  using SumsType = typename PackedMatrix<T>::SumsType;
+  pmatrix.data_type.AssertIs<T>();
+  pmatrix.sums_type.AssertIs<SumsType>();
+  PackedMatrix<T> ret;
+  ret.data = static_cast<T*>(pmatrix.data);
+  ret.sums = static_cast<SumsType*>(pmatrix.sums);
+  ret.layout = pmatrix.layout;
+  ret.zero_point = pmatrix.zero_point;
+  return ret;
+}
+
+// Helpers for Layout / PackedLayout.
+
+inline bool IsPacked(const Layout& layout) {
+  if (layout.order == Order::kColMajor) {
+    return layout.stride == layout.rows;
+  } else {
+    return layout.stride == layout.cols;
+  }
+}
+
+inline bool IsRowMajor(const Layout& layout) {
+  return layout.order == Order::kRowMajor;
+}
+
+template <typename LayoutOrPackedLayout>
+inline bool IsColMajor(const LayoutOrPackedLayout& layout) {
+  return layout.order == Order::kColMajor;
+}
+
+template <typename LayoutOrPackedLayout>
+inline int FlatSize(const LayoutOrPackedLayout& layout) {
+  const int outerdim =
+      layout.order == Order::kColMajor ? layout.cols : layout.rows;
+  return layout.stride * outerdim;
+}
+
+// TODO(b/130417400) add a unit test
+inline int Offset(const Layout& layout, int row, int col) {
+  // TODO(benoitjacob)  - should check this but this make the _slow tests take
+  // 5x longer.  Find a mitigation like in Eigen with an 'internal' variant
+  // bypassing the check?
+  // RUY_DCHECK_GE(row, 0);
+  // RUY_DCHECK_GE(col, 0);
+  // RUY_DCHECK_LT(row, layout.rows);
+  // RUY_DCHECK_LT(col, layout.cols);
+  int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
+  int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
+  return row * row_stride + col * col_stride;
+}
+
+// TODO(b/130417400) add a unit test
+inline int Offset(const PackedLayout& layout, int row, int col) {
+  RUY_DCHECK(is_pot(layout.kernel.rows));
+  RUY_DCHECK(is_pot(layout.kernel.cols));
+  int row_outer = row & ~(layout.kernel.rows - 1);
+  int col_outer = col & ~(layout.kernel.cols - 1);
+  int row_stride_outer =
+      layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
+  int col_stride_outer =
+      layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
+  int offset_outer =
+      row_outer * row_stride_outer + col_outer * col_stride_outer;
+  int row_inner = row - row_outer;
+  int col_inner = col - col_outer;
+  int row_stride_inner =
+      layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
+  int col_stride_inner =
+      layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
+  int offset_inner =
+      row_inner * row_stride_inner + col_inner * col_stride_inner;
+  return offset_outer + offset_inner;
+}
+
+// Helpers for Matrix<T>.
+
+template <typename Scalar>
+const Scalar* ElementPtr(const Matrix<Scalar>& mat, int row, int col) {
+  return mat.data.get() + Offset(mat.layout, row, col);
+}
+
+template <typename Scalar>
+Scalar* ElementPtr(Matrix<Scalar>* mat, int row, int col) {
+  return mat->data.get() + Offset(mat->layout, row, col);
+}
+
+template <typename Scalar>
+Scalar Element(const Matrix<Scalar>& mat, int row, int col) {
+  return *ElementPtr(mat, row, col);
+}
+
+// Helpers for PackedMatrix<T>.
+// Duplicated from Matrix<T>, but the duplication seems acceptable.
+
+template <typename Scalar>
+const Scalar* ElementPtr(const PackedMatrix<Scalar>& mat, int row, int col) {
+  return mat.data + Offset(mat.layout, row, col);
+}
+
+template <typename Scalar>
+Scalar* ElementPtr(PackedMatrix<Scalar>* mat, int row, int col) {
+  return mat->data + Offset(mat->layout, row, col);
+}
+
+template <typename Scalar>
+Scalar Element(const PackedMatrix<Scalar>& mat, int row, int col) {
+  return *ElementPtr(mat, row, col);
+}
+
+// Helpers for PMatrix.
+
+inline std::size_t DataSize(const PMatrix& packed) {
+  return FlatSize(packed.layout) * packed.data_type.size;
+}
+
+inline std::size_t SumsSize(const PMatrix& packed) {
+  // Packed matrices are only relevant for Ruy's TrMul implementations. For
+  // TrMul, the number of sums is always equal to the number of columns.
+  return packed.layout.cols * packed.sums_type.size;
+}
+
+// Transpose helpers.
+
+inline void Transpose(Order* order) {
+  *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
+}
+
+inline void Transpose(Layout* layout) {
+  Transpose(&layout->order);
+  std::swap(layout->rows, layout->cols);
+}
+
+template <typename Scalar>
+inline void Transpose(Matrix<Scalar>* matrix) {
+  Transpose(&matrix->layout);
+}
+
+// Helpers for KernelLayout.
+
+template <typename FixedKernelLayout>
+KernelLayout ToKernelLayout() {
+  KernelLayout ret;
+  ret.order = FixedKernelLayout::kOrder;
+  ret.rows = FixedKernelLayout::kRows;
+  ret.cols = FixedKernelLayout::kCols;
+  return ret;
+}
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_INTERNAL_MATRIX_H_