diff options
Diffstat (limited to 'ruy/internal_matrix.h')
-rw-r--r-- | ruy/internal_matrix.h | 388 |
1 files changed, 388 insertions, 0 deletions
diff --git a/ruy/internal_matrix.h b/ruy/internal_matrix.h new file mode 100644 index 0000000..7fe13be --- /dev/null +++ b/ruy/internal_matrix.h @@ -0,0 +1,388 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Internal types and helpers for matrices. +// +// Ruy has a couple slightly different notions of matrices, besides the +// Matrix<T> class that we expose to the user-facing API. +// +// TODO(silvasean): Put parts of this architecture description somewhere more +// prominent. +// +// The 4 main matrix types are: +// - Matrix<T>: This is a user-facing type on Ruy's external API boundary. It is +// also used internally. +// - DMatrix: This is a type-erased version of Matrix<T>. "D" = "dynamic". +// - PMatrix: This represents a packed matrix, which requires tracking kernel +// layout and row/column sums for quantization. It is type-erased. +// - PackedMatrix<T>: This is a statically typed variant of PMatrix for +// convenience inside typed routines. +// +// Note that Matrix<T> is *not* implemented in terms of the internal types. It +// is an independent, simple, and user-facing type. +// +// The use of type-erasure might seem surprising for a library like Ruy with a +// heavily-templated entry point, but it is motivated by the desire for most of +// Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3 +// main parts: +// - "front-end" (dispatch.h) - this is the highly templated ruy::Mul entry +// point, along with routines that select RunKernel and RunPack implementations +// statically based on those template parameters. +// - "back-end" (kernel.h, pack.h)- this consists of the implementations of +// RunKernel and RunPack, often in assembly code, which are the building blocks +// that Ruy calls to perform matrix multiplication. These are templated so that +// only the requested types/Path's are actually emitted by the compiler. +// - "middle-end" (trmul.h) - this is the part of Ruy that orchestrates the +// calls to the "back-end" optimized building blocks. This layer has to deal +// with issues like cache locality and low-overhead multi-threading. +// +// There is a desire for the "middle-end" to be non-templated in order to +// simplify the implementation and reduce code-size. We type-erase when going +// from the "front-end" to the "middle-end", and un-type-erase going from the +// "middle-end" to the "back-end". The un-type-erasure is possible because the +// "front-end" is responsible for instantiating the needed "back-end" templates, +// and thus the static type information is still present. +// +// Each layer of Ruy uses matrix types: +// - "front-end": Matrix<T> +// - "middle-end": DMatrix, PMatrix +// - "back-end": Matrix<T>, PackedMatrix<T> +// +// The use of separate types for packed matrices is not essential, but makes it +// obvious at a glance whether a matrix is a packed matrix or not. We would +// reconsider this decision if there was significant duplication between packed +// and unpacked matrices, but that doesn't seem to be the case at the moment. +// +// Another goal is to keep the user-facing Matrix<T> as simple and +// understandable as possible. Ideally, a user should be able to read the struct +// definition for Matrix<T> and see a very simple definition with no internal +// details like sums and kernel block layout. +// +// To present another structured view of our various matrix types, here's a +// table: +// Plain matrices Packed matrices +// +---------------------------------- +// Templated | Matrix<T> PackedMatrix<T> +// Type-erased | DMatrix PMatrix +// +// +// There is 1 additional matrix type not mentioned above, due to its low +// importance: +// - PrepackedMatrix: This is a user-facing version of PMatrix. It has the bare +// minimum of fields needed for representing the raw data and sums buffers of a +// packed matrix for the "advanced" explicit pre-packing API. This type plays no +// role in Ruy's internals and can generally by ignored. The only reason it +// exists is so that PMatrix is not exposed to users -- we prefer to keep the +// internal matrix types hidden, even from "advanced" users. + +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_INTERNAL_MATRIX_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_INTERNAL_MATRIX_H_ + +#include <cstddef> +#include <cstdint> +#include <type_traits> +#include <utility> + +#include "ruy/check_macros.h" +#include "ruy/common.h" +#include "ruy/matrix.h" +#include "ruy/size_util.h" + +namespace ruy { + +// KernelLayout describes small-scale block structure in a packed matrix layout. +// It's a runtime (as opposed to compile-time-constant) version of the +// FixedKernelLayout struct used to declare kernel layouts. +// +// This is is sometimes known as "tiling" in other contexts. +// +// For example, consider a packed matrix in column-major format with a +// column-major KernelLayout. The matrix logically has a shape of +// `[cols, rows]`. However, the matrix is laid out as though it were a 4D array +// of shape `[cols / kcols, rows / krows, kcols, krows]`. +// +// Note that in the case of kcols=1, krows=1, this degenerates to +// `[cols, rows, 1, 1]` which is equivalent to having no small-scale block +// structure. +struct KernelLayout { + Order order = Order::kColMajor; + std::uint8_t rows = 1; + std::uint8_t cols = 1; +}; + +// A packed matrix has a small-scale block structure that is not present in in +// the input matrices. This block structure is necessary for the kernels to +// process data efficiently. +// +// This struct is very similar to Layout, but has the extra KernelLayout field. +struct PackedLayout { + std::int32_t rows = 0; + std::int32_t cols = 0; + // Stride is the offset between two adjacent matrix elements + // in the non-contiguous direction. + std::int32_t stride = 0; + Order order = Order::kColMajor; + // Small scale layout shuffling, potentially departing from + // linear row-major or column-major storage. See KernelLayout. + KernelLayout kernel; +}; + +// Dynamic representation for a type. +// +// The most important field in this struct is the size, which Ruy uses to know +// how much memory to allocate without having to be templated on a type. +// Signed-ness and floating-point-ness are mainly present as debugging checks. +// +// Note: Ruy does not use this struct to to dynamically dispatch between +// different typed implementations. As described in the comment at the top of +// this file, Ruy's "front-end", which is templated, instantiates all the +// necessary "back-end" routines with complete static knowledge of all the +// types. +struct Type { + template <typename T> + static Type Create() { + Type ret; + ret.is_signed = std::is_signed<T>::value; + ret.is_floating_point = std::is_floating_point<T>::value; + ret.size = sizeof(T); + return ret; + } + + template <typename T> + void AssertIs() const { + RUY_DCHECK_EQ(is_signed, Create<T>().is_signed); + RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point); + RUY_DCHECK_EQ(size, Create<T>().size); + } + + bool is_signed = false; + bool is_floating_point = false; + std::uint8_t size = 0; +}; + +// Type-erased matrix. +struct DMatrix { + Type data_type; + void* data = nullptr; + Layout layout; + std::int32_t zero_point = 0; +}; + +// Type-erased packed matrix. +struct PMatrix { + Type data_type; + void* data = nullptr; + Type sums_type; + void* sums = nullptr; + PackedLayout layout; + std::int32_t zero_point = 0; +}; + +// Convenient typed helper for packed matrices. +template <typename Scalar> +struct PackedMatrix { + // The row/column sums needed for quantized matrix multiplication when + // the opposite operand of the multiplication uses a non-symmetric zero + // point. + // This member is only relevant for packed matrices. + // Additionally, Ruy always uses 32-bit signed accumulators for quantized + // matrix multiplication. + // For floating point types, there is no quantization, so this pointer + // will always be null. We still need code referencing it to compile + // though, even if it is always branched around. Hence we use Scalar* + // itself as the type in that case. + using SumsType = + typename std::conditional<std::is_floating_point<Scalar>::value, Scalar, + std::int32_t>::type; + + Scalar* data = nullptr; + SumsType* sums = nullptr; + PackedLayout layout; + std::int32_t zero_point = 0; +}; + +template <typename T> +DMatrix ToDMatrix(const Matrix<T>& matrix) { + DMatrix ret; + ret.data_type = Type::Create<T>(); + ret.data = ToVoidPtr(matrix.data.get()); + ret.layout = matrix.layout; + ret.zero_point = matrix.zero_point; + return ret; +} + +template <typename T> +Matrix<T> ToMatrix(const DMatrix& dmatrix) { + dmatrix.data_type.AssertIs<T>(); + Matrix<T> ret; + ret.data = static_cast<T*>(dmatrix.data); + ret.layout = dmatrix.layout; + ret.zero_point = dmatrix.zero_point; + return ret; +} + +template <typename T> +PackedMatrix<T> ToPackedMatrix(const PMatrix& pmatrix) { + using SumsType = typename PackedMatrix<T>::SumsType; + pmatrix.data_type.AssertIs<T>(); + pmatrix.sums_type.AssertIs<SumsType>(); + PackedMatrix<T> ret; + ret.data = static_cast<T*>(pmatrix.data); + ret.sums = static_cast<SumsType*>(pmatrix.sums); + ret.layout = pmatrix.layout; + ret.zero_point = pmatrix.zero_point; + return ret; +} + +// Helpers for Layout / PackedLayout. + +inline bool IsPacked(const Layout& layout) { + if (layout.order == Order::kColMajor) { + return layout.stride == layout.rows; + } else { + return layout.stride == layout.cols; + } +} + +inline bool IsRowMajor(const Layout& layout) { + return layout.order == Order::kRowMajor; +} + +template <typename LayoutOrPackedLayout> +inline bool IsColMajor(const LayoutOrPackedLayout& layout) { + return layout.order == Order::kColMajor; +} + +template <typename LayoutOrPackedLayout> +inline int FlatSize(const LayoutOrPackedLayout& layout) { + const int outerdim = + layout.order == Order::kColMajor ? layout.cols : layout.rows; + return layout.stride * outerdim; +} + +// TODO(b/130417400) add a unit test +inline int Offset(const Layout& layout, int row, int col) { + // TODO(benoitjacob) - should check this but this make the _slow tests take + // 5x longer. Find a mitigation like in Eigen with an 'internal' variant + // bypassing the check? + // RUY_DCHECK_GE(row, 0); + // RUY_DCHECK_GE(col, 0); + // RUY_DCHECK_LT(row, layout.rows); + // RUY_DCHECK_LT(col, layout.cols); + int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride; + int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride; + return row * row_stride + col * col_stride; +} + +// TODO(b/130417400) add a unit test +inline int Offset(const PackedLayout& layout, int row, int col) { + RUY_DCHECK(is_pot(layout.kernel.rows)); + RUY_DCHECK(is_pot(layout.kernel.cols)); + int row_outer = row & ~(layout.kernel.rows - 1); + int col_outer = col & ~(layout.kernel.cols - 1); + int row_stride_outer = + layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride; + int col_stride_outer = + layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride; + int offset_outer = + row_outer * row_stride_outer + col_outer * col_stride_outer; + int row_inner = row - row_outer; + int col_inner = col - col_outer; + int row_stride_inner = + layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols; + int col_stride_inner = + layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows; + int offset_inner = + row_inner * row_stride_inner + col_inner * col_stride_inner; + return offset_outer + offset_inner; +} + +// Helpers for Matrix<T>. + +template <typename Scalar> +const Scalar* ElementPtr(const Matrix<Scalar>& mat, int row, int col) { + return mat.data.get() + Offset(mat.layout, row, col); +} + +template <typename Scalar> +Scalar* ElementPtr(Matrix<Scalar>* mat, int row, int col) { + return mat->data.get() + Offset(mat->layout, row, col); +} + +template <typename Scalar> +Scalar Element(const Matrix<Scalar>& mat, int row, int col) { + return *ElementPtr(mat, row, col); +} + +// Helpers for PackedMatrix<T>. +// Duplicated from Matrix<T>, but the duplication seems acceptable. + +template <typename Scalar> +const Scalar* ElementPtr(const PackedMatrix<Scalar>& mat, int row, int col) { + return mat.data + Offset(mat.layout, row, col); +} + +template <typename Scalar> +Scalar* ElementPtr(PackedMatrix<Scalar>* mat, int row, int col) { + return mat->data + Offset(mat->layout, row, col); +} + +template <typename Scalar> +Scalar Element(const PackedMatrix<Scalar>& mat, int row, int col) { + return *ElementPtr(mat, row, col); +} + +// Helpers for PMatrix. + +inline std::size_t DataSize(const PMatrix& packed) { + return FlatSize(packed.layout) * packed.data_type.size; +} + +inline std::size_t SumsSize(const PMatrix& packed) { + // Packed matrices are only relevant for Ruy's TrMul implementations. For + // TrMul, the number of sums is always equal to the number of columns. + return packed.layout.cols * packed.sums_type.size; +} + +// Transpose helpers. + +inline void Transpose(Order* order) { + *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor; +} + +inline void Transpose(Layout* layout) { + Transpose(&layout->order); + std::swap(layout->rows, layout->cols); +} + +template <typename Scalar> +inline void Transpose(Matrix<Scalar>* matrix) { + Transpose(&matrix->layout); +} + +// Helpers for KernelLayout. + +template <typename FixedKernelLayout> +KernelLayout ToKernelLayout() { + KernelLayout ret; + ret.order = FixedKernelLayout::kOrder; + ret.rows = FixedKernelLayout::kRows; + ret.cols = FixedKernelLayout::kCols; + return ret; +} + +} // namespace ruy + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_INTERNAL_MATRIX_H_ |