ruy/spec.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

/* Copyright 2019 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_

#include <limits>
#include <type_traits>

#include "ruy/cpu_cache_size.h"
#include "ruy/matrix.h"

namespace ruy {

// Our 'general' loop structure (the default) involves multi-threading and
// complicated loops aiming to optimize cache-friendliness. One may opt out of
// this and pick the 'simple' loop structure instead, which only performs well
// for small matrix sizes and only allows using one thread, in exchange for
// smaller code size.
enum class LoopStructure { kGeneral, kSimple, kAuto };

// In general we allow zero_point's to have any Scalar value. This is called
// 'asymmetric' quantization. We do take advantage of the optimization
// opportunities when zero_points happen at runtime to be 'symmetric' (e.g. the
// int8 value 0 or the uint8 value 128), but we still generate code to handle
// the general asymmetric case. By choosing kSymmetric here, one opts out of
// this and supports only the symmetric case, in exchange for smaller code size.
enum class ZeroPointSupport { kGeneral, kSymmetric };

// In general we allow all Layout's, even if we may use slow paths for some
// kinds of layouts. By choosing kRCC, one may opt out of this and
// only keep support for the simplest and most efficient combination of
// Layout's, in exchange for smaller code size. The case covered by
// kRCC is where the storage orders are exactly the following:
//    - LHS is RowMajor
//    - RHS is ColMajor
//    - Destination is ColMajor
enum class LayoutSupport { kGeneral, kRCC };

// A Spec describes all about a matrix multiplication operation that isn't
// encoded in the LHS, RHS and destination matrices. Some of that information
// is encoded as compile-time constants and types (for instance, the choice
// of accumulator type, AccumScalar). Some of that information is encoded as
// runtime values (for instance, the optional bias vector).
template <typename tAccumScalar, typename tDstScalar>
struct BasicSpec {
  // Accumulator type. The type of accumulators used to compute the dot-products
  // before being ultimately casted to the destination type.
  using AccumScalar = tAccumScalar;
  // The destination scalar type.
  using DstScalar = tDstScalar;
  // The bias vector data, if not null.
  const AccumScalar* bias = nullptr;
  // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
  // of the multiplier by which accumulators are multiplied before being casted
  // to the destination type.
  AccumScalar multiplier_fixedpoint = 0;
  // Only for non-floating-point cases. The exponent part of the aforementioned
  // multiplier.
  int multiplier_exponent = 0;
  // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must
  // point to a buffer of as many values as there are rows in the destination
  // matrix. Each row of the destination matrix will use the corresponding
  // buffer element instead of multiplier_fixedpoint.
  const AccumScalar* multiplier_fixedpoint_perchannel = nullptr;
  // Per-channel variant of multiplier_exponent. If not nullptr, this must
  // point to a buffer of as many values as there are rows in the destination
  // matrix. Each row of the destination matrix will use the corresponding
  // buffer element instead of multiplier_exponent.
  //
  // Either none or both of multiplier_exponent_perchannel and
  // multiplier_fixedpoint_perchannel must be nullptr.
  const int* multiplier_exponent_perchannel = nullptr;
  // min clamp bound of destination values.
  DstScalar clamp_min = std::is_floating_point<DstScalar>::value
                            ? -std::numeric_limits<DstScalar>::infinity()
                            : std::numeric_limits<DstScalar>::lowest();
  // max clamp bound of destination values.
  DstScalar clamp_max = std::is_floating_point<DstScalar>::value
                            ? std::numeric_limits<DstScalar>::infinity()
                            : std::numeric_limits<DstScalar>::max();
  // See above enum LoopStructure
  static constexpr LoopStructure kLoopStructure = LoopStructure::kAuto;
  // See above enum LayoutSupport
  static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kGeneral;
  // See above enum ZeroPointSupport
  static constexpr ZeroPointSupport kZeroPointSupport =
      ZeroPointSupport::kGeneral;
  // Testing-only, not meant to be used by actual users:
  // Used for testing of various kernel layouts.
  using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
  using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
  // Returns (a reasonable estimate of) the local CPU cache size.
  // See ruy::LocalDataCacheSize() which returns some coarse, sane default for
  // each CPU architecture.
  // This may be overridden, either to provide more accurate/runtime values,
  // or to test with other values to let testcases have more coverage.
  static int local_data_cache_size() { return LocalDataCacheSize(); }
  // Same as local_data_cache_size but for the total data cache size accessible
  // to each CPU core. See ruy::SharedDataCacheSize().
  static int shared_data_cache_size() { return SharedDataCacheSize(); }
};

}  // namespace ruy

#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_