diff options
Diffstat (limited to 'ruy/spec.h')
-rw-r--r-- | ruy/spec.h | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/ruy/spec.h b/ruy/spec.h new file mode 100644 index 0000000..d96b6a9 --- /dev/null +++ b/ruy/spec.h @@ -0,0 +1,118 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_ + +#include <limits> +#include <type_traits> + +#include "ruy/cpu_cache_size.h" +#include "ruy/matrix.h" + +namespace ruy { + +// Our 'general' loop structure (the default) involves multi-threading and +// complicated loops aiming to optimize cache-friendliness. One may opt out of +// this and pick the 'simple' loop structure instead, which only performs well +// for small matrix sizes and only allows using one thread, in exchange for +// smaller code size. +enum class LoopStructure { kGeneral, kSimple, kAuto }; + +// In general we allow zero_point's to have any Scalar value. This is called +// 'asymmetric' quantization. We do take advantage of the optimization +// opportunities when zero_points happen at runtime to be 'symmetric' (e.g. the +// int8 value 0 or the uint8 value 128), but we still generate code to handle +// the general asymmetric case. By choosing kSymmetric here, one opts out of +// this and supports only the symmetric case, in exchange for smaller code size. +enum class ZeroPointSupport { kGeneral, kSymmetric }; + +// In general we allow all Layout's, even if we may use slow paths for some +// kinds of layouts. By choosing kRCC, one may opt out of this and +// only keep support for the simplest and most efficient combination of +// Layout's, in exchange for smaller code size. The case covered by +// kRCC is where the storage orders are exactly the following: +// - LHS is RowMajor +// - RHS is ColMajor +// - Destination is ColMajor +enum class LayoutSupport { kGeneral, kRCC }; + +// A Spec describes all about a matrix multiplication operation that isn't +// encoded in the LHS, RHS and destination matrices. Some of that information +// is encoded as compile-time constants and types (for instance, the choice +// of accumulator type, AccumScalar). Some of that information is encoded as +// runtime values (for instance, the optional bias vector). +template <typename tAccumScalar, typename tDstScalar> +struct BasicSpec { + // Accumulator type. The type of accumulators used to compute the dot-products + // before being ultimately casted to the destination type. + using AccumScalar = tAccumScalar; + // The destination scalar type. + using DstScalar = tDstScalar; + // The bias vector data, if not null. + const AccumScalar* bias = nullptr; + // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa) + // of the multiplier by which accumulators are multiplied before being casted + // to the destination type. + AccumScalar multiplier_fixedpoint = 0; + // Only for non-floating-point cases. The exponent part of the aforementioned + // multiplier. + int multiplier_exponent = 0; + // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must + // point to a buffer of as many values as there are rows in the destination + // matrix. Each row of the destination matrix will use the corresponding + // buffer element instead of multiplier_fixedpoint. + const AccumScalar* multiplier_fixedpoint_perchannel = nullptr; + // Per-channel variant of multiplier_exponent. If not nullptr, this must + // point to a buffer of as many values as there are rows in the destination + // matrix. Each row of the destination matrix will use the corresponding + // buffer element instead of multiplier_exponent. + // + // Either none or both of multiplier_exponent_perchannel and + // multiplier_fixedpoint_perchannel must be nullptr. + const int* multiplier_exponent_perchannel = nullptr; + // min clamp bound of destination values. + DstScalar clamp_min = std::is_floating_point<DstScalar>::value + ? -std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::lowest(); + // max clamp bound of destination values. + DstScalar clamp_max = std::is_floating_point<DstScalar>::value + ? std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::max(); + // See above enum LoopStructure + static constexpr LoopStructure kLoopStructure = LoopStructure::kAuto; + // See above enum LayoutSupport + static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kGeneral; + // See above enum ZeroPointSupport + static constexpr ZeroPointSupport kZeroPointSupport = + ZeroPointSupport::kGeneral; + // Testing-only, not meant to be used by actual users: + // Used for testing of various kernel layouts. + using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>; + using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>; + // Returns (a reasonable estimate of) the local CPU cache size. + // See ruy::LocalDataCacheSize() which returns some coarse, sane default for + // each CPU architecture. + // This may be overridden, either to provide more accurate/runtime values, + // or to test with other values to let testcases have more coverage. + static int local_data_cache_size() { return LocalDataCacheSize(); } + // Same as local_data_cache_size but for the total data cache size accessible + // to each CPU core. See ruy::SharedDataCacheSize(). + static int shared_data_cache_size() { return SharedDataCacheSize(); } +}; + +} // namespace ruy + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_ |