Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/google/ruy.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'ruy/spec.h')
-rw-r--r--ruy/spec.h118
1 files changed, 118 insertions, 0 deletions
diff --git a/ruy/spec.h b/ruy/spec.h
new file mode 100644
index 0000000..d96b6a9
--- /dev/null
+++ b/ruy/spec.h
@@ -0,0 +1,118 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_
+
+#include <limits>
+#include <type_traits>
+
+#include "ruy/cpu_cache_size.h"
+#include "ruy/matrix.h"
+
+namespace ruy {
+
+// Our 'general' loop structure (the default) involves multi-threading and
+// complicated loops aiming to optimize cache-friendliness. One may opt out of
+// this and pick the 'simple' loop structure instead, which only performs well
+// for small matrix sizes and only allows using one thread, in exchange for
+// smaller code size.
+enum class LoopStructure { kGeneral, kSimple, kAuto };
+
+// In general we allow zero_point's to have any Scalar value. This is called
+// 'asymmetric' quantization. We do take advantage of the optimization
+// opportunities when zero_points happen at runtime to be 'symmetric' (e.g. the
+// int8 value 0 or the uint8 value 128), but we still generate code to handle
+// the general asymmetric case. By choosing kSymmetric here, one opts out of
+// this and supports only the symmetric case, in exchange for smaller code size.
+enum class ZeroPointSupport { kGeneral, kSymmetric };
+
+// In general we allow all Layout's, even if we may use slow paths for some
+// kinds of layouts. By choosing kRCC, one may opt out of this and
+// only keep support for the simplest and most efficient combination of
+// Layout's, in exchange for smaller code size. The case covered by
+// kRCC is where the storage orders are exactly the following:
+// - LHS is RowMajor
+// - RHS is ColMajor
+// - Destination is ColMajor
+enum class LayoutSupport { kGeneral, kRCC };
+
+// A Spec describes all about a matrix multiplication operation that isn't
+// encoded in the LHS, RHS and destination matrices. Some of that information
+// is encoded as compile-time constants and types (for instance, the choice
+// of accumulator type, AccumScalar). Some of that information is encoded as
+// runtime values (for instance, the optional bias vector).
+template <typename tAccumScalar, typename tDstScalar>
+struct BasicSpec {
+ // Accumulator type. The type of accumulators used to compute the dot-products
+ // before being ultimately casted to the destination type.
+ using AccumScalar = tAccumScalar;
+ // The destination scalar type.
+ using DstScalar = tDstScalar;
+ // The bias vector data, if not null.
+ const AccumScalar* bias = nullptr;
+ // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
+ // of the multiplier by which accumulators are multiplied before being casted
+ // to the destination type.
+ AccumScalar multiplier_fixedpoint = 0;
+ // Only for non-floating-point cases. The exponent part of the aforementioned
+ // multiplier.
+ int multiplier_exponent = 0;
+ // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must
+ // point to a buffer of as many values as there are rows in the destination
+ // matrix. Each row of the destination matrix will use the corresponding
+ // buffer element instead of multiplier_fixedpoint.
+ const AccumScalar* multiplier_fixedpoint_perchannel = nullptr;
+ // Per-channel variant of multiplier_exponent. If not nullptr, this must
+ // point to a buffer of as many values as there are rows in the destination
+ // matrix. Each row of the destination matrix will use the corresponding
+ // buffer element instead of multiplier_exponent.
+ //
+ // Either none or both of multiplier_exponent_perchannel and
+ // multiplier_fixedpoint_perchannel must be nullptr.
+ const int* multiplier_exponent_perchannel = nullptr;
+ // min clamp bound of destination values.
+ DstScalar clamp_min = std::is_floating_point<DstScalar>::value
+ ? -std::numeric_limits<DstScalar>::infinity()
+ : std::numeric_limits<DstScalar>::lowest();
+ // max clamp bound of destination values.
+ DstScalar clamp_max = std::is_floating_point<DstScalar>::value
+ ? std::numeric_limits<DstScalar>::infinity()
+ : std::numeric_limits<DstScalar>::max();
+ // See above enum LoopStructure
+ static constexpr LoopStructure kLoopStructure = LoopStructure::kAuto;
+ // See above enum LayoutSupport
+ static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kGeneral;
+ // See above enum ZeroPointSupport
+ static constexpr ZeroPointSupport kZeroPointSupport =
+ ZeroPointSupport::kGeneral;
+ // Testing-only, not meant to be used by actual users:
+ // Used for testing of various kernel layouts.
+ using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+ using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+ // Returns (a reasonable estimate of) the local CPU cache size.
+ // See ruy::LocalDataCacheSize() which returns some coarse, sane default for
+ // each CPU architecture.
+ // This may be overridden, either to provide more accurate/runtime values,
+ // or to test with other values to let testcases have more coverage.
+ static int local_data_cache_size() { return LocalDataCacheSize(); }
+ // Same as local_data_cache_size but for the total data cache size accessible
+ // to each CPU core. See ruy::SharedDataCacheSize().
+ static int shared_data_cache_size() { return SharedDataCacheSize(); }
+};
+
+} // namespace ruy
+
+#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_SPEC_H_