ruy/ctx.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223

/* Copyright 2019 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "ruy/ctx.h"

#include <cstdlib>
#include <functional>
#include <string>

#include "ruy/check_macros.h"
#include "ruy/cpuinfo.h"
#include "ruy/ctx_impl.h"
#include "ruy/have_built_path_for.h"
#include "ruy/path.h"
#include "ruy/performance_advisory.h"
#include "ruy/platform.h"
#include "ruy/strategy_controls.h"
#include "ruy/prepacked_cache.h"
#include "ruy/trace.h"

namespace ruy {

const CtxImpl& Ctx::impl() const { return static_cast<const CtxImpl&>(*this); }
CtxImpl* Ctx::mutable_impl() { return static_cast<CtxImpl*>(this); }

Path Ctx::last_used_path() const { return impl().last_used_path_; }
Tuning Ctx::explicit_tuning() const { return impl().explicit_tuning_; }
void Ctx::set_explicit_tuning(Tuning value) {
  mutable_impl()->explicit_tuning_ = value;
}
const ThreadPool& Ctx::thread_pool() const { return impl().thread_pool_; }
ThreadPool* Ctx::mutable_thread_pool() { return &mutable_impl()->thread_pool_; }
int Ctx::max_num_threads() const { return impl().max_num_threads_; }
void Ctx::set_max_num_threads(int value) {
  mutable_impl()->max_num_threads_ = value;
}
void Ctx::clear_performance_advisories() {
  mutable_impl()->performance_advisory_ = PerformanceAdvisory::kNone;
}
void Ctx::set_performance_advisory(PerformanceAdvisory advisory) {
  mutable_impl()->performance_advisory_ =
      mutable_impl()->performance_advisory_ | advisory;
}
bool Ctx::performance_advisory(PerformanceAdvisory advisory) const {
  return (impl().performance_advisory_ & advisory) !=
         PerformanceAdvisory::kNone;
}
void Ctx::set_num_threads_strategy(NumThreadsStrategy strategy) {
  mutable_impl()->num_threads_strategy_ = strategy;
}
NumThreadsStrategy Ctx::num_threads_strategy() const {
  return impl().num_threads_strategy_;
}

void Ctx::SetRuntimeEnabledPaths(Path paths) {
  if (paths == Path::kNone) {
    // Revert to default behavior using runtime detection.
    mutable_impl()->runtime_enabled_paths_ = Path::kNone;
  } else {
    // Explicitly set enabled paths. Ensure that non-arch are always enabled
    // (needed for fallbacks).
    mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
  }
}

CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }

namespace {

int GetHexIntEnvVarOrZero(const char* name) {
  const char* val = getenv(name);
  if (!val) {
    return 0;
  }
  return std::stoi(val, nullptr, 16);
}

// For each Path bit set in `paths_to_test`, performs runtime detection and
// sets the corresponding bit in the return value if and only if it is
// supported. Path bits that are not set in the input
// `paths_to_detect` value are also left not set in the return value.
Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
  // Paths in kNonArchPathsIncludingInternalVariants are always implicitly
  // supported. Further logic below may add more bits to `results`.
  Path result = kNonArchPathsIncludingInternalVariants;

  // Conditionally sets the `path` bit in `result`, if reported as supported
  // by the `is_supported` predicate.
  auto maybe_add = [&](Path path, std::function<bool(void)> is_supported) {
    if ((paths_to_detect & path) != Path::kNone) {
      if (is_supported()) {
        result = result | path;
      }
    }
  };

#if RUY_PLATFORM_ARM
  // NEON is unconditionally available on ARM64.
  // On ARM32 it's technically possible for it to be unavailable, but we've
  // always chosen to just crash on such devices. We could reevaluate that,
  // however for non-NEON devices to be actually supported, we would need to
  // address also compiler-generated NEON code. That would mean to remove
  // -mfpu=neon from ruy_copts and only use this flag in select NEON translation
  // units, and implement have_built_path_for_neon, similar to the x86 SIMD
  // paths.
  maybe_add(Path::kNeon, []() { return true; });

  // NEON dotprod requires runtime detection, however unlike the x86 SIMD paths
  // it still does not require have_built_path_for because we unconditionally
  // build it at the moment. That is largely because we have had to machine
  // encode dotprod instructions, so we don't actually rely on toolchain support
  // for them.
  maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); });
#elif RUY_PLATFORM_X86
  // x86 SIMD paths currently require both runtime detection, and detection of
  // whether we're building the path at all.
  maybe_add(Path::kAvx,
            [=]() { return HaveBuiltPathForAvx() && cpuinfo->Avx(); });
  maybe_add(Path::kAvx2Fma,
            [=]() { return HaveBuiltPathForAvx2Fma() && cpuinfo->Avx2Fma(); });
  maybe_add(Path::kAvx512,
            [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
#else
  (void)maybe_add;
  (void)cpuinfo;
#endif

  // Sanity checks
  RUY_DCHECK_EQ(kNonArchPaths & ~result, Path::kNone);
  RUY_DCHECK_EQ(
      result & ~(kNonArchPathsIncludingInternalVariants | paths_to_detect),
      Path::kNone);
  return result;
}

}  // namespace

Path Ctx::GetRuntimeEnabledPaths() {
  RUY_TRACE_SCOPE;
  // Just a shorthand alias. Using a pointer to make it clear we're mutating
  // this value in-place.
  Path* paths = &mutable_impl()->runtime_enabled_paths_;

  // The value Path::kNone indicates the initial state before detection has been
  // performed.
  if (*paths != Path::kNone) {
    RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE);
    return *paths;
  }
  // User may have set path explicitly in env var.
  Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("RUY_PATHS"));
  if (paths_bitfield != Path::kNone) {
    *paths = paths_bitfield;
    RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR);
    return *paths;
  }
  // Finally, use runtime detection.
  *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
  RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_DETECTION);
  return *paths;
}

Path Ctx::SelectPath(Path compiled_paths) {
  return mutable_impl()->last_used_path_ =
             GetMostSignificantPath(compiled_paths & GetRuntimeEnabledPaths());
}

void Ctx::EnsureThreadSpecificResources(int thread_count) {
  auto& resources = mutable_impl()->thread_specific_resources_;
  while (thread_count > static_cast<int>(resources.size())) {
    resources.emplace_back(new ThreadSpecificResource);
  }
  RUY_DCHECK_LE(thread_count, static_cast<int>(resources.size()));
}

TuningResolver* Ctx::GetThreadSpecificTuningResolver(int thread_index) const {
  const auto& resources = impl().thread_specific_resources_;
  RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
  return &resources[thread_index]->tuning_resolver;
}

Allocator* Ctx::GetThreadSpecificAllocator(int thread_index) const {
  const auto& resources = impl().thread_specific_resources_;
  RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
  return &resources[thread_index]->allocator;
}

Allocator* Ctx::GetMainAllocator() {
  if (!impl().main_allocator_) {
    mutable_impl()->main_allocator_.reset(new Allocator);
  }
  return impl().main_allocator_.get();
}

PrepackedCache* Ctx::GetPrepackedCache() {
  if (!impl().prepacked_cache_) {
    mutable_impl()->prepacked_cache_.reset(new PrepackedCache);
  }
  return impl().prepacked_cache_.get();
}

Tuning Ctx::GetMainThreadTuning() {
  EnsureThreadSpecificResources(1);
  TuningResolver* tuning_resolver = GetThreadSpecificTuningResolver(0);
  tuning_resolver->SetTuning(explicit_tuning());
  return tuning_resolver->Resolve(mutable_cpuinfo());
}

void Ctx::ClearPrepackedCache() { mutable_impl()->prepacked_cache_ = nullptr; }

}  // namespace ruy