ruy/trmul.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406

/* Copyright 2019 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// The 'middle-end' in ruy. See TrMul function comment.

#include "ruy/trmul.h"

#include <algorithm>
#include <atomic>
#include <cstdint>
#include <cstring>
#include <memory>
#include <vector>

#include "ruy/allocator.h"
#include "ruy/block_map.h"
#include "ruy/check_macros.h"
#include "ruy/cpu_cache_params.h"
#include "ruy/cpuinfo.h"
#include "ruy/ctx.h"
#include "ruy/denormal.h"
#include "ruy/mat.h"
#include "ruy/matrix.h"
#include "ruy/mul_params.h"
#include "ruy/opt_set.h"
#include "ruy/profiler/instrumentation.h"
#include "ruy/side_pair.h"
#include "ruy/size_util.h"
#include "ruy/thread_pool.h"
#include "ruy/trace.h"
#include "ruy/tune.h"

namespace ruy {

namespace {

// Enum to track the packingstatus of a block of the LHS or RHS matrix.
enum class PackingStatus : std::uint8_t {
  kNotStarted,  // No thread has started packing this block yet.
  kInProgress,  // Some thread is currently packing this block.
  kFinished     // This block has already been packed.
};

// TrMulTask is the task that a ruy thread runs to perform the TrMul operation.
class TrMulTask final : public Task {
 public:
  TrMulTask(TrMulParams* params, const BlockMap& block_map,
            std::atomic<int>* atomic_block_id, int thread_id, bool need_atomics,
            SidePair<std::atomic<PackingStatus>*> packing_status,
            TuningResolver* tuning_resolver, Allocator* local_allocator,
            CpuInfo* cpuinfo)
      : params_(params),
        block_map_(block_map),
        atomic_block_id_(atomic_block_id),
        thread_id_(thread_id),
        need_atomics_(need_atomics),
        packing_status_(packing_status),
        tuning_resolver_(tuning_resolver),
        local_allocator_(local_allocator),
        local_already_packed_{nullptr, nullptr},
        cpuinfo_(cpuinfo) {}

  // Thread main function. This is one thread's share of the TrMul work.
  void Run() override {
    RUY_TRACE_SCOPE_NAME("TrMulTask::Run");
    RUY_TRACE_SET_THEAD_ID(thread_id_);
    // Allocate and initialize `local_packed`.
    for (Side side : {Side::kLhs, Side::kRhs}) {
      if (!params_->is_prepacked[side]) {
        const int size = NumBlocksPerSide(side, block_map_);
        local_allocator_->Allocate(size, &local_already_packed_[side]);
        memset(local_already_packed_[side], 0, size * sizeof(bool));
      }
    }

    const Tuning tuning = tuning_resolver_->Resolve(cpuinfo_);
    const int num_blocks = NumBlocks(block_map_);

    // Each thread starts by initially reserving the block whose id
    // is the thread id.
    int block_id = thread_id_;
    // Loop until all blocks have been computed.
    while (block_id < num_blocks) {
      RUY_TRACE_SCOPE_NAME("Main loop iteration");
      // Reserve the next block to handle, hiding the latency of this atomic op.
      const int next_block_id =
          atomic_block_id_->fetch_add(1, std::memory_order_relaxed);
      // Get coordinates of the current block to handle, in "block space".
      SidePair<int> block;
      GetBlockByIndex(block_map_, block_id, &block);
      // Get coordinates of the current block to handle, in matrix space.
      SidePair<int> start, end;
      GetBlockMatrixCoords(block_map_, block, &start, &end);
      RUY_TRACE_INFO(TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS);
      // Maybe pack the current LHS/RHS block, if not already packed.
      EnsurePacked(block, start, end, tuning);
      // Actually do matrix multiplication work
      params_->RunKernel(tuning, start, end);
      // Move on to the next block as obtained by the atomic increment
      // at the start of this while loop iteration.
      block_id = next_block_id;
    }

    local_allocator_->FreeAll();
  }

 private:
  // Tries to pack a block, without blocking.
  // If the block was already packed, returns true.
  // If the block was not started packing, packs it and returns true.
  // If the block was being packed by another thread, returns false.
  bool TryPack(Side side, int block, int start, int end, Tuning tuning) {
    if (params_->is_prepacked[side]) {
      return true;
    }
    if (!local_already_packed_[side][block]) {
      if (need_atomics_) {
        // Explanation of this compare_exchange_strong operation:
        // This atomically performs all of the following:
        // 1. Read `status` with "acquire" memory order.
        //    * That this read uses "acquire" is because both memory orders
        //      specified have "acquire" as their read-component.
        // 2. Compare (bitwise) with `exchanged_status`.
        // 3. If equal, stores the value kInProgress to `status` with "release"
        //    memory order, and returns true, so we take this 'if' branch.
        //    * That this store uses "release" is because of the _rel part in
        //      memory_order_acq_rel passed as the first memory order argument.
        // 4. If not equal, stores the loaded value of `status` to
        //    `exchanged_status` with "relaxed" semantics, and returns false,
        //    so we take the 'else' branch.
        //    * That this store uses "relaxed" is because the second memory
        //      order argument, memory_order_acquire, implies no particular
        //      store semantics. "relaxed" is acceptable here because this
        //      stores to a local stack variable.
        //
        // Rationale for compare_exchange_strong as opposed to
        // compare_exchange_weak:
        // The spurious-failure case with compare_exchange_weak will actually
        // happen a lot here, because the atomic 'status' bytes are stored
        // contiguously in arrays and neighboring values will be accessed
        // by multiple threads concurrently. On a typical ARM CPU, an exclusives
        // reservation granule is 64 bytes, so a lot of false-sharing may
        // happen. Using compare_exchange_weak would thus result in often having
        // TryPack return 'false' when it could instead have done the packing
        // work and returned 'true'. Heuristically, that is not a good thing.
        // Moreover, this changes the TryPack contract, loosening it and making
        // it harder for the caller to reason about. Finally, the overhead of
        // atomic operations is mitigated by the enclosing check on
        // local_already_packed, so maybe the overhead of
        // compare_exchange_strong isn't such a problem. But we don't really
        // know for sure, that would be interesting to experiment more with.
        PackingStatus exchanged_status = PackingStatus::kNotStarted;
        std::atomic<PackingStatus>& status = packing_status_[side][block];
        if (status.compare_exchange_strong(
                exchanged_status, PackingStatus::kInProgress,
                std::memory_order_acq_rel, std::memory_order_acquire)) {
          // In this branch, the status was kNotStarted and we just atomically
          // changed it to kInProgress as we are about to handle the packing
          // ourselves.
          RUY_TRACE_INFO(TRYPACK_PACKING);
          params_->RunPack(side, tuning, start, end);
          status.store(PackingStatus::kFinished, std::memory_order_release);
        } else if (exchanged_status == PackingStatus::kInProgress) {
          // Another thread is currently packing this block.
          RUY_TRACE_INFO(TRYPACK_ANOTHER_THREAD_PACKING);
          return false;
        } else {
          RUY_TRACE_INFO(TRYPACK_PACKED_BY_ANOTHER_THREAD);
        }
        RUY_DCHECK(status.load(std::memory_order_acquire) ==
                   PackingStatus::kFinished);
      } else {
        // Single-threaded case: no need for expensive atomics,
        // local_already_packed is the truth already.
        params_->RunPack(side, tuning, start, end);
      }
      local_already_packed_[side][block] = true;
    } else {
      RUY_TRACE_INFO(TRYPACK_PREVIOUSLY_PACKED);
    }
    return true;
  }

  // Ensures that both the LHS and RHS blocks required by the specified block
  // are packed. In the event that they are already being packed on another
  // threads, this function may perform the packing of some other block while
  // waiting for that other thread to finish packing the requested block.
  void EnsurePacked(const SidePair<int>& block, const SidePair<int>& start,
                    const SidePair<int>& end, Tuning tuning) {
#if RUY_OPT(PACK_AHEAD)
    SidePair<int> next_runahead_block{block[Side::kLhs] + 1,
                                      block[Side::kRhs] + 1};
    Side next_runahead_side = Side::kLhs;
#endif
    while (true) {
      bool both_sides_packed = true;
      for (Side side : {Side::kLhs, Side::kRhs}) {
        both_sides_packed &=
            TryPack(side, block[side], start[side], end[side], tuning);
      }
      if (both_sides_packed) {
        break;
      }
#if RUY_OPT(PACK_AHEAD)
      RUY_TRACE_INFO(ENSURE_PACKED_ENTER_RUN_AHEAD);
      const Side runahead_side = next_runahead_side;
      const int runahead_block = next_runahead_block[runahead_side];
      next_runahead_side = OtherSide(next_runahead_side);
      if (runahead_block >= NumBlocksPerSide(runahead_side, block_map_)) {
        continue;
      }
      int runahead_block_start, runahead_block_end;
      GetBlockMatrixCoords(runahead_side, block_map_, runahead_block,
                           &runahead_block_start, &runahead_block_end);
      TryPack(runahead_side, runahead_block, runahead_block_start,
              runahead_block_end, tuning);
      next_runahead_block[runahead_side] = runahead_block + 1;
#endif
    }
    RUY_TRACE_INFO(ENSURE_PACKED_END);
  }

  TrMulParams* params_;
  const BlockMap& block_map_;
  std::atomic<int>* atomic_block_id_;
  int thread_id_;
  bool need_atomics_;
  SidePair<std::atomic<PackingStatus>*> packing_status_;
  TuningResolver* tuning_resolver_;
  Allocator* local_allocator_;

  // Local indicators of packedness to avoid the overhead of atomic ops.
  SidePair<bool*> local_already_packed_;

  CpuInfo* cpuinfo_;
};

int GetTentativeThreadCount(Ctx* ctx, int rows, int cols, int depth) {
#if RUY_PLATFORM_EMSCRIPTEN
  // b/139927184, std::thread constructor raises exception
  return 1;
#endif
  RUY_TRACE_SCOPE;
  // Empirically determined rule for reasonable number of
  // threads to use. This is proportional to the number of arithmetic ops
  // in this Mul (product of the 3 sizes).
  const int64_t total_number_of_elements = rows * cols * depth;
  static constexpr int64_t kDivisor = 32768;
  int tentative_thread_count = std::max(
      1, std::min(static_cast<int>(total_number_of_elements / kDivisor),
                  ctx->max_num_threads()));
  RUY_TRACE_INFO(GET_TENTATIVE_THREAD_COUNT);
  return tentative_thread_count;
}

bool GetUseSimpleLoop(int tentative_thread_count, int rows, int cols, int depth,
                      int lhs_scalar_size, int rhs_scalar_size,
                      const CpuCacheParams& cpu_cache_params) {
  RUY_TRACE_SCOPE;
  if (tentative_thread_count == 1) {
    if (IsObviouslyLinearTraversal(rows, cols, depth, lhs_scalar_size,
                                   rhs_scalar_size, cpu_cache_params)) {
      RUY_TRACE_INFO(GET_USE_SIMPLE_LOOP_RETURNS_TRUE);
      return true;
    }
  }
  RUY_TRACE_INFO(GET_USE_SIMPLE_LOOP_RETURNS_FALSE);
  return false;
}

}  // namespace

// TrMul is the ruy middle-end. It contains the high-level logic to perform
// a ruy::Mul's work, down to calls to back-end Kernel and Pack functions.
// This includes determining how many threads to use, computing the BlockMap,
// executing tasks on a thread-pool. The TrMul function itself runs on the main
// thread, the code that is potentially running on worker threads is in
// TrMulTask::Run().
void TrMul(Ctx* ctx, TrMulParams* params) {
  RUY_TRACE_SCOPE;
  profiler::ScopeLabel label(
      "TrMul (Path=0x%x, max_num_threads=%d, is_prepacked=(%d,%d))",
      static_cast<int>(params->path), ctx->max_num_threads(),
      params->is_prepacked[Side::kLhs], params->is_prepacked[Side::kRhs]);

  PEMat& packed_lhs = params->packed_matrix[Side::kLhs];
  PEMat& packed_rhs = params->packed_matrix[Side::kRhs];
  EMat& lhs = params->src[Side::kLhs];
  EMat& rhs = params->src[Side::kRhs];

  const int rows = lhs.layout.cols;
  const int cols = rhs.layout.cols;
  const int depth = lhs.layout.rows;

  const int tentative_thread_count =
      GetTentativeThreadCount(ctx, rows, cols, depth);
  const auto& cpu_cache_params = ctx->mutable_cpuinfo()->CacheParams();

  // Suppress denormals to avoid computation inefficiency.
  // Note this only handles the denormal suppression on the main thread. As for
  // worker threads, the suppression is handled in each thread's main loop. See
  // the corresponding code in thread_pool.cc for details.
  ScopedSuppressDenormals suppress_denormals;

  // Case of running this TrMul as a simple loop.
  // This is a good place to start reading this function: all the rest
  // of this function is just an optimized, but functionally equivalent,
  // version of that.
  if (GetUseSimpleLoop(tentative_thread_count, rows, cols, depth,
                       lhs.data_type.size, rhs.data_type.size,
                       cpu_cache_params)) {
    profiler::ScopeLabel label_simple("TrMulImpl, simple loop");
    Tuning tuning = ctx->GetMainThreadTuning();
    RUY_TRACE_INFO(TRMUL_SIMPLE_LOOP);

    const SidePair<int> origin{0, 0};
    const SidePair<int> rounded_dims{packed_lhs.layout.cols,
                                     packed_rhs.layout.cols};
    for (Side side : {Side::kLhs, Side::kRhs}) {
      if (!params->is_prepacked[side]) {
        params->RunPack(side, tuning, origin[side], rounded_dims[side]);
      }
    }
    params->RunKernel(tuning, origin, rounded_dims);
    return;
  }

  profiler::ScopeLabel label_general("TrMulImpl, general case");
  RUY_TRACE_INFO(TRMUL_GENERAL_CASE);
  Allocator* main_allocator = ctx->GetMainAllocator();

  // Initialize block map.
  BlockMap block_map;
  MakeBlockMap(packed_lhs.layout.cols, packed_rhs.layout.cols, depth,
               packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
               packed_lhs.data_type.size, packed_rhs.data_type.size,
               tentative_thread_count, cpu_cache_params, &block_map);

  // Initialize per-thread state.
  const int thread_count = block_map.thread_count;
  const bool need_atomics = thread_count > 1;
  ctx->EnsureThreadSpecificResources(thread_count);
  for (int i = 0; i < thread_count; i++) {
    ctx->GetThreadSpecificTuningResolver(i)->SetTuning(ctx->explicit_tuning());
  }

  // In the need_atomics case, allocate and initialize atomic values tracking
  // the packing status of blocks.
  SidePair<std::atomic<PackingStatus>*> packing_status{nullptr, nullptr};
  if (need_atomics) {
    for (Side side : {Side::kLhs, Side::kRhs}) {
      if (!params->is_prepacked[side]) {
        const int size = NumBlocksPerSide(side, block_map);
        main_allocator->Allocate(size, &packing_status[side]);
        for (int i = 0; i < size; i++) {
          packing_status[side][i].store(PackingStatus::kNotStarted,
                                        std::memory_order_relaxed);
        }
      }
    }
  }

  // Create the atomic block id, allocate it using Allocator so that
  // we get the alignment ensuring that it sits alone in its exclusives
  // reservation granule.
  std::atomic<int>* atomic_block_id;
  main_allocator->Allocate(1, &atomic_block_id);
  atomic_block_id->store(thread_count);

  // Create task objects. We allocate a single buffer and then use placement-new
  // to construct N TrMulTask objects within it. To avoid having the Clang CFI
  // sanitizer complain about a TrMulTask* pointer temporarily pointing to
  // garbage, we keep the pointer a plain char* until finished constructing.
  char* tasks_buf =
      main_allocator->Allocate<char>(thread_count * sizeof(TrMulTask));
  for (int i = 0; i < thread_count; i++) {
    auto* allocator = ctx->GetThreadSpecificAllocator(i);
    auto* tuning_resolver = ctx->GetThreadSpecificTuningResolver(i);
    new (tasks_buf + i * sizeof(TrMulTask)) TrMulTask(
        params, block_map, atomic_block_id, i, need_atomics, packing_status,
        tuning_resolver, allocator, ctx->mutable_cpuinfo());
  }
  TrMulTask* tasks = reinterpret_cast<TrMulTask*>(tasks_buf);

  // Do the computation.
  ctx->mutable_thread_pool()->Execute(thread_count, tasks);

  // Finish up.
  for (int i = 0; i < thread_count; i++) {
    tasks[i].~TrMulTask();
  }
}

}  // namespace ruy