ruy/cpu_cache_size.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

/* Copyright 2020 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CPU_CACHE_SIZE_H_
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CPU_CACHE_SIZE_H_

#include "ruy/path.h"
#include "ruy/platform.h"

namespace ruy {

// LocalDataCacheSize returns a sane default size for each CPU core's local
// data cache, i.e. the largest data cache that is local to that CPU core, not
// shared with other cores. That allows coarse tuning of code that aims for
// most of its memory accesses to hit such a typically fast data cache.
//
// SharedDataCacheSize returns a sane default size of the total data cache
// accessible to each CPU, including any shared cache.
//
// For example, if we design tune this code for a ARM Cortex-A55 with a local L1
// cache of 32k, a local L2 cache of 128k and a shared L3 cache of 1M,
// LocalDataCacheSize should return 128k and SharedDataCacheSize
// should return 1M.
//
// Ideally these values would be queried at runtime, and we should probably
// do that on x86, but that is hard to do on ARM.
#if RUY_PLATFORM(ARM_64)
inline int LocalDataCacheSize() { return 1 << 15; }
inline int SharedDataCacheSize() { return 1 << 19; }
#elif RUY_PLATFORM(ARM_32)
inline int LocalDataCacheSize() { return 1 << 14; }
inline int SharedDataCacheSize() { return 1 << 18; }
#elif RUY_PLATFORM(X86)
inline int LocalDataCacheSize() { return 1 << 17; }
inline int SharedDataCacheSize() { return 1 << 21; }
#else
inline int LocalDataCacheSize() { return 1 << 14; }
inline int SharedDataCacheSize() { return 1 << 18; }
#endif
// Variants taking a Path argument which acts
// as a hint telling whether we're targeting more or less recent/powerful CPUs.
inline int LocalDataCacheSize(Path path) {
#if RUY_PLATFORM(ARM_64)
  if (path == Path::kNeonDotprod) {
    // At the moment, the smallest CPU with dotprod is probably Cortex-A55 with
    // 128k L2 local cache.
    return 1 << 17;
  }
#else
  (void)path;
#endif
  return LocalDataCacheSize();
}
inline int SharedDataCacheSize(Path path) {
#if RUY_PLATFORM(ARM_64)
  if (path == Path::kNeonDotprod) {
    // At the moment, the smallest CPU with dotprod is probably Cortex-A55 with
    // 1M L3 shared cache.
    return 1 << 20;
  }
#else
  (void)path;
#endif
  return SharedDataCacheSize();
}

}  // namespace ruy

#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_CPU_CACHE_SIZE_H_