diff options
author | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-11-10 13:40:19 +0300 |
---|---|---|
committer | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-11-17 17:46:09 +0300 |
commit | 79e68324a8c6449531404164f3210bdfd271a68b (patch) | |
tree | 663f6b16f9df07dafea0071c8a300b745b9c716e | |
parent | e66f6ec5ad5b47cb43e69a4690c3e9e1a9a49b4e (diff) |
limiter: Ignore `total_inactive_file` instead of `inactive_file`
In https://gitlab.com/gitlab-org/gitaly/-/merge_requests/6508, we
implemented a fix to ignore highly evictable Page Caches from cgroup
memory threshold. The used metric was parent cgroup's `inactive_file`.
Unfortunately, that metric reflects the inactive Page Caches of direct
processes inside the parent cgroup. It doesn't account for the indirect
processes in children repository cgroups. We don't spawn any process in
the parent cgroup. So, the fix was useless.
In Cgroup V1, `memory.stat` has `total_inactive_file`. It's exactly what
we are looking for. In Cgroup V2, `inactive_file` includes all of its
substree consumptions. So, we can keep using that field.
-rw-r--r-- | internal/cgroups/cgroups.go | 20 | ||||
-rw-r--r-- | internal/cgroups/handler_linux_test.go | 30 | ||||
-rw-r--r-- | internal/cgroups/v1_linux.go | 12 | ||||
-rw-r--r-- | internal/cgroups/v2_linux.go | 16 | ||||
-rw-r--r-- | internal/limiter/watchers/cgroup_memory_watcher.go | 4 | ||||
-rw-r--r-- | internal/limiter/watchers/cgroup_memory_watcher_test.go | 18 |
6 files changed, 67 insertions, 33 deletions
diff --git a/internal/cgroups/cgroups.go b/internal/cgroups/cgroups.go index 8cf8df116..f8bca0609 100644 --- a/internal/cgroups/cgroups.go +++ b/internal/cgroups/cgroups.go @@ -34,18 +34,18 @@ type CgroupStats struct { // UnderOOM is the current OOM status of a cgroup. This information is available for Cgroup V1 only. It's read // from the `memory.oom_control` file. UnderOOM bool - // Anon reflects the `rss` (V1) or `anon` (V2) of `memory.stat` file. That's amount of memory used in anonymous mappings - Anon uint64 + // TotalAnon reflects the `rss` (V1) or `anon` (V2) of `memory.stat` file. That's amount of memory used in anonymous mappings + TotalAnon uint64 // ActiveAnon reflects the `active_file` of `memory.stat` file, anonymous and swap cache memory on active LRU list. - ActiveAnon uint64 + TotalActiveAnon uint64 // InactiveAnon reflects the `inactive_anon` of `memory.stat` file, anonymous and swap cache memory on inactive LRU list. - InactiveAnon uint64 - // File reflects the `cache` (V1) or `file` (V2) of `memory.stat` file, bytes of page cache memory. - File uint64 - // ActiveFile reflects the `active_file` of `memory.stat` file, bytes of file-backed memory that cannot be reclaimed. - ActiveFile uint64 - // InactiveFile reflects the `inactive_file` of `memory.stat` file, bytes of file-backed memory on inactive LRU list. - InactiveFile uint64 + TotalInactiveAnon uint64 + // TotalFile reflects the `cache` (V1) or `file` (V2) of `memory.stat` file, bytes of page cache memory. + TotalFile uint64 + // TotalActiveFile reflects the `active_file` of `memory.stat` file, bytes of file-backed memory that cannot be reclaimed. + TotalActiveFile uint64 + // TotalInactiveFile reflects the `inactive_file` of `memory.stat` file, bytes of file-backed memory on inactive LRU list. + TotalInactiveFile uint64 } // Stats stores statistics of all cgroups managed by a manager diff --git a/internal/cgroups/handler_linux_test.go b/internal/cgroups/handler_linux_test.go index 9556b393b..ac87939e6 100644 --- a/internal/cgroups/handler_linux_test.go +++ b/internal/cgroups/handler_linux_test.go @@ -735,6 +735,18 @@ oom_kill 3`}, {"cpu.stat", `nr_periods 10 nr_throttled 50 throttled_time 1000000`}, // 0.001 seconds + {"memory.stat", `cache 0 +rss 0 +inactive_anon 100 +active_anon 200 +inactive_file 300 +active_file 400 +total_cache 235000000 +total_rss 234000000 +total_inactive_anon 200000000 +total_active_anon 34000000 +total_inactive_file 100000000 +total_active_file 135000000`}, }, expectedStats: Stats{ ParentStats: CgroupStats{ @@ -744,6 +756,12 @@ throttled_time 1000000`}, // 0.001 seconds MemoryLimit: 2000000000, OOMKills: 3, UnderOOM: true, + TotalAnon: 234000000, + TotalActiveAnon: 34000000, + TotalInactiveAnon: 200000000, + TotalFile: 235000000, + TotalActiveFile: 135000000, + TotalInactiveFile: 100000000, }, }, }, @@ -773,6 +791,12 @@ oom_kill 5`}, {"cpu.stat", `nr_periods 10 nr_throttled 50 throttled_usec 1000000`}, // 0.001 seconds + {"memory.stat", `anon 234000000 +file 235000000 +inactive_anon 200000000 +active_anon 34000000 +inactive_file 100000000 +active_file 135000000`}, }, expectedStats: Stats{ ParentStats: CgroupStats{ @@ -781,6 +805,12 @@ throttled_usec 1000000`}, // 0.001 seconds MemoryUsage: 1234000000, MemoryLimit: 2000000000, OOMKills: 5, + TotalAnon: 234000000, + TotalActiveAnon: 34000000, + TotalInactiveAnon: 200000000, + TotalFile: 235000000, + TotalActiveFile: 135000000, + TotalInactiveFile: 100000000, }, }, }, diff --git a/internal/cgroups/v1_linux.go b/internal/cgroups/v1_linux.go index 53f6cb5a4..e56829c26 100644 --- a/internal/cgroups/v1_linux.go +++ b/internal/cgroups/v1_linux.go @@ -195,12 +195,12 @@ func (cvh *cgroupV1Handler) stats() (Stats, error) { MemoryLimit: metrics.Memory.Usage.Limit, OOMKills: metrics.MemoryOomControl.OomKill, UnderOOM: metrics.MemoryOomControl.UnderOom != 0, - Anon: metrics.Memory.RSS, - ActiveAnon: metrics.Memory.ActiveAnon, - InactiveAnon: metrics.Memory.InactiveAnon, - File: metrics.Memory.Cache, - ActiveFile: metrics.Memory.ActiveFile, - InactiveFile: metrics.Memory.InactiveFile, + TotalAnon: metrics.Memory.TotalRSS, + TotalActiveAnon: metrics.Memory.TotalActiveAnon, + TotalInactiveAnon: metrics.Memory.TotalInactiveAnon, + TotalFile: metrics.Memory.TotalCache, + TotalActiveFile: metrics.Memory.TotalActiveFile, + TotalInactiveFile: metrics.Memory.TotalInactiveFile, }, }, nil } diff --git a/internal/cgroups/v2_linux.go b/internal/cgroups/v2_linux.go index 396cf104f..479b2c5ce 100644 --- a/internal/cgroups/v2_linux.go +++ b/internal/cgroups/v2_linux.go @@ -189,14 +189,18 @@ func (cvh *cgroupV2Handler) stats() (Stats, error) { CPUThrottledDuration: float64(metrics.CPU.ThrottledUsec) / float64(time.Second), MemoryUsage: metrics.Memory.Usage, MemoryLimit: metrics.Memory.UsageLimit, - Anon: metrics.Memory.Anon, - ActiveAnon: metrics.Memory.ActiveAnon, - InactiveAnon: metrics.Memory.InactiveAnon, - File: metrics.Memory.File, - ActiveFile: metrics.Memory.ActiveFile, - InactiveFile: metrics.Memory.InactiveFile, + // memory.stat breaks down the cgroup's memory footprint into different types of memory. In + // Cgroup V2, this file includes the consumption of the cgroup’s entire subtree. Total_* stats + // were removed. + TotalAnon: metrics.Memory.Anon, + TotalActiveAnon: metrics.Memory.ActiveAnon, + TotalInactiveAnon: metrics.Memory.InactiveAnon, + TotalFile: metrics.Memory.File, + TotalActiveFile: metrics.Memory.ActiveFile, + TotalInactiveFile: metrics.Memory.InactiveFile, }, } + if metrics.MemoryEvents != nil { stats.ParentStats.OOMKills = metrics.MemoryEvents.OomKill } diff --git a/internal/limiter/watchers/cgroup_memory_watcher.go b/internal/limiter/watchers/cgroup_memory_watcher.go index 453e5eee6..ab686ba12 100644 --- a/internal/limiter/watchers/cgroup_memory_watcher.go +++ b/internal/limiter/watchers/cgroup_memory_watcher.go @@ -73,14 +73,14 @@ func (c *CgroupMemoryWatcher) Poll(context.Context) (*limiter.BackoffEvent, erro // for some special insignificant cases (LazyFree for example). A portion of the Page Caches, noted by `inactive_file`, // is the target for the eviction first. So, it makes sense to exclude the easy evictable memory from the threshold. if parentStats.MemoryLimit > 0 && parentStats.MemoryUsage > 0 && - float64(parentStats.MemoryUsage-parentStats.InactiveFile)/float64(parentStats.MemoryLimit) >= c.memoryThreshold { + float64(parentStats.MemoryUsage-parentStats.TotalInactiveFile)/float64(parentStats.MemoryLimit) >= c.memoryThreshold { return &limiter.BackoffEvent{ WatcherName: c.Name(), ShouldBackoff: true, Reason: "cgroup memory exceeds threshold", Stats: map[string]any{ "memory_usage": parentStats.MemoryUsage, - "inactive_file": parentStats.InactiveFile, + "inactive_file": parentStats.TotalInactiveFile, "memory_limit": parentStats.MemoryLimit, "memory_threshold": c.memoryThreshold, }, diff --git a/internal/limiter/watchers/cgroup_memory_watcher_test.go b/internal/limiter/watchers/cgroup_memory_watcher_test.go index 1e30eac8c..ccc2df3a0 100644 --- a/internal/limiter/watchers/cgroup_memory_watcher_test.go +++ b/internal/limiter/watchers/cgroup_memory_watcher_test.go @@ -89,9 +89,9 @@ func TestCgroupMemoryWatcher_Poll(t *testing.T) { statsList: []cgroups.Stats{ { ParentStats: cgroups.CgroupStats{ - MemoryUsage: 1900000000, - InactiveFile: 100000000, - MemoryLimit: 2000000000, + MemoryUsage: 1900000000, + TotalInactiveFile: 100000000, + MemoryLimit: 2000000000, }, }, }, @@ -156,9 +156,9 @@ func TestCgroupMemoryWatcher_Poll(t *testing.T) { statsList: []cgroups.Stats{ { ParentStats: cgroups.CgroupStats{ - MemoryUsage: 1900000000, - InactiveFile: 200000000, - MemoryLimit: 2000000000, + MemoryUsage: 1900000000, + TotalInactiveFile: 200000000, + MemoryLimit: 2000000000, }, }, }, @@ -176,9 +176,9 @@ func TestCgroupMemoryWatcher_Poll(t *testing.T) { statsList: []cgroups.Stats{ { ParentStats: cgroups.CgroupStats{ - MemoryUsage: 1200000000, - InactiveFile: 100000000, - MemoryLimit: 2000000000, + MemoryUsage: 1200000000, + TotalInactiveFile: 100000000, + MemoryLimit: 2000000000, }, }, }, |