diff options
author | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-07-25 14:04:40 +0300 |
---|---|---|
committer | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-07-28 06:59:10 +0300 |
commit | 577436851cf80904f27603e75e716dc6a006a7dc (patch) | |
tree | 9aef8354eba61c6c8145fb5fc83949376553a3f5 | |
parent | bd2ea331fe723a1f487176a2ac699d9531e0d434 (diff) |
limiter: Implement Cgroup memory resource watcher
This commit implements Cgroup memory resource watcher to monitor the
memory usage of the parent Cgroup. When the usage exceeds 90% of the
memory limit or the cgroup is under OOM, the watcher considers it as a
backoff event.
We target the parent cgroup only for the sake of simplicity. Observing
the memory usage of repository cgroups adds a lot of overhead. In
addition, when the parent cgroup reaches its limit, all commands are
affected. The impact of repository cgroup exceeding limit is local to
some certain repositories.
-rw-r--r-- | internal/limiter/watchers/cgroup_memory_watcher.go | 70 | ||||
-rw-r--r-- | internal/limiter/watchers/cgroup_memory_watcher_test.go | 132 |
2 files changed, 202 insertions, 0 deletions
diff --git a/internal/limiter/watchers/cgroup_memory_watcher.go b/internal/limiter/watchers/cgroup_memory_watcher.go new file mode 100644 index 000000000..4d293bf2e --- /dev/null +++ b/internal/limiter/watchers/cgroup_memory_watcher.go @@ -0,0 +1,70 @@ +package watchers + +import ( + "context" + "fmt" + + "gitlab.com/gitlab-org/gitaly/v16/internal/cgroups" + "gitlab.com/gitlab-org/gitaly/v16/internal/limiter" +) + +const ( + cgroupMemoryWatcherName = "CgroupMemory" + memoryThreshold = 0.9 +) + +// CgroupMemoryWatcher implements ResourceWatcher interface. This watcher polls +// the statistics from the cgroup manager. It returns a backoff event in two +// conditions: +// * The current memory usage exceeds a soft threshold (90%). +// * The cgroup is under OOM. +type CgroupMemoryWatcher struct { + manager cgroups.Manager +} + +// NewCgroupMemoryWatcher is the initializer of CgroupMemoryWatcher +func NewCgroupMemoryWatcher(manager cgroups.Manager) *CgroupMemoryWatcher { + return &CgroupMemoryWatcher{ + manager: manager, + } +} + +// Name returns the name of CgroupMemoryWatcher +func (c *CgroupMemoryWatcher) Name() string { + return cgroupMemoryWatcherName +} + +// Poll asserts the cgroup statistics and returns a backoff event accordingly +// when it is triggered. These stats are fetched from cgroup manager. +func (c *CgroupMemoryWatcher) Poll(context.Context) (*limiter.BackoffEvent, error) { + if !c.manager.Ready() { + return &limiter.BackoffEvent{WatcherName: c.Name(), ShouldBackoff: false}, nil + } + + stats, err := c.manager.Stats() + if err != nil { + return nil, fmt.Errorf("cgroup watcher: poll stats from cgroup manager: %w", err) + } + parentStats := stats.ParentStats + + // Whether the parent cgroup isthe memory cgroup is under OOM, tasks may be stopped. This stat is available in + // Cgroup V1 only. + if parentStats.UnderOOM { + return &limiter.BackoffEvent{ + WatcherName: c.Name(), + ShouldBackoff: true, + Reason: "cgroup is under OOM", + }, nil + } + + if parentStats.MemoryLimit > 0 && parentStats.MemoryUsage > 0 && + float64(parentStats.MemoryUsage)/float64(parentStats.MemoryLimit) >= memoryThreshold { + return &limiter.BackoffEvent{ + WatcherName: c.Name(), + ShouldBackoff: true, + Reason: fmt.Sprintf("cgroup memory exceeds limit: %d/%d", parentStats.MemoryUsage, parentStats.MemoryLimit), + }, nil + } + + return &limiter.BackoffEvent{WatcherName: c.Name(), ShouldBackoff: false}, nil +} diff --git a/internal/limiter/watchers/cgroup_memory_watcher_test.go b/internal/limiter/watchers/cgroup_memory_watcher_test.go new file mode 100644 index 000000000..551c040cb --- /dev/null +++ b/internal/limiter/watchers/cgroup_memory_watcher_test.go @@ -0,0 +1,132 @@ +package watchers + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v16/internal/cgroups" + "gitlab.com/gitlab-org/gitaly/v16/internal/limiter" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" +) + +func TestCgroupMemoryWatcher_Name(t *testing.T) { + t.Parallel() + + manager := NewCgroupMemoryWatcher(&testCgroupManager{}) + require.Equal(t, cgroupMemoryWatcherName, manager.Name()) +} + +func TestCgroupMemoryWatcher_Poll(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + desc string + manager *testCgroupManager + expectedEvent *limiter.BackoffEvent + expectedErr error + }{ + { + desc: "disabled watcher", + manager: &testCgroupManager{ready: false}, + expectedEvent: &limiter.BackoffEvent{ + WatcherName: cgroupMemoryWatcherName, + ShouldBackoff: false, + }, + expectedErr: nil, + }, + { + desc: "cgroup stats return empty stats", + manager: &testCgroupManager{ + ready: true, + statsList: []cgroups.Stats{{}}, + }, + expectedEvent: &limiter.BackoffEvent{ + WatcherName: cgroupMemoryWatcherName, + ShouldBackoff: false, + }, + }, + { + desc: "cgroup stats query returns errors", + manager: &testCgroupManager{ + ready: true, + statsErr: fmt.Errorf("something goes wrong"), + statsList: []cgroups.Stats{{}}, + }, + expectedErr: fmt.Errorf("cgroup watcher: poll stats from cgroup manager: %w", fmt.Errorf("something goes wrong")), + }, + { + desc: "cgroup memory usage is more than 90%", + manager: &testCgroupManager{ + ready: true, + statsList: []cgroups.Stats{ + { + ParentStats: cgroups.CgroupStats{ + MemoryUsage: 1800000000, + MemoryLimit: 2000000000, + }, + }, + }, + }, + expectedEvent: &limiter.BackoffEvent{ + WatcherName: cgroupMemoryWatcherName, + ShouldBackoff: true, + Reason: "cgroup memory exceeds limit: 1800000000/2000000000", + }, + expectedErr: nil, + }, + { + desc: "cgroup is under OOM", + manager: &testCgroupManager{ + ready: true, + statsList: []cgroups.Stats{ + { + ParentStats: cgroups.CgroupStats{ + MemoryUsage: 1900000000, + MemoryLimit: 2000000000, + UnderOOM: true, + }, + }, + }, + }, + expectedEvent: &limiter.BackoffEvent{ + WatcherName: cgroupMemoryWatcherName, + ShouldBackoff: true, + Reason: "cgroup is under OOM", + }, + expectedErr: nil, + }, + { + desc: "cgroup memory usage normal", + manager: &testCgroupManager{ + ready: true, + statsList: []cgroups.Stats{ + { + ParentStats: cgroups.CgroupStats{ + MemoryUsage: 1700000000, + MemoryLimit: 2000000000, + }, + }, + }, + }, + expectedEvent: &limiter.BackoffEvent{ + WatcherName: cgroupMemoryWatcherName, + ShouldBackoff: false, + }, + expectedErr: nil, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + watcher := NewCgroupMemoryWatcher(tc.manager) + event, err := watcher.Poll(testhelper.Context(t)) + + if tc.expectedErr != nil { + require.Equal(t, tc.expectedErr, err) + require.Nil(t, event) + } else { + require.NoError(t, err) + require.Equal(t, tc.expectedEvent, event) + } + }) + } +} |