diff options
author | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-11-09 07:53:00 +0300 |
---|---|---|
committer | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-11-10 12:29:08 +0300 |
commit | 0767596516b0d8457094023b40e28ac883bfc8d4 (patch) | |
tree | adc20c9c5a08cd4f834ef3819f59a0d39547b376 | |
parent | c65b631d971809d9e0294356d7892860d4800cf3 (diff) |
limiter: Make CpuThrottledThreshold configureable
In the current implementation, the adaptive limiting kicks in when the
resource level exceeds some hard-coded thresholds:
* 90% of the parent cgroup's memory.
* Cgroup's cpu is throttled for 50% of the observation time.
Although the current CPU throttled threshold is reasonable, it might not
be good for all cases. A more powerful machine can tolerate a higher
throttling rate while a less powerful machine wants to lower the limit
sooner. This commit adds the ability to customize the CPU throttled
threshold.
-rw-r--r-- | internal/cli/gitaly/serve.go | 2 | ||||
-rw-r--r-- | internal/gitaly/config/config.go | 18 | ||||
-rw-r--r-- | internal/gitaly/config/config_test.go | 20 | ||||
-rw-r--r-- | internal/limiter/watchers/cgroup_cpu_watcher.go | 23 | ||||
-rw-r--r-- | internal/limiter/watchers/cgroup_cpu_watcher_test.go | 48 |
5 files changed, 99 insertions, 12 deletions
diff --git a/internal/cli/gitaly/serve.go b/internal/cli/gitaly/serve.go index edca64585..223880507 100644 --- a/internal/cli/gitaly/serve.go +++ b/internal/cli/gitaly/serve.go @@ -353,7 +353,7 @@ func run(cfg config.Cfg, logger log.Logger) error { logger, adaptiveLimits, []limiter.ResourceWatcher{ - watchers.NewCgroupCPUWatcher(cgroupMgr), + watchers.NewCgroupCPUWatcher(cgroupMgr, cfg.AdaptiveLimiting.CPUThrottledThreshold), watchers.NewCgroupMemoryWatcher(cgroupMgr), }, ) diff --git a/internal/gitaly/config/config.go b/internal/gitaly/config/config.go index 735fa2d65..869871f11 100644 --- a/internal/gitaly/config/config.go +++ b/internal/gitaly/config/config.go @@ -123,6 +123,7 @@ type Cfg struct { Backup BackupConfig `toml:"backup,omitempty" json:"backup"` Timeout TimeoutConfig `toml:"timeout,omitempty" json:"timeout"` Transactions Transactions `toml:"transactions,omitempty" json:"transactions,omitempty"` + AdaptiveLimiting AdaptiveLimiting `toml:"adaptive_limiting,omitempty" json:"adaptive_limiting,omitempty"` } // Transactions configures transaction related options. @@ -487,6 +488,23 @@ func (c Concurrency) Validate() error { return errs.AsError() } +// AdaptiveLimiting defines a set of global config for the adaptive limiter. This config customizes how the resource +// watchers and calculator works. Specific limits for each RPC or pack-objects operation should be configured +// individually using the Concurrency and PackObjectsLimiting structs respectively. +type AdaptiveLimiting struct { + // CPUThrottledThreshold defines the CPU throttling ratio threshold for a backoff event. The resource watcher + // compares the recorded total throttled time between two polls. If the throttled time exceeds this threshold of + // the observation window, it returns a backoff event. By default, the threshold is 0.5 (50%). + CPUThrottledThreshold float64 `toml:"cpu_throttled_threshold" json:"cpu_throttled_threshold"` +} + +// Validate runs validation on all fields and compose all found errors. +func (c AdaptiveLimiting) Validate() error { + return cfgerror.New(). + Append(cfgerror.Comparable(c.CPUThrottledThreshold).GreaterOrEqual(0), "cpu_throttled_threshold"). + AsError() +} + // RateLimiting allows endpoints to be limited to a maximum request rate per // second. The rate limiter uses a concept of a "token bucket". In order to serve a // request, a token is retrieved from the token bucket. The size of the token diff --git a/internal/gitaly/config/config_test.go b/internal/gitaly/config/config_test.go index 41c12c800..273c7b8d7 100644 --- a/internal/gitaly/config/config_test.go +++ b/internal/gitaly/config/config_test.go @@ -1989,6 +1989,26 @@ func TestConcurrency_Validate(t *testing.T) { ) } +func TestAdaptiveLimiting_Validate(t *testing.T) { + t.Parallel() + + require.NoError(t, AdaptiveLimiting{CPUThrottledThreshold: 0}.Validate()) + require.NoError(t, AdaptiveLimiting{CPUThrottledThreshold: 0.1}.Validate()) + require.NoError(t, AdaptiveLimiting{CPUThrottledThreshold: 0.9}.Validate()) + require.NoError(t, AdaptiveLimiting{CPUThrottledThreshold: 2.0}.Validate()) + + require.Equal( + t, + cfgerror.ValidationErrors{ + cfgerror.NewValidationError( + fmt.Errorf("%w: -0.1 is not greater than or equal to 0", cfgerror.ErrNotInRange), + "cpu_throttled_threshold", + ), + }, + AdaptiveLimiting{CPUThrottledThreshold: -0.1}.Validate(), + ) +} + func TestStorage_Validate(t *testing.T) { t.Parallel() diff --git a/internal/limiter/watchers/cgroup_cpu_watcher.go b/internal/limiter/watchers/cgroup_cpu_watcher.go index 5ef5ba7d0..30f5c8454 100644 --- a/internal/limiter/watchers/cgroup_cpu_watcher.go +++ b/internal/limiter/watchers/cgroup_cpu_watcher.go @@ -10,8 +10,8 @@ import ( ) const ( - cgroupCPUWatcherName = "CgroupCpu" - cpuThrottledThreshold = 0.5 + cgroupCPUWatcherName = "CgroupCpu" + defaultCPUThrottledThreshold = 0.5 ) // CgroupCPUWatcher implements ResourceWatcher interface for watching CPU throttling of cgroup. Cgroup doesn't have an @@ -19,9 +19,10 @@ const ( // between two polls. If the throttled time exceeds 50% of the observation window, it returns a backoff event. The // watcher uses `throttled_time` (CgroupV1) or `throttled_usec` (CgroupV2) stats from the cgroup manager. type CgroupCPUWatcher struct { - manager cgroups.Manager - lastPoll time.Time - lastParentStats cgroups.CgroupStats + manager cgroups.Manager + cpuThrottledThreshold float64 + lastPoll time.Time + lastParentStats cgroups.CgroupStats // currentTime is the function that returns the current time. If it's not set, time.Now() is used // instead. It's used for tests only. @@ -29,9 +30,13 @@ type CgroupCPUWatcher struct { } // NewCgroupCPUWatcher is the initializer of CgroupCPUWatcher -func NewCgroupCPUWatcher(manager cgroups.Manager) *CgroupCPUWatcher { +func NewCgroupCPUWatcher(manager cgroups.Manager, cpuThrottledThreshold float64) *CgroupCPUWatcher { + if cpuThrottledThreshold == 0 { + cpuThrottledThreshold = defaultCPUThrottledThreshold + } return &CgroupCPUWatcher{ - manager: manager, + manager: manager, + cpuThrottledThreshold: cpuThrottledThreshold, } } @@ -81,7 +86,7 @@ func (c *CgroupCPUWatcher) Poll(ctx context.Context) (*limiter.BackoffEvent, err timeDiff := currentPoll.Sub(c.lastPoll).Abs().Seconds() // If the total throttled duration since the last poll exceeds 50%. - if timeDiff > 0 && throttledDuration/timeDiff > cpuThrottledThreshold { + if timeDiff > 0 && throttledDuration/timeDiff > c.cpuThrottledThreshold { return &limiter.BackoffEvent{ WatcherName: c.Name(), ShouldBackoff: true, @@ -89,7 +94,7 @@ func (c *CgroupCPUWatcher) Poll(ctx context.Context) (*limiter.BackoffEvent, err Stats: map[string]any{ "time_diff": timeDiff, "throttled_duration": throttledDuration, - "throttled_threshold": cpuThrottledThreshold, + "throttled_threshold": c.cpuThrottledThreshold, }, }, nil } diff --git a/internal/limiter/watchers/cgroup_cpu_watcher_test.go b/internal/limiter/watchers/cgroup_cpu_watcher_test.go index f7207eded..8b007b399 100644 --- a/internal/limiter/watchers/cgroup_cpu_watcher_test.go +++ b/internal/limiter/watchers/cgroup_cpu_watcher_test.go @@ -14,7 +14,7 @@ import ( func TestCgroupCPUWatcher_Name(t *testing.T) { t.Parallel() - manager := NewCgroupCPUWatcher(&testCgroupManager{}) + manager := NewCgroupCPUWatcher(&testCgroupManager{}, 0.5) require.Equal(t, cgroupCPUWatcherName, manager.Name()) } @@ -27,6 +27,7 @@ func TestCgroupCPUWatcher_Poll(t *testing.T) { desc string manager *testCgroupManager pollTimes []recentTimeFunc + cpuThreshold float64 expectedEvents []*limiter.BackoffEvent expectedErrs []error }{ @@ -395,9 +396,52 @@ func TestCgroupCPUWatcher_Poll(t *testing.T) { }, }, }, + { + desc: "customized CPU threshold", + manager: &testCgroupManager{ + ready: true, + statsList: []cgroups.Stats{ + testCPUStat(1, 100), + testCPUStat(2, 108), // 8 seconds - okay + testCPUStat(3, 123), // 15 seconds - 15 over 15, exceeding 90% + testCPUStat(4, 136), // 13 seconds - fine + }, + }, + cpuThreshold: 0.9, + pollTimes: []recentTimeFunc{ + mockRecentTime(t, "2023-01-01T11:00:00Z"), + mockRecentTime(t, "2023-01-01T11:00:15Z"), + mockRecentTime(t, "2023-01-01T11:00:30Z"), + mockRecentTime(t, "2023-01-01T11:00:45Z"), + }, + expectedEvents: []*limiter.BackoffEvent{ + { + WatcherName: cgroupCPUWatcherName, + ShouldBackoff: false, + }, + { + WatcherName: cgroupCPUWatcherName, + ShouldBackoff: false, + }, + { + WatcherName: cgroupCPUWatcherName, + ShouldBackoff: true, + Reason: "cgroup CPU throttled too much", + Stats: map[string]any{ + "time_diff": 15.0, + "throttled_duration": 15.0, + "throttled_threshold": 0.9, + }, + }, + { + WatcherName: cgroupCPUWatcherName, + ShouldBackoff: false, + }, + }, + }, } { t.Run(tc.desc, func(t *testing.T) { - watcher := NewCgroupCPUWatcher(tc.manager) + watcher := NewCgroupCPUWatcher(tc.manager, tc.cpuThreshold) if tc.pollTimes != nil { require.Equal(t, len(tc.expectedEvents), len(tc.pollTimes), "poll times set up incorrectly") |