diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-03-29 12:43:55 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-03-29 12:46:53 +0300 |
commit | 74882d5778003bcdf8ef30a1ddfd3902170747fa (patch) | |
tree | 9558900782c94d6565abd624f40f0e31c829c19a | |
parent | 6cc6d4b4ac491dddbeff72b0b2525c188d0cd23b (diff) |
maintenance: Introduce rate limiting for `OptimizeRepo`
With the current implementation of `OptimizeRepo`, we're trying to do as
much work as possible in the timeframe dictated by the maintenance
schedule. This has proven to be problematic even in times of reduced
load on deployments because we were essentially DoS'ing ourselves with
so many requests that it caused alerts to trigger. On staging systems,
we see more than 200 calls per second for the `OptimizeRepository` RPC.
Even if those jobs don't need to do any heavy repacking, it does cause
heavy read-load on Gitaly nodes just to determine that nothing needs to
be done.
As a first iteration towards betterment, this commit introduces rate
limiting to the maintenance job. Instead of going as quickly as we can,
we limit requests per second to 1.
Of course this means that we're now able to optimize less repositories
than we previously did. But this is still much better than driving a DoS
against ourselves and waking up SREs on weekends. Depending on the
typical load we'll see with this rate limiting, we may be able to enable
the maintenance task 24/7.
-rw-r--r-- | internal/gitaly/maintenance/optimize.go | 45 | ||||
-rw-r--r-- | internal/gitaly/maintenance/optimize_test.go | 19 | ||||
-rw-r--r-- | internal/gitaly/server/server_factory.go | 2 |
3 files changed, 52 insertions, 14 deletions
diff --git a/internal/gitaly/maintenance/optimize.go b/internal/gitaly/maintenance/optimize.go index 53f4f323a..854d68a04 100644 --- a/internal/gitaly/maintenance/optimize.go +++ b/internal/gitaly/maintenance/optimize.go @@ -11,6 +11,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/internal/gitaly/config" + "gitlab.com/gitlab-org/gitaly/internal/helper" "gitlab.com/gitlab-org/gitaly/internal/storage" "gitlab.com/gitlab-org/gitaly/proto/go/gitalypb" "google.golang.org/grpc" @@ -68,12 +69,15 @@ func optimizeRepoAtPath(ctx context.Context, l logrus.FieldLogger, s config.Stor return nil } -func walkReposShuffled(ctx context.Context, walker *randomWalker, l logrus.FieldLogger, s config.Storage, o Optimizer) error { +func walkReposShuffled( + ctx context.Context, + walker *randomWalker, + l logrus.FieldLogger, + s config.Storage, + o Optimizer, + ticker helper.Ticker, +) error { for { - if err := ctx.Err(); err != nil { - return err - } - fi, path, err := walker.next() switch { case errors.Is(err, errIterOver): @@ -89,21 +93,33 @@ func walkReposShuffled(ctx context.Context, walker *randomWalker, l logrus.Field } walker.skipDir() + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C(): + } + + // Reset the ticker before doing the optimization such that we essentially limit + // ourselves to doing optimizations once per tick, not once per tick plus the time + // it takes to do the optimization. It's best effort given that traversing the + // directory hierarchy takes some time, too, but it should be good enough for now. + ticker.Reset() + if err := optimizeRepoAtPath(ctx, l, s, path, o); err != nil { return err } } } -// OptimizeReposRandomly returns a function to walk through each storage and -// attempt to optimize any repos encountered. +// OptimizeReposRandomly returns a function to walk through each storage and attempts to optimize +// any repos encountered. The ticker is used to rate-limit optimizations. // -// Only storage paths that map to an enabled storage name will be walked. -// Any storage paths shared by multiple storages will only be walked once. +// Only storage paths that map to an enabled storage name will be walked. Any storage paths shared +// by multiple storages will only be walked once. // -// Any errors during the optimization will be logged. Any other errors will be -// returned and cause the walk to end prematurely. -func OptimizeReposRandomly(storages []config.Storage, optimizer Optimizer, rand *rand.Rand) StoragesJob { +// Any errors during the optimization will be logged. Any other errors will be returned and cause +// the walk to end prematurely. +func OptimizeReposRandomly(storages []config.Storage, optimizer Optimizer, ticker helper.Ticker, rand *rand.Rand) StoragesJob { return func(ctx context.Context, l logrus.FieldLogger, enabledStorageNames []string) error { enabledNames := map[string]struct{}{} for _, sName := range enabledStorageNames { @@ -112,6 +128,9 @@ func OptimizeReposRandomly(storages []config.Storage, optimizer Optimizer, rand visitedPaths := map[string]bool{} + ticker.Reset() + defer ticker.Stop() + for _, storage := range shuffledStoragesCopy(rand, storages) { if _, ok := enabledNames[storage.Name]; !ok { continue // storage not enabled @@ -126,7 +145,7 @@ func OptimizeReposRandomly(storages []config.Storage, optimizer Optimizer, rand walker := newRandomWalker(storage.Path, rand) - if err := walkReposShuffled(ctx, walker, l, storage, optimizer); err != nil { + if err := walkReposShuffled(ctx, walker, l, storage, optimizer, ticker); err != nil { l.WithError(err). WithField("storage_path", storage.Path). Errorf("maintenance: unable to completely walk storage") diff --git a/internal/gitaly/maintenance/optimize_test.go b/internal/gitaly/maintenance/optimize_test.go index 1cfd91bc0..90d1c9f9a 100644 --- a/internal/gitaly/maintenance/optimize_test.go +++ b/internal/gitaly/maintenance/optimize_test.go @@ -12,6 +12,7 @@ import ( "gitlab.com/gitlab-org/gitaly/internal/gitaly/config" "gitlab.com/gitlab-org/gitaly/internal/gitaly/service/repository" "gitlab.com/gitlab-org/gitaly/internal/gitaly/transaction" + "gitlab.com/gitlab-org/gitaly/internal/helper" "gitlab.com/gitlab-org/gitaly/internal/testhelper" "gitlab.com/gitlab-org/gitaly/internal/testhelper/testcfg" "gitlab.com/gitlab-org/gitaly/proto/go/gitalypb" @@ -78,14 +79,30 @@ func TestOptimizeReposRandomly(t *testing.T) { }, } { t.Run(tc.desc, func(t *testing.T) { + tickerDone := false + tickerCount := 0 + + ticker := helper.NewManualTicker() + ticker.ResetFunc = func() { + tickerCount++ + ticker.Tick() + } + ticker.StopFunc = func() { + tickerDone = true + } + mo := &mockOptimizer{ t: t, cfg: cfg, } - walker := OptimizeReposRandomly(cfg.Storages, mo, rand.New(rand.NewSource(1))) + walker := OptimizeReposRandomly(cfg.Storages, mo, ticker, rand.New(rand.NewSource(1))) require.NoError(t, walker(ctx, testhelper.DiscardTestEntry(t), tc.storages)) require.ElementsMatch(t, tc.expected, mo.actual) + require.True(t, tickerDone) + // We expect one more tick than optimized repositories because of the + // initial tick up front to re-start the timer. + require.Equal(t, len(tc.expected)+1, tickerCount) }) } } diff --git a/internal/gitaly/server/server_factory.go b/internal/gitaly/server/server_factory.go index 60a5bdae9..4de443d69 100644 --- a/internal/gitaly/server/server_factory.go +++ b/internal/gitaly/server/server_factory.go @@ -12,6 +12,7 @@ import ( "gitlab.com/gitlab-org/gitaly/client" "gitlab.com/gitlab-org/gitaly/internal/gitaly/config" "gitlab.com/gitlab-org/gitaly/internal/gitaly/maintenance" + "gitlab.com/gitlab-org/gitaly/internal/helper" gitalylog "gitlab.com/gitlab-org/gitaly/internal/log" "gitlab.com/gitlab-org/gitaly/proto/go/gitalypb" "google.golang.org/grpc" @@ -56,6 +57,7 @@ func (s *GitalyServerFactory) StartWorkers(ctx context.Context, l logrus.FieldLo maintenance.OptimizeReposRandomly( cfg.Storages, gitalypb.NewRepositoryServiceClient(cc), + helper.NewTimerTicker(1*time.Second), rand.New(rand.NewSource(time.Now().UnixNano())), ), ) |