diff options
author | Pavlo Strokov <pstrokov@gitlab.com> | 2021-12-16 17:12:58 +0300 |
---|---|---|
committer | Pavlo Strokov <pstrokov@gitlab.com> | 2021-12-16 17:29:03 +0300 |
commit | 7412a98422c47269e18c937672a25e6047577bb1 (patch) | |
tree | 8da155d78661c2c466a68bbfb5e3aa146d8c080e | |
parent | 1762b3a49fef1f12c2dc6ac153807b2c8c25b1fe (diff) |
maintenance: Override deadline for RPC call
Gitaly service runs a maintenance job each day for a certain
period of time. The log shows that OptimizeRepository RPC
fails with DeadlineExceeded error. The root cause is a usage
of the context that is limited by the duration of the maintenance.
When job is out of maintenance window the context is cancelled
and RPC is cancelled as well. Instead, we should handle it gracefully.
That is why before RPC call we first suppress context cancellation
and them apply a new default timeout for it. That protects execution
from cancellation of the parent context as well as cancels long
running RPC calls. If we don't cancel long running RPC the other
repositories will suffer from not being optimized for a long time.
Closes: https://gitlab.com/gitlab-org/gitaly/-/issues/3963
Changelog: fixed
-rw-r--r-- | internal/gitaly/maintenance/optimize.go | 8 | ||||
-rw-r--r-- | internal/gitaly/maintenance/optimize_test.go | 39 |
2 files changed, 47 insertions, 0 deletions
diff --git a/internal/gitaly/maintenance/optimize.go b/internal/gitaly/maintenance/optimize.go index f4da15c84..d7231fb24 100644 --- a/internal/gitaly/maintenance/optimize.go +++ b/internal/gitaly/maintenance/optimize.go @@ -56,6 +56,14 @@ func optimizeRepoAtPath(ctx context.Context, l logrus.FieldLogger, s config.Stor Repository: repo, } + // In order to prevent RPC cancellation because of the parent context cancellation + // (job execution duration passed) we suppress cancellation of the parent and add a + // new time limit to make sure it won't block forever. + // It also helps us to be protected over repositories that are taking too long to complete + // a request, so no any other repository can be optimized. + ctx, cancel := context.WithTimeout(helper.SuppressCancellation(ctx), 5*time.Minute) + defer cancel() + start := time.Now() if _, err := o.OptimizeRepository(ctx, optimizeReq); err != nil { l.WithFields(map[string]interface{}{ diff --git a/internal/gitaly/maintenance/optimize_test.go b/internal/gitaly/maintenance/optimize_test.go index 2ca182f13..8c03229d4 100644 --- a/internal/gitaly/maintenance/optimize_test.go +++ b/internal/gitaly/maintenance/optimize_test.go @@ -5,6 +5,7 @@ import ( "math/rand" "path/filepath" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -121,3 +122,41 @@ func TestOptimizeReposRandomly(t *testing.T) { }) } } + +type mockOptimizerCancel struct { + t *testing.T + startedAt time.Time +} + +func (m mockOptimizerCancel) OptimizeRepository(ctx context.Context, _ *gitalypb.OptimizeRepositoryRequest, _ ...grpc.CallOption) (*gitalypb.OptimizeRepositoryResponse, error) { + timeline, ok := ctx.Deadline() + if assert.True(m.t, ok) { + assert.True(m.t, timeline.After(m.startedAt), m.startedAt) + future := m.startedAt.Add(10 * time.Minute) + assert.True(m.t, timeline.Before(future), future) + } + return &gitalypb.OptimizeRepositoryResponse{}, nil +} + +func TestOptimizeReposRandomly_cancellationOverride(t *testing.T) { + cfgBuilder := testcfg.NewGitalyCfgBuilder() + cfg := cfgBuilder.Build(t) + + gittest.InitRepo(t, cfg, cfg.Storages[0]) + + ctx, cancel := testhelper.Context() + defer cancel() + + // The timeout should be overwritten by the default 5 min timeout. + //nolint:forbidigo // We're explicitly testing deadline override. + ctx, cancel = context.WithTimeout(ctx, 72*time.Hour) + defer cancel() + + ticker := helper.NewManualTicker() + ticker.Tick() + + mo := &mockOptimizerCancel{t: t, startedAt: time.Now()} + walker := OptimizeReposRandomly(cfg.Storages, mo, ticker, rand.New(rand.NewSource(1))) + + require.NoError(t, walker(ctx, testhelper.NewDiscardingLogEntry(t), []string{cfg.Storages[0].Name})) +} |