diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-02-10 16:02:18 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-02-11 10:17:00 +0300 |
commit | 6f2f5bc6c62082a94e3676dcb915042d5be2f257 (patch) | |
tree | 87a426b010ca55230eaf4ad357f0d565ec50ceeb | |
parent | 8cdbdb46b4fa31e0c2f1e2646baaf0ffb271b3a0 (diff) |
hook: Increase the timeout when casting votes
In production, we're hitting timeouts from time to time when voting on
transactions, where those timeouts typically always happen on secondary
nodes which are waiting for the primary to cast its vote. The root cause
of this is that the primary node executes logic in hooks which secondary
nodes don't, like posting to the `/internal/allowed` API to check
whether a push is allowed and custom hook logic.
Unfortunately, both of those actions may take quite some time to finish:
the allowed check scales with repository size as Rails will execute
several RPCs to inspect the pushed data, while the custom hook logic is
not controlled by us and may potentially run in unbounded time. Those
primary-only actions can thus easily bust the 10 seconds we have set up
as timeout for transactional voting.
Given that those steps are potentially unbounded, we could just go ahead
and remove the timeout altogether. This does feel a bit dangerous
though: if a node part of an ongoing transaction is failing, then it
wouldn't ever cast its vote and other nodes may potentially be stuck. If
we didn't have a timeout, we'd now rely on the overarching RPC to have
a timeout set, otherwise we would be stuck forever waiting for quorum to
be reached. There are RPCs where having a timeout doesn't really make a
lot of sense, and unfortunately the relevant RPCs in the context of
hooks are part of that class as git pushes may take a very long time.
So we cannot rely on those RPCs having a sensible timeout.
Given that we cannot rely on any timeout being set but still want to
eventually abort the transaction to not get stuck forever, this commit
instead introduces an arbitrarily chosen high timeout of 5 minutes to
the voting logic. This assumes that the combination of access checks and
custom hook logic should never exceed these 5 minutes, which feels like
a reasonable assumption to make.
In order to assert that it's still possible to set shorter timeouts if
the surrounding RPC has one, a new testcase is added which uses a
practically immediate timeout on the RPC.
-rw-r--r-- | changelogs/unreleased/pks-hook-increase-tx-timeouts.yml | 5 | ||||
-rw-r--r-- | internal/gitaly/hook/transactions.go | 17 | ||||
-rw-r--r-- | internal/gitaly/hook/transactions_test.go | 41 |
3 files changed, 60 insertions, 3 deletions
diff --git a/changelogs/unreleased/pks-hook-increase-tx-timeouts.yml b/changelogs/unreleased/pks-hook-increase-tx-timeouts.yml new file mode 100644 index 000000000..9e8ae19df --- /dev/null +++ b/changelogs/unreleased/pks-hook-increase-tx-timeouts.yml @@ -0,0 +1,5 @@ +--- +title: 'hook: Increase the timeout when casting votes' +merge_request: 3115 +author: +type: fixed diff --git a/internal/gitaly/hook/transactions.go b/internal/gitaly/hook/transactions.go index 9e7ef7324..7d2304b3e 100644 --- a/internal/gitaly/hook/transactions.go +++ b/internal/gitaly/hook/transactions.go @@ -9,6 +9,21 @@ import ( "gitlab.com/gitlab-org/gitaly/internal/praefect/metadata" ) +const ( + // transactionTimeout is the timeout used for all transactional + // actions like voting and stopping of transactions. This timeout is + // quite high: usually, a transaction should finish in at most a few + // milliseconds. There are cases though where it may take a lot longer, + // like when executing logic on the primary node only: the primary's + // vote will be delayed until that logic finishes while secondaries are + // waiting for the primary to cast its vote on the transaction. Given + // that the primary-only logic's execution time scales with repository + // size for the access checks and that it is potentially even unbounded + // due to custom hooks, we thus use a high timeout. It shouldn't + // normally be hit, but if it is hit then it indicates a real problem. + transactionTimeout = 5 * time.Minute +) + func isPrimary(payload git.HooksPayload) bool { if payload.Transaction == nil { return true @@ -30,7 +45,7 @@ func (m *GitLabHookManager) runWithTransaction(ctx context.Context, payload git. return errors.New("transaction without Praefect server") } - ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + ctx, cancel := context.WithTimeout(ctx, transactionTimeout) defer cancel() if err := handler(ctx, *payload.Transaction, *payload.Praefect); err != nil { diff --git a/internal/gitaly/hook/transactions_test.go b/internal/gitaly/hook/transactions_test.go index 256e70a7e..474c46c44 100644 --- a/internal/gitaly/hook/transactions_test.go +++ b/internal/gitaly/hook/transactions_test.go @@ -3,9 +3,11 @@ package hook import ( "context" "errors" + "fmt" "io/ioutil" "strings" "testing" + "time" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/internal/git" @@ -15,11 +17,12 @@ import ( ) type mockTransactionManager struct { + vote func(context.Context, metadata.Transaction, metadata.PraefectServer, []byte) error stop func(context.Context, metadata.Transaction, metadata.PraefectServer) error } -func (m *mockTransactionManager) Vote(context.Context, metadata.Transaction, metadata.PraefectServer, []byte) error { - return nil +func (m *mockTransactionManager) Vote(ctx context.Context, tx metadata.Transaction, praefect metadata.PraefectServer, vote []byte) error { + return m.vote(ctx, tx, praefect, vote) } func (m *mockTransactionManager) Stop(ctx context.Context, tx metadata.Transaction, praefect metadata.PraefectServer) error { @@ -121,3 +124,37 @@ func TestHookManager_stopCalled(t *testing.T) { }) } } + +func TestHookManager_contextCancellationCancelsVote(t *testing.T) { + mockTxMgr := mockTransactionManager{ + vote: func(ctx context.Context, tx metadata.Transaction, praefect metadata.PraefectServer, vote []byte) error { + <-ctx.Done() + return fmt.Errorf("mock error: %s", ctx.Err()) + }, + } + + hookManager := NewManager(config.NewLocator(config.Config), &mockTxMgr, GitlabAPIStub, config.Config) + + repo, _, cleanup := testhelper.NewTestRepo(t) + defer cleanup() + + hooksPayload, err := git.NewHooksPayload( + config.Config, + repo, + &metadata.Transaction{ + ID: 1234, Node: "primary", Primary: true, + }, + &metadata.PraefectServer{ + SocketPath: "does_not", + Token: "matter", + }, + nil, + ).Env() + require.NoError(t, err) + + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Nanosecond) + defer cancel() + + err = hookManager.ReferenceTransactionHook(ctx, ReferenceTransactionPrepared, []string{hooksPayload}, strings.NewReader("changes")) + require.Equal(t, "error voting on transaction: mock error: context deadline exceeded", err.Error()) +} |