Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Steinhardt <psteinhardt@gitlab.com>2021-02-10 16:02:18 +0300
committerPatrick Steinhardt <psteinhardt@gitlab.com>2021-02-11 10:17:00 +0300
commit6f2f5bc6c62082a94e3676dcb915042d5be2f257 (patch)
tree87a426b010ca55230eaf4ad357f0d565ec50ceeb
parent8cdbdb46b4fa31e0c2f1e2646baaf0ffb271b3a0 (diff)
hook: Increase the timeout when casting votes
In production, we're hitting timeouts from time to time when voting on transactions, where those timeouts typically always happen on secondary nodes which are waiting for the primary to cast its vote. The root cause of this is that the primary node executes logic in hooks which secondary nodes don't, like posting to the `/internal/allowed` API to check whether a push is allowed and custom hook logic. Unfortunately, both of those actions may take quite some time to finish: the allowed check scales with repository size as Rails will execute several RPCs to inspect the pushed data, while the custom hook logic is not controlled by us and may potentially run in unbounded time. Those primary-only actions can thus easily bust the 10 seconds we have set up as timeout for transactional voting. Given that those steps are potentially unbounded, we could just go ahead and remove the timeout altogether. This does feel a bit dangerous though: if a node part of an ongoing transaction is failing, then it wouldn't ever cast its vote and other nodes may potentially be stuck. If we didn't have a timeout, we'd now rely on the overarching RPC to have a timeout set, otherwise we would be stuck forever waiting for quorum to be reached. There are RPCs where having a timeout doesn't really make a lot of sense, and unfortunately the relevant RPCs in the context of hooks are part of that class as git pushes may take a very long time. So we cannot rely on those RPCs having a sensible timeout. Given that we cannot rely on any timeout being set but still want to eventually abort the transaction to not get stuck forever, this commit instead introduces an arbitrarily chosen high timeout of 5 minutes to the voting logic. This assumes that the combination of access checks and custom hook logic should never exceed these 5 minutes, which feels like a reasonable assumption to make. In order to assert that it's still possible to set shorter timeouts if the surrounding RPC has one, a new testcase is added which uses a practically immediate timeout on the RPC.
-rw-r--r--changelogs/unreleased/pks-hook-increase-tx-timeouts.yml5
-rw-r--r--internal/gitaly/hook/transactions.go17
-rw-r--r--internal/gitaly/hook/transactions_test.go41
3 files changed, 60 insertions, 3 deletions
diff --git a/changelogs/unreleased/pks-hook-increase-tx-timeouts.yml b/changelogs/unreleased/pks-hook-increase-tx-timeouts.yml
new file mode 100644
index 000000000..9e8ae19df
--- /dev/null
+++ b/changelogs/unreleased/pks-hook-increase-tx-timeouts.yml
@@ -0,0 +1,5 @@
+---
+title: 'hook: Increase the timeout when casting votes'
+merge_request: 3115
+author:
+type: fixed
diff --git a/internal/gitaly/hook/transactions.go b/internal/gitaly/hook/transactions.go
index 9e7ef7324..7d2304b3e 100644
--- a/internal/gitaly/hook/transactions.go
+++ b/internal/gitaly/hook/transactions.go
@@ -9,6 +9,21 @@ import (
"gitlab.com/gitlab-org/gitaly/internal/praefect/metadata"
)
+const (
+ // transactionTimeout is the timeout used for all transactional
+ // actions like voting and stopping of transactions. This timeout is
+ // quite high: usually, a transaction should finish in at most a few
+ // milliseconds. There are cases though where it may take a lot longer,
+ // like when executing logic on the primary node only: the primary's
+ // vote will be delayed until that logic finishes while secondaries are
+ // waiting for the primary to cast its vote on the transaction. Given
+ // that the primary-only logic's execution time scales with repository
+ // size for the access checks and that it is potentially even unbounded
+ // due to custom hooks, we thus use a high timeout. It shouldn't
+ // normally be hit, but if it is hit then it indicates a real problem.
+ transactionTimeout = 5 * time.Minute
+)
+
func isPrimary(payload git.HooksPayload) bool {
if payload.Transaction == nil {
return true
@@ -30,7 +45,7 @@ func (m *GitLabHookManager) runWithTransaction(ctx context.Context, payload git.
return errors.New("transaction without Praefect server")
}
- ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
+ ctx, cancel := context.WithTimeout(ctx, transactionTimeout)
defer cancel()
if err := handler(ctx, *payload.Transaction, *payload.Praefect); err != nil {
diff --git a/internal/gitaly/hook/transactions_test.go b/internal/gitaly/hook/transactions_test.go
index 256e70a7e..474c46c44 100644
--- a/internal/gitaly/hook/transactions_test.go
+++ b/internal/gitaly/hook/transactions_test.go
@@ -3,9 +3,11 @@ package hook
import (
"context"
"errors"
+ "fmt"
"io/ioutil"
"strings"
"testing"
+ "time"
"github.com/stretchr/testify/require"
"gitlab.com/gitlab-org/gitaly/internal/git"
@@ -15,11 +17,12 @@ import (
)
type mockTransactionManager struct {
+ vote func(context.Context, metadata.Transaction, metadata.PraefectServer, []byte) error
stop func(context.Context, metadata.Transaction, metadata.PraefectServer) error
}
-func (m *mockTransactionManager) Vote(context.Context, metadata.Transaction, metadata.PraefectServer, []byte) error {
- return nil
+func (m *mockTransactionManager) Vote(ctx context.Context, tx metadata.Transaction, praefect metadata.PraefectServer, vote []byte) error {
+ return m.vote(ctx, tx, praefect, vote)
}
func (m *mockTransactionManager) Stop(ctx context.Context, tx metadata.Transaction, praefect metadata.PraefectServer) error {
@@ -121,3 +124,37 @@ func TestHookManager_stopCalled(t *testing.T) {
})
}
}
+
+func TestHookManager_contextCancellationCancelsVote(t *testing.T) {
+ mockTxMgr := mockTransactionManager{
+ vote: func(ctx context.Context, tx metadata.Transaction, praefect metadata.PraefectServer, vote []byte) error {
+ <-ctx.Done()
+ return fmt.Errorf("mock error: %s", ctx.Err())
+ },
+ }
+
+ hookManager := NewManager(config.NewLocator(config.Config), &mockTxMgr, GitlabAPIStub, config.Config)
+
+ repo, _, cleanup := testhelper.NewTestRepo(t)
+ defer cleanup()
+
+ hooksPayload, err := git.NewHooksPayload(
+ config.Config,
+ repo,
+ &metadata.Transaction{
+ ID: 1234, Node: "primary", Primary: true,
+ },
+ &metadata.PraefectServer{
+ SocketPath: "does_not",
+ Token: "matter",
+ },
+ nil,
+ ).Env()
+ require.NoError(t, err)
+
+ ctx, cancel := context.WithTimeout(context.Background(), 1*time.Nanosecond)
+ defer cancel()
+
+ err = hookManager.ReferenceTransactionHook(ctx, ReferenceTransactionPrepared, []string{hooksPayload}, strings.NewReader("changes"))
+ require.Equal(t, "error voting on transaction: mock error: context deadline exceeded", err.Error())
+}