diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-09-13 13:25:35 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-09-14 10:08:37 +0300 |
commit | 4abf27024d6411d535d2e71526b9f682526e9c68 (patch) | |
tree | 028b7abb2413c87a814487191f51eafff471ba6d | |
parent | 15e0e47e570493d60b7f81429d3362d3d17a5931 (diff) |
updateref: Fix indeterminate state when locking refs
When using the updateref package, one would typically first queue
updates, then prepare the transaction and finally commit it. At prepare
time, the expectation is that all refs will be locked after the function
call has succeeded. But because git-update-ref(1) had a bug which kept
us from reading the status report, we couldn't assert that this is the
case and just went ahead after requesting the state transition. The
result is that callers may act on supposedly-locked refs while locking
either hasn't finished yet or failed already. In the context of Gitaly
Cluster this means that we may cast votes on already-failed updates.
Now that we have backported the upstreamed patch to properly flush out
status reports we can finally fix this bug by waiting for the status
report before returning from `Prepare()`. This fixes the possibility for
races and will cause us to abort early in case we can already determine
that the update will fail anyway. Furthermore, it fixes a set of flaky
tests in the updateref package which hit this exact race.
Changelog: fixed
-rw-r--r-- | internal/git/updateref/updateref.go | 47 | ||||
-rw-r--r-- | internal/git/updateref/updateref_test.go | 68 | ||||
-rw-r--r-- | internal/metadata/featureflag/ff_updateref_assert_locking.go | 4 |
3 files changed, 106 insertions, 13 deletions
diff --git a/internal/git/updateref/updateref.go b/internal/git/updateref/updateref.go index c3eca46bf..dde7b5b76 100644 --- a/internal/git/updateref/updateref.go +++ b/internal/git/updateref/updateref.go @@ -1,6 +1,7 @@ package updateref import ( + "bufio" "bytes" "context" "fmt" @@ -8,6 +9,7 @@ import ( "gitlab.com/gitlab-org/gitaly/v14/internal/command" "gitlab.com/gitlab-org/gitaly/v14/internal/git" "gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config" + "gitlab.com/gitlab-org/gitaly/v14/internal/metadata/featureflag" ) // Updater wraps a `git update-ref --stdin` process, presenting an interface @@ -16,7 +18,12 @@ import ( type Updater struct { repo git.RepositoryExecutor cmd *command.Command + stdout *bufio.Reader stderr *bytes.Buffer + + // withStatusFlushing determines whether the Git version used supports proper flushing of + // status messages. + withStatusFlushing bool } // UpdaterOpt is a type representing options for the Updater. @@ -65,10 +72,22 @@ func New(ctx context.Context, conf config.Cfg, repo git.RepositoryExecutor, opts return nil, err } + withStatusFlushing := false + if featureflag.UpdaterefVerifyStateChanges.IsEnabled(ctx) { + gitVersion, err := git.CurrentVersionForExecutor(ctx, repo) + if err != nil { + return nil, fmt.Errorf("determining git version: %w", err) + } + + withStatusFlushing = gitVersion.FlushesUpdaterefStatus() + } + updater := &Updater{ - repo: repo, - cmd: cmd, - stderr: &stderr, + repo: repo, + cmd: cmd, + stderr: &stderr, + stdout: bufio.NewReader(cmd), + withStatusFlushing: withStatusFlushing, } // By writing an explicit "start" to the command, we enable @@ -134,16 +153,30 @@ func (u *Updater) Cancel() error { func (u *Updater) setState(state string) error { _, err := fmt.Fprintf(u.cmd, "%s\x00", state) if err != nil { - return fmt.Errorf("updating state to %s: %w", state, err) + return fmt.Errorf("updating state to %q: %w", state, err) } // For each state-changing command, git-update-ref(1) will report successful execution via // "<command>: ok" lines printed to its stdout. Ideally, we should thus verify here whether // the command was successfully executed by checking for exactly this line, otherwise we // cannot be sure whether the command has correctly been processed by Git or if an error was - // raised. Unfortunately, Git won't flush the output though, and as such we would be left - // waiting for the message to appear here. This needs to be fixed upstream first, and after - // this we should adapt this function to assert commands. + // raised. Unfortunately, Git only knows to flush these reports either starting with v2.34.0 + // or with our backported version v2.33.0.gl3. + if u.withStatusFlushing { + line, err := u.stdout.ReadString('\n') + if err != nil { + // We need to explicitly cancel the command here and wait for it to + // terminate such that we can retrieve the command's stderr in a race-free + // manner. + _ = u.Cancel() + + return fmt.Errorf("state update to %q failed: %w, stderr: %q", state, err, u.stderr) + } + + if line != fmt.Sprintf("%s: ok\n", state) { + return fmt.Errorf("state update to %q not successful: expected ok, got %q", state, line) + } + } return nil } diff --git a/internal/git/updateref/updateref_test.go b/internal/git/updateref/updateref_test.go index 89ab269f4..aea0de5cb 100644 --- a/internal/git/updateref/updateref_test.go +++ b/internal/git/updateref/updateref_test.go @@ -12,6 +12,7 @@ import ( "gitlab.com/gitlab-org/gitaly/v14/internal/git/hooks" "gitlab.com/gitlab-org/gitaly/v14/internal/git/localrepo" "gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config" + "gitlab.com/gitlab-org/gitaly/v14/internal/metadata/featureflag" "gitlab.com/gitlab-org/gitaly/v14/internal/testhelper" "gitlab.com/gitlab-org/gitaly/v14/internal/testhelper/testcfg" ) @@ -133,6 +134,46 @@ func TestUpdater_prepareLocksTransaction(t *testing.T) { require.Contains(t, err.Error(), "fatal: prepared transactions can only be closed") } +func TestUpdater_concurrentLocking(t *testing.T) { + testhelper.NewFeatureSets([]featureflag.FeatureFlag{ + featureflag.UpdaterefVerifyStateChanges, + }).Run(t, testUpdaterConcurrentLocking) +} + +func testUpdaterConcurrentLocking(t *testing.T, ctx context.Context) { + cfg, protoRepo, _ := testcfg.BuildWithRepo(t) + repo := localrepo.NewTestRepo(t, cfg, protoRepo) + + commit, logErr := repo.ReadCommit(ctx, "refs/heads/master") + require.NoError(t, logErr) + + firstUpdater, err := New(ctx, cfg, repo) + require.NoError(t, err) + require.NoError(t, firstUpdater.Update("refs/heads/master", "", commit.Id)) + require.NoError(t, firstUpdater.Prepare()) + + secondUpdater, err := New(ctx, cfg, repo) + require.NoError(t, err) + require.NoError(t, secondUpdater.Update("refs/heads/master", "", commit.Id)) + + // With flushing, we're able to detect concurrent locking at prepare time already instead of + // at commit time. + if gitSupportsStatusFlushing(t, ctx, cfg) && featureflag.UpdaterefVerifyStateChanges.IsEnabled(ctx) { + err := secondUpdater.Prepare() + require.Error(t, err) + require.Contains(t, err.Error(), "state update to \"prepare\" failed: EOF, stderr: \"warning: update refs/heads/master: missing <newvalue>, treating as zero\\nfatal: prepare: cannot lock ref 'refs/heads/master'") + + require.NoError(t, firstUpdater.Commit()) + } else { + require.NoError(t, secondUpdater.Prepare()) + require.NoError(t, firstUpdater.Commit()) + + err := secondUpdater.Commit() + require.Error(t, err) + require.Contains(t, err.Error(), "git update-ref: exit status 128, stderr: \"warning: update refs/heads/master: missing <newvalue>, treating as zero\\nfatal: prepare: cannot lock ref 'refs/heads/master'") + } +} + func TestBulkOperation(t *testing.T) { ctx, cancel := testhelper.Context() defer cancel() @@ -235,10 +276,13 @@ func TestUpdater_closingStdinAbortsChanges(t *testing.T) { } func TestUpdater_capturesStderr(t *testing.T) { - ctx, cancel := testhelper.Context() - defer cancel() + testhelper.NewFeatureSets([]featureflag.FeatureFlag{ + featureflag.UpdaterefVerifyStateChanges, + }).Run(t, testUpdaterCapturesStderr) +} - _, _, updater := setupUpdater(t, ctx) +func testUpdaterCapturesStderr(t *testing.T, ctx context.Context) { + cfg, _, updater := setupUpdater(t, ctx) ref := "refs/heads/a" newValue := strings.Repeat("1", 40) @@ -246,11 +290,23 @@ func TestUpdater_capturesStderr(t *testing.T) { require.NoError(t, updater.Update(git.ReferenceName(ref), newValue, oldValue)) - expectedErr := fmt.Sprintf("git update-ref: exit status 128, stderr: "+ - "\"fatal: commit: cannot update ref '%s': "+ - "trying to write ref '%s' with nonexistent object %s\\n\"", ref, ref, newValue) + var expectedErr string + if gitSupportsStatusFlushing(t, ctx, cfg) && featureflag.UpdaterefVerifyStateChanges.IsEnabled(ctx) { + expectedErr = fmt.Sprintf("state update to \"commit\" failed: EOF, stderr: \"fatal: commit: cannot update ref '%s': "+ + "trying to write ref '%s' with nonexistent object %s\\n\"", ref, ref, newValue) + } else { + expectedErr = fmt.Sprintf("git update-ref: exit status 128, stderr: "+ + "\"fatal: commit: cannot update ref '%s': "+ + "trying to write ref '%s' with nonexistent object %s\\n\"", ref, ref, newValue) + } err := updater.Commit() require.NotNil(t, err) require.Equal(t, err.Error(), expectedErr) } + +func gitSupportsStatusFlushing(t *testing.T, ctx context.Context, cfg config.Cfg) bool { + version, err := git.CurrentVersion(ctx, git.NewExecCommandFactory(cfg)) + require.NoError(t, err) + return version.FlushesUpdaterefStatus() +} diff --git a/internal/metadata/featureflag/ff_updateref_assert_locking.go b/internal/metadata/featureflag/ff_updateref_assert_locking.go new file mode 100644 index 000000000..e018fd1e6 --- /dev/null +++ b/internal/metadata/featureflag/ff_updateref_assert_locking.go @@ -0,0 +1,4 @@ +package featureflag + +// UpdaterefVerifyStateChanges causes us to assert that transactional state changes finish correctly. +var UpdaterefVerifyStateChanges = NewFeatureFlag("updateref_verify_state_changes", true) |