updateref: Fix indeterminate state when locking refs

When using the updateref package, one would typically first queue updates, then prepare the transaction and finally commit it. At prepare time, the expectation is that all refs will be locked after the function call has succeeded. But because git-update-ref(1) had a bug which kept us from reading the status report, we couldn't assert that this is the case and just went ahead after requesting the state transition. The result is that callers may act on supposedly-locked refs while locking either hasn't finished yet or failed already. In the context of Gitaly Cluster this means that we may cast votes on already-failed updates. Now that we have backported the upstreamed patch to properly flush out status reports we can finally fix this bug by waiting for the status report before returning from `Prepare()`. This fixes the possibility for races and will cause us to abort early in case we can already determine that the update will fail anyway. Furthermore, it fixes a set of flaky tests in the updateref package which hit this exact race. Changelog: fixed
author: Patrick Steinhardt <psteinhardt@gitlab.com> 2021-09-13 13:25:35 +0300
committer: Patrick Steinhardt <psteinhardt@gitlab.com> 2021-09-14 10:08:37 +0300
commit: 4abf27024d6411d535d2e71526b9f682526e9c68 (patch)
tree: 028b7abb2413c87a814487191f51eafff471ba6d
parent: 15e0e47e570493d60b7f81429d3362d3d17a5931 (diff)
3 files changed, 106 insertions, 13 deletions
diff --git a/internal/git/updateref/updateref.go b/internal/git/updateref/updateref.go
index c3eca46bf..dde7b5b76 100644
--- a/internal/git/updateref/updateref.go
+++ b/internal/git/updateref/updateref.go
@@ -1,6 +1,7 @@
 package updateref
 
 import (
+	"bufio"
 	"bytes"
 	"context"
 	"fmt"
@@ -8,6 +9,7 @@ import (
 	"gitlab.com/gitlab-org/gitaly/v14/internal/command"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/git"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config"
+	"gitlab.com/gitlab-org/gitaly/v14/internal/metadata/featureflag"
 )
 
 // Updater wraps a `git update-ref --stdin` process, presenting an interface
@@ -16,7 +18,12 @@ import (
 type Updater struct {
 	repo   git.RepositoryExecutor
 	cmd    *command.Command
+	stdout *bufio.Reader
 	stderr *bytes.Buffer
+
+	// withStatusFlushing determines whether the Git version used supports proper flushing of
+	// status messages.
+	withStatusFlushing bool
 }
 
 // UpdaterOpt is a type representing options for the Updater.
@@ -65,10 +72,22 @@ func New(ctx context.Context, conf config.Cfg, repo git.RepositoryExecutor, opts
 		return nil, err
 	}
 
+	withStatusFlushing := false
+	if featureflag.UpdaterefVerifyStateChanges.IsEnabled(ctx) {
+		gitVersion, err := git.CurrentVersionForExecutor(ctx, repo)
+		if err != nil {
+			return nil, fmt.Errorf("determining git version: %w", err)
+		}
+
+		withStatusFlushing = gitVersion.FlushesUpdaterefStatus()
+	}
+
 	updater := &Updater{
-		repo:   repo,
-		cmd:    cmd,
-		stderr: &stderr,
+		repo:               repo,
+		cmd:                cmd,
+		stderr:             &stderr,
+		stdout:             bufio.NewReader(cmd),
+		withStatusFlushing: withStatusFlushing,
 	}
 
 	// By writing an explicit "start" to the command, we enable
@@ -134,16 +153,30 @@ func (u *Updater) Cancel() error {
 func (u *Updater) setState(state string) error {
 	_, err := fmt.Fprintf(u.cmd, "%s\x00", state)
 	if err != nil {
-		return fmt.Errorf("updating state to %s: %w", state, err)
+		return fmt.Errorf("updating state to %q: %w", state, err)
 	}
 
 	// For each state-changing command, git-update-ref(1) will report successful execution via
 	// "<command>: ok" lines printed to its stdout. Ideally, we should thus verify here whether
 	// the command was successfully executed by checking for exactly this line, otherwise we
 	// cannot be sure whether the command has correctly been processed by Git or if an error was
-	// raised. Unfortunately, Git won't flush the output though, and as such we would be left
-	// waiting for the message to appear here. This needs to be fixed upstream first, and after
-	// this we should adapt this function to assert commands.
+	// raised. Unfortunately, Git only knows to flush these reports either starting with v2.34.0
+	// or with our backported version v2.33.0.gl3.
+	if u.withStatusFlushing {
+		line, err := u.stdout.ReadString('\n')
+		if err != nil {
+			// We need to explicitly cancel the command here and wait for it to
+			// terminate such that we can retrieve the command's stderr in a race-free
+			// manner.
+			_ = u.Cancel()
+
+			return fmt.Errorf("state update to %q failed: %w, stderr: %q", state, err, u.stderr)
+		}
+
+		if line != fmt.Sprintf("%s: ok\n", state) {
+			return fmt.Errorf("state update to %q not successful: expected ok, got %q", state, line)
+		}
+	}
 
 	return nil
 }
diff --git a/internal/git/updateref/updateref_test.go b/internal/git/updateref/updateref_test.go
index 89ab269f4..aea0de5cb 100644
--- a/internal/git/updateref/updateref_test.go
+++ b/internal/git/updateref/updateref_test.go
@@ -12,6 +12,7 @@ import (
 	"gitlab.com/gitlab-org/gitaly/v14/internal/git/hooks"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/git/localrepo"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config"
+	"gitlab.com/gitlab-org/gitaly/v14/internal/metadata/featureflag"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/testhelper"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/testhelper/testcfg"
 )
@@ -133,6 +134,46 @@ func TestUpdater_prepareLocksTransaction(t *testing.T) {
 	require.Contains(t, err.Error(), "fatal: prepared transactions can only be closed")
 }
 
+func TestUpdater_concurrentLocking(t *testing.T) {
+	testhelper.NewFeatureSets([]featureflag.FeatureFlag{
+		featureflag.UpdaterefVerifyStateChanges,
+	}).Run(t, testUpdaterConcurrentLocking)
+}
+
+func testUpdaterConcurrentLocking(t *testing.T, ctx context.Context) {
+	cfg, protoRepo, _ := testcfg.BuildWithRepo(t)
+	repo := localrepo.NewTestRepo(t, cfg, protoRepo)
+
+	commit, logErr := repo.ReadCommit(ctx, "refs/heads/master")
+	require.NoError(t, logErr)
+
+	firstUpdater, err := New(ctx, cfg, repo)
+	require.NoError(t, err)
+	require.NoError(t, firstUpdater.Update("refs/heads/master", "", commit.Id))
+	require.NoError(t, firstUpdater.Prepare())
+
+	secondUpdater, err := New(ctx, cfg, repo)
+	require.NoError(t, err)
+	require.NoError(t, secondUpdater.Update("refs/heads/master", "", commit.Id))
+
+	// With flushing, we're able to detect concurrent locking at prepare time already instead of
+	// at commit time.
+	if gitSupportsStatusFlushing(t, ctx, cfg) && featureflag.UpdaterefVerifyStateChanges.IsEnabled(ctx) {
+		err := secondUpdater.Prepare()
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "state update to \"prepare\" failed: EOF, stderr: \"warning: update refs/heads/master: missing <newvalue>, treating as zero\\nfatal: prepare: cannot lock ref 'refs/heads/master'")
+
+		require.NoError(t, firstUpdater.Commit())
+	} else {
+		require.NoError(t, secondUpdater.Prepare())
+		require.NoError(t, firstUpdater.Commit())
+
+		err := secondUpdater.Commit()
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "git update-ref: exit status 128, stderr: \"warning: update refs/heads/master: missing <newvalue>, treating as zero\\nfatal: prepare: cannot lock ref 'refs/heads/master'")
+	}
+}
+
 func TestBulkOperation(t *testing.T) {
 	ctx, cancel := testhelper.Context()
 	defer cancel()
@@ -235,10 +276,13 @@ func TestUpdater_closingStdinAbortsChanges(t *testing.T) {
 }
 
 func TestUpdater_capturesStderr(t *testing.T) {
-	ctx, cancel := testhelper.Context()
-	defer cancel()
+	testhelper.NewFeatureSets([]featureflag.FeatureFlag{
+		featureflag.UpdaterefVerifyStateChanges,
+	}).Run(t, testUpdaterCapturesStderr)
+}
 
-	_, _, updater := setupUpdater(t, ctx)
+func testUpdaterCapturesStderr(t *testing.T, ctx context.Context) {
+	cfg, _, updater := setupUpdater(t, ctx)
 
 	ref := "refs/heads/a"
 	newValue := strings.Repeat("1", 40)
@@ -246,11 +290,23 @@ func TestUpdater_capturesStderr(t *testing.T) {
 
 	require.NoError(t, updater.Update(git.ReferenceName(ref), newValue, oldValue))
 
-	expectedErr := fmt.Sprintf("git update-ref: exit status 128, stderr: "+
-		"\"fatal: commit: cannot update ref '%s': "+
-		"trying to write ref '%s' with nonexistent object %s\\n\"", ref, ref, newValue)
+	var expectedErr string
+	if gitSupportsStatusFlushing(t, ctx, cfg) && featureflag.UpdaterefVerifyStateChanges.IsEnabled(ctx) {
+		expectedErr = fmt.Sprintf("state update to \"commit\" failed: EOF, stderr: \"fatal: commit: cannot update ref '%s': "+
+			"trying to write ref '%s' with nonexistent object %s\\n\"", ref, ref, newValue)
+	} else {
+		expectedErr = fmt.Sprintf("git update-ref: exit status 128, stderr: "+
+			"\"fatal: commit: cannot update ref '%s': "+
+			"trying to write ref '%s' with nonexistent object %s\\n\"", ref, ref, newValue)
+	}
 
 	err := updater.Commit()
 	require.NotNil(t, err)
 	require.Equal(t, err.Error(), expectedErr)
 }
+
+func gitSupportsStatusFlushing(t *testing.T, ctx context.Context, cfg config.Cfg) bool {
+	version, err := git.CurrentVersion(ctx, git.NewExecCommandFactory(cfg))
+	require.NoError(t, err)
+	return version.FlushesUpdaterefStatus()
+}
diff --git a/internal/metadata/featureflag/ff_updateref_assert_locking.go b/internal/metadata/featureflag/ff_updateref_assert_locking.go
new file mode 100644
index 000000000..e018fd1e6
--- /dev/null
+++ b/internal/metadata/featureflag/ff_updateref_assert_locking.go
@@ -0,0 +1,4 @@
+package featureflag
+
+// UpdaterefVerifyStateChanges causes us to assert that transactional state changes finish correctly.
+var UpdaterefVerifyStateChanges = NewFeatureFlag("updateref_verify_state_changes", true)
author	Patrick Steinhardt <psteinhardt@gitlab.com>	2021-09-13 13:25:35 +0300
committer	Patrick Steinhardt <psteinhardt@gitlab.com>	2021-09-14 10:08:37 +0300
commit	4abf27024d6411d535d2e71526b9f682526e9c68 (patch)
tree	028b7abb2413c87a814487191f51eafff471ba6d
parent	15e0e47e570493d60b7f81429d3362d3d17a5931 (diff)