From a103d1d7e84585f627d9cee8ed6a0117a77c5698 Mon Sep 17 00:00:00 2001 From: Will Chandler Date: Fri, 12 Jan 2024 15:13:26 -0500 Subject: cleanup: Don't run filter-repo in-place git-filter-repo(1) uses git-fast-import(1) to import the rewritten repository history. This will unpack the new objects, then iterate over references serially and update them using reference transactions. This does not atomically update the references[0], so an interruption during this final stage will result in partially applied changes. To mitigate this risk, create a temporary staging repository to write the updated history into, then atomically force fetch that into the original repo. This has the downside of being slower than modifying the repository in-place[1], but improving safety for a high-risk operation like this is a greater priority. [0] https://gitlab.com/gitlab-org/git/-/blob/d4dbce1db5cd227a57074bcfc7ec9f0655961bba/builtin/fast-import.c#L1659-1668 [1] https://github.com/newren/git-filter-repo/issues/66#issuecomment-602100316 --- internal/gitaly/service/cleanup/rewrite_history.go | 132 +++++++++++++++++++-- 1 file changed, 125 insertions(+), 7 deletions(-) diff --git a/internal/gitaly/service/cleanup/rewrite_history.go b/internal/gitaly/service/cleanup/rewrite_history.go index 7bbdd258e..85e45ab55 100644 --- a/internal/gitaly/service/cleanup/rewrite_history.go +++ b/internal/gitaly/service/cleanup/rewrite_history.go @@ -85,8 +85,8 @@ func (s *server) RewriteHistory(server gitalypb.CleanupService_RewriteHistorySer } } - if err := s.runFilterRepo(ctx, repo, repoProto, blobsToRemove, redactions); err != nil { - return fmt.Errorf("rewriting repository history: %w", err) + if err := s.rewriteHistory(ctx, repo, repoProto, blobsToRemove, redactions); err != nil { + return err } if err := server.SendAndClose(&gitalypb.RewriteHistoryResponse{}); err != nil { @@ -96,15 +96,110 @@ func (s *server) RewriteHistory(server gitalypb.CleanupService_RewriteHistorySer return nil } -func (s *server) runFilterRepo( +func (s *server) rewriteHistory( ctx context.Context, repo *localrepo.Repo, repoProto *gitalypb.Repository, blobsToRemove []string, redactions [][]byte, +) error { + defaultBranch, err := repo.HeadReference(ctx) + if err != nil { + return fmt.Errorf("finding HEAD reference: %w", err) + } + + stagingRepo, stagingRepoPath, err := s.initStagingRepo(ctx, repoProto, defaultBranch) + if err != nil { + return fmt.Errorf("setting up staging repo: %w", err) + } + + if err := s.runFilterRepo(ctx, repo, stagingRepo, blobsToRemove, redactions); err != nil { + return fmt.Errorf("rewriting repository history: %w", err) + } + + var stderr strings.Builder + if err := repo.ExecAndWait(ctx, + git.Command{ + Name: "fetch", + Flags: []git.Option{ + // Delete any refs that were removed by filter-repo. + git.Flag{Name: "--prune"}, + // The mirror refspec includes tags, don't fetch them again. + git.Flag{Name: "--no-tags"}, + // New history will be disjoint from the original repo. + git.Flag{Name: "--force"}, + // Ensure we don't partially apply the rewritten history. + // We don't expect file / directory conflicts as all refs + // in the staging repo are from the original. + git.Flag{Name: "--atomic"}, + // We're going to have a lot of these, don't waste + // time displaying them. + git.Flag{Name: "--no-show-forced-updates"}, + // No need for FETCH_HEAD when mirroring. + git.Flag{Name: "--no-write-fetch-head"}, + git.Flag{Name: "--quiet"}, + }, + Args: append( + []string{"file://" + stagingRepoPath}, + git.MirrorRefSpec, + ), + }, + git.WithRefTxHook(repo), + git.WithStderr(&stderr), + git.WithConfig(git.ConfigPair{ + Key: "advice.fetchShowForcedUpdates", Value: "false", + }), + ); err != nil { + return structerr.New("fetching rewritten history: %w", err).WithMetadata("stderr", &stderr) + } + + return nil +} + +// initStagingRepo creates a new bare repository to write the rewritten history into +// with default branch is set to match the source repo. +func (s *server) initStagingRepo(ctx context.Context, repo *gitalypb.Repository, defaultBranch git.ReferenceName) (*localrepo.Repo, string, error) { + stagingRepoProto, stagingRepoDir, err := tempdir.NewRepository(ctx, repo.GetStorageName(), s.logger, s.locator) + if err != nil { + return nil, "", err + } + + var stderr strings.Builder + cmd, err := s.gitCmdFactory.NewWithoutRepo(ctx, git.Command{ + Name: "init", + Flags: []git.Option{ + git.Flag{Name: "--bare"}, + git.Flag{Name: "--quiet"}, + }, + Args: []string{stagingRepoDir.Path()}, + }, git.WithStderr(&stderr)) + if err != nil { + return nil, "", fmt.Errorf("spawning git-init: %w", err) + } + + if err := cmd.Wait(); err != nil { + return nil, "", structerr.New("creating repository: %w", err).WithMetadata("stderr", &stderr) + } + + stagingRepo := s.localrepo(stagingRepoProto) + + // Ensure HEAD matches the source repository. In practice a mismatch doesn't cause problems, + // but out of an abundance of caution let's keep the two repos as similar as possible. + if err := stagingRepo.SetDefaultBranch(ctx, s.txManager, defaultBranch); err != nil { + return nil, "", fmt.Errorf("setting default branch: %w", err) + } + + return stagingRepo, stagingRepoDir.Path(), nil +} + +func (s *server) runFilterRepo( + ctx context.Context, + srcRepo, stagingRepo *localrepo.Repo, + blobsToRemove []string, + redactions [][]byte, ) error { // Place argument files in a tempdir so that cleanup is handled automatically. - tmpDir, err := tempdir.New(ctx, repo.GetStorageName(), s.logger, s.locator) + tmpDir, err := tempdir.New(ctx, srcRepo.GetStorageName(), s.logger, s.locator) if err != nil { return fmt.Errorf("create tempdir: %w", err) } @@ -129,11 +224,29 @@ func (s *server) runFilterRepo( flags = append(flags, git.Flag{Name: "--replace-text=" + replacePath}) } + srcPath, err := srcRepo.Path() + if err != nil { + return fmt.Errorf("getting source repo path: %w", err) + } + + stagingPath, err := stagingRepo.Path() + if err != nil { + return fmt.Errorf("getting target repo path: %w", err) + } + + // We must run this using 'NewWithoutRepo' because setting '--git-dir', + // as 'repo.ExecAndWait' does, will override the '--target' flag and + // write the updates directly to the original repository. var stdout, stderr strings.Builder - if err := repo.ExecAndWait(ctx, + cmd, err := s.gitCmdFactory.NewWithoutRepo(ctx, git.Command{ Name: "filter-repo", Flags: append([]git.Option{ + // Repository to write filtered history into. + git.Flag{Name: "--target=" + stagingPath}, + // Repository to read from. + git.Flag{Name: "--source=" + srcPath}, + // git.Flag{Name: "--refs=refs/*"}, // Prevent automatic cleanup tasks like deleting 'origin' and running git-gc(1). git.Flag{Name: "--partial"}, // Bypass check that repository is not a fresh clone. @@ -146,10 +259,15 @@ func (s *server) runFilterRepo( git.Flag{Name: "--quiet"}, }, flags...), }, - git.WithRefTxHook(repo), + git.WithDisabledHooks(), git.WithStdout(&stdout), git.WithStderr(&stderr), - ); err != nil { + ) + if err != nil { + return fmt.Errorf("spawning git-filter-repo: %w", err) + } + + if err := cmd.Wait(); err != nil { var exitErr *exec.ExitError if errors.As(err, &exitErr) { return structerr.New("git-filter-repo failed with exit code %d", exitErr.ExitCode()).WithMetadataItems( -- cgit v1.2.3