diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-07-07 11:31:05 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2021-07-16 10:15:01 +0300 |
commit | 22e386d7cdecbb8023918918f7224a1791e975c6 (patch) | |
tree | 578559e5fc9bd1b5ef8ed53501067da372a19f00 | |
parent | aa09579c973f8565a473fee8fc05c896ba8b1a9d (diff) |
coordinator: Fix repo creation/removal race for up-to-date secondaries
When creating repositories, Rails will in some cases first schedule a
call to `RemoveRepository()` to make sure that there's no old repository
obstructing the path we are about to create the repository at. Given
that `RemoveRepository()` isn't transactional, this action will be
replicated to secondaries asynchronously. With our current design, these
replication jobs will not cause a bump of the repository generation
given that we may not even have an entry for the repository. Combined
with the fact that we do not take outstanding replication jobs into
account when computing whether a node is up-to-date or not, this means
that secondaries will be considered up-to-date even though they have
deletions pending. This results in a race between the replication jobs
which are about to delete the target repository and the subsequent call
to `CreateRepository()` et al. If this race is lost (that is, the
deletion wasn't scheduled before `CreateRepository()` gets executed,
which is almost always), then we will end up with the repository getting
deleted during or after its creation.
This race needs to be fixed via two changes:
1. We need to take outstanding modifying replication jobs into
account when computing whether a node is up to date or not.
Otherwise, we may be in the middle of processing a transactional
RPC when a replication job kicks in and modifies repository state
while we're operating on it.
2. With (1) implemented, we need to make `RemoveRepository()`
transactional, otherwise it would always cause secondaries to be
considered out-of-date and repository creation wouldn't use
transactions in most cases.
This commit implements (2) and enables transactional behaviour for
`RemoveRepository()` which has been implemented in the preceding commit.
Note that this requires a modification of `TestRemoveRepository()`:
previously, we didn't set up the backchannel handshaker and thus
couldn't accept votes from Gitaly nodes. Now that we have transactions
enabled for this RPC, we must be able to handle votes or otherwise the
RPC is slated for failure. We thus manually need to set up the node
manager to have a backchannel handshaker.
Changelog: fixed
-rw-r--r-- | internal/metadata/featureflag/feature_flags.go | 3 | ||||
-rw-r--r-- | internal/praefect/coordinator.go | 6 | ||||
-rw-r--r-- | internal/praefect/server_test.go | 21 |
3 files changed, 25 insertions, 5 deletions
diff --git a/internal/metadata/featureflag/feature_flags.go b/internal/metadata/featureflag/feature_flags.go index f42cab447..f08bd5ea2 100644 --- a/internal/metadata/featureflag/feature_flags.go +++ b/internal/metadata/featureflag/feature_flags.go @@ -30,6 +30,8 @@ var ( // via a direct call instead of doing an RPC call to its own server. This fixes calls of // `ReplicateRepository()` in case it's invoked via Praefect with transactions enabled. ReplicateRepositoryDirectFetch = FeatureFlag{Name: "replicate_repository_direct_fetch", OnByDefault: false} + // TxRemoveRepository enables transactionsal voting for the RemoveRepository RPC. + TxRemoveRepository = FeatureFlag{Name: "tx_remove_repository", OnByDefault: false} ) // All includes all feature flags. @@ -42,4 +44,5 @@ var All = []FeatureFlag{ LFSPointersPipeline, CreateRepositoryFromBundleAtomicFetch, ReplicateRepositoryDirectFetch, + TxRemoveRepository, } diff --git a/internal/praefect/coordinator.go b/internal/praefect/coordinator.go index 8e02f9d69..700026915 100644 --- a/internal/praefect/coordinator.go +++ b/internal/praefect/coordinator.go @@ -88,8 +88,9 @@ var transactionRPCs = map[string]transactionsCondition{ "/gitaly.WikiService/WikiUpdatePage": transactionsEnabled, "/gitaly.WikiService/WikiWritePage": transactionsEnabled, - "/gitaly.RepositoryService/SetConfig": transactionsFlag(featureflag.TxConfig), - "/gitaly.RepositoryService/DeleteConfig": transactionsFlag(featureflag.TxConfig), + "/gitaly.RepositoryService/SetConfig": transactionsFlag(featureflag.TxConfig), + "/gitaly.RepositoryService/DeleteConfig": transactionsFlag(featureflag.TxConfig), + "/gitaly.RepositoryService/RemoveRepository": transactionsFlag(featureflag.TxRemoveRepository), // The following RPCs currently aren't transactional, but we may consider making them // transactional in the future if the need arises. @@ -99,7 +100,6 @@ var transactionRPCs = map[string]transactionsCondition{ "/gitaly.ObjectPoolService/LinkRepositoryToObjectPool": transactionsDisabled, "/gitaly.ObjectPoolService/ReduplicateRepository": transactionsDisabled, "/gitaly.ObjectPoolService/UnlinkRepositoryFromObjectPool": transactionsDisabled, - "/gitaly.RepositoryService/RemoveRepository": transactionsDisabled, "/gitaly.RepositoryService/RenameRepository": transactionsDisabled, // The following list of RPCs are considered idempotent RPCs: while they write into the diff --git a/internal/praefect/server_test.go b/internal/praefect/server_test.go index 216d62d04..cba0fef0c 100644 --- a/internal/praefect/server_test.go +++ b/internal/praefect/server_test.go @@ -493,7 +493,24 @@ func TestRemoveRepository(t *testing.T) { return queue.Acknowledge(ctx, state, ids) }) - cc, _, cleanup := runPraefectServer(t, praefectCfg, buildOptions{withQueue: queueInterceptor}) + repoStore := defaultRepoStore(praefectCfg) + txMgr := defaultTxMgr(praefectCfg) + nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), praefectCfg, nil, + repoStore, promtest.NewMockHistogramVec(), protoregistry.GitalyProtoPreregistered, + nil, backchannel.NewClientHandshaker( + testhelper.DiscardTestEntry(t), + NewBackchannelServerFactory(testhelper.DiscardTestEntry(t), transaction.NewServer(txMgr)), + ), + ) + require.NoError(t, err) + nodeMgr.Start(0, time.Hour) + + cc, _, cleanup := runPraefectServer(t, praefectCfg, buildOptions{ + withQueue: queueInterceptor, + withRepoStore: repoStore, + withNodeMgr: nodeMgr, + withTxMgr: txMgr, + }) defer cleanup() ctx, cancel := testhelper.Context() @@ -502,7 +519,7 @@ func TestRemoveRepository(t *testing.T) { virtualRepo := proto.Clone(repos[0][0]).(*gitalypb.Repository) virtualRepo.StorageName = praefectCfg.VirtualStorages[0].Name - _, err := gitalypb.NewRepositoryServiceClient(cc).RemoveRepository(ctx, &gitalypb.RemoveRepositoryRequest{ + _, err = gitalypb.NewRepositoryServiceClient(cc).RemoveRepository(ctx, &gitalypb.RemoveRepositoryRequest{ Repository: virtualRepo, }) require.NoError(t, err) |