Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Steinhardt <psteinhardt@gitlab.com>2021-07-07 11:31:05 +0300
committerPatrick Steinhardt <psteinhardt@gitlab.com>2021-07-16 10:15:01 +0300
commit22e386d7cdecbb8023918918f7224a1791e975c6 (patch)
tree578559e5fc9bd1b5ef8ed53501067da372a19f00
parentaa09579c973f8565a473fee8fc05c896ba8b1a9d (diff)
coordinator: Fix repo creation/removal race for up-to-date secondaries
When creating repositories, Rails will in some cases first schedule a call to `RemoveRepository()` to make sure that there's no old repository obstructing the path we are about to create the repository at. Given that `RemoveRepository()` isn't transactional, this action will be replicated to secondaries asynchronously. With our current design, these replication jobs will not cause a bump of the repository generation given that we may not even have an entry for the repository. Combined with the fact that we do not take outstanding replication jobs into account when computing whether a node is up-to-date or not, this means that secondaries will be considered up-to-date even though they have deletions pending. This results in a race between the replication jobs which are about to delete the target repository and the subsequent call to `CreateRepository()` et al. If this race is lost (that is, the deletion wasn't scheduled before `CreateRepository()` gets executed, which is almost always), then we will end up with the repository getting deleted during or after its creation. This race needs to be fixed via two changes: 1. We need to take outstanding modifying replication jobs into account when computing whether a node is up to date or not. Otherwise, we may be in the middle of processing a transactional RPC when a replication job kicks in and modifies repository state while we're operating on it. 2. With (1) implemented, we need to make `RemoveRepository()` transactional, otherwise it would always cause secondaries to be considered out-of-date and repository creation wouldn't use transactions in most cases. This commit implements (2) and enables transactional behaviour for `RemoveRepository()` which has been implemented in the preceding commit. Note that this requires a modification of `TestRemoveRepository()`: previously, we didn't set up the backchannel handshaker and thus couldn't accept votes from Gitaly nodes. Now that we have transactions enabled for this RPC, we must be able to handle votes or otherwise the RPC is slated for failure. We thus manually need to set up the node manager to have a backchannel handshaker. Changelog: fixed
-rw-r--r--internal/metadata/featureflag/feature_flags.go3
-rw-r--r--internal/praefect/coordinator.go6
-rw-r--r--internal/praefect/server_test.go21
3 files changed, 25 insertions, 5 deletions
diff --git a/internal/metadata/featureflag/feature_flags.go b/internal/metadata/featureflag/feature_flags.go
index f42cab447..f08bd5ea2 100644
--- a/internal/metadata/featureflag/feature_flags.go
+++ b/internal/metadata/featureflag/feature_flags.go
@@ -30,6 +30,8 @@ var (
// via a direct call instead of doing an RPC call to its own server. This fixes calls of
// `ReplicateRepository()` in case it's invoked via Praefect with transactions enabled.
ReplicateRepositoryDirectFetch = FeatureFlag{Name: "replicate_repository_direct_fetch", OnByDefault: false}
+ // TxRemoveRepository enables transactionsal voting for the RemoveRepository RPC.
+ TxRemoveRepository = FeatureFlag{Name: "tx_remove_repository", OnByDefault: false}
)
// All includes all feature flags.
@@ -42,4 +44,5 @@ var All = []FeatureFlag{
LFSPointersPipeline,
CreateRepositoryFromBundleAtomicFetch,
ReplicateRepositoryDirectFetch,
+ TxRemoveRepository,
}
diff --git a/internal/praefect/coordinator.go b/internal/praefect/coordinator.go
index 8e02f9d69..700026915 100644
--- a/internal/praefect/coordinator.go
+++ b/internal/praefect/coordinator.go
@@ -88,8 +88,9 @@ var transactionRPCs = map[string]transactionsCondition{
"/gitaly.WikiService/WikiUpdatePage": transactionsEnabled,
"/gitaly.WikiService/WikiWritePage": transactionsEnabled,
- "/gitaly.RepositoryService/SetConfig": transactionsFlag(featureflag.TxConfig),
- "/gitaly.RepositoryService/DeleteConfig": transactionsFlag(featureflag.TxConfig),
+ "/gitaly.RepositoryService/SetConfig": transactionsFlag(featureflag.TxConfig),
+ "/gitaly.RepositoryService/DeleteConfig": transactionsFlag(featureflag.TxConfig),
+ "/gitaly.RepositoryService/RemoveRepository": transactionsFlag(featureflag.TxRemoveRepository),
// The following RPCs currently aren't transactional, but we may consider making them
// transactional in the future if the need arises.
@@ -99,7 +100,6 @@ var transactionRPCs = map[string]transactionsCondition{
"/gitaly.ObjectPoolService/LinkRepositoryToObjectPool": transactionsDisabled,
"/gitaly.ObjectPoolService/ReduplicateRepository": transactionsDisabled,
"/gitaly.ObjectPoolService/UnlinkRepositoryFromObjectPool": transactionsDisabled,
- "/gitaly.RepositoryService/RemoveRepository": transactionsDisabled,
"/gitaly.RepositoryService/RenameRepository": transactionsDisabled,
// The following list of RPCs are considered idempotent RPCs: while they write into the
diff --git a/internal/praefect/server_test.go b/internal/praefect/server_test.go
index 216d62d04..cba0fef0c 100644
--- a/internal/praefect/server_test.go
+++ b/internal/praefect/server_test.go
@@ -493,7 +493,24 @@ func TestRemoveRepository(t *testing.T) {
return queue.Acknowledge(ctx, state, ids)
})
- cc, _, cleanup := runPraefectServer(t, praefectCfg, buildOptions{withQueue: queueInterceptor})
+ repoStore := defaultRepoStore(praefectCfg)
+ txMgr := defaultTxMgr(praefectCfg)
+ nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), praefectCfg, nil,
+ repoStore, promtest.NewMockHistogramVec(), protoregistry.GitalyProtoPreregistered,
+ nil, backchannel.NewClientHandshaker(
+ testhelper.DiscardTestEntry(t),
+ NewBackchannelServerFactory(testhelper.DiscardTestEntry(t), transaction.NewServer(txMgr)),
+ ),
+ )
+ require.NoError(t, err)
+ nodeMgr.Start(0, time.Hour)
+
+ cc, _, cleanup := runPraefectServer(t, praefectCfg, buildOptions{
+ withQueue: queueInterceptor,
+ withRepoStore: repoStore,
+ withNodeMgr: nodeMgr,
+ withTxMgr: txMgr,
+ })
defer cleanup()
ctx, cancel := testhelper.Context()
@@ -502,7 +519,7 @@ func TestRemoveRepository(t *testing.T) {
virtualRepo := proto.Clone(repos[0][0]).(*gitalypb.Repository)
virtualRepo.StorageName = praefectCfg.VirtualStorages[0].Name
- _, err := gitalypb.NewRepositoryServiceClient(cc).RemoveRepository(ctx, &gitalypb.RemoveRepositoryRequest{
+ _, err = gitalypb.NewRepositoryServiceClient(cc).RemoveRepository(ctx, &gitalypb.RemoveRepositoryRequest{
Repository: virtualRepo,
})
require.NoError(t, err)