Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavlo Strokov <pstrokov@gitlab.com>2021-07-15 18:47:39 +0300
committerPavlo Strokov <pstrokov@gitlab.com>2021-07-21 14:10:44 +0300
commit943fdd846af8ccb755b104dceb65bd8926f960b4 (patch)
tree828a4cf24f571655d23eed07aea565053e70032c
parent7b95c34000d871ea6bed65f3c908043138aa11be (diff)
Remove orphan repository replicas
When repository removal operation fails for secondaries for max attempts to process the job the repositories remain on the disk with no use. It happens because the entry is removed from repositories table when removal operation runs for the primary storage. That is why we should extend our reconciler to handle removal of those orphan repositories stored on the secondary gitaly nodes. Because of the delete_replica_unique_index it is possible to schedule only a single removal operation for one reconciliation run. But eventually all repositories will be removed. In order to return a single storage into proper state faster we use ordering for the scheduling, it also simplifies testing. Closes: https://gitlab.com/gitlab-org/gitaly/-/issues/3480 Changelog: fixed
-rw-r--r--internal/praefect/reconciler/reconciler.go27
-rw-r--r--internal/praefect/reconciler/reconciler_test.go109
2 files changed, 130 insertions, 6 deletions
diff --git a/internal/praefect/reconciler/reconciler.go b/internal/praefect/reconciler/reconciler.go
index 6b2a7a594..308a7aff1 100644
--- a/internal/praefect/reconciler/reconciler.go
+++ b/internal/praefect/reconciler/reconciler.go
@@ -120,7 +120,7 @@ func (r *Reconciler) reconcile(ctx context.Context) error {
for virtualStorage, healthyStorages := range r.hc.HealthyNodes() {
if len(healthyStorages) < 2 {
- // minimum two healthy storages within a virtual stoage needed for valid
+ // minimum two healthy storages within a virtual storage needed for valid
// replication source and target
r.log.WithField("virtual_storage", virtualStorage).Info("reconciliation skipped for virtual storage due to not having enough healthy storages")
continue
@@ -190,6 +190,31 @@ delete_jobs AS (
AND job->>'relative_path' = relative_path
AND job->>'change' = 'delete_replica'
)
+ UNION ALL
+ -- Schedules repository removal operations for the storages where repository exists, but shouldn't.
+ -- That can happen when repository removal replication job failed on the secondaries and
+ -- repository remains on storage unused (because it doesn't exist in the 'repositories' table anymore).
+ SELECT * FROM (
+ SELECT DISTINCT ON (virtual_storage, relative_path)
+ virtual_storage,
+ relative_path,
+ storage
+ FROM storage_repositories
+ JOIN healthy_storages USING (virtual_storage, storage)
+ LEFT JOIN repositories USING (virtual_storage, relative_path)
+ WHERE repositories.virtual_storage IS NULL AND
+ NOT EXISTS (
+ -- Ensure there are no scheduled 'delete_replica' type jobs for the repository.
+ SELECT FROM replication_queue
+ WHERE state NOT IN ('completed', 'cancelled', 'dead')
+ AND job->>'virtual_storage' = virtual_storage
+ AND job->>'relative_path' = relative_path
+ AND job->>'change' = 'delete_replica'
+ )
+ -- this sorting is not required, but it simplifies testing and doesn't influence
+ -- performance as a noticeable degradation
+ ORDER BY virtual_storage, relative_path, storage
+ ) orphans
),
update_jobs AS (
diff --git a/internal/praefect/reconciler/reconciler_test.go b/internal/praefect/reconciler/reconciler_test.go
index 7f034b60b..9ba2d1c79 100644
--- a/internal/praefect/reconciler/reconciler_test.go
+++ b/internal/praefect/reconciler/reconciler_test.go
@@ -96,11 +96,12 @@ func TestReconciler(t *testing.T) {
}
for _, tc := range []struct {
- desc string
- healthyStorages storages
- repositories repositories
- existingJobs existingJobs
- reconciliationJobs jobs
+ desc string
+ healthyStorages storages
+ repositories repositories
+ existingJobs existingJobs
+ deletedRepositories map[string][]string
+ reconciliationJobs jobs
}{
{
desc: "no repositories",
@@ -984,6 +985,94 @@ func TestReconciler(t *testing.T) {
},
},
},
+ {
+ desc: "orphan repositories replicas should be scheduled for deletion",
+ healthyStorages: configuredStoragesWithout("storage-3"),
+ repositories: repositories{
+ "virtual-storage-1": {
+ "relative-path-1": {
+ "storage-1": {generation: 1},
+ "storage-2": {generation: 1},
+ },
+ "relative-path-2": {
+ "storage-1": {generation: 1},
+ "storage-2": {generation: 1},
+ },
+ "relative-path-3": {
+ "storage-1": {generation: 1},
+ "storage-2": {generation: 1},
+ },
+ },
+ },
+ deletedRepositories: map[string][]string{"virtual-storage-1": {"relative-path-2", "relative-path-3"}},
+ reconciliationJobs: jobs{
+ {
+ Change: datastore.DeleteReplica,
+ VirtualStorage: "virtual-storage-1",
+ RelativePath: "relative-path-2",
+ TargetNodeStorage: "storage-1",
+ },
+ {
+ Change: datastore.DeleteReplica,
+ VirtualStorage: "virtual-storage-1",
+ RelativePath: "relative-path-3",
+ TargetNodeStorage: "storage-1",
+ },
+ },
+ },
+ {
+ desc: "orphan repositories replicas should be scheduled for deletion (only for healthy nodes)",
+ healthyStorages: configuredStoragesWithout("storage-1"),
+ repositories: repositories{
+ "virtual-storage-1": {
+ "relative-path-1": {
+ "storage-1": {generation: 1},
+ "storage-2": {generation: 1},
+ "storage-3": {generation: 1},
+ },
+ "relative-path-2": {
+ "storage-1": {generation: 1},
+ "storage-2": {generation: 1},
+ "storage-3": {generation: 1},
+ },
+ },
+ },
+ deletedRepositories: map[string][]string{"virtual-storage-1": {"relative-path-2"}},
+ reconciliationJobs: jobs{
+ {
+ Change: datastore.DeleteReplica,
+ VirtualStorage: "virtual-storage-1",
+ RelativePath: "relative-path-2",
+ TargetNodeStorage: "storage-2",
+ },
+ },
+ },
+ {
+ desc: "orphan repositories replicas should not be scheduled if replication event already exists",
+ healthyStorages: configuredStorages,
+ repositories: repositories{
+ "virtual-storage-1": {
+ "relative-path-1": {
+ "storage-1": {generation: 1},
+ "storage-2": {generation: 1},
+ "storage-3": {generation: 1},
+ },
+ },
+ },
+ deletedRepositories: map[string][]string{"virtual-storage-1": {"relative-path-1"}},
+ existingJobs: []datastore.ReplicationEvent{
+ {
+ State: datastore.JobStateReady,
+ Job: datastore.ReplicationJob{
+ Change: datastore.DeleteReplica,
+ VirtualStorage: "virtual-storage-1",
+ TargetNodeStorage: "storage-1",
+ RelativePath: "relative-path-1",
+ },
+ },
+ },
+ reconciliationJobs: nil,
+ },
} {
t.Run(tc.desc, func(t *testing.T) {
ctx, cancel := testhelper.Context()
@@ -1055,6 +1144,16 @@ func TestReconciler(t *testing.T) {
require.True(t, resetted)
}
+ for vs, repos := range tc.deletedRepositories {
+ for _, repo := range repos {
+ _, err := db.Exec(
+ "DELETE FROM repositories WHERE virtual_storage = $1 AND relative_path = $2",
+ vs, repo,
+ )
+ require.NoError(t, err)
+ }
+ }
+
// run reconcile in two concurrent transactions to ensure everything works
// as expected with multiple Praefect's reconciling at the same time
firstTx, err := db.Begin()