diff options
author | Pavlo Strokov <pstrokov@gitlab.com> | 2021-07-15 18:47:39 +0300 |
---|---|---|
committer | Pavlo Strokov <pstrokov@gitlab.com> | 2021-07-21 14:10:44 +0300 |
commit | 943fdd846af8ccb755b104dceb65bd8926f960b4 (patch) | |
tree | 828a4cf24f571655d23eed07aea565053e70032c | |
parent | 7b95c34000d871ea6bed65f3c908043138aa11be (diff) |
Remove orphan repository replicas
When repository removal operation fails for secondaries for
max attempts to process the job the repositories remain on
the disk with no use. It happens because the entry is removed
from repositories table when removal operation runs for the primary
storage. That is why we should extend our reconciler to handle
removal of those orphan repositories stored on the secondary gitaly
nodes. Because of the delete_replica_unique_index it is possible to
schedule only a single removal operation for one reconciliation run.
But eventually all repositories will be removed. In order to return
a single storage into proper state faster we use ordering for the
scheduling, it also simplifies testing.
Closes: https://gitlab.com/gitlab-org/gitaly/-/issues/3480
Changelog: fixed
-rw-r--r-- | internal/praefect/reconciler/reconciler.go | 27 | ||||
-rw-r--r-- | internal/praefect/reconciler/reconciler_test.go | 109 |
2 files changed, 130 insertions, 6 deletions
diff --git a/internal/praefect/reconciler/reconciler.go b/internal/praefect/reconciler/reconciler.go index 6b2a7a594..308a7aff1 100644 --- a/internal/praefect/reconciler/reconciler.go +++ b/internal/praefect/reconciler/reconciler.go @@ -120,7 +120,7 @@ func (r *Reconciler) reconcile(ctx context.Context) error { for virtualStorage, healthyStorages := range r.hc.HealthyNodes() { if len(healthyStorages) < 2 { - // minimum two healthy storages within a virtual stoage needed for valid + // minimum two healthy storages within a virtual storage needed for valid // replication source and target r.log.WithField("virtual_storage", virtualStorage).Info("reconciliation skipped for virtual storage due to not having enough healthy storages") continue @@ -190,6 +190,31 @@ delete_jobs AS ( AND job->>'relative_path' = relative_path AND job->>'change' = 'delete_replica' ) + UNION ALL + -- Schedules repository removal operations for the storages where repository exists, but shouldn't. + -- That can happen when repository removal replication job failed on the secondaries and + -- repository remains on storage unused (because it doesn't exist in the 'repositories' table anymore). + SELECT * FROM ( + SELECT DISTINCT ON (virtual_storage, relative_path) + virtual_storage, + relative_path, + storage + FROM storage_repositories + JOIN healthy_storages USING (virtual_storage, storage) + LEFT JOIN repositories USING (virtual_storage, relative_path) + WHERE repositories.virtual_storage IS NULL AND + NOT EXISTS ( + -- Ensure there are no scheduled 'delete_replica' type jobs for the repository. + SELECT FROM replication_queue + WHERE state NOT IN ('completed', 'cancelled', 'dead') + AND job->>'virtual_storage' = virtual_storage + AND job->>'relative_path' = relative_path + AND job->>'change' = 'delete_replica' + ) + -- this sorting is not required, but it simplifies testing and doesn't influence + -- performance as a noticeable degradation + ORDER BY virtual_storage, relative_path, storage + ) orphans ), update_jobs AS ( diff --git a/internal/praefect/reconciler/reconciler_test.go b/internal/praefect/reconciler/reconciler_test.go index 7f034b60b..9ba2d1c79 100644 --- a/internal/praefect/reconciler/reconciler_test.go +++ b/internal/praefect/reconciler/reconciler_test.go @@ -96,11 +96,12 @@ func TestReconciler(t *testing.T) { } for _, tc := range []struct { - desc string - healthyStorages storages - repositories repositories - existingJobs existingJobs - reconciliationJobs jobs + desc string + healthyStorages storages + repositories repositories + existingJobs existingJobs + deletedRepositories map[string][]string + reconciliationJobs jobs }{ { desc: "no repositories", @@ -984,6 +985,94 @@ func TestReconciler(t *testing.T) { }, }, }, + { + desc: "orphan repositories replicas should be scheduled for deletion", + healthyStorages: configuredStoragesWithout("storage-3"), + repositories: repositories{ + "virtual-storage-1": { + "relative-path-1": { + "storage-1": {generation: 1}, + "storage-2": {generation: 1}, + }, + "relative-path-2": { + "storage-1": {generation: 1}, + "storage-2": {generation: 1}, + }, + "relative-path-3": { + "storage-1": {generation: 1}, + "storage-2": {generation: 1}, + }, + }, + }, + deletedRepositories: map[string][]string{"virtual-storage-1": {"relative-path-2", "relative-path-3"}}, + reconciliationJobs: jobs{ + { + Change: datastore.DeleteReplica, + VirtualStorage: "virtual-storage-1", + RelativePath: "relative-path-2", + TargetNodeStorage: "storage-1", + }, + { + Change: datastore.DeleteReplica, + VirtualStorage: "virtual-storage-1", + RelativePath: "relative-path-3", + TargetNodeStorage: "storage-1", + }, + }, + }, + { + desc: "orphan repositories replicas should be scheduled for deletion (only for healthy nodes)", + healthyStorages: configuredStoragesWithout("storage-1"), + repositories: repositories{ + "virtual-storage-1": { + "relative-path-1": { + "storage-1": {generation: 1}, + "storage-2": {generation: 1}, + "storage-3": {generation: 1}, + }, + "relative-path-2": { + "storage-1": {generation: 1}, + "storage-2": {generation: 1}, + "storage-3": {generation: 1}, + }, + }, + }, + deletedRepositories: map[string][]string{"virtual-storage-1": {"relative-path-2"}}, + reconciliationJobs: jobs{ + { + Change: datastore.DeleteReplica, + VirtualStorage: "virtual-storage-1", + RelativePath: "relative-path-2", + TargetNodeStorage: "storage-2", + }, + }, + }, + { + desc: "orphan repositories replicas should not be scheduled if replication event already exists", + healthyStorages: configuredStorages, + repositories: repositories{ + "virtual-storage-1": { + "relative-path-1": { + "storage-1": {generation: 1}, + "storage-2": {generation: 1}, + "storage-3": {generation: 1}, + }, + }, + }, + deletedRepositories: map[string][]string{"virtual-storage-1": {"relative-path-1"}}, + existingJobs: []datastore.ReplicationEvent{ + { + State: datastore.JobStateReady, + Job: datastore.ReplicationJob{ + Change: datastore.DeleteReplica, + VirtualStorage: "virtual-storage-1", + TargetNodeStorage: "storage-1", + RelativePath: "relative-path-1", + }, + }, + }, + reconciliationJobs: nil, + }, } { t.Run(tc.desc, func(t *testing.T) { ctx, cancel := testhelper.Context() @@ -1055,6 +1144,16 @@ func TestReconciler(t *testing.T) { require.True(t, resetted) } + for vs, repos := range tc.deletedRepositories { + for _, repo := range repos { + _, err := db.Exec( + "DELETE FROM repositories WHERE virtual_storage = $1 AND relative_path = $2", + vs, repo, + ) + require.NoError(t, err) + } + } + // run reconcile in two concurrent transactions to ensure everything works // as expected with multiple Praefect's reconciling at the same time firstTx, err := db.Begin() |