diff options
author | Sami Hiltunen <shiltunen@gitlab.com> | 2022-05-05 12:27:56 +0300 |
---|---|---|
committer | Sami Hiltunen <shiltunen@gitlab.com> | 2022-05-05 12:42:11 +0300 |
commit | 7462739f477a3f55062192b7bcba1f97afd50b62 (patch) | |
tree | 1fc0f37c18975321406baaf232c8176ddbae2ecc | |
parent | 1f98d5a94c880e3e556ae3ace095f83e44f002fb (diff) |
Disable deletions by default in background verifier
Praefect's background verifier goes through the replica metadata
records and verifies the replicas still exist. If not, it deletes
their metadata records so no traffic is routed to them and they'll be
reconciled.
Due to renames not being atomic in Praefect, there is a race that can
cause the verifier to remove metadata records that belong to renamed
repositories. The rename process first renames the repository on the
disk and afterwards updates the metadata record to match. If the
verifier checks the replica after it's renamed on the disk but before
it's metadata record is renamed, the verifier would errorenously remove
the record. This can be a problem with Geo in particular as it renames
a lot of repositories, and soft deletions.
To prevent these errorenous removals but to still get data from production
by running the verifier, this commit disables the removal behavior. The
metrics and logs produced still reflect the invalidity of the metadata
record but the record itself is not deleted. The replica is instead marked
as successfully verified so the verifier can proceed with checking other
replicas. The invalid replicas will be found again after the verification
interval has passed.
There is a fix pending for making the renames atomic which will do away with
this problem. Until that lands, the deletion logic should not be default
enabled.
-rw-r--r-- | cmd/praefect/main.go | 1 | ||||
-rw-r--r-- | internal/praefect/verifier.go | 19 | ||||
-rw-r--r-- | internal/praefect/verifier_test.go | 75 |
3 files changed, 82 insertions, 13 deletions
diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go index ab28bdea0..52c30f923 100644 --- a/cmd/praefect/main.go +++ b/cmd/praefect/main.go @@ -358,6 +358,7 @@ func run( nodeSet.Connections(), hm, conf.BackgroundVerification.VerificationInterval, + false, ) promreg.MustRegister(verifier) diff --git a/internal/praefect/verifier.go b/internal/praefect/verifier.go index cf23ae7f4..f1e789219 100644 --- a/internal/praefect/verifier.go +++ b/internal/praefect/verifier.go @@ -28,6 +28,12 @@ type MetadataVerifier struct { leaseDuration time.Duration healthChecker HealthChecker verificationInterval time.Duration + // If performDeletions is set, the worker deletes invalid metadata records. If it is not + // set, the worker marks the replica as successfully verified in the database but produces + // logs and metrics for the invalid replica. Marking the verification successful in the database + // allows the worker to proceed. The invalid replicas will be found again after the configured + // verificationInterval has passed. + performDeletions bool dequeuedJobsTotal *prometheus.CounterVec completedJobsTotal *prometheus.CounterVec @@ -47,6 +53,7 @@ func NewMetadataVerifier( conns Connections, healthChecker HealthChecker, verificationInterval time.Duration, + performDeletions bool, ) *MetadataVerifier { v := &MetadataVerifier{ log: log, @@ -56,6 +63,7 @@ func NewMetadataVerifier( leaseDuration: 30 * time.Second, healthChecker: healthChecker, verificationInterval: verificationInterval, + performDeletions: performDeletions, dequeuedJobsTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "gitaly_praefect_verification_jobs_dequeued_total", @@ -298,7 +306,10 @@ func (v *MetadataVerifier) updateMetadata(ctx context.Context, results []verific } if len(logRecords) > 0 { - v.log.WithField("replicas", logRecords).Info("removing metadata records of non-existent replicas") + v.log.WithFields(logrus.Fields{ + "perform_deletions": v.performDeletions, + "replicas": logRecords, + }).Info("removing metadata records of non-existent replicas") } _, err := v.db.ExecContext(ctx, ` @@ -331,8 +342,10 @@ DELETE FROM storage_repositories USING results WHERE storage_repositories.repository_id = results.repository_id AND storage_repositories.storage = results.storage -AND successfully_verified AND NOT exists - `, repositoryIDs, storages, successfullyVerifieds, exists) +AND successfully_verified +AND NOT exists +AND $5 + `, repositoryIDs, storages, successfullyVerifieds, exists, v.performDeletions) if err != nil { return fmt.Errorf("query: %w", err) } diff --git a/internal/praefect/verifier_test.go b/internal/praefect/verifier_test.go index 9c2ffda9b..674089166 100644 --- a/internal/praefect/verifier_test.go +++ b/internal/praefect/verifier_test.go @@ -79,14 +79,15 @@ func TestVerifier(t *testing.T) { } for _, tc := range []struct { - desc string - erroringGitalys map[string]bool - replicas replicas - healthyStorages StaticHealthChecker - batchSize int - steps []step - dequeuedJobsTotal map[string]map[string]int - completedJobsTotal map[string]map[string]map[string]int + desc string + dontPerformDeletions bool + erroringGitalys map[string]bool + replicas replicas + healthyStorages StaticHealthChecker + batchSize int + steps []step + dequeuedJobsTotal map[string]map[string]int + completedJobsTotal map[string]map[string]map[string]int }{ { desc: "all replicas exist", @@ -279,6 +280,60 @@ func TestVerifier(t *testing.T) { }, }, { + desc: "metadata is not deleted if deletions are disabled", + dontPerformDeletions: true, + batchSize: 2, + replicas: replicas{ + "virtual-storage": { + "repository-1": { + gitaly1: {exists: true, lastVerified: neverVerified}, + gitaly2: {exists: false, lastVerified: neverVerified}, + gitaly3: {exists: false, lastVerified: pendingVerification}, + }, + }, + }, + steps: []step{ + { + expectedRemovals: logRecord{ + "virtual-storage": { + "repository-1": {gitaly2}, + }, + }, + expectedReplicas: map[string]map[string][]string{ + "virtual-storage": { + "repository-1": {gitaly1, gitaly2, gitaly3}, + }, + }, + }, + { + expectedRemovals: logRecord{ + "virtual-storage": { + "repository-1": {gitaly3}, + }, + }, + expectedReplicas: map[string]map[string][]string{ + "virtual-storage": { + "repository-1": {gitaly1, gitaly2, gitaly3}, + }, + }, + }, + }, + dequeuedJobsTotal: map[string]map[string]int{ + "virtual-storage": { + gitaly1: 1, + gitaly2: 1, + gitaly3: 1, + }, + }, + completedJobsTotal: map[string]map[string]map[string]int{ + "virtual-storage": { + gitaly1: {"valid": 1}, + gitaly2: {"invalid": 1}, + gitaly3: {"invalid": 1}, + }, + }, + }, + { desc: "verification time is updated when repository exists", replicas: replicas{ "virtual-storage": { @@ -581,7 +636,7 @@ func TestVerifier(t *testing.T) { healthyStorages = tc.healthyStorages } - verifier := NewMetadataVerifier(logger, db, conns, healthyStorages, 24*7*time.Hour) + verifier := NewMetadataVerifier(logger, db, conns, healthyStorages, 24*7*time.Hour, !tc.dontPerformDeletions) if tc.batchSize > 0 { verifier.batchSize = tc.batchSize } @@ -744,7 +799,7 @@ func TestVerifier_runExpiredLeaseReleaser(t *testing.T) { defer tx.Rollback(t) logger, hook := test.NewNullLogger() - verifier := NewMetadataVerifier(logrus.NewEntry(logger), tx, nil, nil, 0) + verifier := NewMetadataVerifier(logrus.NewEntry(logger), tx, nil, nil, 0, true) // set batch size lower than the number of locked leases to ensure the batching works verifier.batchSize = 2 |