Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSami Hiltunen <shiltunen@gitlab.com>2022-05-05 12:27:56 +0300
committerSami Hiltunen <shiltunen@gitlab.com>2022-05-05 12:42:11 +0300
commit7462739f477a3f55062192b7bcba1f97afd50b62 (patch)
tree1fc0f37c18975321406baaf232c8176ddbae2ecc
parent1f98d5a94c880e3e556ae3ace095f83e44f002fb (diff)
Disable deletions by default in background verifier
Praefect's background verifier goes through the replica metadata records and verifies the replicas still exist. If not, it deletes their metadata records so no traffic is routed to them and they'll be reconciled. Due to renames not being atomic in Praefect, there is a race that can cause the verifier to remove metadata records that belong to renamed repositories. The rename process first renames the repository on the disk and afterwards updates the metadata record to match. If the verifier checks the replica after it's renamed on the disk but before it's metadata record is renamed, the verifier would errorenously remove the record. This can be a problem with Geo in particular as it renames a lot of repositories, and soft deletions. To prevent these errorenous removals but to still get data from production by running the verifier, this commit disables the removal behavior. The metrics and logs produced still reflect the invalidity of the metadata record but the record itself is not deleted. The replica is instead marked as successfully verified so the verifier can proceed with checking other replicas. The invalid replicas will be found again after the verification interval has passed. There is a fix pending for making the renames atomic which will do away with this problem. Until that lands, the deletion logic should not be default enabled.
-rw-r--r--cmd/praefect/main.go1
-rw-r--r--internal/praefect/verifier.go19
-rw-r--r--internal/praefect/verifier_test.go75
3 files changed, 82 insertions, 13 deletions
diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go
index ab28bdea0..52c30f923 100644
--- a/cmd/praefect/main.go
+++ b/cmd/praefect/main.go
@@ -358,6 +358,7 @@ func run(
nodeSet.Connections(),
hm,
conf.BackgroundVerification.VerificationInterval,
+ false,
)
promreg.MustRegister(verifier)
diff --git a/internal/praefect/verifier.go b/internal/praefect/verifier.go
index cf23ae7f4..f1e789219 100644
--- a/internal/praefect/verifier.go
+++ b/internal/praefect/verifier.go
@@ -28,6 +28,12 @@ type MetadataVerifier struct {
leaseDuration time.Duration
healthChecker HealthChecker
verificationInterval time.Duration
+ // If performDeletions is set, the worker deletes invalid metadata records. If it is not
+ // set, the worker marks the replica as successfully verified in the database but produces
+ // logs and metrics for the invalid replica. Marking the verification successful in the database
+ // allows the worker to proceed. The invalid replicas will be found again after the configured
+ // verificationInterval has passed.
+ performDeletions bool
dequeuedJobsTotal *prometheus.CounterVec
completedJobsTotal *prometheus.CounterVec
@@ -47,6 +53,7 @@ func NewMetadataVerifier(
conns Connections,
healthChecker HealthChecker,
verificationInterval time.Duration,
+ performDeletions bool,
) *MetadataVerifier {
v := &MetadataVerifier{
log: log,
@@ -56,6 +63,7 @@ func NewMetadataVerifier(
leaseDuration: 30 * time.Second,
healthChecker: healthChecker,
verificationInterval: verificationInterval,
+ performDeletions: performDeletions,
dequeuedJobsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gitaly_praefect_verification_jobs_dequeued_total",
@@ -298,7 +306,10 @@ func (v *MetadataVerifier) updateMetadata(ctx context.Context, results []verific
}
if len(logRecords) > 0 {
- v.log.WithField("replicas", logRecords).Info("removing metadata records of non-existent replicas")
+ v.log.WithFields(logrus.Fields{
+ "perform_deletions": v.performDeletions,
+ "replicas": logRecords,
+ }).Info("removing metadata records of non-existent replicas")
}
_, err := v.db.ExecContext(ctx, `
@@ -331,8 +342,10 @@ DELETE FROM storage_repositories
USING results
WHERE storage_repositories.repository_id = results.repository_id
AND storage_repositories.storage = results.storage
-AND successfully_verified AND NOT exists
- `, repositoryIDs, storages, successfullyVerifieds, exists)
+AND successfully_verified
+AND NOT exists
+AND $5
+ `, repositoryIDs, storages, successfullyVerifieds, exists, v.performDeletions)
if err != nil {
return fmt.Errorf("query: %w", err)
}
diff --git a/internal/praefect/verifier_test.go b/internal/praefect/verifier_test.go
index 9c2ffda9b..674089166 100644
--- a/internal/praefect/verifier_test.go
+++ b/internal/praefect/verifier_test.go
@@ -79,14 +79,15 @@ func TestVerifier(t *testing.T) {
}
for _, tc := range []struct {
- desc string
- erroringGitalys map[string]bool
- replicas replicas
- healthyStorages StaticHealthChecker
- batchSize int
- steps []step
- dequeuedJobsTotal map[string]map[string]int
- completedJobsTotal map[string]map[string]map[string]int
+ desc string
+ dontPerformDeletions bool
+ erroringGitalys map[string]bool
+ replicas replicas
+ healthyStorages StaticHealthChecker
+ batchSize int
+ steps []step
+ dequeuedJobsTotal map[string]map[string]int
+ completedJobsTotal map[string]map[string]map[string]int
}{
{
desc: "all replicas exist",
@@ -279,6 +280,60 @@ func TestVerifier(t *testing.T) {
},
},
{
+ desc: "metadata is not deleted if deletions are disabled",
+ dontPerformDeletions: true,
+ batchSize: 2,
+ replicas: replicas{
+ "virtual-storage": {
+ "repository-1": {
+ gitaly1: {exists: true, lastVerified: neverVerified},
+ gitaly2: {exists: false, lastVerified: neverVerified},
+ gitaly3: {exists: false, lastVerified: pendingVerification},
+ },
+ },
+ },
+ steps: []step{
+ {
+ expectedRemovals: logRecord{
+ "virtual-storage": {
+ "repository-1": {gitaly2},
+ },
+ },
+ expectedReplicas: map[string]map[string][]string{
+ "virtual-storage": {
+ "repository-1": {gitaly1, gitaly2, gitaly3},
+ },
+ },
+ },
+ {
+ expectedRemovals: logRecord{
+ "virtual-storage": {
+ "repository-1": {gitaly3},
+ },
+ },
+ expectedReplicas: map[string]map[string][]string{
+ "virtual-storage": {
+ "repository-1": {gitaly1, gitaly2, gitaly3},
+ },
+ },
+ },
+ },
+ dequeuedJobsTotal: map[string]map[string]int{
+ "virtual-storage": {
+ gitaly1: 1,
+ gitaly2: 1,
+ gitaly3: 1,
+ },
+ },
+ completedJobsTotal: map[string]map[string]map[string]int{
+ "virtual-storage": {
+ gitaly1: {"valid": 1},
+ gitaly2: {"invalid": 1},
+ gitaly3: {"invalid": 1},
+ },
+ },
+ },
+ {
desc: "verification time is updated when repository exists",
replicas: replicas{
"virtual-storage": {
@@ -581,7 +636,7 @@ func TestVerifier(t *testing.T) {
healthyStorages = tc.healthyStorages
}
- verifier := NewMetadataVerifier(logger, db, conns, healthyStorages, 24*7*time.Hour)
+ verifier := NewMetadataVerifier(logger, db, conns, healthyStorages, 24*7*time.Hour, !tc.dontPerformDeletions)
if tc.batchSize > 0 {
verifier.batchSize = tc.batchSize
}
@@ -744,7 +799,7 @@ func TestVerifier_runExpiredLeaseReleaser(t *testing.T) {
defer tx.Rollback(t)
logger, hook := test.NewNullLogger()
- verifier := NewMetadataVerifier(logrus.NewEntry(logger), tx, nil, nil, 0)
+ verifier := NewMetadataVerifier(logrus.NewEntry(logger), tx, nil, nil, 0, true)
// set batch size lower than the number of locked leases to ensure the batching works
verifier.batchSize = 2