Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSami Hiltunen <shiltunen@gitlab.com>2021-07-20 12:33:19 +0300
committerSami Hiltunen <shiltunen@gitlab.com>2021-07-20 12:33:19 +0300
commit5b3c3ad85b8ee290785ffcc405d97055d46af0c2 (patch)
tree0bd4f1c6cf7a369fe47cba895058f13e0db360ce
parenteaf66c05cf4d63c2dc1ab8db50a67918d0180e6d (diff)
parent73839029f79d4ebdbc8d96475cf9bd0e2a599b2b (diff)
Merge branch 'pks-tx-coordinator-replication-relax-primary-error' into 'master'
coordinator: Only schedule replication for differing error states See merge request gitlab-org/gitaly!3660
-rw-r--r--internal/praefect/coordinator.go21
-rw-r--r--internal/praefect/coordinator_pg_test.go11
-rw-r--r--internal/praefect/coordinator_test.go75
3 files changed, 82 insertions, 25 deletions
diff --git a/internal/praefect/coordinator.go b/internal/praefect/coordinator.go
index 34d6c117d..490287b1a 100644
--- a/internal/praefect/coordinator.go
+++ b/internal/praefect/coordinator.go
@@ -793,8 +793,9 @@ func (c *Coordinator) createTransactionFinalizer(
// - The node failed to be part of the quorum. As a special case, if the primary fails the vote, all
// nodes need to get replication jobs.
//
-// - The node has errored. As a special case, if the primary fails all nodes need to get replication
-// jobs.
+// - The node has a different error state than the primary. If both primary and secondary have
+// returned the same error, then we assume they did the same thing and failed in the same
+// controlled way.
//
// Note that this function cannot and should not fail: if anything goes wrong, we need to create
// replication jobs to repair state.
@@ -851,13 +852,6 @@ func getUpdatedAndOutdatedSecondaries(
// for them.
markOutdated("outdated", route.ReplicationTargets)
- // If the primary errored, then we need to assume that it has modified on-disk state and
- // thus need to replicate those changes to secondaries.
- if primaryErr != nil {
- markOutdated("primary-failed", routerNodesToStorages(route.Secondaries))
- return
- }
-
// If no subtransaction happened, then the called RPC may not be aware of transactions or
// the nodes failed before casting any votes. If the primary failed the RPC, we assume
// no changes were done and the nodes hit an error prior to voting. If the primary processed
@@ -883,11 +877,12 @@ func getUpdatedAndOutdatedSecondaries(
return
}
- // Now we finally got the potentially happy case: in case the secondary didn't run into an
- // error and committed, it's considered up to date and thus does not need replication.
+ // Now we finally got the potentially happy case: when the secondary committed the
+ // transaction and has the same error state as the primary, then it's considered up to date
+ // and thus does not need replication.
for _, secondary := range route.Secondaries {
- if nodeErrors.errByNode[secondary.Storage] != nil {
- markOutdated("node-failed", []string{secondary.Storage})
+ if nodeErrors.errByNode[secondary.Storage] != primaryErr {
+ markOutdated("node-error-status", []string{secondary.Storage})
continue
}
diff --git a/internal/praefect/coordinator_pg_test.go b/internal/praefect/coordinator_pg_test.go
index d43f2f02c..05dee5e60 100644
--- a/internal/praefect/coordinator_pg_test.go
+++ b/internal/praefect/coordinator_pg_test.go
@@ -125,20 +125,15 @@ func TestStreamDirectorMutator_Transaction(t *testing.T) {
},
},
{
- // If the RPC fails without any subtransactions, the Gitalys would not have performed any changes yet.
- // We don't have to consider the secondaries outdated.
- desc: "unstarted transaction doesn't create replication jobs if the primary fails",
+ desc: "unstarted transaction does not create replication job",
primaryFails: true,
nodes: []node{
{primary: true, expectedGeneration: 0},
- {primary: false, expectedGeneration: 0},
+ {primary: false, shouldGetRepl: false, expectedGeneration: 0},
},
},
{
- // If there were no subtransactions and the RPC failed, the primary should not have performed any changes.
- // We don't need to schedule replication jobs to replication targets either as they'd have jobs
- // already scheduled by the earlier RPC that made them outdated or by the reconciler.
- desc: "unstarted transaction should not create replication jobs for outdated node if the primary fails",
+ desc: "unstarted transaction should not create replication jobs for outdated node if the primary does not vote",
primaryFails: true,
nodes: []node{
{primary: true, shouldGetRepl: false, generation: 1, expectedGeneration: 1},
diff --git a/internal/praefect/coordinator_test.go b/internal/praefect/coordinator_test.go
index d7dcdd369..811f40c87 100644
--- a/internal/praefect/coordinator_test.go
+++ b/internal/praefect/coordinator_test.go
@@ -1707,7 +1707,51 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) {
expectedPrimaryDirtied: true,
expectedOutdated: []string{"s1", "s2"},
expectedMetrics: map[string]int{
- "primary-failed": 2,
+ "node-error-status": 2,
+ },
+ },
+ {
+ desc: "multiple committed nodes with same error as primary",
+ primary: node{
+ name: "primary",
+ state: transactions.VoteCommitted,
+ err: anyErr,
+ },
+ secondaries: []node{
+ {name: "s1", state: transactions.VoteCommitted, err: anyErr},
+ {name: "s2", state: transactions.VoteCommitted, err: anyErr},
+ },
+ didVote: map[string]bool{
+ "primary": true,
+ },
+ subtransactions: 1,
+ expectedPrimaryDirtied: true,
+ expectedUpdated: []string{"s1", "s2"},
+ expectedMetrics: map[string]int{
+ "updated": 2,
+ },
+ },
+ {
+ desc: "multiple committed nodes with different error as primary",
+ primary: node{
+ name: "primary",
+ state: transactions.VoteCommitted,
+ err: anyErr,
+ },
+ secondaries: []node{
+ {name: "s1", state: transactions.VoteCommitted, err: errors.New("somethingsomething")},
+ {name: "s2", state: transactions.VoteCommitted, err: anyErr},
+ },
+ didVote: map[string]bool{
+ "primary": true,
+ },
+ subtransactions: 1,
+ expectedPrimaryDirtied: true,
+ expectedUpdated: []string{"s2"},
+ expectedOutdated: []string{"s1"},
+ expectedMetrics: map[string]int{
+ "node-error-status": 1,
+ "updated": 1,
},
},
{
@@ -1728,8 +1772,31 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) {
expectedUpdated: []string{"s2"},
expectedOutdated: []string{"s1"},
expectedMetrics: map[string]int{
- "node-failed": 1,
- "updated": 1,
+ "node-error-status": 1,
+ "updated": 1,
+ },
+ },
+ {
+ desc: "multiple committed nodes with primary and missing secondary err",
+ primary: node{
+ name: "primary",
+ state: transactions.VoteCommitted,
+ err: anyErr,
+ },
+ secondaries: []node{
+ {name: "s1", state: transactions.VoteCommitted, err: anyErr},
+ {name: "s2", state: transactions.VoteCommitted},
+ },
+ didVote: map[string]bool{
+ "primary": true,
+ },
+ subtransactions: 1,
+ expectedPrimaryDirtied: true,
+ expectedUpdated: []string{"s1"},
+ expectedOutdated: []string{"s2"},
+ expectedMetrics: map[string]int{
+ "node-error-status": 1,
+ "updated": 1,
},
},
{
@@ -1849,7 +1916,7 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) {
expectedPrimaryDirtied: true,
expectedOutdated: []string{"s1", "s2", "r1", "r2"},
expectedMetrics: map[string]int{
- "node-failed": 1,
+ "node-error-status": 1,
"node-not-committed": 1,
"outdated": 2,
},