Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Steinhardt <psteinhardt@gitlab.com>2021-07-09 15:00:20 +0300
committerPatrick Steinhardt <psteinhardt@gitlab.com>2021-09-02 08:23:56 +0300
commit23d7161a4a80bb18d9e0b0b304f2d5bd3cd6d467 (patch)
treec84a06fec7c8f5b32f3bba72bd138b0e7d508c54
parent62cdae73245218c73033800222524f7642e9bcd2 (diff)
coordinator: Only schedule replication for differing error states
When finalizing a transaction, we always schedule replication jobs in case the primary has returned an error. Given that there are many RPCs which are expected to return errors in a controlled way, e.g. if a commit is missing, this causes us to create replication in many contexts where it's not necessary at all. Thinking about the issue, what we really care for is not whether an RPC failed or not. It's that primary and secondary nodes behaved the same. If both primary and secondaries succeeded, we're good. But if both failed with the same error, then we're good to as long as all transactions have been committed: quorum was reached on all votes and nodes failed in the same way, so we can assume that nodes did indeed perform the same changes. This commit thus relaxes the error condition to not schedule replication jobs anymore in case the primary failed, but to only schedule replication jobs to any node which has a different error than the primary. This has both the advantage that we only need to selectively schedule jobs for disagreeing nodes instead of targeting all secondaries and it avoids scheduling jobs in many cases where we do hit errors. Changelog: performance (cherry picked from commit 73839029f79d4ebdbc8d96475cf9bd0e2a599b2b)
-rw-r--r--internal/praefect/coordinator.go21
-rw-r--r--internal/praefect/coordinator_pg_test.go11
-rw-r--r--internal/praefect/coordinator_test.go75
3 files changed, 82 insertions, 25 deletions
diff --git a/internal/praefect/coordinator.go b/internal/praefect/coordinator.go
index 7e2eaaba8..d08a1df67 100644
--- a/internal/praefect/coordinator.go
+++ b/internal/praefect/coordinator.go
@@ -769,8 +769,9 @@ func (c *Coordinator) createTransactionFinalizer(
// - The node failed to be part of the quorum. As a special case, if the primary fails the vote, all
// nodes need to get replication jobs.
//
-// - The node has errored. As a special case, if the primary fails all nodes need to get replication
-// jobs.
+// - The node has a different error state than the primary. If both primary and secondary have
+// returned the same error, then we assume they did the same thing and failed in the same
+// controlled way.
//
// Note that this function cannot and should not fail: if anything goes wrong, we need to create
// replication jobs to repair state.
@@ -827,13 +828,6 @@ func getUpdatedAndOutdatedSecondaries(
// for them.
markOutdated("outdated", route.ReplicationTargets)
- // If the primary errored, then we need to assume that it has modified on-disk state and
- // thus need to replicate those changes to secondaries.
- if primaryErr != nil {
- markOutdated("primary-failed", routerNodesToStorages(route.Secondaries))
- return
- }
-
// If no subtransaction happened, then the called RPC may not be aware of transactions or
// the nodes failed before casting any votes. If the primary failed the RPC, we assume
// no changes were done and the nodes hit an error prior to voting. If the primary processed
@@ -859,11 +853,12 @@ func getUpdatedAndOutdatedSecondaries(
return
}
- // Now we finally got the potentially happy case: in case the secondary didn't run into an
- // error and committed, it's considered up to date and thus does not need replication.
+ // Now we finally got the potentially happy case: when the secondary committed the
+ // transaction and has the same error state as the primary, then it's considered up to date
+ // and thus does not need replication.
for _, secondary := range route.Secondaries {
- if nodeErrors.errByNode[secondary.Storage] != nil {
- markOutdated("node-failed", []string{secondary.Storage})
+ if nodeErrors.errByNode[secondary.Storage] != primaryErr {
+ markOutdated("node-error-status", []string{secondary.Storage})
continue
}
diff --git a/internal/praefect/coordinator_pg_test.go b/internal/praefect/coordinator_pg_test.go
index d40b8b14e..dc03142dc 100644
--- a/internal/praefect/coordinator_pg_test.go
+++ b/internal/praefect/coordinator_pg_test.go
@@ -125,20 +125,15 @@ func TestStreamDirectorMutator_Transaction(t *testing.T) {
},
},
{
- // If the RPC fails without any subtransactions, the Gitalys would not have performed any changes yet.
- // We don't have to consider the secondaries outdated.
- desc: "unstarted transaction doesn't create replication jobs if the primary fails",
+ desc: "unstarted transaction does not create replication job",
primaryFails: true,
nodes: []node{
{primary: true, expectedGeneration: 0},
- {primary: false, expectedGeneration: 0},
+ {primary: false, shouldGetRepl: false, expectedGeneration: 0},
},
},
{
- // If there were no subtransactions and the RPC failed, the primary should not have performed any changes.
- // We don't need to schedule replication jobs to replication targets either as they'd have jobs
- // already scheduled by the earlier RPC that made them outdated or by the reconciler.
- desc: "unstarted transaction should not create replication jobs for outdated node if the primary fails",
+ desc: "unstarted transaction should not create replication jobs for outdated node if the primary does not vote",
primaryFails: true,
nodes: []node{
{primary: true, shouldGetRepl: false, generation: 1, expectedGeneration: 1},
diff --git a/internal/praefect/coordinator_test.go b/internal/praefect/coordinator_test.go
index 8c926d9db..c7383491e 100644
--- a/internal/praefect/coordinator_test.go
+++ b/internal/praefect/coordinator_test.go
@@ -1727,7 +1727,51 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) {
expectedPrimaryDirtied: true,
expectedOutdated: []string{"s1", "s2"},
expectedMetrics: map[string]int{
- "primary-failed": 2,
+ "node-error-status": 2,
+ },
+ },
+ {
+ desc: "multiple committed nodes with same error as primary",
+ primary: node{
+ name: "primary",
+ state: transactions.VoteCommitted,
+ err: anyErr,
+ },
+ secondaries: []node{
+ {name: "s1", state: transactions.VoteCommitted, err: anyErr},
+ {name: "s2", state: transactions.VoteCommitted, err: anyErr},
+ },
+ didVote: map[string]bool{
+ "primary": true,
+ },
+ subtransactions: 1,
+ expectedPrimaryDirtied: true,
+ expectedUpdated: []string{"s1", "s2"},
+ expectedMetrics: map[string]int{
+ "updated": 2,
+ },
+ },
+ {
+ desc: "multiple committed nodes with different error as primary",
+ primary: node{
+ name: "primary",
+ state: transactions.VoteCommitted,
+ err: anyErr,
+ },
+ secondaries: []node{
+ {name: "s1", state: transactions.VoteCommitted, err: errors.New("somethingsomething")},
+ {name: "s2", state: transactions.VoteCommitted, err: anyErr},
+ },
+ didVote: map[string]bool{
+ "primary": true,
+ },
+ subtransactions: 1,
+ expectedPrimaryDirtied: true,
+ expectedUpdated: []string{"s2"},
+ expectedOutdated: []string{"s1"},
+ expectedMetrics: map[string]int{
+ "node-error-status": 1,
+ "updated": 1,
},
},
{
@@ -1748,8 +1792,31 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) {
expectedUpdated: []string{"s2"},
expectedOutdated: []string{"s1"},
expectedMetrics: map[string]int{
- "node-failed": 1,
- "updated": 1,
+ "node-error-status": 1,
+ "updated": 1,
+ },
+ },
+ {
+ desc: "multiple committed nodes with primary and missing secondary err",
+ primary: node{
+ name: "primary",
+ state: transactions.VoteCommitted,
+ err: anyErr,
+ },
+ secondaries: []node{
+ {name: "s1", state: transactions.VoteCommitted, err: anyErr},
+ {name: "s2", state: transactions.VoteCommitted},
+ },
+ didVote: map[string]bool{
+ "primary": true,
+ },
+ subtransactions: 1,
+ expectedPrimaryDirtied: true,
+ expectedUpdated: []string{"s1"},
+ expectedOutdated: []string{"s2"},
+ expectedMetrics: map[string]int{
+ "node-error-status": 1,
+ "updated": 1,
},
},
{
@@ -1869,7 +1936,7 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) {
expectedPrimaryDirtied: true,
expectedOutdated: []string{"s1", "s2", "r1", "r2"},
expectedMetrics: map[string]int{
- "node-failed": 1,
+ "node-error-status": 1,
"node-not-committed": 1,
"outdated": 2,
},