diff options
author | Sami Hiltunen <shiltunen@gitlab.com> | 2021-07-20 12:33:19 +0300 |
---|---|---|
committer | Sami Hiltunen <shiltunen@gitlab.com> | 2021-07-20 12:33:19 +0300 |
commit | 5b3c3ad85b8ee290785ffcc405d97055d46af0c2 (patch) | |
tree | 0bd4f1c6cf7a369fe47cba895058f13e0db360ce | |
parent | eaf66c05cf4d63c2dc1ab8db50a67918d0180e6d (diff) | |
parent | 73839029f79d4ebdbc8d96475cf9bd0e2a599b2b (diff) |
Merge branch 'pks-tx-coordinator-replication-relax-primary-error' into 'master'
coordinator: Only schedule replication for differing error states
See merge request gitlab-org/gitaly!3660
-rw-r--r-- | internal/praefect/coordinator.go | 21 | ||||
-rw-r--r-- | internal/praefect/coordinator_pg_test.go | 11 | ||||
-rw-r--r-- | internal/praefect/coordinator_test.go | 75 |
3 files changed, 82 insertions, 25 deletions
diff --git a/internal/praefect/coordinator.go b/internal/praefect/coordinator.go index 34d6c117d..490287b1a 100644 --- a/internal/praefect/coordinator.go +++ b/internal/praefect/coordinator.go @@ -793,8 +793,9 @@ func (c *Coordinator) createTransactionFinalizer( // - The node failed to be part of the quorum. As a special case, if the primary fails the vote, all // nodes need to get replication jobs. // -// - The node has errored. As a special case, if the primary fails all nodes need to get replication -// jobs. +// - The node has a different error state than the primary. If both primary and secondary have +// returned the same error, then we assume they did the same thing and failed in the same +// controlled way. // // Note that this function cannot and should not fail: if anything goes wrong, we need to create // replication jobs to repair state. @@ -851,13 +852,6 @@ func getUpdatedAndOutdatedSecondaries( // for them. markOutdated("outdated", route.ReplicationTargets) - // If the primary errored, then we need to assume that it has modified on-disk state and - // thus need to replicate those changes to secondaries. - if primaryErr != nil { - markOutdated("primary-failed", routerNodesToStorages(route.Secondaries)) - return - } - // If no subtransaction happened, then the called RPC may not be aware of transactions or // the nodes failed before casting any votes. If the primary failed the RPC, we assume // no changes were done and the nodes hit an error prior to voting. If the primary processed @@ -883,11 +877,12 @@ func getUpdatedAndOutdatedSecondaries( return } - // Now we finally got the potentially happy case: in case the secondary didn't run into an - // error and committed, it's considered up to date and thus does not need replication. + // Now we finally got the potentially happy case: when the secondary committed the + // transaction and has the same error state as the primary, then it's considered up to date + // and thus does not need replication. for _, secondary := range route.Secondaries { - if nodeErrors.errByNode[secondary.Storage] != nil { - markOutdated("node-failed", []string{secondary.Storage}) + if nodeErrors.errByNode[secondary.Storage] != primaryErr { + markOutdated("node-error-status", []string{secondary.Storage}) continue } diff --git a/internal/praefect/coordinator_pg_test.go b/internal/praefect/coordinator_pg_test.go index d43f2f02c..05dee5e60 100644 --- a/internal/praefect/coordinator_pg_test.go +++ b/internal/praefect/coordinator_pg_test.go @@ -125,20 +125,15 @@ func TestStreamDirectorMutator_Transaction(t *testing.T) { }, }, { - // If the RPC fails without any subtransactions, the Gitalys would not have performed any changes yet. - // We don't have to consider the secondaries outdated. - desc: "unstarted transaction doesn't create replication jobs if the primary fails", + desc: "unstarted transaction does not create replication job", primaryFails: true, nodes: []node{ {primary: true, expectedGeneration: 0}, - {primary: false, expectedGeneration: 0}, + {primary: false, shouldGetRepl: false, expectedGeneration: 0}, }, }, { - // If there were no subtransactions and the RPC failed, the primary should not have performed any changes. - // We don't need to schedule replication jobs to replication targets either as they'd have jobs - // already scheduled by the earlier RPC that made them outdated or by the reconciler. - desc: "unstarted transaction should not create replication jobs for outdated node if the primary fails", + desc: "unstarted transaction should not create replication jobs for outdated node if the primary does not vote", primaryFails: true, nodes: []node{ {primary: true, shouldGetRepl: false, generation: 1, expectedGeneration: 1}, diff --git a/internal/praefect/coordinator_test.go b/internal/praefect/coordinator_test.go index d7dcdd369..811f40c87 100644 --- a/internal/praefect/coordinator_test.go +++ b/internal/praefect/coordinator_test.go @@ -1707,7 +1707,51 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) { expectedPrimaryDirtied: true, expectedOutdated: []string{"s1", "s2"}, expectedMetrics: map[string]int{ - "primary-failed": 2, + "node-error-status": 2, + }, + }, + { + desc: "multiple committed nodes with same error as primary", + primary: node{ + name: "primary", + state: transactions.VoteCommitted, + err: anyErr, + }, + secondaries: []node{ + {name: "s1", state: transactions.VoteCommitted, err: anyErr}, + {name: "s2", state: transactions.VoteCommitted, err: anyErr}, + }, + didVote: map[string]bool{ + "primary": true, + }, + subtransactions: 1, + expectedPrimaryDirtied: true, + expectedUpdated: []string{"s1", "s2"}, + expectedMetrics: map[string]int{ + "updated": 2, + }, + }, + { + desc: "multiple committed nodes with different error as primary", + primary: node{ + name: "primary", + state: transactions.VoteCommitted, + err: anyErr, + }, + secondaries: []node{ + {name: "s1", state: transactions.VoteCommitted, err: errors.New("somethingsomething")}, + {name: "s2", state: transactions.VoteCommitted, err: anyErr}, + }, + didVote: map[string]bool{ + "primary": true, + }, + subtransactions: 1, + expectedPrimaryDirtied: true, + expectedUpdated: []string{"s2"}, + expectedOutdated: []string{"s1"}, + expectedMetrics: map[string]int{ + "node-error-status": 1, + "updated": 1, }, }, { @@ -1728,8 +1772,31 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) { expectedUpdated: []string{"s2"}, expectedOutdated: []string{"s1"}, expectedMetrics: map[string]int{ - "node-failed": 1, - "updated": 1, + "node-error-status": 1, + "updated": 1, + }, + }, + { + desc: "multiple committed nodes with primary and missing secondary err", + primary: node{ + name: "primary", + state: transactions.VoteCommitted, + err: anyErr, + }, + secondaries: []node{ + {name: "s1", state: transactions.VoteCommitted, err: anyErr}, + {name: "s2", state: transactions.VoteCommitted}, + }, + didVote: map[string]bool{ + "primary": true, + }, + subtransactions: 1, + expectedPrimaryDirtied: true, + expectedUpdated: []string{"s1"}, + expectedOutdated: []string{"s2"}, + expectedMetrics: map[string]int{ + "node-error-status": 1, + "updated": 1, }, }, { @@ -1849,7 +1916,7 @@ func TestGetUpdatedAndOutdatedSecondaries(t *testing.T) { expectedPrimaryDirtied: true, expectedOutdated: []string{"s1", "s2", "r1", "r2"}, expectedMetrics: map[string]int{ - "node-failed": 1, + "node-error-status": 1, "node-not-committed": 1, "outdated": 2, }, |