diff options
author | Sami Hiltunen <shiltunen@gitlab.com> | 2021-11-15 15:07:06 +0300 |
---|---|---|
committer | Sami Hiltunen <shiltunen@gitlab.com> | 2021-11-15 16:28:00 +0300 |
commit | 731fadf3c8b869ea55371375191e9c8c8105c8a9 (patch) | |
tree | 7ff08f2901758ed973cc96898a73dacc9bcc0283 | |
parent | 3e45d3384caa633730f3e51698d13a009b90de3b (diff) |
Perform health check updates in an ordered manner
This commit updates the HealthManager to perform upserts into the
node_status table in an ordered manner. This ensures concurrent
updates to the records will not deadlock due to the random lock
acquisition order. Generally this is not a concern as each Praefect
have their own records. During upgrades though, the new Praefect
process shares its identity with the old Process and can perform
concurrent health updates to the same record. Ordering the inserts
thus prevents deadlocks from happening during an upgrade. Postgres
would resolve the deadlock by allowing one query to proceed and aborting
the other one. Thus, the impact should not be big but we should avoid
deadlocking anyway.
Changelog: fixed
-rw-r--r-- | internal/praefect/nodes/health_manager.go | 1 | ||||
-rw-r--r-- | internal/praefect/nodes/health_manager_test.go | 12 |
2 files changed, 8 insertions, 5 deletions
diff --git a/internal/praefect/nodes/health_manager.go b/internal/praefect/nodes/health_manager.go index 75946dbe9..cb2d22345 100644 --- a/internal/praefect/nodes/health_manager.go +++ b/internal/praefect/nodes/health_manager.go @@ -146,6 +146,7 @@ FROM ( SELECT unnest($2::text[]) AS shard_name, unnest($3::text[]) AS node_name, unnest($4::boolean[]) AS is_healthy + ORDER BY shard_name, node_name ) AS results ON CONFLICT (praefect_name, shard_name, node_name) DO UPDATE SET diff --git a/internal/praefect/nodes/health_manager_test.go b/internal/praefect/nodes/health_manager_test.go index 98faf9410..9ca498ab9 100644 --- a/internal/praefect/nodes/health_manager_test.go +++ b/internal/praefect/nodes/health_manager_test.go @@ -604,13 +604,15 @@ func TestHealthManager_orderedWrites(t *testing.T) { // Wait for tx2 to be blocked on the gitaly-1 lock acquired by tx1 glsql.WaitForQueries(ctx, t, db, "INSERT INTO node_status", 1) - // This triggers a deadlock. Postgres allows tx1 to proceed and kills tx2. + // Ensure tx1 can acquire lock on gitaly-2. require.NoError(t, hm1.updateHealthChecks(ctx, []string{virtualStorage}, []string{"gitaly-2"}, []bool{true})) - require.EqualError(t, <-tx2Err, "update checks: pq: deadlock detected") - - // Tx1 succeeds and Tx2 failed + // Committing tx1 releases locks and unblocks tx2. require.NoError(t, tx1.Commit()) - require.Error(t, tx2.Commit()) + + // tx2 should succeed afterwards. + require.NoError(t, <-tx2Err) + require.NoError(t, tx2.Commit()) require.Equal(t, map[string][]string{"virtual-storage": {"gitaly-1", "gitaly-2"}}, hm1.HealthConsensus()) + require.Equal(t, map[string][]string{"virtual-storage": {"gitaly-1", "gitaly-2"}}, hm2.HealthConsensus()) } |