diff options
author | Stan Hu <stanhu@gmail.com> | 2021-11-20 01:17:30 +0300 |
---|---|---|
committer | Stan Hu <stanhu@gmail.com> | 2021-11-24 00:47:30 +0300 |
commit | edb7c80a8b443381528281e119cd4348fd050e43 (patch) | |
tree | f486e33c76a0a30adc1b4a20669b842f79a540f8 | |
parent | 4e18794f846ad0d27bea3443caa2b51cd9afd722 (diff) |
Use a CTE instead of healthy_storages viewsh-use-cte-healthy-storages
Right now we have a `healthy_storages` database view that has hard-coded
thresholds. We break this out into a common table expression (CTE) in
preparation for making the node failure thresholds configurable and
possibly replaced with a Gossip-based protocol.
Changelog: changed
-rw-r--r-- | internal/praefect/datastore/healthy_storages.go | 19 | ||||
-rw-r--r-- | internal/praefect/datastore/repository_store.go | 7 | ||||
-rw-r--r-- | internal/praefect/nodes/health_manager.go | 3 |
3 files changed, 25 insertions, 4 deletions
diff --git a/internal/praefect/datastore/healthy_storages.go b/internal/praefect/datastore/healthy_storages.go new file mode 100644 index 000000000..fea2f2fca --- /dev/null +++ b/internal/praefect/datastore/healthy_storages.go @@ -0,0 +1,19 @@ +package datastore + +// HealthyStoragesQuery returns the SQL to determine whether a node is up. +// The view in internal/praefect/datastore/migrations/20210525143540_healthy_storages_view.go +// can be dropped once this is deployed. +func HealthyStoragesQuery() string { + return ` +SELECT shard_name AS virtual_storage, node_name AS storage +FROM node_status AS ns +WHERE last_seen_active_at >= NOW() - INTERVAL '10 SECOND' +GROUP BY shard_name, node_name +HAVING COUNT(praefect_name) >= ( + SELECT CEIL(COUNT(DISTINCT praefect_name) / 2.0) AS quorum_count + FROM node_status + WHERE shard_name = ns.shard_name + AND last_contact_attempt_at >= NOW() - INTERVAL '60 SECOND' +) +ORDER BY shard_name, node_name` +} diff --git a/internal/praefect/datastore/repository_store.go b/internal/praefect/datastore/repository_store.go index b9ee33ba3..a4c71c5a4 100644 --- a/internal/praefect/datastore/repository_store.go +++ b/internal/praefect/datastore/repository_store.go @@ -658,7 +658,7 @@ func (rs *PostgresRepositoryStore) GetPartiallyAvailableRepositories(ctx context // and there can't be any assignments for deleted repositories, this is still needed as long as the // fallback behavior of no assignments is in place. // - // 4. We join the `healthy_storages` view to return the storages current health. + // 4. We join the `healthy_storages` CTE to return the storages current health. // // 5. We join the `valid_primaries` view to return whether the storage is ready to act as a primary in case // of a failover. @@ -670,6 +670,7 @@ func (rs *PostgresRepositoryStore) GetPartiallyAvailableRepositories(ctx context // than the assigned ones. // rows, err := rs.db.QueryContext(ctx, ` +WITH healthy_storages_cte AS (`+HealthyStoragesQuery()+`) `+` SELECT json_build_object ( 'RelativePath', relative_path, @@ -691,7 +692,7 @@ FROM ( storage, repositories.generation - COALESCE(storage_repositories.generation, -1) AS behind_by, repository_assignments.storage IS NOT NULL AS assigned, - healthy_storages.storage IS NOT NULL AS healthy, + healthy_storages_cte.storage IS NOT NULL AS healthy, valid_primaries.storage IS NOT NULL AS valid_primary FROM ( SELECT repository_id, storage, generation FROM storage_repositories ) AS storage_repositories FULL JOIN ( @@ -706,7 +707,7 @@ FROM ( ) ) AS repository_assignments USING (repository_id, storage) JOIN repositories USING (repository_id) - LEFT JOIN healthy_storages USING (virtual_storage, storage) + LEFT JOIN healthy_storages_cte USING (virtual_storage, storage) LEFT JOIN ( SELECT repository_id, storage FROM valid_primaries ) AS valid_primaries USING (repository_id, storage) WHERE virtual_storage = $1 ORDER BY relative_path, "primary", storage diff --git a/internal/praefect/nodes/health_manager.go b/internal/praefect/nodes/health_manager.go index cb2d22345..76068e0c9 100644 --- a/internal/praefect/nodes/health_manager.go +++ b/internal/praefect/nodes/health_manager.go @@ -10,6 +10,7 @@ import ( "github.com/lib/pq" "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/v14/internal/helper" + "gitlab.com/gitlab-org/gitaly/v14/internal/praefect/datastore" "gitlab.com/gitlab-org/gitaly/v14/internal/praefect/datastore/glsql" "google.golang.org/grpc/health/grpc_health_v1" ) @@ -161,7 +162,7 @@ ON CONFLICT (praefect_name, shard_name, node_name) return fmt.Errorf("update checks: %w", err) } - rows, err := hm.db.QueryContext(ctx, `SELECT virtual_storage, storage FROM healthy_storages`) + rows, err := hm.db.QueryContext(ctx, `WITH healthy_storages_cte AS (`+datastore.HealthyStoragesQuery()+`) `+`SELECT virtual_storage, storage FROM healthy_storages_cte`) if err != nil { return fmt.Errorf("query healthy storages: %w", err) } |