diff options
author | Sami Hiltunen <shiltunen@gitlab.com> | 2022-11-30 23:51:01 +0300 |
---|---|---|
committer | Sami Hiltunen <shiltunen@gitlab.com> | 2022-11-30 23:54:22 +0300 |
commit | 36e2349865ae8c713923f4fe8b7b8973a5104d73 (patch) | |
tree | 36688d9b34a5496f7b135205ef43c2302deedaa0 | |
parent | ab405cca9de3358f498615344c663279eaa9922f (diff) |
Support health checking connections locally only
HealthManager is responsible for health checking connections to
Gitaly in Praefect. Doing so, it also persists the information in
the database so the other Praefect's can take the connection status
of other Praefects in consideration when promoting primaries and so
forth. As we are about to add a second set of connections behind a
feature flag, we want to health check the connections so we don't
route traffic to them if they are unhealthy but we don't want to the
status of the feature flagged connections affect the health consensus
used for the elections. This commit makes this possible with the health
manager. If the HealthManager is not provided a database handle, it will
simply locally check the connections without persisting any information
in the database.
-rw-r--r-- | internal/praefect/nodes/health_manager.go | 25 | ||||
-rw-r--r-- | internal/praefect/nodes/health_manager_test.go | 25 |
2 files changed, 41 insertions, 9 deletions
diff --git a/internal/praefect/nodes/health_manager.go b/internal/praefect/nodes/health_manager.go index 21cdfc9fe..0743bf520 100644 --- a/internal/praefect/nodes/health_manager.go +++ b/internal/praefect/nodes/health_manager.go @@ -58,6 +58,9 @@ type HealthManager struct { // NewHealthManager returns a new health manager that monitors which nodes in the cluster // are healthy. +// +// If db is nil, the HealthManager checks the connection health normally but doesn't persist +// any information about the nodes in the database. func NewHealthManager( log logrus.FieldLogger, db glsql.Querier, @@ -138,10 +141,13 @@ func (hm *HealthManager) updateHealthChecks(ctx context.Context, virtualStorages hm.locallyHealthy.Store(locallyHealthy) - ctx, cancel := hm.databaseTimeout(ctx) - defer cancel() + if hm.db != nil { + // Database is nil only when an alternative set of connections is being tested behind a feature flag + // and we do not want to affect the consensus in the database, just the routing decisions. + ctx, cancel := hm.databaseTimeout(ctx) + defer cancel() - if _, err := hm.db.ExecContext(ctx, ` + if _, err := hm.db.ExecContext(ctx, ` INSERT INTO node_status (praefect_name, shard_name, node_name, last_contact_attempt_at, last_seen_active_at) SELECT $1, shard_name, node_name, NOW(), CASE WHEN is_healthy THEN NOW() ELSE NULL END FROM ( @@ -155,12 +161,13 @@ ON CONFLICT (praefect_name, shard_name, node_name) last_contact_attempt_at = NOW(), last_seen_active_at = COALESCE(EXCLUDED.last_seen_active_at, node_status.last_seen_active_at) `, - hm.praefectName, - virtualStorages, - physicalStorages, - healthy, - ); err != nil { - return fmt.Errorf("update checks: %w", err) + hm.praefectName, + virtualStorages, + physicalStorages, + healthy, + ); err != nil { + return fmt.Errorf("update checks: %w", err) + } } if hm.firstUpdate { diff --git a/internal/praefect/nodes/health_manager_test.go b/internal/praefect/nodes/health_manager_test.go index f53297854..15b191cda 100644 --- a/internal/praefect/nodes/health_manager_test.go +++ b/internal/praefect/nodes/health_manager_test.go @@ -49,6 +49,31 @@ func getHealthConsensus(t *testing.T, ctx context.Context, db glsql.Querier) map return consensus } +func TestHealthManagerWithoutDatabase(t *testing.T) { + t.Parallel() + + hm := NewHealthManager(testhelper.NewDiscardingLogger(t), nil, "ignored", HealthClients{ + "virtual-storage": { + "healthy-storage": mockHealthClient{ + CheckFunc: func(context.Context, *grpc_health_v1.HealthCheckRequest, ...grpc.CallOption) (*grpc_health_v1.HealthCheckResponse, error) { + return &grpc_health_v1.HealthCheckResponse{Status: grpc_health_v1.HealthCheckResponse_SERVING}, nil + }, + }, + "unhealthy-storage": mockHealthClient{ + CheckFunc: func(context.Context, *grpc_health_v1.HealthCheckRequest, ...grpc.CallOption) (*grpc_health_v1.HealthCheckResponse, error) { + return &grpc_health_v1.HealthCheckResponse{Status: grpc_health_v1.HealthCheckResponse_NOT_SERVING}, nil + }, + }, + }, + }) + hm.handleError = func(err error) error { return err } + + runCtx, cancelRun := context.WithCancel(testhelper.Context(t)) + require.Equal(t, context.Canceled, hm.Run(runCtx, helper.NewCountTicker(1, cancelRun))) + require.Equal(t, map[string][]string{"virtual-storage": {"healthy-storage"}}, hm.HealthyNodes()) + <-hm.Updated() +} + func TestHealthManager(t *testing.T) { t.Parallel() ctx := testhelper.Context(t) |