From d5303e924079c32961d8f5a8a7d96cfe09a3d892 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Tue, 8 Nov 2022 13:53:42 -0600 Subject: Praefect: Remove clock drift readiness check Currently the Praefect readiness RPC performs a clock drift check when invoked. This clock drift check calls the configured NTP service to validate the time against. If the NTP service call fails, the readiness check is considered failed. The clock drift check should not be included in the readiness check as it is more of a host problem. Also for deployments that depend on the readiness RPC to signal readiness, a failed response from the NTP service could inaccurately reflect the state of Praefect. This change removes the clock drift check from the checks performed by the readiness RPC. --- cmd/praefect/main.go | 2 +- internal/praefect/service/checks.go | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go index 078db25df..daed37853 100644 --- a/cmd/praefect/main.go +++ b/cmd/praefect/main.go @@ -436,7 +436,7 @@ func run( protoregistry.GitalyProtoPreregistered, nodeSet.Connections(), primaryGetter, - service.AllChecks(), + service.ReadinessChecks(), ) ) metricsCollectors = append(metricsCollectors, transactionManager, coordinator, repl) diff --git a/internal/praefect/service/checks.go b/internal/praefect/service/checks.go index 9bf5e7bb0..37f51256c 100644 --- a/internal/praefect/service/checks.go +++ b/internal/praefect/service/checks.go @@ -38,9 +38,10 @@ const ( Fatal = "fatal" ) -// Check is a struct representing a check on the health of a Gitaly cluster's setup. These are separate from the "healthcheck" -// concept which is more concerned with the health of the praefect service. These checks are meant to diagnose any issues with -// the praefect cluster setup itself and will be run on startup/restarts. +// Check represents a check to be performed to validate Gitaly cluster health. +// Checks are used to diagnose issues with Praefect cluster setup through the +// Praefect CLI `check` subcommand and also performed through the invoking of +// the readiness RPC. type Check struct { Run func(ctx context.Context) error Name string @@ -62,6 +63,16 @@ func AllChecks() []CheckFunc { } } +// ReadinessChecks returns the checks invoked by the Praefect readiness RPC. +func ReadinessChecks() []CheckFunc { + return []CheckFunc{ + NewPraefectMigrationCheck, + NewGitalyNodeConnectivityCheck, + NewPostgresReadWriteCheck, + NewUnavailableReposCheck, + } +} + // NewPraefectMigrationCheck returns a Check that checks if all praefect migrations have run func NewPraefectMigrationCheck(conf config.Config, w io.Writer, quiet bool) *Check { return &Check{ @@ -204,7 +215,7 @@ func NewUnavailableReposCheck(conf config.Config, w io.Writer, quiet bool) *Chec } // NewClockSyncCheck returns a function that returns a check that verifies if system clock is in sync. -func NewClockSyncCheck(clockDriftCheck func(ntpHost string, driftThreshold time.Duration) (bool, error)) func(_ config.Config, _ io.Writer, _ bool) *Check { +func NewClockSyncCheck(clockDriftCheck func(ntpHost string, driftThreshold time.Duration) (bool, error)) CheckFunc { return func(conf config.Config, w io.Writer, quite bool) *Check { return &Check{ Name: "clock synchronization", -- cgit v1.2.3