praefect: Add ability to have separate database metrics endpoint

By default, when metrics are enabled, then each Praefect will expose information about how many read-only repositories there are, which requires Praefect to query the database. First, this will result in the same metrics being exposed by every Praefect given that the database is shared between all of them. And second, this will cause one query per Praefect per scraping run. This cost does add up and generate quite some load on the database, especially so if there is a lot of repositories in that database, up to a point where it may overload the database completely. Fix this issue by splitting metrics which hit the database into a separate endpoint "/db_metrics". This allows admins to set up a separate scraper with a different scraping interval for this metric, and furthermore it gives the ability to only scrape this metric for one of the Praefect instances so the work isn't unnecessarily duplicated. Given that this is a breaking change which will get backported, we must make this behaviour opt-in for now. We thus include a new configuration key "prometheus_use_database_endpoint" which enables the new behaviour such that existing installations' metrics won't break on a simple point release. The intent is to eventually remove this configuration though and enable it for all setups on a major release. Changelog: added (cherry picked from commit 7e74b7333ca6f2d1e55e7a17350cccc7c856c847)
author: John Cai <jcai@gitlab.com> 2021-11-17 02:12:56 +0300
committer: Patrick Steinhardt <psteinhardt@gitlab.com> 2021-11-19 12:48:37 +0300
commit: ebaade4a4816704e6c5bf5696f070fd16273fe09 (patch)
tree: 205917045b91f4bc9f5018b08c32820b231a2f9b
parent: aac5d5e573b6a48f724ea8337ea74e24e4c4167f (diff)
3 files changed, 42 insertions, 10 deletions
diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go
index 262b2f90d..fdfbb2789 100644
--- a/cmd/praefect/main.go
+++ b/cmd/praefect/main.go
@@ -63,11 +63,13 @@ import (
 	"flag"
 	"fmt"
 	"math/rand"
+	"net/http"
 	"os"
 	"strings"
 	"time"
 
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"github.com/sirupsen/logrus"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/backchannel"
 	"gitlab.com/gitlab-org/gitaly/v14/internal/bootstrap"
@@ -149,7 +151,9 @@ func main() {
 		logger.Fatalf("unable to create a bootstrap: %v", err)
 	}
 
-	if err := run(starterConfigs, conf, b, prometheus.DefaultRegisterer); err != nil {
+	dbPromRegistry := prometheus.NewRegistry()
+
+	if err := run(starterConfigs, conf, b, prometheus.DefaultRegisterer, dbPromRegistry); err != nil {
 		logger.Fatalf("%v", err)
 	}
 }
@@ -192,7 +196,16 @@ func configure(conf config.Config) {
 	sentry.ConfigureSentry(version.GetVersion(), conf.Sentry)
 }
 
-func run(cfgs []starter.Config, conf config.Config, b bootstrap.Listener, promreg prometheus.Registerer) error {
+func run(
+	cfgs []starter.Config,
+	conf config.Config,
+	b bootstrap.Listener,
+	promreg prometheus.Registerer,
+	dbPromRegistry interface {
+		prometheus.Registerer
+		prometheus.Gatherer
+	},
+) error {
 	nodeLatencyHistogram, err := metrics.RegisterNodeLatency(conf.Prometheus, promreg)
 	if err != nil {
 		return err
@@ -390,9 +403,18 @@ func run(cfgs []starter.Config, conf config.Config, b bootstrap.Listener, promre
 	)
 	metricsCollectors = append(metricsCollectors, transactionManager, coordinator, repl)
 	if db != nil {
-		promreg.MustRegister(
-			datastore.NewRepositoryStoreCollector(logger, conf.VirtualStorageNames(), db, conf.Prometheus.ScrapeTimeout),
-		)
+		repositoryStoreCollector := datastore.NewRepositoryStoreCollector(logger, conf.VirtualStorageNames(), db, conf.Prometheus.ScrapeTimeout)
+
+		// Eventually, database-related metrics will always be exported via a separate
+		// endpoint such that it's possible to set a different scraping interval and thus to
+		// reduce database load. For now though, we register the metrics twice, once for the
+		// standard and once for the database-specific endpoint. This is done to ensure a
+		// transitory period where deployments can be moved to the new endpoint without
+		// causing breakage if they still use the old endpoint.
+		dbPromRegistry.MustRegister(repositoryStoreCollector)
+		if !conf.PrometheusExcludeDatabaseFromDefaultMetrics {
+			promreg.MustRegister(repositoryStoreCollector)
+		}
 	}
 	promreg.MustRegister(metricsCollectors...)
 
@@ -415,9 +437,13 @@ func run(cfgs []starter.Config, conf config.Config, b bootstrap.Listener, promre
 				return err
 			}
 
+			serveMux := http.NewServeMux()
+			serveMux.Handle("/db_metrics", promhttp.HandlerFor(dbPromRegistry, promhttp.HandlerOpts{}))
+
 			go func() {
 				if err := monitoring.Start(
 					monitoring.WithListener(l),
+					monitoring.WithServeMux(serveMux),
 					monitoring.WithBuildInformation(praefect.GetVersion(), praefect.GetBuildTime())); err != nil {
 					logger.WithError(err).Errorf("Unable to start prometheus listener: %v", conf.PrometheusListenAddr)
 				}
diff --git a/cmd/praefect/subcmd_remove_repository_test.go b/cmd/praefect/subcmd_remove_repository_test.go
index abc4b93b9..bb56dee3c 100644
--- a/cmd/praefect/subcmd_remove_repository_test.go
+++ b/cmd/praefect/subcmd_remove_repository_test.go
@@ -112,7 +112,7 @@ func TestRemoveRepository_Exec(t *testing.T) {
 	bootstrapper := bootstrap.NewNoop()
 	go func() {
 		defer close(stopped)
-		assert.NoError(t, run(starterConfigs, conf, bootstrapper, prometheus.NewRegistry()))
+		assert.NoError(t, run(starterConfigs, conf, bootstrapper, prometheus.NewRegistry(), prometheus.NewRegistry()))
 	}()
 
 	cc, err := client.Dial("unix://"+conf.SocketPath, nil)
diff --git a/internal/praefect/config/config.go b/internal/praefect/config/config.go
index bef7f8d62..1e3cf682d 100644
--- a/internal/praefect/config/config.go
+++ b/internal/praefect/config/config.go
@@ -123,10 +123,16 @@ type Config struct {
 	Sentry               sentry.Config     `toml:"sentry"`
 	PrometheusListenAddr string            `toml:"prometheus_listen_addr"`
 	Prometheus           prometheus.Config `toml:"prometheus"`
-	Auth                 auth.Config       `toml:"auth"`
-	TLS                  config.TLS        `toml:"tls"`
-	DB                   `toml:"database"`
-	Failover             Failover `toml:"failover"`
+	// PrometheusExcludeDatabaseFromDefaultMetrics excludes database-related metrics from the
+	// default metrics. If set to `false`, then database metrics will be available both via
+	// `/metrics` and `/db_metrics`. Otherwise, they will only be accessible via `/db_metrics`.
+	// Defaults to `false`. This is used as a transitory configuration key: eventually, database
+	// metrics will always be removed from the standard metrics endpoint.
+	PrometheusExcludeDatabaseFromDefaultMetrics bool        `toml:"prometheus_exclude_database_from_default_metrics"`
+	Auth                                        auth.Config `toml:"auth"`
+	TLS                                         config.TLS  `toml:"tls"`
+	DB                                          `toml:"database"`
+	Failover                                    Failover `toml:"failover"`
 	// Keep for legacy reasons: remove after Omnibus has switched
 	FailoverEnabled     bool                `toml:"failover_enabled"`
 	MemoryQueueEnabled  bool                `toml:"memory_queue_enabled"`
author	John Cai <jcai@gitlab.com>	2021-11-17 02:12:56 +0300
committer	Patrick Steinhardt <psteinhardt@gitlab.com>	2021-11-19 12:48:37 +0300
commit	ebaade4a4816704e6c5bf5696f070fd16273fe09 (patch)
tree	205917045b91f4bc9f5018b08c32820b231a2f9b
parent	aac5d5e573b6a48f724ea8337ea74e24e4c4167f (diff)