diff options
author | Sami Hiltunen <shiltunen@gitlab.com> | 2020-05-19 14:30:47 +0300 |
---|---|---|
committer | Sami Hiltunen <shiltunen@gitlab.com> | 2020-05-25 14:58:44 +0300 |
commit | 11de6a393182b19ded863488251d320d437f0e40 (patch) | |
tree | 110e1b398697ed9345d5492249223b220f7b9474 | |
parent | a08561eebea15f32dd4217ca122ce0fc2364a2d6 (diff) |
metrics examplesmh-instrument-read-only
-rw-r--r-- | cmd/praefect/main.go | 8 | ||||
-rw-r--r-- | internal/praefect/nodes/local_elector.go | 35 | ||||
-rw-r--r-- | internal/praefect/nodes/manager.go | 12 | ||||
-rw-r--r-- | internal/praefect/nodes/sql_elector.go | 76 | ||||
-rw-r--r-- | internal/praefect/nodes/sql_elector_test.go | 66 | ||||
-rw-r--r-- | internal/service/repository/optimize.go | 4 |
6 files changed, 170 insertions, 31 deletions
diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go index 322ae28b5..86bffb2c9 100644 --- a/cmd/praefect/main.go +++ b/cmd/praefect/main.go @@ -91,6 +91,7 @@ import ( "strings" "time" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/internal/bootstrap" "gitlab.com/gitlab-org/gitaly/internal/bootstrap/starter" @@ -244,6 +245,13 @@ func run(cfgs []starter.Config, conf config.Config) error { } nodeManager.Start(1*time.Second, 3*time.Second) + // We register the component directly with Prometheus, in this case with + // prometheus.DefaultRegisterer. During the scrape the describe method in nodeMgr + // relays the call to the configured strategies. The strategies then describe their + // own metrics which include the counters and the const metrics. The metrics are + // collected via the Collect method, giving us better control on the metric collection. + prometheus.MustRegister(nodeManager) + transactionCounterMetric, err := metrics.RegisterTransactionCounter() if err != nil { return err diff --git a/internal/praefect/nodes/local_elector.go b/internal/praefect/nodes/local_elector.go index 04711f760..ea6d4a872 100644 --- a/internal/praefect/nodes/local_elector.go +++ b/internal/praefect/nodes/local_elector.go @@ -5,8 +5,8 @@ import ( "sync" "time" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" - "gitlab.com/gitlab-org/gitaly/internal/praefect/metrics" ) type nodeCandidate struct { @@ -119,11 +119,24 @@ func (s *localElector) monitor(d time.Duration) { } } +func (*localElector) Describe(chan<- *prometheus.Desc) { + // ignored in the example +} + +func (*localElector) Collect(chan<- prometheus.Metric) { + // ignored in the example. + // + // If you look at the implementation of sqlElector.Collect, we can + // see that it would be straightforward to extract the instrumentation + // code in to a wrapper that wraps a leaderElectionStrategy and instruments + // the calls to GetShard and uses GetShard to fetch the current primaries + // and secondaries, reducing code duplication between the local and the sql + // elector +} + // checkNodes issues a gRPC health check for each node managed by the // shard. func (s *localElector) checkNodes(ctx context.Context) error { - defer s.updateMetrics() - for _, n := range s.nodes { n.checkNode(ctx) } @@ -195,19 +208,3 @@ func (s *localElector) enableWrites(context.Context) error { s.isReadOnly = false return nil } - -func (s *localElector) updateMetrics() { - s.m.RLock() - primary := s.primaryNode - s.m.RUnlock() - - for _, n := range s.nodes { - var val float64 - - if n == primary { - val = 1 - } - - metrics.PrimaryGauge.WithLabelValues(s.shardName, n.node.GetStorage()).Set(val) - } -} diff --git a/internal/praefect/nodes/manager.go b/internal/praefect/nodes/manager.go index d9d79ec2f..d932f1e71 100644 --- a/internal/praefect/nodes/manager.go +++ b/internal/praefect/nodes/manager.go @@ -11,6 +11,7 @@ import ( grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware" "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" gitalyauth "gitlab.com/gitlab-org/gitaly/auth" "gitlab.com/gitlab-org/gitaly/client" @@ -89,6 +90,7 @@ type leaderElectionStrategy interface { checkNodes(context.Context) error GetShard() (Shard, error) enableWrites(context.Context) error + prometheus.Collector } // ErrPrimaryNotHealthy indicates the primary of a shard is not in a healthy state and hence @@ -152,6 +154,16 @@ func NewManager(log *logrus.Entry, c config.Config, db *sql.DB, ds datastore.Dat }, nil } +func (n *Mgr) Describe(descs chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(n, descs) +} + +func (n *Mgr) Collect(metrics chan<- prometheus.Metric) { + for _, vs := range n.strategies { + vs.Collect(metrics) + } +} + // Start will bootstrap the node manager by calling healthcheck on the nodes as well as kicking off // the monitoring process. Start must be called before NodeMgr can be used. func (n *Mgr) Start(bootstrapInterval, monitorInterval time.Duration) { diff --git a/internal/praefect/nodes/sql_elector.go b/internal/praefect/nodes/sql_elector.go index e747b5265..ce680ca01 100644 --- a/internal/praefect/nodes/sql_elector.go +++ b/internal/praefect/nodes/sql_elector.go @@ -12,9 +12,9 @@ import ( "github.com/google/uuid" "github.com/lib/pq" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/internal/praefect/config" - "gitlab.com/gitlab-org/gitaly/internal/praefect/metrics" ) const ( @@ -84,6 +84,9 @@ type sqlElector struct { failoverSeconds int activePraefectSeconds int readOnlyAfterFailover bool + + getShardTotal prometheus.Counter + neverSet prometheus.Gauge } func newSQLElector(name string, c config.Config, failoverTimeoutSeconds int, activePraefectSeconds int, db *sql.DB, log logrus.FieldLogger, ns []*nodeStatus) *sqlElector { @@ -107,6 +110,25 @@ func newSQLElector(name string, c config.Config, failoverTimeoutSeconds int, act nodes: nodes, primaryNode: nodes[0], readOnlyAfterFailover: c.Failover.ReadOnlyAfterFailover, + + // We create the counter here with all the label values already set. This means the time series + // shows up immediately after registration in Prometheus without any prior observations being made. + // The metric is defined where it is used, reducing the need to jump around the code base. + getShardTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gitaly_get_shard_total", + Help: "Total count of GetShard calls", + ConstLabels: prometheus.Labels{ + "virtual_storage": name, + }, + }), + + neverSet: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "gitaly_never_set_gauge", + Help: "This is never touched, yet is visible in scrape", + ConstLabels: prometheus.Labels{ + "virtual_storage": name, + }, + }), } } @@ -160,8 +182,6 @@ func (s *sqlElector) monitor(d time.Duration) { func (s *sqlElector) checkNodes(ctx context.Context) error { var wg sync.WaitGroup - defer s.updateMetrics() - for _, n := range s.nodes { wg.Add(1) @@ -256,6 +276,12 @@ last_contact_attempt_at = NOW()` // GetShard gets the current status of the shard. ErrPrimaryNotHealthy // is returned if a primary does not exist. func (s *sqlElector) GetShard() (Shard, error) { + // Since we set the constant labels when we created the counter, + // we can simply increment here without worrying about the labels. + // Since the metric is always created, we do not need to nil check + // either. + s.getShardTotal.Inc() + primary, readOnly, err := s.lookupPrimary() if err != nil { return Shard{}, err @@ -280,19 +306,49 @@ func (s *sqlElector) GetShard() (Shard, error) { }, nil } -func (s *sqlElector) updateMetrics() { - s.m.RLock() - primary := s.primaryNode - s.m.RUnlock() +func (s *sqlElector) Describe(descs chan<- *prometheus.Desc) { + // DescribeByCollect is a convenience method from prometheus that allows us to avoid + // explicitly listing every metric here and only focus on the Collect method. + prometheus.DescribeByCollect(s, descs) +} + +func (s *sqlElector) Collect(metrics chan<- prometheus.Metric) { + // First we collect every concrete metric we have + s.getShardTotal.Collect(metrics) + s.neverSet.Collect(metrics) + + // Then we collect metrics that do not directly map to a persistent + // metric. In this case, the primary information comes from the database + // and could have been modified by another praefect node. We get the value + // when it is needed for a scrape, guaranteeing it is always up to date. + // This is also useful if we have other derived metrics that we do not hold + // in the memory. + shard, err := s.GetShard() + if err != nil { + // In the example, we only log the error. We could also expose a metric + // indicating the scrape's failure: https://prometheus.io/docs/instrumenting/writing_exporters/#failed-scrapes + s.log.WithFields(logrus.Fields{ + "error": err, + "virtual-storage": s.shardName, + }).Error("prometheus scrape failed") + return + } for _, node := range s.nodes { var val float64 - - if primary == node { + if node == shard.Primary { val = 1 } - metrics.PrimaryGauge.WithLabelValues(s.shardName, node.GetStorage()).Set(val) + // Since the information about the primary is derived from GetShard during scrape, we do not have a persistent + // gauge for it. Instead, we just give the derived value directly to Prometheus via a ConstMetric. Since there + // are no gauges that could be set from the outside during a scrape, the view is always consistent. + metrics <- prometheus.MustNewConstMetric(prometheus.NewDesc( + "gitaly_praefect_primaries", + "Primary/Secondary statuses of Gitaly nodes.", + []string{"virtual_storage", "gitaly_storage"}, + nil, + ), prometheus.GaugeValue, val, s.shardName, node.GetStorage()) } } diff --git a/internal/praefect/nodes/sql_elector_test.go b/internal/praefect/nodes/sql_elector_test.go index adc602952..e0354d72d 100644 --- a/internal/praefect/nodes/sql_elector_test.go +++ b/internal/praefect/nodes/sql_elector_test.go @@ -3,13 +3,16 @@ package nodes import ( + "strings" "testing" "time" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/sirupsen/logrus" "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/internal/praefect/config" + "gitlab.com/gitlab-org/gitaly/internal/praefect/datastore" "gitlab.com/gitlab-org/gitaly/internal/praefect/datastore/glsql" "gitlab.com/gitlab-org/gitaly/internal/praefect/models" "gitlab.com/gitlab-org/gitaly/internal/testhelper" @@ -20,6 +23,69 @@ import ( var shardName string = "test-shard-0" +func TestMetrics(t *testing.T) { + db := getDB(t) + + // set up copied from below + const virtualStorageName = "virtual-storage-0" + const primaryStorage = "praefect-internal-0" + socket0, socket1 := testhelper.GetTemporaryGitalySocketFileName(), testhelper.GetTemporaryGitalySocketFileName() + virtualStorage := &config.VirtualStorage{ + Name: virtualStorageName, + Nodes: []*models.Node{ + { + Storage: primaryStorage, + Address: "unix://" + socket0, + DefaultPrimary: true, + }, + { + Storage: "praefect-internal-1", + Address: "unix://" + socket1, + }, + }, + } + + srv0, _ := testhelper.NewServerWithHealth(t, socket0) + defer srv0.Stop() + + srv1, _ := testhelper.NewServerWithHealth(t, socket1) + defer srv1.Stop() + + conf := config.Config{ + Failover: config.Failover{Enabled: true, ElectionStrategy: "sql"}, + VirtualStorages: []*config.VirtualStorage{virtualStorage}, + } + nm, err := NewManager(testhelper.DiscardTestEntry(t), conf, db.DB, datastore.Datastore{}, promtest.NewMockHistogramVec()) + require.NoError(t, err) + + ctx, cancel := testhelper.Context() + defer cancel() + err = nm.strategies[virtualStorageName].checkNodes(ctx) + db.RequireRowsInTable(t, "shard_primaries", 1) + + // the actual test code + expected := strings.NewReader(` +# HELP gitaly_get_shard_total Total count of GetShard calls +# TYPE gitaly_get_shard_total counter +gitaly_get_shard_total{virtual_storage="virtual-storage-0"} 2 + +# HELP gitaly_never_set_gauge This is never touched, yet is visible in scrape +# TYPE gitaly_never_set_gauge gauge +gitaly_never_set_gauge{virtual_storage="virtual-storage-0"} 0 + +# HELP gitaly_praefect_primaries Primary/Secondary statuses of Gitaly nodes. +# TYPE gitaly_praefect_primaries gauge +gitaly_praefect_primaries{gitaly_storage="praefect-internal-0",virtual_storage="virtual-storage-0"} 1 +gitaly_praefect_primaries{gitaly_storage="praefect-internal-1",virtual_storage="virtual-storage-0"} 0 +`) + + // Multiple tests can now use the metrics as the metrics are per elector instance. + // The metrics are not registered in the tests, so there won't be duplicate registrations + // in the global registry. Alternatively, if the test does not care about metrics, it can + // just completely ignore them without having to provide mock metrics. + require.NoError(t, testutil.CollectAndCompare(nm, expected)) +} + func TestGetPrimaryAndSecondaries(t *testing.T) { db := getDB(t) diff --git a/internal/service/repository/optimize.go b/internal/service/repository/optimize.go index 5c366e881..1b831c1ef 100644 --- a/internal/service/repository/optimize.go +++ b/internal/service/repository/optimize.go @@ -31,8 +31,8 @@ func init() { func removeEmptyDirs(ctx context.Context, target string) error { if err := ctx.Err(); err != nil { - return err - } + return err + } entries, err := ioutil.ReadDir(target) switch { |