Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'internal/praefect/nodes/sql_elector.go')
-rw-r--r--internal/praefect/nodes/sql_elector.go76
1 files changed, 66 insertions, 10 deletions
diff --git a/internal/praefect/nodes/sql_elector.go b/internal/praefect/nodes/sql_elector.go
index e747b5265..ce680ca01 100644
--- a/internal/praefect/nodes/sql_elector.go
+++ b/internal/praefect/nodes/sql_elector.go
@@ -12,9 +12,9 @@ import (
"github.com/google/uuid"
"github.com/lib/pq"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
"gitlab.com/gitlab-org/gitaly/internal/praefect/config"
- "gitlab.com/gitlab-org/gitaly/internal/praefect/metrics"
)
const (
@@ -84,6 +84,9 @@ type sqlElector struct {
failoverSeconds int
activePraefectSeconds int
readOnlyAfterFailover bool
+
+ getShardTotal prometheus.Counter
+ neverSet prometheus.Gauge
}
func newSQLElector(name string, c config.Config, failoverTimeoutSeconds int, activePraefectSeconds int, db *sql.DB, log logrus.FieldLogger, ns []*nodeStatus) *sqlElector {
@@ -107,6 +110,25 @@ func newSQLElector(name string, c config.Config, failoverTimeoutSeconds int, act
nodes: nodes,
primaryNode: nodes[0],
readOnlyAfterFailover: c.Failover.ReadOnlyAfterFailover,
+
+ // We create the counter here with all the label values already set. This means the time series
+ // shows up immediately after registration in Prometheus without any prior observations being made.
+ // The metric is defined where it is used, reducing the need to jump around the code base.
+ getShardTotal: prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "gitaly_get_shard_total",
+ Help: "Total count of GetShard calls",
+ ConstLabels: prometheus.Labels{
+ "virtual_storage": name,
+ },
+ }),
+
+ neverSet: prometheus.NewGauge(prometheus.GaugeOpts{
+ Name: "gitaly_never_set_gauge",
+ Help: "This is never touched, yet is visible in scrape",
+ ConstLabels: prometheus.Labels{
+ "virtual_storage": name,
+ },
+ }),
}
}
@@ -160,8 +182,6 @@ func (s *sqlElector) monitor(d time.Duration) {
func (s *sqlElector) checkNodes(ctx context.Context) error {
var wg sync.WaitGroup
- defer s.updateMetrics()
-
for _, n := range s.nodes {
wg.Add(1)
@@ -256,6 +276,12 @@ last_contact_attempt_at = NOW()`
// GetShard gets the current status of the shard. ErrPrimaryNotHealthy
// is returned if a primary does not exist.
func (s *sqlElector) GetShard() (Shard, error) {
+ // Since we set the constant labels when we created the counter,
+ // we can simply increment here without worrying about the labels.
+ // Since the metric is always created, we do not need to nil check
+ // either.
+ s.getShardTotal.Inc()
+
primary, readOnly, err := s.lookupPrimary()
if err != nil {
return Shard{}, err
@@ -280,19 +306,49 @@ func (s *sqlElector) GetShard() (Shard, error) {
}, nil
}
-func (s *sqlElector) updateMetrics() {
- s.m.RLock()
- primary := s.primaryNode
- s.m.RUnlock()
+func (s *sqlElector) Describe(descs chan<- *prometheus.Desc) {
+ // DescribeByCollect is a convenience method from prometheus that allows us to avoid
+ // explicitly listing every metric here and only focus on the Collect method.
+ prometheus.DescribeByCollect(s, descs)
+}
+
+func (s *sqlElector) Collect(metrics chan<- prometheus.Metric) {
+ // First we collect every concrete metric we have
+ s.getShardTotal.Collect(metrics)
+ s.neverSet.Collect(metrics)
+
+ // Then we collect metrics that do not directly map to a persistent
+ // metric. In this case, the primary information comes from the database
+ // and could have been modified by another praefect node. We get the value
+ // when it is needed for a scrape, guaranteeing it is always up to date.
+ // This is also useful if we have other derived metrics that we do not hold
+ // in the memory.
+ shard, err := s.GetShard()
+ if err != nil {
+ // In the example, we only log the error. We could also expose a metric
+ // indicating the scrape's failure: https://prometheus.io/docs/instrumenting/writing_exporters/#failed-scrapes
+ s.log.WithFields(logrus.Fields{
+ "error": err,
+ "virtual-storage": s.shardName,
+ }).Error("prometheus scrape failed")
+ return
+ }
for _, node := range s.nodes {
var val float64
-
- if primary == node {
+ if node == shard.Primary {
val = 1
}
- metrics.PrimaryGauge.WithLabelValues(s.shardName, node.GetStorage()).Set(val)
+ // Since the information about the primary is derived from GetShard during scrape, we do not have a persistent
+ // gauge for it. Instead, we just give the derived value directly to Prometheus via a ConstMetric. Since there
+ // are no gauges that could be set from the outside during a scrape, the view is always consistent.
+ metrics <- prometheus.MustNewConstMetric(prometheus.NewDesc(
+ "gitaly_praefect_primaries",
+ "Primary/Secondary statuses of Gitaly nodes.",
+ []string{"virtual_storage", "gitaly_storage"},
+ nil,
+ ), prometheus.GaugeValue, val, s.shardName, node.GetStorage())
}
}