Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSami Hiltunen <shiltunen@gitlab.com>2020-05-19 14:30:47 +0300
committerSami Hiltunen <shiltunen@gitlab.com>2020-05-25 14:58:44 +0300
commit11de6a393182b19ded863488251d320d437f0e40 (patch)
tree110e1b398697ed9345d5492249223b220f7b9474
parenta08561eebea15f32dd4217ca122ce0fc2364a2d6 (diff)
-rw-r--r--cmd/praefect/main.go8
-rw-r--r--internal/praefect/nodes/local_elector.go35
-rw-r--r--internal/praefect/nodes/manager.go12
-rw-r--r--internal/praefect/nodes/sql_elector.go76
-rw-r--r--internal/praefect/nodes/sql_elector_test.go66
-rw-r--r--internal/service/repository/optimize.go4
6 files changed, 170 insertions, 31 deletions
diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go
index 322ae28b5..86bffb2c9 100644
--- a/cmd/praefect/main.go
+++ b/cmd/praefect/main.go
@@ -91,6 +91,7 @@ import (
"strings"
"time"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
"gitlab.com/gitlab-org/gitaly/internal/bootstrap"
"gitlab.com/gitlab-org/gitaly/internal/bootstrap/starter"
@@ -244,6 +245,13 @@ func run(cfgs []starter.Config, conf config.Config) error {
}
nodeManager.Start(1*time.Second, 3*time.Second)
+ // We register the component directly with Prometheus, in this case with
+ // prometheus.DefaultRegisterer. During the scrape the describe method in nodeMgr
+ // relays the call to the configured strategies. The strategies then describe their
+ // own metrics which include the counters and the const metrics. The metrics are
+ // collected via the Collect method, giving us better control on the metric collection.
+ prometheus.MustRegister(nodeManager)
+
transactionCounterMetric, err := metrics.RegisterTransactionCounter()
if err != nil {
return err
diff --git a/internal/praefect/nodes/local_elector.go b/internal/praefect/nodes/local_elector.go
index 04711f760..ea6d4a872 100644
--- a/internal/praefect/nodes/local_elector.go
+++ b/internal/praefect/nodes/local_elector.go
@@ -5,8 +5,8 @@ import (
"sync"
"time"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
- "gitlab.com/gitlab-org/gitaly/internal/praefect/metrics"
)
type nodeCandidate struct {
@@ -119,11 +119,24 @@ func (s *localElector) monitor(d time.Duration) {
}
}
+func (*localElector) Describe(chan<- *prometheus.Desc) {
+ // ignored in the example
+}
+
+func (*localElector) Collect(chan<- prometheus.Metric) {
+ // ignored in the example.
+ //
+ // If you look at the implementation of sqlElector.Collect, we can
+ // see that it would be straightforward to extract the instrumentation
+ // code in to a wrapper that wraps a leaderElectionStrategy and instruments
+ // the calls to GetShard and uses GetShard to fetch the current primaries
+ // and secondaries, reducing code duplication between the local and the sql
+ // elector
+}
+
// checkNodes issues a gRPC health check for each node managed by the
// shard.
func (s *localElector) checkNodes(ctx context.Context) error {
- defer s.updateMetrics()
-
for _, n := range s.nodes {
n.checkNode(ctx)
}
@@ -195,19 +208,3 @@ func (s *localElector) enableWrites(context.Context) error {
s.isReadOnly = false
return nil
}
-
-func (s *localElector) updateMetrics() {
- s.m.RLock()
- primary := s.primaryNode
- s.m.RUnlock()
-
- for _, n := range s.nodes {
- var val float64
-
- if n == primary {
- val = 1
- }
-
- metrics.PrimaryGauge.WithLabelValues(s.shardName, n.node.GetStorage()).Set(val)
- }
-}
diff --git a/internal/praefect/nodes/manager.go b/internal/praefect/nodes/manager.go
index d9d79ec2f..d932f1e71 100644
--- a/internal/praefect/nodes/manager.go
+++ b/internal/praefect/nodes/manager.go
@@ -11,6 +11,7 @@ import (
grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware"
"github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus"
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
gitalyauth "gitlab.com/gitlab-org/gitaly/auth"
"gitlab.com/gitlab-org/gitaly/client"
@@ -89,6 +90,7 @@ type leaderElectionStrategy interface {
checkNodes(context.Context) error
GetShard() (Shard, error)
enableWrites(context.Context) error
+ prometheus.Collector
}
// ErrPrimaryNotHealthy indicates the primary of a shard is not in a healthy state and hence
@@ -152,6 +154,16 @@ func NewManager(log *logrus.Entry, c config.Config, db *sql.DB, ds datastore.Dat
}, nil
}
+func (n *Mgr) Describe(descs chan<- *prometheus.Desc) {
+ prometheus.DescribeByCollect(n, descs)
+}
+
+func (n *Mgr) Collect(metrics chan<- prometheus.Metric) {
+ for _, vs := range n.strategies {
+ vs.Collect(metrics)
+ }
+}
+
// Start will bootstrap the node manager by calling healthcheck on the nodes as well as kicking off
// the monitoring process. Start must be called before NodeMgr can be used.
func (n *Mgr) Start(bootstrapInterval, monitorInterval time.Duration) {
diff --git a/internal/praefect/nodes/sql_elector.go b/internal/praefect/nodes/sql_elector.go
index e747b5265..ce680ca01 100644
--- a/internal/praefect/nodes/sql_elector.go
+++ b/internal/praefect/nodes/sql_elector.go
@@ -12,9 +12,9 @@ import (
"github.com/google/uuid"
"github.com/lib/pq"
+ "github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
"gitlab.com/gitlab-org/gitaly/internal/praefect/config"
- "gitlab.com/gitlab-org/gitaly/internal/praefect/metrics"
)
const (
@@ -84,6 +84,9 @@ type sqlElector struct {
failoverSeconds int
activePraefectSeconds int
readOnlyAfterFailover bool
+
+ getShardTotal prometheus.Counter
+ neverSet prometheus.Gauge
}
func newSQLElector(name string, c config.Config, failoverTimeoutSeconds int, activePraefectSeconds int, db *sql.DB, log logrus.FieldLogger, ns []*nodeStatus) *sqlElector {
@@ -107,6 +110,25 @@ func newSQLElector(name string, c config.Config, failoverTimeoutSeconds int, act
nodes: nodes,
primaryNode: nodes[0],
readOnlyAfterFailover: c.Failover.ReadOnlyAfterFailover,
+
+ // We create the counter here with all the label values already set. This means the time series
+ // shows up immediately after registration in Prometheus without any prior observations being made.
+ // The metric is defined where it is used, reducing the need to jump around the code base.
+ getShardTotal: prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "gitaly_get_shard_total",
+ Help: "Total count of GetShard calls",
+ ConstLabels: prometheus.Labels{
+ "virtual_storage": name,
+ },
+ }),
+
+ neverSet: prometheus.NewGauge(prometheus.GaugeOpts{
+ Name: "gitaly_never_set_gauge",
+ Help: "This is never touched, yet is visible in scrape",
+ ConstLabels: prometheus.Labels{
+ "virtual_storage": name,
+ },
+ }),
}
}
@@ -160,8 +182,6 @@ func (s *sqlElector) monitor(d time.Duration) {
func (s *sqlElector) checkNodes(ctx context.Context) error {
var wg sync.WaitGroup
- defer s.updateMetrics()
-
for _, n := range s.nodes {
wg.Add(1)
@@ -256,6 +276,12 @@ last_contact_attempt_at = NOW()`
// GetShard gets the current status of the shard. ErrPrimaryNotHealthy
// is returned if a primary does not exist.
func (s *sqlElector) GetShard() (Shard, error) {
+ // Since we set the constant labels when we created the counter,
+ // we can simply increment here without worrying about the labels.
+ // Since the metric is always created, we do not need to nil check
+ // either.
+ s.getShardTotal.Inc()
+
primary, readOnly, err := s.lookupPrimary()
if err != nil {
return Shard{}, err
@@ -280,19 +306,49 @@ func (s *sqlElector) GetShard() (Shard, error) {
}, nil
}
-func (s *sqlElector) updateMetrics() {
- s.m.RLock()
- primary := s.primaryNode
- s.m.RUnlock()
+func (s *sqlElector) Describe(descs chan<- *prometheus.Desc) {
+ // DescribeByCollect is a convenience method from prometheus that allows us to avoid
+ // explicitly listing every metric here and only focus on the Collect method.
+ prometheus.DescribeByCollect(s, descs)
+}
+
+func (s *sqlElector) Collect(metrics chan<- prometheus.Metric) {
+ // First we collect every concrete metric we have
+ s.getShardTotal.Collect(metrics)
+ s.neverSet.Collect(metrics)
+
+ // Then we collect metrics that do not directly map to a persistent
+ // metric. In this case, the primary information comes from the database
+ // and could have been modified by another praefect node. We get the value
+ // when it is needed for a scrape, guaranteeing it is always up to date.
+ // This is also useful if we have other derived metrics that we do not hold
+ // in the memory.
+ shard, err := s.GetShard()
+ if err != nil {
+ // In the example, we only log the error. We could also expose a metric
+ // indicating the scrape's failure: https://prometheus.io/docs/instrumenting/writing_exporters/#failed-scrapes
+ s.log.WithFields(logrus.Fields{
+ "error": err,
+ "virtual-storage": s.shardName,
+ }).Error("prometheus scrape failed")
+ return
+ }
for _, node := range s.nodes {
var val float64
-
- if primary == node {
+ if node == shard.Primary {
val = 1
}
- metrics.PrimaryGauge.WithLabelValues(s.shardName, node.GetStorage()).Set(val)
+ // Since the information about the primary is derived from GetShard during scrape, we do not have a persistent
+ // gauge for it. Instead, we just give the derived value directly to Prometheus via a ConstMetric. Since there
+ // are no gauges that could be set from the outside during a scrape, the view is always consistent.
+ metrics <- prometheus.MustNewConstMetric(prometheus.NewDesc(
+ "gitaly_praefect_primaries",
+ "Primary/Secondary statuses of Gitaly nodes.",
+ []string{"virtual_storage", "gitaly_storage"},
+ nil,
+ ), prometheus.GaugeValue, val, s.shardName, node.GetStorage())
}
}
diff --git a/internal/praefect/nodes/sql_elector_test.go b/internal/praefect/nodes/sql_elector_test.go
index adc602952..e0354d72d 100644
--- a/internal/praefect/nodes/sql_elector_test.go
+++ b/internal/praefect/nodes/sql_elector_test.go
@@ -3,13 +3,16 @@
package nodes
import (
+ "strings"
"testing"
"time"
+ "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/sirupsen/logrus"
"github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
"gitlab.com/gitlab-org/gitaly/internal/praefect/config"
+ "gitlab.com/gitlab-org/gitaly/internal/praefect/datastore"
"gitlab.com/gitlab-org/gitaly/internal/praefect/datastore/glsql"
"gitlab.com/gitlab-org/gitaly/internal/praefect/models"
"gitlab.com/gitlab-org/gitaly/internal/testhelper"
@@ -20,6 +23,69 @@ import (
var shardName string = "test-shard-0"
+func TestMetrics(t *testing.T) {
+ db := getDB(t)
+
+ // set up copied from below
+ const virtualStorageName = "virtual-storage-0"
+ const primaryStorage = "praefect-internal-0"
+ socket0, socket1 := testhelper.GetTemporaryGitalySocketFileName(), testhelper.GetTemporaryGitalySocketFileName()
+ virtualStorage := &config.VirtualStorage{
+ Name: virtualStorageName,
+ Nodes: []*models.Node{
+ {
+ Storage: primaryStorage,
+ Address: "unix://" + socket0,
+ DefaultPrimary: true,
+ },
+ {
+ Storage: "praefect-internal-1",
+ Address: "unix://" + socket1,
+ },
+ },
+ }
+
+ srv0, _ := testhelper.NewServerWithHealth(t, socket0)
+ defer srv0.Stop()
+
+ srv1, _ := testhelper.NewServerWithHealth(t, socket1)
+ defer srv1.Stop()
+
+ conf := config.Config{
+ Failover: config.Failover{Enabled: true, ElectionStrategy: "sql"},
+ VirtualStorages: []*config.VirtualStorage{virtualStorage},
+ }
+ nm, err := NewManager(testhelper.DiscardTestEntry(t), conf, db.DB, datastore.Datastore{}, promtest.NewMockHistogramVec())
+ require.NoError(t, err)
+
+ ctx, cancel := testhelper.Context()
+ defer cancel()
+ err = nm.strategies[virtualStorageName].checkNodes(ctx)
+ db.RequireRowsInTable(t, "shard_primaries", 1)
+
+ // the actual test code
+ expected := strings.NewReader(`
+# HELP gitaly_get_shard_total Total count of GetShard calls
+# TYPE gitaly_get_shard_total counter
+gitaly_get_shard_total{virtual_storage="virtual-storage-0"} 2
+
+# HELP gitaly_never_set_gauge This is never touched, yet is visible in scrape
+# TYPE gitaly_never_set_gauge gauge
+gitaly_never_set_gauge{virtual_storage="virtual-storage-0"} 0
+
+# HELP gitaly_praefect_primaries Primary/Secondary statuses of Gitaly nodes.
+# TYPE gitaly_praefect_primaries gauge
+gitaly_praefect_primaries{gitaly_storage="praefect-internal-0",virtual_storage="virtual-storage-0"} 1
+gitaly_praefect_primaries{gitaly_storage="praefect-internal-1",virtual_storage="virtual-storage-0"} 0
+`)
+
+ // Multiple tests can now use the metrics as the metrics are per elector instance.
+ // The metrics are not registered in the tests, so there won't be duplicate registrations
+ // in the global registry. Alternatively, if the test does not care about metrics, it can
+ // just completely ignore them without having to provide mock metrics.
+ require.NoError(t, testutil.CollectAndCompare(nm, expected))
+}
+
func TestGetPrimaryAndSecondaries(t *testing.T) {
db := getDB(t)
diff --git a/internal/service/repository/optimize.go b/internal/service/repository/optimize.go
index 5c366e881..1b831c1ef 100644
--- a/internal/service/repository/optimize.go
+++ b/internal/service/repository/optimize.go
@@ -31,8 +31,8 @@ func init() {
func removeEmptyDirs(ctx context.Context, target string) error {
if err := ctx.Err(); err != nil {
- return err
- }
+ return err
+ }
entries, err := ioutil.ReadDir(target)
switch {