Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Steinhardt <psteinhardt@gitlab.com>2020-03-19 10:04:48 +0300
committerPatrick Steinhardt <psteinhardt@gitlab.com>2020-03-19 10:04:48 +0300
commit4a573695a41192d25913d02b2ea68cc67cc9033d (patch)
tree0ff56c45eaea6f5c499147c3cd29d752b8945c2f
parent6b80a74b91d034f935c04793c005082e0b741975 (diff)
parent999c0391390115c3f0002bee5bcca8590c23eebd (diff)
Merge branch 'jc-metric-node-latency' into 'master'
Add histogram to keep track of node healthcheck latency Closes #2250 See merge request gitlab-org/gitaly!1921
-rw-r--r--changelogs/unreleased/jc-metric-node-latency.yml5
-rw-r--r--cmd/praefect/main.go9
-rw-r--r--internal/praefect/auth_test.go3
-rw-r--r--internal/praefect/coordinator_test.go3
-rw-r--r--internal/praefect/helper_test.go4
-rw-r--r--internal/praefect/metrics/prometheus.go19
-rw-r--r--internal/praefect/nodes/manager.go24
-rw-r--r--internal/praefect/nodes/manager_test.go20
-rw-r--r--internal/praefect/replicator_test.go6
-rw-r--r--internal/praefect/server_test.go3
-rw-r--r--internal/testhelper/promtest/histogram.go50
11 files changed, 124 insertions, 22 deletions
diff --git a/changelogs/unreleased/jc-metric-node-latency.yml b/changelogs/unreleased/jc-metric-node-latency.yml
new file mode 100644
index 000000000..ec04f7583
--- /dev/null
+++ b/changelogs/unreleased/jc-metric-node-latency.yml
@@ -0,0 +1,5 @@
+---
+title: Add histogram to keep track of node healthcheck latency
+merge_request: 1921
+author:
+type: added
diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go
index ae81f1680..6ec446451 100644
--- a/cmd/praefect/main.go
+++ b/cmd/praefect/main.go
@@ -136,11 +136,10 @@ func configure(conf config.Config) {
}
func run(cfgs []starter.Config, conf config.Config) error {
- nodeManager, err := nodes.NewManager(logger, conf)
+ nodeLatencyHistogram, err := metrics.RegisterNodeLatency(conf.Prometheus)
if err != nil {
return err
}
- nodeManager.Start(1*time.Second, 3*time.Second)
latencyMetric, err := metrics.RegisterReplicationLatency(conf.Prometheus)
if err != nil {
@@ -152,6 +151,12 @@ func run(cfgs []starter.Config, conf config.Config) error {
return err
}
+ nodeManager, err := nodes.NewManager(logger, conf, nodeLatencyHistogram)
+ if err != nil {
+ return err
+ }
+ nodeManager.Start(1*time.Second, 3*time.Second)
+
registry := protoregistry.New()
if err = registry.RegisterFiles(protoregistry.GitalyProtoFileDescriptors...); err != nil {
return err
diff --git a/internal/praefect/auth_test.go b/internal/praefect/auth_test.go
index 48f1daf7a..0229590a0 100644
--- a/internal/praefect/auth_test.go
+++ b/internal/praefect/auth_test.go
@@ -17,6 +17,7 @@ import (
"gitlab.com/gitlab-org/gitaly/internal/praefect/nodes"
"gitlab.com/gitlab-org/gitaly/internal/praefect/protoregistry"
"gitlab.com/gitlab-org/gitaly/internal/testhelper"
+ "gitlab.com/gitlab-org/gitaly/internal/testhelper/promtest"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
)
@@ -190,7 +191,7 @@ func runServer(t *testing.T, token string, required bool) (*Server, string, func
logEntry := testhelper.DiscardTestEntry(t)
ds := datastore.NewInMemory(conf)
- nodeMgr, err := nodes.NewManager(logEntry, conf)
+ nodeMgr, err := nodes.NewManager(logEntry, conf, promtest.NewMockHistogramVec())
require.NoError(t, err)
registry := protoregistry.New()
diff --git a/internal/praefect/coordinator_test.go b/internal/praefect/coordinator_test.go
index e51d30333..6e396a3ea 100644
--- a/internal/praefect/coordinator_test.go
+++ b/internal/praefect/coordinator_test.go
@@ -13,6 +13,7 @@ import (
"gitlab.com/gitlab-org/gitaly/internal/praefect/nodes"
"gitlab.com/gitlab-org/gitaly/internal/praefect/protoregistry"
"gitlab.com/gitlab-org/gitaly/internal/testhelper"
+ "gitlab.com/gitlab-org/gitaly/internal/testhelper/promtest"
"gitlab.com/gitlab-org/gitaly/proto/go/gitalypb"
"gitlab.com/gitlab-org/labkit/correlation"
)
@@ -59,7 +60,7 @@ func TestStreamDirector(t *testing.T) {
entry := testhelper.DiscardTestEntry(t)
- nodeMgr, err := nodes.NewManager(entry, conf)
+ nodeMgr, err := nodes.NewManager(entry, conf, promtest.NewMockHistogramVec())
require.NoError(t, err)
r := protoregistry.New()
require.NoError(t, r.RegisterFiles(protoregistry.GitalyProtoFileDescriptors...))
diff --git a/internal/praefect/helper_test.go b/internal/praefect/helper_test.go
index fec88b1a2..4c93443fb 100644
--- a/internal/praefect/helper_test.go
+++ b/internal/praefect/helper_test.go
@@ -115,7 +115,7 @@ func runPraefectServerWithMock(t *testing.T, conf config.Config, backends map[st
conf.VirtualStorages[0].Nodes[i] = node
}
- nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), conf)
+ nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), conf, promtest.NewMockHistogramVec())
require.NoError(t, err)
nodeMgr.Start(1*time.Millisecond, 5*time.Millisecond)
@@ -176,7 +176,7 @@ func runPraefectServerWithGitaly(t *testing.T, conf config.Config) (*grpc.Client
ds := datastore.NewInMemory(conf)
logEntry := log.Default()
- nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), conf)
+ nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), conf, promtest.NewMockHistogramVec())
require.NoError(t, err)
nodeMgr.Start(1*time.Millisecond, 5*time.Millisecond)
diff --git a/internal/praefect/metrics/prometheus.go b/internal/praefect/metrics/prometheus.go
index 12a7d7dfc..f53a176b9 100644
--- a/internal/praefect/metrics/prometheus.go
+++ b/internal/praefect/metrics/prometheus.go
@@ -20,6 +20,21 @@ func RegisterReplicationLatency(conf promconfig.Config) (Histogram, error) {
return replicationLatency, prometheus.Register(replicationLatency)
}
+// RegisterNodeLatency creates and registers a prometheus histogram to
+// observe internal node latency
+func RegisterNodeLatency(conf promconfig.Config) (HistogramVec, error) {
+ nodeLatency := prometheus.NewHistogramVec(
+ prometheus.HistogramOpts{
+ Namespace: "gitaly",
+ Subsystem: "praefect",
+ Name: "node_latency",
+ Buckets: conf.GRPCLatencyBuckets,
+ }, []string{"gitaly_storage"},
+ )
+
+ return nodeLatency, prometheus.Register(nodeLatency)
+}
+
// RegisterReplicationJobsInFlight creates and registers a gauge
// to track the size of the replication queue
func RegisterReplicationJobsInFlight() (Gauge, error) {
@@ -64,3 +79,7 @@ type Gauge interface {
type Histogram interface {
Observe(float64)
}
+
+type HistogramVec interface {
+ WithLabelValues(lvs ...string) prometheus.Observer
+}
diff --git a/internal/praefect/nodes/manager.go b/internal/praefect/nodes/manager.go
index 78ac5494a..7952233fb 100644
--- a/internal/praefect/nodes/manager.go
+++ b/internal/praefect/nodes/manager.go
@@ -75,6 +75,7 @@ type Mgr struct {
// VirtualStorages
failoverEnabled bool
log *logrus.Entry
+ nodeLatencyHist metrics.HistogramVec
}
// ErrPrimaryNotHealthy indicates the primary of a shard is not in a healthy state and hence
@@ -82,7 +83,7 @@ type Mgr struct {
var ErrPrimaryNotHealthy = errors.New("primary is not healthy")
// NewNodeManager creates a new NodeMgr based on virtual storage configs
-func NewManager(log *logrus.Entry, c config.Config, dialOpts ...grpc.DialOption) (*Mgr, error) {
+func NewManager(log *logrus.Entry, c config.Config, latencyHistogram metrics.HistogramVec, dialOpts ...grpc.DialOption) (*Mgr, error) {
shards := make(map[string]*shard)
for _, virtualStorage := range c.VirtualStorages {
var secondaries []*nodeStatus
@@ -108,7 +109,7 @@ func NewManager(log *logrus.Entry, c config.Config, dialOpts ...grpc.DialOption)
if err != nil {
return nil, err
}
- ns := newConnectionStatus(*node, conn, log)
+ ns := newConnectionStatus(*node, conn, log, latencyHistogram)
if node.DefaultPrimary {
primary = ns
@@ -229,20 +230,22 @@ func (n *Mgr) checkShards() {
}
}
-func newConnectionStatus(node models.Node, cc *grpc.ClientConn, l *logrus.Entry) *nodeStatus {
+func newConnectionStatus(node models.Node, cc *grpc.ClientConn, l *logrus.Entry, latencyHist metrics.HistogramVec) *nodeStatus {
return &nodeStatus{
- Node: node,
- ClientConn: cc,
- statuses: make([]healthpb.HealthCheckResponse_ServingStatus, 0),
- log: l,
+ Node: node,
+ ClientConn: cc,
+ statuses: make([]healthpb.HealthCheckResponse_ServingStatus, 0),
+ log: l,
+ latencyHist: latencyHist,
}
}
type nodeStatus struct {
models.Node
*grpc.ClientConn
- statuses []healthpb.HealthCheckResponse_ServingStatus
- log *logrus.Entry
+ statuses []healthpb.HealthCheckResponse_ServingStatus
+ log *logrus.Entry
+ latencyHist metrics.HistogramVec
}
// GetStorage gets the storage name of a node
@@ -284,7 +287,10 @@ func (n *nodeStatus) check() {
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel()
+ start := time.Now()
resp, err := client.Check(ctx, &healthpb.HealthCheckRequest{Service: ""})
+ n.latencyHist.WithLabelValues(n.Storage).Observe(time.Since(start).Seconds())
+
if err != nil {
n.log.WithError(err).WithField("storage", n.Storage).WithField("address", n.Address).Warn("error when pinging healthcheck")
resp = &healthpb.HealthCheckResponse{
diff --git a/internal/praefect/nodes/manager_test.go b/internal/praefect/nodes/manager_test.go
index 69a24c10c..8f5e1443b 100644
--- a/internal/praefect/nodes/manager_test.go
+++ b/internal/praefect/nodes/manager_test.go
@@ -8,6 +8,7 @@ import (
"gitlab.com/gitlab-org/gitaly/internal/praefect/config"
"gitlab.com/gitlab-org/gitaly/internal/praefect/models"
"gitlab.com/gitlab-org/gitaly/internal/testhelper"
+ "gitlab.com/gitlab-org/gitaly/internal/testhelper/promtest"
"google.golang.org/grpc"
"google.golang.org/grpc/health/grpc_health_v1"
)
@@ -24,14 +25,21 @@ func TestNodeStatus(t *testing.T) {
require.NoError(t, err)
- cs := newConnectionStatus(models.Node{}, cc, testhelper.DiscardTestEntry(t))
+ mockHistogramVec := promtest.NewMockHistogramVec()
+
+ storageName := "default"
+ cs := newConnectionStatus(models.Node{Storage: storageName}, cc, testhelper.DiscardTestEntry(t), mockHistogramVec)
require.False(t, cs.isHealthy())
+ var expectedLabels [][]string
for i := 0; i < healthcheckThreshold; i++ {
cs.check()
+ expectedLabels = append(expectedLabels, []string{storageName})
}
require.True(t, cs.isHealthy())
+ require.Equal(t, expectedLabels, mockHistogramVec.LabelsCalled())
+ require.Len(t, mockHistogramVec.Observer().Observed(), healthcheckThreshold)
healthSvr.SetServingStatus("", grpc_health_v1.HealthCheckResponse_NOT_SERVING)
@@ -73,10 +81,11 @@ func TestNodeManager(t *testing.T) {
FailoverEnabled: false,
}
- nm, err := NewManager(testhelper.DiscardTestEntry(t), confWithFailover)
+ mockHistogram := promtest.NewMockHistogramVec()
+ nm, err := NewManager(testhelper.DiscardTestEntry(t), confWithFailover, mockHistogram)
require.NoError(t, err)
- nmWithoutFailover, err := NewManager(testhelper.DiscardTestEntry(t), confWithoutFailover)
+ nmWithoutFailover, err := NewManager(testhelper.DiscardTestEntry(t), confWithoutFailover, mockHistogram)
require.NoError(t, err)
nm.Start(1*time.Millisecond, 5*time.Second)
@@ -115,6 +124,11 @@ func TestNodeManager(t *testing.T) {
healthSrv0.SetServingStatus("", grpc_health_v1.HealthCheckResponse_UNKNOWN)
nm.checkShards()
+ labelsCalled := mockHistogram.LabelsCalled()
+ for _, node := range virtualStorages[0].Nodes {
+ require.Contains(t, labelsCalled, []string{node.Storage})
+ }
+
// since the primary is unhealthy, we expect checkShards to demote primary to secondary, and promote the healthy
// secondary to primary
diff --git a/internal/praefect/replicator_test.go b/internal/praefect/replicator_test.go
index 495ff1bdc..2dfe83467 100644
--- a/internal/praefect/replicator_test.go
+++ b/internal/praefect/replicator_test.go
@@ -141,7 +141,7 @@ func TestProcessReplicationJob(t *testing.T) {
entry := testhelper.DiscardTestEntry(t)
replicator.log = entry
- nodeMgr, err := nodes.NewManager(entry, config)
+ nodeMgr, err := nodes.NewManager(entry, config, promtest.NewMockHistogramVec())
require.NoError(t, err)
nodeMgr.Start(1*time.Millisecond, 5*time.Millisecond)
@@ -280,7 +280,7 @@ func TestProcessBacklog_FailedJobs(t *testing.T) {
require.NoError(t, ds.UpdateReplJobState(ids[0], datastore.JobStateReady))
- nodeMgr, err := nodes.NewManager(entry, config)
+ nodeMgr, err := nodes.NewManager(entry, config, promtest.NewMockHistogramVec())
require.NoError(t, err)
replMgr := NewReplMgr("default", entry, ds, nodeMgr)
@@ -410,7 +410,7 @@ func TestProcessBacklog_Success(t *testing.T) {
require.NoError(t, ds.UpdateReplJobState(id, datastore.JobStateReady))
}
- nodeMgr, err := nodes.NewManager(entry, config)
+ nodeMgr, err := nodes.NewManager(entry, config, promtest.NewMockHistogramVec())
require.NoError(t, err)
replMgr := NewReplMgr("default", entry, ds, nodeMgr)
diff --git a/internal/praefect/server_test.go b/internal/praefect/server_test.go
index b5d01664b..c70777fb9 100644
--- a/internal/praefect/server_test.go
+++ b/internal/praefect/server_test.go
@@ -24,6 +24,7 @@ import (
"gitlab.com/gitlab-org/gitaly/internal/praefect/nodes"
"gitlab.com/gitlab-org/gitaly/internal/praefect/protoregistry"
"gitlab.com/gitlab-org/gitaly/internal/testhelper"
+ "gitlab.com/gitlab-org/gitaly/internal/testhelper/promtest"
"gitlab.com/gitlab-org/gitaly/internal/version"
"gitlab.com/gitlab-org/gitaly/proto/go/gitalypb"
healthpb "google.golang.org/grpc/health/grpc_health_v1"
@@ -132,7 +133,7 @@ func TestGitalyServerInfoBadNode(t *testing.T) {
}
entry := testhelper.DiscardTestEntry(t)
- nodeMgr, err := nodes.NewManager(entry, conf)
+ nodeMgr, err := nodes.NewManager(entry, conf, promtest.NewMockHistogramVec())
require.NoError(t, err)
registry := protoregistry.New()
diff --git a/internal/testhelper/promtest/histogram.go b/internal/testhelper/promtest/histogram.go
index b35d58e0a..35208128f 100644
--- a/internal/testhelper/promtest/histogram.go
+++ b/internal/testhelper/promtest/histogram.go
@@ -2,6 +2,8 @@ package promtest
import (
"sync"
+
+ "github.com/prometheus/client_golang/prometheus"
)
// MockHistogram is a mock histogram that adheres to prometheus.Histogram for use in unit tests
@@ -16,3 +18,51 @@ func (m *MockHistogram) Observe(v float64) {
defer m.m.Unlock()
m.Values = append(m.Values, v)
}
+
+func NewMockHistogramVec() *MockHistogramVec {
+ return &MockHistogramVec{}
+}
+
+type MockHistogramVec struct {
+ m sync.RWMutex
+ labelsCalled [][]string
+ observer MockObserver
+}
+
+func (m *MockHistogramVec) LabelsCalled() [][]string {
+ m.m.RLock()
+ defer m.m.RUnlock()
+
+ return m.labelsCalled
+}
+
+func (m *MockHistogramVec) Observer() *MockObserver {
+ return &m.observer
+}
+
+func (m *MockHistogramVec) WithLabelValues(lvs ...string) prometheus.Observer {
+ m.m.Lock()
+ defer m.m.Unlock()
+
+ m.labelsCalled = append(m.labelsCalled, lvs)
+ return &m.observer
+}
+
+type MockObserver struct {
+ m sync.RWMutex
+ observed []float64
+}
+
+func (m *MockObserver) Observe(v float64) {
+ m.m.Lock()
+ defer m.m.Unlock()
+
+ m.observed = append(m.observed, v)
+}
+
+func (m *MockObserver) Observed() []float64 {
+ m.m.RLock()
+ defer m.m.RUnlock()
+
+ return m.observed
+}