diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2020-03-19 10:04:48 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2020-03-19 10:04:48 +0300 |
commit | 4a573695a41192d25913d02b2ea68cc67cc9033d (patch) | |
tree | 0ff56c45eaea6f5c499147c3cd29d752b8945c2f | |
parent | 6b80a74b91d034f935c04793c005082e0b741975 (diff) | |
parent | 999c0391390115c3f0002bee5bcca8590c23eebd (diff) |
Merge branch 'jc-metric-node-latency' into 'master'
Add histogram to keep track of node healthcheck latency
Closes #2250
See merge request gitlab-org/gitaly!1921
-rw-r--r-- | changelogs/unreleased/jc-metric-node-latency.yml | 5 | ||||
-rw-r--r-- | cmd/praefect/main.go | 9 | ||||
-rw-r--r-- | internal/praefect/auth_test.go | 3 | ||||
-rw-r--r-- | internal/praefect/coordinator_test.go | 3 | ||||
-rw-r--r-- | internal/praefect/helper_test.go | 4 | ||||
-rw-r--r-- | internal/praefect/metrics/prometheus.go | 19 | ||||
-rw-r--r-- | internal/praefect/nodes/manager.go | 24 | ||||
-rw-r--r-- | internal/praefect/nodes/manager_test.go | 20 | ||||
-rw-r--r-- | internal/praefect/replicator_test.go | 6 | ||||
-rw-r--r-- | internal/praefect/server_test.go | 3 | ||||
-rw-r--r-- | internal/testhelper/promtest/histogram.go | 50 |
11 files changed, 124 insertions, 22 deletions
diff --git a/changelogs/unreleased/jc-metric-node-latency.yml b/changelogs/unreleased/jc-metric-node-latency.yml new file mode 100644 index 000000000..ec04f7583 --- /dev/null +++ b/changelogs/unreleased/jc-metric-node-latency.yml @@ -0,0 +1,5 @@ +--- +title: Add histogram to keep track of node healthcheck latency +merge_request: 1921 +author: +type: added diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go index ae81f1680..6ec446451 100644 --- a/cmd/praefect/main.go +++ b/cmd/praefect/main.go @@ -136,11 +136,10 @@ func configure(conf config.Config) { } func run(cfgs []starter.Config, conf config.Config) error { - nodeManager, err := nodes.NewManager(logger, conf) + nodeLatencyHistogram, err := metrics.RegisterNodeLatency(conf.Prometheus) if err != nil { return err } - nodeManager.Start(1*time.Second, 3*time.Second) latencyMetric, err := metrics.RegisterReplicationLatency(conf.Prometheus) if err != nil { @@ -152,6 +151,12 @@ func run(cfgs []starter.Config, conf config.Config) error { return err } + nodeManager, err := nodes.NewManager(logger, conf, nodeLatencyHistogram) + if err != nil { + return err + } + nodeManager.Start(1*time.Second, 3*time.Second) + registry := protoregistry.New() if err = registry.RegisterFiles(protoregistry.GitalyProtoFileDescriptors...); err != nil { return err diff --git a/internal/praefect/auth_test.go b/internal/praefect/auth_test.go index 48f1daf7a..0229590a0 100644 --- a/internal/praefect/auth_test.go +++ b/internal/praefect/auth_test.go @@ -17,6 +17,7 @@ import ( "gitlab.com/gitlab-org/gitaly/internal/praefect/nodes" "gitlab.com/gitlab-org/gitaly/internal/praefect/protoregistry" "gitlab.com/gitlab-org/gitaly/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/internal/testhelper/promtest" "google.golang.org/grpc" "google.golang.org/grpc/codes" ) @@ -190,7 +191,7 @@ func runServer(t *testing.T, token string, required bool) (*Server, string, func logEntry := testhelper.DiscardTestEntry(t) ds := datastore.NewInMemory(conf) - nodeMgr, err := nodes.NewManager(logEntry, conf) + nodeMgr, err := nodes.NewManager(logEntry, conf, promtest.NewMockHistogramVec()) require.NoError(t, err) registry := protoregistry.New() diff --git a/internal/praefect/coordinator_test.go b/internal/praefect/coordinator_test.go index e51d30333..6e396a3ea 100644 --- a/internal/praefect/coordinator_test.go +++ b/internal/praefect/coordinator_test.go @@ -13,6 +13,7 @@ import ( "gitlab.com/gitlab-org/gitaly/internal/praefect/nodes" "gitlab.com/gitlab-org/gitaly/internal/praefect/protoregistry" "gitlab.com/gitlab-org/gitaly/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/internal/testhelper/promtest" "gitlab.com/gitlab-org/gitaly/proto/go/gitalypb" "gitlab.com/gitlab-org/labkit/correlation" ) @@ -59,7 +60,7 @@ func TestStreamDirector(t *testing.T) { entry := testhelper.DiscardTestEntry(t) - nodeMgr, err := nodes.NewManager(entry, conf) + nodeMgr, err := nodes.NewManager(entry, conf, promtest.NewMockHistogramVec()) require.NoError(t, err) r := protoregistry.New() require.NoError(t, r.RegisterFiles(protoregistry.GitalyProtoFileDescriptors...)) diff --git a/internal/praefect/helper_test.go b/internal/praefect/helper_test.go index fec88b1a2..4c93443fb 100644 --- a/internal/praefect/helper_test.go +++ b/internal/praefect/helper_test.go @@ -115,7 +115,7 @@ func runPraefectServerWithMock(t *testing.T, conf config.Config, backends map[st conf.VirtualStorages[0].Nodes[i] = node } - nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), conf) + nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), conf, promtest.NewMockHistogramVec()) require.NoError(t, err) nodeMgr.Start(1*time.Millisecond, 5*time.Millisecond) @@ -176,7 +176,7 @@ func runPraefectServerWithGitaly(t *testing.T, conf config.Config) (*grpc.Client ds := datastore.NewInMemory(conf) logEntry := log.Default() - nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), conf) + nodeMgr, err := nodes.NewManager(testhelper.DiscardTestEntry(t), conf, promtest.NewMockHistogramVec()) require.NoError(t, err) nodeMgr.Start(1*time.Millisecond, 5*time.Millisecond) diff --git a/internal/praefect/metrics/prometheus.go b/internal/praefect/metrics/prometheus.go index 12a7d7dfc..f53a176b9 100644 --- a/internal/praefect/metrics/prometheus.go +++ b/internal/praefect/metrics/prometheus.go @@ -20,6 +20,21 @@ func RegisterReplicationLatency(conf promconfig.Config) (Histogram, error) { return replicationLatency, prometheus.Register(replicationLatency) } +// RegisterNodeLatency creates and registers a prometheus histogram to +// observe internal node latency +func RegisterNodeLatency(conf promconfig.Config) (HistogramVec, error) { + nodeLatency := prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "gitaly", + Subsystem: "praefect", + Name: "node_latency", + Buckets: conf.GRPCLatencyBuckets, + }, []string{"gitaly_storage"}, + ) + + return nodeLatency, prometheus.Register(nodeLatency) +} + // RegisterReplicationJobsInFlight creates and registers a gauge // to track the size of the replication queue func RegisterReplicationJobsInFlight() (Gauge, error) { @@ -64,3 +79,7 @@ type Gauge interface { type Histogram interface { Observe(float64) } + +type HistogramVec interface { + WithLabelValues(lvs ...string) prometheus.Observer +} diff --git a/internal/praefect/nodes/manager.go b/internal/praefect/nodes/manager.go index 78ac5494a..7952233fb 100644 --- a/internal/praefect/nodes/manager.go +++ b/internal/praefect/nodes/manager.go @@ -75,6 +75,7 @@ type Mgr struct { // VirtualStorages failoverEnabled bool log *logrus.Entry + nodeLatencyHist metrics.HistogramVec } // ErrPrimaryNotHealthy indicates the primary of a shard is not in a healthy state and hence @@ -82,7 +83,7 @@ type Mgr struct { var ErrPrimaryNotHealthy = errors.New("primary is not healthy") // NewNodeManager creates a new NodeMgr based on virtual storage configs -func NewManager(log *logrus.Entry, c config.Config, dialOpts ...grpc.DialOption) (*Mgr, error) { +func NewManager(log *logrus.Entry, c config.Config, latencyHistogram metrics.HistogramVec, dialOpts ...grpc.DialOption) (*Mgr, error) { shards := make(map[string]*shard) for _, virtualStorage := range c.VirtualStorages { var secondaries []*nodeStatus @@ -108,7 +109,7 @@ func NewManager(log *logrus.Entry, c config.Config, dialOpts ...grpc.DialOption) if err != nil { return nil, err } - ns := newConnectionStatus(*node, conn, log) + ns := newConnectionStatus(*node, conn, log, latencyHistogram) if node.DefaultPrimary { primary = ns @@ -229,20 +230,22 @@ func (n *Mgr) checkShards() { } } -func newConnectionStatus(node models.Node, cc *grpc.ClientConn, l *logrus.Entry) *nodeStatus { +func newConnectionStatus(node models.Node, cc *grpc.ClientConn, l *logrus.Entry, latencyHist metrics.HistogramVec) *nodeStatus { return &nodeStatus{ - Node: node, - ClientConn: cc, - statuses: make([]healthpb.HealthCheckResponse_ServingStatus, 0), - log: l, + Node: node, + ClientConn: cc, + statuses: make([]healthpb.HealthCheckResponse_ServingStatus, 0), + log: l, + latencyHist: latencyHist, } } type nodeStatus struct { models.Node *grpc.ClientConn - statuses []healthpb.HealthCheckResponse_ServingStatus - log *logrus.Entry + statuses []healthpb.HealthCheckResponse_ServingStatus + log *logrus.Entry + latencyHist metrics.HistogramVec } // GetStorage gets the storage name of a node @@ -284,7 +287,10 @@ func (n *nodeStatus) check() { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) defer cancel() + start := time.Now() resp, err := client.Check(ctx, &healthpb.HealthCheckRequest{Service: ""}) + n.latencyHist.WithLabelValues(n.Storage).Observe(time.Since(start).Seconds()) + if err != nil { n.log.WithError(err).WithField("storage", n.Storage).WithField("address", n.Address).Warn("error when pinging healthcheck") resp = &healthpb.HealthCheckResponse{ diff --git a/internal/praefect/nodes/manager_test.go b/internal/praefect/nodes/manager_test.go index 69a24c10c..8f5e1443b 100644 --- a/internal/praefect/nodes/manager_test.go +++ b/internal/praefect/nodes/manager_test.go @@ -8,6 +8,7 @@ import ( "gitlab.com/gitlab-org/gitaly/internal/praefect/config" "gitlab.com/gitlab-org/gitaly/internal/praefect/models" "gitlab.com/gitlab-org/gitaly/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/internal/testhelper/promtest" "google.golang.org/grpc" "google.golang.org/grpc/health/grpc_health_v1" ) @@ -24,14 +25,21 @@ func TestNodeStatus(t *testing.T) { require.NoError(t, err) - cs := newConnectionStatus(models.Node{}, cc, testhelper.DiscardTestEntry(t)) + mockHistogramVec := promtest.NewMockHistogramVec() + + storageName := "default" + cs := newConnectionStatus(models.Node{Storage: storageName}, cc, testhelper.DiscardTestEntry(t), mockHistogramVec) require.False(t, cs.isHealthy()) + var expectedLabels [][]string for i := 0; i < healthcheckThreshold; i++ { cs.check() + expectedLabels = append(expectedLabels, []string{storageName}) } require.True(t, cs.isHealthy()) + require.Equal(t, expectedLabels, mockHistogramVec.LabelsCalled()) + require.Len(t, mockHistogramVec.Observer().Observed(), healthcheckThreshold) healthSvr.SetServingStatus("", grpc_health_v1.HealthCheckResponse_NOT_SERVING) @@ -73,10 +81,11 @@ func TestNodeManager(t *testing.T) { FailoverEnabled: false, } - nm, err := NewManager(testhelper.DiscardTestEntry(t), confWithFailover) + mockHistogram := promtest.NewMockHistogramVec() + nm, err := NewManager(testhelper.DiscardTestEntry(t), confWithFailover, mockHistogram) require.NoError(t, err) - nmWithoutFailover, err := NewManager(testhelper.DiscardTestEntry(t), confWithoutFailover) + nmWithoutFailover, err := NewManager(testhelper.DiscardTestEntry(t), confWithoutFailover, mockHistogram) require.NoError(t, err) nm.Start(1*time.Millisecond, 5*time.Second) @@ -115,6 +124,11 @@ func TestNodeManager(t *testing.T) { healthSrv0.SetServingStatus("", grpc_health_v1.HealthCheckResponse_UNKNOWN) nm.checkShards() + labelsCalled := mockHistogram.LabelsCalled() + for _, node := range virtualStorages[0].Nodes { + require.Contains(t, labelsCalled, []string{node.Storage}) + } + // since the primary is unhealthy, we expect checkShards to demote primary to secondary, and promote the healthy // secondary to primary diff --git a/internal/praefect/replicator_test.go b/internal/praefect/replicator_test.go index 495ff1bdc..2dfe83467 100644 --- a/internal/praefect/replicator_test.go +++ b/internal/praefect/replicator_test.go @@ -141,7 +141,7 @@ func TestProcessReplicationJob(t *testing.T) { entry := testhelper.DiscardTestEntry(t) replicator.log = entry - nodeMgr, err := nodes.NewManager(entry, config) + nodeMgr, err := nodes.NewManager(entry, config, promtest.NewMockHistogramVec()) require.NoError(t, err) nodeMgr.Start(1*time.Millisecond, 5*time.Millisecond) @@ -280,7 +280,7 @@ func TestProcessBacklog_FailedJobs(t *testing.T) { require.NoError(t, ds.UpdateReplJobState(ids[0], datastore.JobStateReady)) - nodeMgr, err := nodes.NewManager(entry, config) + nodeMgr, err := nodes.NewManager(entry, config, promtest.NewMockHistogramVec()) require.NoError(t, err) replMgr := NewReplMgr("default", entry, ds, nodeMgr) @@ -410,7 +410,7 @@ func TestProcessBacklog_Success(t *testing.T) { require.NoError(t, ds.UpdateReplJobState(id, datastore.JobStateReady)) } - nodeMgr, err := nodes.NewManager(entry, config) + nodeMgr, err := nodes.NewManager(entry, config, promtest.NewMockHistogramVec()) require.NoError(t, err) replMgr := NewReplMgr("default", entry, ds, nodeMgr) diff --git a/internal/praefect/server_test.go b/internal/praefect/server_test.go index b5d01664b..c70777fb9 100644 --- a/internal/praefect/server_test.go +++ b/internal/praefect/server_test.go @@ -24,6 +24,7 @@ import ( "gitlab.com/gitlab-org/gitaly/internal/praefect/nodes" "gitlab.com/gitlab-org/gitaly/internal/praefect/protoregistry" "gitlab.com/gitlab-org/gitaly/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/internal/testhelper/promtest" "gitlab.com/gitlab-org/gitaly/internal/version" "gitlab.com/gitlab-org/gitaly/proto/go/gitalypb" healthpb "google.golang.org/grpc/health/grpc_health_v1" @@ -132,7 +133,7 @@ func TestGitalyServerInfoBadNode(t *testing.T) { } entry := testhelper.DiscardTestEntry(t) - nodeMgr, err := nodes.NewManager(entry, conf) + nodeMgr, err := nodes.NewManager(entry, conf, promtest.NewMockHistogramVec()) require.NoError(t, err) registry := protoregistry.New() diff --git a/internal/testhelper/promtest/histogram.go b/internal/testhelper/promtest/histogram.go index b35d58e0a..35208128f 100644 --- a/internal/testhelper/promtest/histogram.go +++ b/internal/testhelper/promtest/histogram.go @@ -2,6 +2,8 @@ package promtest import ( "sync" + + "github.com/prometheus/client_golang/prometheus" ) // MockHistogram is a mock histogram that adheres to prometheus.Histogram for use in unit tests @@ -16,3 +18,51 @@ func (m *MockHistogram) Observe(v float64) { defer m.m.Unlock() m.Values = append(m.Values, v) } + +func NewMockHistogramVec() *MockHistogramVec { + return &MockHistogramVec{} +} + +type MockHistogramVec struct { + m sync.RWMutex + labelsCalled [][]string + observer MockObserver +} + +func (m *MockHistogramVec) LabelsCalled() [][]string { + m.m.RLock() + defer m.m.RUnlock() + + return m.labelsCalled +} + +func (m *MockHistogramVec) Observer() *MockObserver { + return &m.observer +} + +func (m *MockHistogramVec) WithLabelValues(lvs ...string) prometheus.Observer { + m.m.Lock() + defer m.m.Unlock() + + m.labelsCalled = append(m.labelsCalled, lvs) + return &m.observer +} + +type MockObserver struct { + m sync.RWMutex + observed []float64 +} + +func (m *MockObserver) Observe(v float64) { + m.m.Lock() + defer m.m.Unlock() + + m.observed = append(m.observed, v) +} + +func (m *MockObserver) Observed() []float64 { + m.m.RLock() + defer m.m.RUnlock() + + return m.observed +} |