diff options
author | Zeger-Jan van de Weg <git@zjvandeweg.nl> | 2021-07-13 15:06:08 +0300 |
---|---|---|
committer | Zeger-Jan van de Weg <git@zjvandeweg.nl> | 2021-07-13 15:06:08 +0300 |
commit | 40511f7a14ded77c826809d054d740a66e1c106f (patch) | |
tree | 8b83a010c70e4402478774d921aae0e02f0434f9 | |
parent | d4ea957f6131538cd78e490a585ea3a455251064 (diff) | |
parent | 12061b1c3cfba4c14cb89200ced7e814aa06b813 (diff) |
Merge branch 'smh-unavailable-repos-metric' into 'master'
Update read-only repository count metric to account for lazy failover
See merge request gitlab-org/gitaly!3548
-rw-r--r-- | cmd/praefect/main.go | 7 | ||||
-rw-r--r-- | internal/praefect/datastore/collector.go | 101 | ||||
-rw-r--r-- | internal/praefect/datastore/collector_test.go | 230 |
3 files changed, 185 insertions, 153 deletions
diff --git a/cmd/praefect/main.go b/cmd/praefect/main.go index 2418b8a90..a906ee21c 100644 --- a/cmd/praefect/main.go +++ b/cmd/praefect/main.go @@ -401,12 +401,7 @@ func run(cfgs []starter.Config, conf config.Config) error { metricsCollectors = append(metricsCollectors, transactionManager, coordinator, repl) if db != nil { prometheus.MustRegister( - datastore.NewRepositoryStoreCollector( - logger, - conf.VirtualStorageNames(), - db, - conf.Failover.ElectionStrategy == config.ElectionStrategyPerRepository, - ), + datastore.NewRepositoryStoreCollector(logger, conf.VirtualStorageNames(), db), ) } prometheus.MustRegister(metricsCollectors...) diff --git a/internal/praefect/datastore/collector.go b/internal/praefect/datastore/collector.go index 098c46dfb..dc9500b47 100644 --- a/internal/praefect/datastore/collector.go +++ b/internal/praefect/datastore/collector.go @@ -10,6 +10,9 @@ import ( "gitlab.com/gitlab-org/gitaly/v14/internal/praefect/datastore/glsql" ) +// This is kept for backwards compatibility as some alerting rules depend on this. +// The unavailable repositories is a more accurate description for the metric and +// is exported below so we can migrate to it. var descReadOnlyRepositories = prometheus.NewDesc( "gitaly_praefect_read_only_repositories", "Number of repositories in read-only mode within a virtual storage.", @@ -17,21 +20,26 @@ var descReadOnlyRepositories = prometheus.NewDesc( nil, ) +var descUnavailableRepositories = prometheus.NewDesc( + "gitaly_praefect_unavailable_repositories", + "Number of repositories that have no healthy, up to date replicas.", + []string{"virtual_storage"}, + nil, +) + // RepositoryStoreCollector collects metrics from the RepositoryStore. type RepositoryStoreCollector struct { - log logrus.FieldLogger - db glsql.Querier - virtualStorages []string - repositoryScoped bool + log logrus.FieldLogger + db glsql.Querier + virtualStorages []string } // NewRepositoryStoreCollector returns a new collector. -func NewRepositoryStoreCollector(log logrus.FieldLogger, virtualStorages []string, db glsql.Querier, repositoryScoped bool) *RepositoryStoreCollector { +func NewRepositoryStoreCollector(log logrus.FieldLogger, virtualStorages []string, db glsql.Querier) *RepositoryStoreCollector { return &RepositoryStoreCollector{ - log: log.WithField("component", "RepositoryStoreCollector"), - db: db, - virtualStorages: virtualStorages, - repositoryScoped: repositoryScoped, + log: log.WithField("component", "RepositoryStoreCollector"), + db: db, + virtualStorages: virtualStorages, } } @@ -40,76 +48,39 @@ func (c *RepositoryStoreCollector) Describe(ch chan<- *prometheus.Desc) { } func (c *RepositoryStoreCollector) Collect(ch chan<- prometheus.Metric) { - readOnlyCounts, err := c.queryMetrics(context.TODO()) + unavailableCounts, err := c.queryMetrics(context.TODO()) if err != nil { c.log.WithError(err).Error("failed collecting read-only repository count metric") return } for _, vs := range c.virtualStorages { - ch <- prometheus.MustNewConstMetric(descReadOnlyRepositories, prometheus.GaugeValue, float64(readOnlyCounts[vs]), vs) + for _, desc := range []*prometheus.Desc{descReadOnlyRepositories, descUnavailableRepositories} { + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(unavailableCounts[vs]), vs) + } } } -// queryMetrics queries the number of read-only repositories from the database. -// A repository is in read-only mode when its primary storage is not on the latest -// generation. -// -// There are two variants on the query: -// -// 1. virtualStorageScopedQuery considers virtual storage scoped primaries. Virtual storage's -// primary is stored in the `shard_primaries` table in the `node_name` column. -// 2. repositoryScopedQuery considers repository specific primaries. Repository scoped -// primaries are stored in the `primary` column in the `repositories` table. -// -// Both queries cross-reference the `repositories` and `storage_repositories` tables -// to see if the primary storage of the repository is on the latest generation. If not, -// it's added to the returned count. -// -// The query operating on virtual storage scoped primaries will be dropped once the migration -// to repository scoped primaries is finished. +// queryMetrics queries the number of unavailable repositories from the database. +// A repository is unavailable when it has no replicas that can act as a primary, indicating +// they are either unhealthy or out of date. func (c *RepositoryStoreCollector) queryMetrics(ctx context.Context) (map[string]int, error) { - const virtualStorageScopedQuery = ` -SELECT repositories.virtual_storage, COUNT(*) + rows, err := c.db.QueryContext(ctx, ` +SELECT virtual_storage, COUNT(*) FROM repositories -LEFT JOIN shard_primaries ON - shard_primaries.shard_name = repositories.virtual_storage AND - shard_primaries.demoted = false -LEFT JOIN storage_repositories ON - repositories.virtual_storage = storage_repositories.virtual_storage AND - repositories.relative_path = storage_repositories.relative_path AND - shard_primaries.node_name = storage_repositories.storage -WHERE - COALESCE(storage_repositories.generation, -1) < repositories.generation AND - repositories.virtual_storage = ANY($1) -GROUP BY repositories.virtual_storage; - ` - - const repositoryScopedQuery = ` -SELECT repositories.virtual_storage, COUNT(*) -FROM repositories -LEFT JOIN storage_repositories ON - repositories.virtual_storage = storage_repositories.virtual_storage AND - repositories.relative_path = storage_repositories.relative_path AND - repositories.primary = storage_repositories.storage -WHERE - COALESCE(storage_repositories.generation, -1) < repositories.generation AND - repositories.virtual_storage = ANY($1) -GROUP BY repositories.virtual_storage -` - - query := virtualStorageScopedQuery - if c.repositoryScoped { - query = repositoryScopedQuery - } - - rows, err := c.db.QueryContext(ctx, query, pq.StringArray(c.virtualStorages)) +WHERE NOT EXISTS ( + SELECT FROM valid_primaries + WHERE valid_primaries.virtual_storage = repositories.virtual_storage + AND valid_primaries.relative_path = repositories.relative_path +) AND repositories.virtual_storage = ANY($1) +GROUP BY virtual_storage + `, pq.StringArray(c.virtualStorages)) if err != nil { return nil, fmt.Errorf("query: %w", err) } defer rows.Close() - vsReadOnly := make(map[string]int) + vsUnavailable := make(map[string]int) for rows.Next() { var vs string var count int @@ -118,8 +89,8 @@ GROUP BY repositories.virtual_storage return nil, fmt.Errorf("scan: %w", err) } - vsReadOnly[vs] = count + vsUnavailable[vs] = count } - return vsReadOnly, rows.Err() + return vsUnavailable, rows.Err() } diff --git a/internal/praefect/datastore/collector_test.go b/internal/praefect/datastore/collector_test.go index 65248605e..f840a6749 100644 --- a/internal/praefect/datastore/collector_test.go +++ b/internal/praefect/datastore/collector_test.go @@ -17,107 +17,173 @@ func TestRepositoryStoreCollector(t *testing.T) { ctx, cancel := testhelper.Context() defer cancel() - db := getDB(t) - rs := NewPostgresRepositoryStore(db, nil) + type replicas map[string]struct { + generation int + assigned bool + } - state := map[string]map[string]map[string]int{ - "some-read-only": { - "read-only": { - "vs-primary": 0, - "repo-primary": 0, - "secondary": 1, - }, - "writable": { - "vs-primary": 1, - "repo-primary": 1, - "secondary": 1, + type repositories []struct { + deleted bool + relativePath string + replicas replicas + } + + for _, tc := range []struct { + desc string + healthyNodes []string + repositories repositories + count int + }{ + { + desc: "no repositories", + }, + { + desc: "deleted repositories are not considered unavailable", + repositories: repositories{ + { + deleted: true, + relativePath: "repository-1", + replicas: replicas{ + "storage-1": {generation: 0}, + }, + }, }, - "repo-writable": { - "vs-primary": 0, - "repo-primary": 1, - "secondary": 1, + }, + { + desc: "repositories without any healthy replicas are counted", + repositories: repositories{ + { + relativePath: "repository-1", + replicas: replicas{ + "storage-1": {generation: 0}, + }, + }, }, + count: 1, }, - "all-writable": { - "writable": { - "vs-primary": 0, - "repo-primary": 0, + { + desc: "repositories with healthy replicas are not counted", + healthyNodes: []string{"storage-1"}, + repositories: repositories{ + { + relativePath: "repository-1", + replicas: replicas{ + "storage-1": {generation: 0}, + }, + }, }, }, - "unconfigured": { - "read-only": { - "secondary": 1, + { + desc: "repositories with only outdated healthy replicas are counted", + healthyNodes: []string{"storage-1"}, + repositories: repositories{ + { + relativePath: "repository-1", + replicas: replicas{ + "storage-1": {generation: 0}, + "storage-2": {generation: 1}, + }, + }, }, + count: 1, }, - "no-records": {}, - "no-primary": { - "read-only": { - "secondary": 0, + { + desc: "repositories with unassigned fully up to date healthy replicas are not counted", + healthyNodes: []string{"storage-2"}, + repositories: repositories{ + { + relativePath: "repository-1", + replicas: replicas{ + "storage-1": {generation: 1, assigned: true}, + "storage-2": {generation: 1}, + }, + }, }, }, - } - for virtualStorage, relativePaths := range state { - demoted := false - if virtualStorage == "no-primary" { - demoted = true - } - _, err := db.ExecContext(ctx, ` - INSERT INTO shard_primaries (shard_name, node_name, elected_by_praefect, elected_at, demoted) - VALUES ($1, 'vs-primary', 'not-needed', now(), $2) - `, virtualStorage, demoted, - ) - require.NoError(t, err) - - for relativePath, storages := range relativePaths { - if virtualStorage != "no-primary" { - _, err := db.ExecContext(ctx, ` - INSERT INTO repositories (virtual_storage, relative_path, "primary") - VALUES ($1, $2, 'repo-primary') - `, virtualStorage, relativePath, - ) - require.NoError(t, err) - } - - for storage, generation := range storages { - require.NoError(t, rs.SetGeneration(ctx, virtualStorage, relativePath, storage, generation)) - } - } - } - - var virtualStorages []string - for vs := range state { - if vs == "unconfigured" { - continue - } - - virtualStorages = append(virtualStorages, vs) - } - - for _, tc := range []struct { - desc string - repositoryScoped bool - someReadOnlyCount int - }{ { - desc: "repository scoped", - someReadOnlyCount: 1, - repositoryScoped: true, + desc: "repositories with unassigned, outdated replicas is not unavailable", + healthyNodes: []string{"storage-1"}, + repositories: repositories{ + { + relativePath: "repository-1", + replicas: replicas{ + "storage-1": {generation: 1, assigned: true}, + "storage-2": {generation: 0}, + }, + }, + }, }, { - desc: "virtual storage scoped", - someReadOnlyCount: 2, + desc: "multiple unavailable repositories are counted correctly", + healthyNodes: []string{"storage-2"}, + repositories: repositories{ + { + relativePath: "repository-1", + replicas: replicas{ + "storage-1": {generation: 1}, + "storage-2": {generation: 0}, + }, + }, + { + relativePath: "repository-2", + replicas: replicas{ + "storage-1": {generation: 1}, + "storage-2": {generation: 0}, + }, + }, + }, + count: 2, }, } { t.Run(tc.desc, func(t *testing.T) { - c := NewRepositoryStoreCollector(logrus.New(), virtualStorages, db, tc.repositoryScoped) + tx, err := getDB(t).Begin() + require.NoError(t, err) + defer tx.Rollback() + + testhelper.SetHealthyNodes(t, ctx, tx, map[string]map[string][]string{ + "praefect-0": {"virtual-storage-1": tc.healthyNodes}, + }) + + for _, repository := range tc.repositories { + if !repository.deleted { + _, err := tx.ExecContext(ctx, ` + INSERT INTO repositories (virtual_storage, relative_path) + VALUES ('virtual-storage-1', $1) + `, repository.relativePath, + ) + require.NoError(t, err) + } + + for storage, replica := range repository.replicas { + if replica.assigned { + _, err := tx.ExecContext(ctx, ` + INSERT INTO repository_assignments (virtual_storage, relative_path, storage) + VALUES ('virtual-storage-1', $1, $2) + `, repository.relativePath, storage, + ) + require.NoError(t, err) + } + + _, err := tx.ExecContext(ctx, ` + INSERT INTO storage_repositories (virtual_storage, relative_path, storage, generation) + VALUES ('virtual-storage-1', $1, $2, $3) + `, repository.relativePath, storage, replica.generation, + ) + require.NoError(t, err) + } + } + + c := NewRepositoryStoreCollector(logrus.New(), []string{"virtual-storage-1", "virtual-storage-2"}, tx) require.NoError(t, testutil.CollectAndCompare(c, strings.NewReader(fmt.Sprintf(` # HELP gitaly_praefect_read_only_repositories Number of repositories in read-only mode within a virtual storage. # TYPE gitaly_praefect_read_only_repositories gauge -gitaly_praefect_read_only_repositories{virtual_storage="all-writable"} 0 -gitaly_praefect_read_only_repositories{virtual_storage="no-records"} 0 -gitaly_praefect_read_only_repositories{virtual_storage="no-primary"} 1 -gitaly_praefect_read_only_repositories{virtual_storage="some-read-only"} %d -`, tc.someReadOnlyCount)))) +gitaly_praefect_read_only_repositories{virtual_storage="virtual-storage-1"} %d +gitaly_praefect_read_only_repositories{virtual_storage="virtual-storage-2"} 0 +# HELP gitaly_praefect_unavailable_repositories Number of repositories that have no healthy, up to date replicas. +# TYPE gitaly_praefect_unavailable_repositories gauge +gitaly_praefect_unavailable_repositories{virtual_storage="virtual-storage-1"} %d +gitaly_praefect_unavailable_repositories{virtual_storage="virtual-storage-2"} 0 + `, tc.count, tc.count)))) }) } } |