diff options
author | Igor Wiedler <iwiedler@gitlab.com> | 2022-04-05 15:23:59 +0300 |
---|---|---|
committer | Igor Wiedler <iwiedler@gitlab.com> | 2022-04-05 18:57:14 +0300 |
commit | a088c9b3e51e22df4e5274e2ae26b112552c01e1 (patch) | |
tree | cdfc4522dffc28e9f46bcf4bc37cdb55ed9885b4 | |
parent | a8ca1e263382d60f0263a90005876163805f44e6 (diff) |
Expose command stats (rusage) metrics via prometheuscommand-stats-metrics
We currently track rusage metrics in logs on a per-RPC basis. This
allows us to get a very fine-grained view into resource attribution.
However, logs often do not lend themselves to corse-grained and long-
term analysis. For this reason it is useful to expose metrics via
prometheus.
By aggregating that data as metrics, we aim to partially close an
observability gap that exists for short-lived processes. The existing
`process-exporter` metrics are severely under-reporting the utilization
of short-lived processes, which gitaly spawns many of.
See also:
- https://gitlab.com/gitlab-com/gl-infra/scalability#1655
This patch introduces a set of `gitaly_command_*` metrics which
provide aggregated resource attribution along the following
dimensions:
- `cmd` - the basename of the command being executed.
- `subcmd` - an optional subcommand, e.g. `archive` for `git archive`
- `grpc_service` - the grpc service caller
- `grpc_method` - the grpc method caller
The newly introduced metrics are:
- `gitaly_command_cpu_seconds_total` Sum of CPU time spent
- `gitaly_command_real_seconds_total` Sum of real time spent
- `gitaly_command_minor_page_faults_total` Sum of minor page faults
- `gitaly_command_major_page_faults_total` Sum of major page faults
- `gitaly_command_signals_received_total` Sum of signals received
- `gitaly_command_context_switches_total` Sum of context switches
This feature is being introduced behind a feature flag. However,
since metrics are sticky, once the metric has been defined, it
will be returned by the process until the next restart.
The cardinality of the metrics should be relatively well-bounded
in any case.
-rw-r--r-- | internal/command/command.go | 93 | ||||
-rw-r--r-- | internal/git/command_factory.go | 2 | ||||
-rw-r--r-- | internal/git2go/executor.go | 5 | ||||
-rw-r--r-- | internal/gitaly/hook/custom.go | 2 | ||||
-rw-r--r-- | internal/gitaly/linguist/linguist.go | 2 | ||||
-rw-r--r-- | internal/metadata/featureflag/ff_command_stats_metrics.go | 4 | ||||
-rw-r--r-- | internal/testhelper/testhelper.go | 3 |
7 files changed, 109 insertions, 2 deletions
diff --git a/internal/command/command.go b/internal/command/command.go index ba964209a..45167aab4 100644 --- a/internal/command/command.go +++ b/internal/command/command.go @@ -7,15 +7,20 @@ import ( "io" "os" "os/exec" + "path" "strings" "sync" "syscall" "time" "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" + grpcmwtags "github.com/grpc-ecosystem/go-grpc-middleware/tags" "github.com/opentracing/opentracing-go" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/v14/internal/command/commandcounter" + "gitlab.com/gitlab-org/gitaly/v14/internal/metadata/featureflag" "gitlab.com/gitlab-org/labkit/tracing" ) @@ -33,6 +38,51 @@ func init() { } } +var ( + cpuSecondsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "gitaly_command_cpu_seconds_total", + Help: "Sum of CPU time spent by shelling out", + }, + []string{"grpc_service", "grpc_method", "cmd", "subcmd", "mode"}, + ) + realSecondsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "gitaly_command_real_seconds_total", + Help: "Sum of real time spent by shelling out", + }, + []string{"grpc_service", "grpc_method", "cmd", "subcmd"}, + ) + minorPageFaultsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "gitaly_command_minor_page_faults_total", + Help: "Sum of minor page faults performed while shelling out", + }, + []string{"grpc_service", "grpc_method", "cmd", "subcmd"}, + ) + majorPageFaultsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "gitaly_command_major_page_faults_total", + Help: "Sum of major page faults performed while shelling out", + }, + []string{"grpc_service", "grpc_method", "cmd", "subcmd"}, + ) + signalsReceivedTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "gitaly_command_signals_received_total", + Help: "Sum of signals received while shelling out", + }, + []string{"grpc_service", "grpc_method", "cmd", "subcmd"}, + ) + contextSwitchesTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "gitaly_command_context_switches_total", + Help: "Sum of context switches performed while shelling out", + }, + []string{"grpc_service", "grpc_method", "cmd", "subcmd", "ctxswitchtype"}, + ) +) + // GitEnv contains the ENV variables for git commands var GitEnv = []string{ // Force english locale for consistency on the output messages @@ -100,7 +150,8 @@ type Command struct { waitError error waitOnce sync.Once - span opentracing.Span + metricsSubCmd string + span opentracing.Span cgroupPath string } @@ -151,6 +202,11 @@ func (c *Command) SetCgroupPath(path string) { c.cgroupPath = path } +// SetMetricsSubCmd sets the "subcmd" label used in metrics +func (c *Command) SetMetricsSubCmd(metricsSubCmd string) { + c.metricsSubCmd = metricsSubCmd +} + type contextWithoutDonePanic string // New creates a Command from an exec.Cmd. On success, the Command @@ -392,6 +448,21 @@ func (c *Command) logProcessComplete() { } } + if featureflag.CommandStatsMetrics.IsEnabled(ctx) { + service, method := methodFromContext(ctx) + cmdName := path.Base(c.cmd.Path) + cpuSecondsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd, "system").Add(systemTime.Seconds()) + cpuSecondsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd, "user").Add(userTime.Seconds()) + realSecondsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd).Add(realTime.Seconds()) + if ok { + minorPageFaultsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd).Add(float64(rusage.Minflt)) + majorPageFaultsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd).Add(float64(rusage.Majflt)) + signalsReceivedTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd).Add(float64(rusage.Nsignals)) + contextSwitchesTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd, "voluntary").Add(float64(rusage.Nvcsw)) + contextSwitchesTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd, "nonvoluntary").Add(float64(rusage.Nivcsw)) + } + } + c.span.LogKV( "pid", cmd.ProcessState.Pid(), "exit_code", exitCode, @@ -411,6 +482,26 @@ func (c *Command) logProcessComplete() { c.span.Finish() } +func methodFromContext(ctx context.Context) (service string, method string) { + tags := grpcmwtags.Extract(ctx) + ctxValue := tags.Values()["grpc.request.fullMethod"] + if ctxValue == nil { + return "", "" + } + + if s, ok := ctxValue.(string); ok { + // Expect: "/foo.BarService/Qux" + split := strings.Split(s, "/") + if len(split) != 3 { + return "", "" + } + + return split[1], split[2] + } + + return "", "" +} + // Command arguments will be passed to the exec syscall as // null-terminated C strings. That means the arguments themselves may not // contain a null byte. The go stdlib checks for null bytes but it diff --git a/internal/git/command_factory.go b/internal/git/command_factory.go index 056b8b170..3a5c58bac 100644 --- a/internal/git/command_factory.go +++ b/internal/git/command_factory.go @@ -390,6 +390,8 @@ func (cf *ExecCommandFactory) newCommand(ctx context.Context, repo repository.Gi return nil, err } + command.SetMetricsSubCmd(sc.Subcommand()) + if featureflag.RunCommandsInCGroup.IsEnabled(ctx) { if err := cf.cgroupsManager.AddCommand(command); err != nil { return nil, err diff --git a/internal/git2go/executor.go b/internal/git2go/executor.go index 45ed5e8c1..73dc095e8 100644 --- a/internal/git2go/executor.go +++ b/internal/git2go/executor.go @@ -49,7 +49,7 @@ func NewExecutor(cfg config.Cfg, gitCmdFactory git.CommandFactory, locator stora } } -func (b *Executor) run(ctx context.Context, repo repository.GitRepo, stdin io.Reader, args ...string) (*bytes.Buffer, error) { +func (b *Executor) run(ctx context.Context, repo repository.GitRepo, stdin io.Reader, subcmd string, args ...string) (*bytes.Buffer, error) { repoPath, err := b.locator.GetRepoPath(repo) if err != nil { return nil, fmt.Errorf("gitaly-git2go: %w", err) @@ -66,6 +66,7 @@ func (b *Executor) run(ctx context.Context, repo repository.GitRepo, stdin io.Re "-log-format", b.logFormat, "-log-level", b.logLevel, "-correlation-id", correlation.ExtractFromContext(ctx), + subcmd, }, args...) var stdout bytes.Buffer @@ -74,6 +75,8 @@ func (b *Executor) run(ctx context.Context, repo repository.GitRepo, stdin io.Re return nil, err } + cmd.SetMetricsSubCmd(subcmd) + if err := cmd.Wait(); err != nil { return nil, err } diff --git a/internal/gitaly/hook/custom.go b/internal/gitaly/hook/custom.go index 791fbeea4..5020f8530 100644 --- a/internal/gitaly/hook/custom.go +++ b/internal/gitaly/hook/custom.go @@ -73,6 +73,8 @@ func (m *GitLabHookManager) newCustomHooksExecutor(repo *gitalypb.Repository, ho return err } + c.SetMetricsSubCmd(hookName) + if err = c.Wait(); err != nil { // Custom hook errors need to be handled specially when we update // refs via updateref.UpdaterWithHooks: their stdout and stderr must diff --git a/internal/gitaly/linguist/linguist.go b/internal/gitaly/linguist/linguist.go index fdd7ee6e7..9c498e0cf 100644 --- a/internal/gitaly/linguist/linguist.go +++ b/internal/gitaly/linguist/linguist.go @@ -128,6 +128,8 @@ func (inst *Instance) startGitLinguist(ctx context.Context, repoPath string, com return nil, fmt.Errorf("creating command: %w", err) } + internalCmd.SetMetricsSubCmd(linguistCommand) + return internalCmd, nil } diff --git a/internal/metadata/featureflag/ff_command_stats_metrics.go b/internal/metadata/featureflag/ff_command_stats_metrics.go new file mode 100644 index 000000000..a11206c52 --- /dev/null +++ b/internal/metadata/featureflag/ff_command_stats_metrics.go @@ -0,0 +1,4 @@ +package featureflag + +// CommandStatsMetrics tracks additional prometheus metrics for each shelled out command +var CommandStatsMetrics = NewFeatureFlag("command_stats_metrics", false) diff --git a/internal/testhelper/testhelper.go b/internal/testhelper/testhelper.go index 2e780398b..f723aaf19 100644 --- a/internal/testhelper/testhelper.go +++ b/internal/testhelper/testhelper.go @@ -174,6 +174,9 @@ func ContextWithoutCancel(opts ...ContextOpt) context.Context { // ConcurrencyQueueMaxWait is in the codepath of every RPC call since it's in the limithandler // middleware. ctx = featureflag.ContextWithFeatureFlag(ctx, featureflag.ConcurrencyQueueMaxWait, true) + // CommandStatsMetrics is checked on every shelled out command, which may happen outside of + // RPC context. + ctx = featureflag.ContextWithFeatureFlag(ctx, featureflag.CommandStatsMetrics, true) for _, opt := range opts { ctx = opt(ctx) |