Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIgor Wiedler <iwiedler@gitlab.com>2022-04-05 15:23:59 +0300
committerIgor Wiedler <iwiedler@gitlab.com>2022-04-05 18:57:14 +0300
commita088c9b3e51e22df4e5274e2ae26b112552c01e1 (patch)
treecdfc4522dffc28e9f46bcf4bc37cdb55ed9885b4
parenta8ca1e263382d60f0263a90005876163805f44e6 (diff)
Expose command stats (rusage) metrics via prometheuscommand-stats-metrics
We currently track rusage metrics in logs on a per-RPC basis. This allows us to get a very fine-grained view into resource attribution. However, logs often do not lend themselves to corse-grained and long- term analysis. For this reason it is useful to expose metrics via prometheus. By aggregating that data as metrics, we aim to partially close an observability gap that exists for short-lived processes. The existing `process-exporter` metrics are severely under-reporting the utilization of short-lived processes, which gitaly spawns many of. See also: - https://gitlab.com/gitlab-com/gl-infra/scalability#1655 This patch introduces a set of `gitaly_command_*` metrics which provide aggregated resource attribution along the following dimensions: - `cmd` - the basename of the command being executed. - `subcmd` - an optional subcommand, e.g. `archive` for `git archive` - `grpc_service` - the grpc service caller - `grpc_method` - the grpc method caller The newly introduced metrics are: - `gitaly_command_cpu_seconds_total` Sum of CPU time spent - `gitaly_command_real_seconds_total` Sum of real time spent - `gitaly_command_minor_page_faults_total` Sum of minor page faults - `gitaly_command_major_page_faults_total` Sum of major page faults - `gitaly_command_signals_received_total` Sum of signals received - `gitaly_command_context_switches_total` Sum of context switches This feature is being introduced behind a feature flag. However, since metrics are sticky, once the metric has been defined, it will be returned by the process until the next restart. The cardinality of the metrics should be relatively well-bounded in any case.
-rw-r--r--internal/command/command.go93
-rw-r--r--internal/git/command_factory.go2
-rw-r--r--internal/git2go/executor.go5
-rw-r--r--internal/gitaly/hook/custom.go2
-rw-r--r--internal/gitaly/linguist/linguist.go2
-rw-r--r--internal/metadata/featureflag/ff_command_stats_metrics.go4
-rw-r--r--internal/testhelper/testhelper.go3
7 files changed, 109 insertions, 2 deletions
diff --git a/internal/command/command.go b/internal/command/command.go
index ba964209a..45167aab4 100644
--- a/internal/command/command.go
+++ b/internal/command/command.go
@@ -7,15 +7,20 @@ import (
"io"
"os"
"os/exec"
+ "path"
"strings"
"sync"
"syscall"
"time"
"github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus"
+ grpcmwtags "github.com/grpc-ecosystem/go-grpc-middleware/tags"
"github.com/opentracing/opentracing-go"
+ "github.com/prometheus/client_golang/prometheus"
+ "github.com/prometheus/client_golang/prometheus/promauto"
"github.com/sirupsen/logrus"
"gitlab.com/gitlab-org/gitaly/v14/internal/command/commandcounter"
+ "gitlab.com/gitlab-org/gitaly/v14/internal/metadata/featureflag"
"gitlab.com/gitlab-org/labkit/tracing"
)
@@ -33,6 +38,51 @@ func init() {
}
}
+var (
+ cpuSecondsTotal = promauto.NewCounterVec(
+ prometheus.CounterOpts{
+ Name: "gitaly_command_cpu_seconds_total",
+ Help: "Sum of CPU time spent by shelling out",
+ },
+ []string{"grpc_service", "grpc_method", "cmd", "subcmd", "mode"},
+ )
+ realSecondsTotal = promauto.NewCounterVec(
+ prometheus.CounterOpts{
+ Name: "gitaly_command_real_seconds_total",
+ Help: "Sum of real time spent by shelling out",
+ },
+ []string{"grpc_service", "grpc_method", "cmd", "subcmd"},
+ )
+ minorPageFaultsTotal = promauto.NewCounterVec(
+ prometheus.CounterOpts{
+ Name: "gitaly_command_minor_page_faults_total",
+ Help: "Sum of minor page faults performed while shelling out",
+ },
+ []string{"grpc_service", "grpc_method", "cmd", "subcmd"},
+ )
+ majorPageFaultsTotal = promauto.NewCounterVec(
+ prometheus.CounterOpts{
+ Name: "gitaly_command_major_page_faults_total",
+ Help: "Sum of major page faults performed while shelling out",
+ },
+ []string{"grpc_service", "grpc_method", "cmd", "subcmd"},
+ )
+ signalsReceivedTotal = promauto.NewCounterVec(
+ prometheus.CounterOpts{
+ Name: "gitaly_command_signals_received_total",
+ Help: "Sum of signals received while shelling out",
+ },
+ []string{"grpc_service", "grpc_method", "cmd", "subcmd"},
+ )
+ contextSwitchesTotal = promauto.NewCounterVec(
+ prometheus.CounterOpts{
+ Name: "gitaly_command_context_switches_total",
+ Help: "Sum of context switches performed while shelling out",
+ },
+ []string{"grpc_service", "grpc_method", "cmd", "subcmd", "ctxswitchtype"},
+ )
+)
+
// GitEnv contains the ENV variables for git commands
var GitEnv = []string{
// Force english locale for consistency on the output messages
@@ -100,7 +150,8 @@ type Command struct {
waitError error
waitOnce sync.Once
- span opentracing.Span
+ metricsSubCmd string
+ span opentracing.Span
cgroupPath string
}
@@ -151,6 +202,11 @@ func (c *Command) SetCgroupPath(path string) {
c.cgroupPath = path
}
+// SetMetricsSubCmd sets the "subcmd" label used in metrics
+func (c *Command) SetMetricsSubCmd(metricsSubCmd string) {
+ c.metricsSubCmd = metricsSubCmd
+}
+
type contextWithoutDonePanic string
// New creates a Command from an exec.Cmd. On success, the Command
@@ -392,6 +448,21 @@ func (c *Command) logProcessComplete() {
}
}
+ if featureflag.CommandStatsMetrics.IsEnabled(ctx) {
+ service, method := methodFromContext(ctx)
+ cmdName := path.Base(c.cmd.Path)
+ cpuSecondsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd, "system").Add(systemTime.Seconds())
+ cpuSecondsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd, "user").Add(userTime.Seconds())
+ realSecondsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd).Add(realTime.Seconds())
+ if ok {
+ minorPageFaultsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd).Add(float64(rusage.Minflt))
+ majorPageFaultsTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd).Add(float64(rusage.Majflt))
+ signalsReceivedTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd).Add(float64(rusage.Nsignals))
+ contextSwitchesTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd, "voluntary").Add(float64(rusage.Nvcsw))
+ contextSwitchesTotal.WithLabelValues(service, method, cmdName, c.metricsSubCmd, "nonvoluntary").Add(float64(rusage.Nivcsw))
+ }
+ }
+
c.span.LogKV(
"pid", cmd.ProcessState.Pid(),
"exit_code", exitCode,
@@ -411,6 +482,26 @@ func (c *Command) logProcessComplete() {
c.span.Finish()
}
+func methodFromContext(ctx context.Context) (service string, method string) {
+ tags := grpcmwtags.Extract(ctx)
+ ctxValue := tags.Values()["grpc.request.fullMethod"]
+ if ctxValue == nil {
+ return "", ""
+ }
+
+ if s, ok := ctxValue.(string); ok {
+ // Expect: "/foo.BarService/Qux"
+ split := strings.Split(s, "/")
+ if len(split) != 3 {
+ return "", ""
+ }
+
+ return split[1], split[2]
+ }
+
+ return "", ""
+}
+
// Command arguments will be passed to the exec syscall as
// null-terminated C strings. That means the arguments themselves may not
// contain a null byte. The go stdlib checks for null bytes but it
diff --git a/internal/git/command_factory.go b/internal/git/command_factory.go
index 056b8b170..3a5c58bac 100644
--- a/internal/git/command_factory.go
+++ b/internal/git/command_factory.go
@@ -390,6 +390,8 @@ func (cf *ExecCommandFactory) newCommand(ctx context.Context, repo repository.Gi
return nil, err
}
+ command.SetMetricsSubCmd(sc.Subcommand())
+
if featureflag.RunCommandsInCGroup.IsEnabled(ctx) {
if err := cf.cgroupsManager.AddCommand(command); err != nil {
return nil, err
diff --git a/internal/git2go/executor.go b/internal/git2go/executor.go
index 45ed5e8c1..73dc095e8 100644
--- a/internal/git2go/executor.go
+++ b/internal/git2go/executor.go
@@ -49,7 +49,7 @@ func NewExecutor(cfg config.Cfg, gitCmdFactory git.CommandFactory, locator stora
}
}
-func (b *Executor) run(ctx context.Context, repo repository.GitRepo, stdin io.Reader, args ...string) (*bytes.Buffer, error) {
+func (b *Executor) run(ctx context.Context, repo repository.GitRepo, stdin io.Reader, subcmd string, args ...string) (*bytes.Buffer, error) {
repoPath, err := b.locator.GetRepoPath(repo)
if err != nil {
return nil, fmt.Errorf("gitaly-git2go: %w", err)
@@ -66,6 +66,7 @@ func (b *Executor) run(ctx context.Context, repo repository.GitRepo, stdin io.Re
"-log-format", b.logFormat,
"-log-level", b.logLevel,
"-correlation-id", correlation.ExtractFromContext(ctx),
+ subcmd,
}, args...)
var stdout bytes.Buffer
@@ -74,6 +75,8 @@ func (b *Executor) run(ctx context.Context, repo repository.GitRepo, stdin io.Re
return nil, err
}
+ cmd.SetMetricsSubCmd(subcmd)
+
if err := cmd.Wait(); err != nil {
return nil, err
}
diff --git a/internal/gitaly/hook/custom.go b/internal/gitaly/hook/custom.go
index 791fbeea4..5020f8530 100644
--- a/internal/gitaly/hook/custom.go
+++ b/internal/gitaly/hook/custom.go
@@ -73,6 +73,8 @@ func (m *GitLabHookManager) newCustomHooksExecutor(repo *gitalypb.Repository, ho
return err
}
+ c.SetMetricsSubCmd(hookName)
+
if err = c.Wait(); err != nil {
// Custom hook errors need to be handled specially when we update
// refs via updateref.UpdaterWithHooks: their stdout and stderr must
diff --git a/internal/gitaly/linguist/linguist.go b/internal/gitaly/linguist/linguist.go
index fdd7ee6e7..9c498e0cf 100644
--- a/internal/gitaly/linguist/linguist.go
+++ b/internal/gitaly/linguist/linguist.go
@@ -128,6 +128,8 @@ func (inst *Instance) startGitLinguist(ctx context.Context, repoPath string, com
return nil, fmt.Errorf("creating command: %w", err)
}
+ internalCmd.SetMetricsSubCmd(linguistCommand)
+
return internalCmd, nil
}
diff --git a/internal/metadata/featureflag/ff_command_stats_metrics.go b/internal/metadata/featureflag/ff_command_stats_metrics.go
new file mode 100644
index 000000000..a11206c52
--- /dev/null
+++ b/internal/metadata/featureflag/ff_command_stats_metrics.go
@@ -0,0 +1,4 @@
+package featureflag
+
+// CommandStatsMetrics tracks additional prometheus metrics for each shelled out command
+var CommandStatsMetrics = NewFeatureFlag("command_stats_metrics", false)
diff --git a/internal/testhelper/testhelper.go b/internal/testhelper/testhelper.go
index 2e780398b..f723aaf19 100644
--- a/internal/testhelper/testhelper.go
+++ b/internal/testhelper/testhelper.go
@@ -174,6 +174,9 @@ func ContextWithoutCancel(opts ...ContextOpt) context.Context {
// ConcurrencyQueueMaxWait is in the codepath of every RPC call since it's in the limithandler
// middleware.
ctx = featureflag.ContextWithFeatureFlag(ctx, featureflag.ConcurrencyQueueMaxWait, true)
+ // CommandStatsMetrics is checked on every shelled out command, which may happen outside of
+ // RPC context.
+ ctx = featureflag.ContextWithFeatureFlag(ctx, featureflag.CommandStatsMetrics, true)
for _, opt := range opts {
ctx = opt(ctx)