diff options
author | John Cai <jcai@gitlab.com> | 2021-12-08 15:45:11 +0300 |
---|---|---|
committer | John Cai <jcai@gitlab.com> | 2021-12-08 16:13:04 +0300 |
commit | 234974414f2e1f5c8855f4e07289a6570caf1c90 (patch) | |
tree | cdc62d025dd4354b50937282e15c651af71a0b2f | |
parent | 3ca16212b97e4c3f6a46b257efefc5dfc199adc3 (diff) |
cgroups: emit cgroups stats to prometheus
In order to gain visibility into how cgroups is performing, turn
CgroupsManager into a metrics collector so metrics can bubble up through
the CommandFactory.
Changelog: changed
-rw-r--r-- | internal/cgroups/cgroups.go | 3 | ||||
-rw-r--r-- | internal/cgroups/mock_linux_test.go | 43 | ||||
-rw-r--r-- | internal/cgroups/noop.go | 7 | ||||
-rw-r--r-- | internal/cgroups/v1_linux.go | 55 | ||||
-rw-r--r-- | internal/cgroups/v1_linux_test.go | 42 | ||||
-rw-r--r-- | internal/git/command_factory.go | 1 | ||||
-rw-r--r-- | internal/git/command_factory_cgroup_test.go | 4 |
7 files changed, 153 insertions, 2 deletions
diff --git a/internal/cgroups/cgroups.go b/internal/cgroups/cgroups.go index 733f89d74..273746685 100644 --- a/internal/cgroups/cgroups.go +++ b/internal/cgroups/cgroups.go @@ -1,6 +1,7 @@ package cgroups import ( + "github.com/prometheus/client_golang/prometheus" "gitlab.com/gitlab-org/gitaly/v14/internal/command" "gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config/cgroups" ) @@ -17,6 +18,8 @@ type Manager interface { // It is expected to be called once at Gitaly shutdown from any // instance of the Manager. Cleanup() error + Describe(ch chan<- *prometheus.Desc) + Collect(ch chan<- prometheus.Metric) } // NewManager returns the appropriate Cgroups manager diff --git a/internal/cgroups/mock_linux_test.go b/internal/cgroups/mock_linux_test.go index 38f1e5ebc..d42c3e827 100644 --- a/internal/cgroups/mock_linux_test.go +++ b/internal/cgroups/mock_linux_test.go @@ -21,6 +21,7 @@ package cgroups import ( "os" "path/filepath" + "strconv" "testing" "github.com/containerd/cgroups" @@ -54,3 +55,45 @@ func newMock(t *testing.T) *mockCgroup { func (m *mockCgroup) hierarchy() ([]cgroups.Subsystem, error) { return m.subsystems, nil } + +func (m *mockCgroup) setupMockCgroupFiles( + t *testing.T, + manager *CGroupV1Manager, + memFailCount int, +) { + for _, s := range m.subsystems { + path := filepath.Join(m.root, string(s.Name()), manager.currentProcessCgroup()) + require.NoError(t, os.MkdirAll(path, 0o644)) + + for _, emptyFile := range []string{ + "cpu.stat", + "memory.stat", + "memory.oom_control", + } { + require.NoError(t, os.WriteFile(filepath.Join(path, emptyFile), []byte(""), 0o644)) + } + + for _, zeroFile := range []string{ + "memory.usage_in_bytes", + "memory.max_usage_in_bytes", + "memory.limit_in_bytes", + "memory.failcnt", + "memory.memsw.failcnt", + "memory.memsw.usage_in_bytes", + "memory.memsw.max_usage_in_bytes", + "memory.memsw.limit_in_bytes", + "memory.kmem.usage_in_bytes", + "memory.kmem.max_usage_in_bytes", + "memory.kmem.failcnt", + "memory.kmem.limit_in_bytes", + "memory.kmem.tcp.usage_in_bytes", + "memory.kmem.tcp.max_usage_in_bytes", + "memory.kmem.tcp.failcnt", + "memory.kmem.tcp.limit_in_bytes", + } { + require.NoError(t, os.WriteFile(filepath.Join(path, zeroFile), []byte("0"), 0o644)) + } + + require.NoError(t, os.WriteFile(filepath.Join(path, "memory.failcnt"), []byte(strconv.Itoa(memFailCount)), 0o644)) + } +} diff --git a/internal/cgroups/noop.go b/internal/cgroups/noop.go index caac82e2d..57f552902 100644 --- a/internal/cgroups/noop.go +++ b/internal/cgroups/noop.go @@ -1,6 +1,7 @@ package cgroups import ( + "github.com/prometheus/client_golang/prometheus" "gitlab.com/gitlab-org/gitaly/v14/internal/command" ) @@ -21,3 +22,9 @@ func (cg *NoopManager) AddCommand(cmd *command.Command) error { func (cg *NoopManager) Cleanup() error { return nil } + +// Describe does nothing +func (cg *NoopManager) Describe(ch chan<- *prometheus.Desc) {} + +// Collect does nothing +func (cg *NoopManager) Collect(ch chan<- prometheus.Metric) {} diff --git a/internal/cgroups/v1_linux.go b/internal/cgroups/v1_linux.go index 42a797aa1..477a34249 100644 --- a/internal/cgroups/v1_linux.go +++ b/internal/cgroups/v1_linux.go @@ -8,14 +8,17 @@ import ( "github.com/containerd/cgroups" specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/prometheus/client_golang/prometheus" "gitlab.com/gitlab-org/gitaly/v14/internal/command" cgroupscfg "gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config/cgroups" ) // CGroupV1Manager is the manager for cgroups v1 type CGroupV1Manager struct { - cfg cgroupscfg.Config - hierarchy func() ([]cgroups.Subsystem, error) + cfg cgroupscfg.Config + hierarchy func() ([]cgroups.Subsystem, error) + paths map[string]interface{} + memoryFailedTotal, cpuUsage *prometheus.GaugeVec } func newV1Manager(cfg cgroupscfg.Config) *CGroupV1Manager { @@ -24,6 +27,21 @@ func newV1Manager(cfg cgroupscfg.Config) *CGroupV1Manager { hierarchy: func() ([]cgroups.Subsystem, error) { return defaultSubsystems(cfg.Mountpoint) }, + paths: make(map[string]interface{}), + memoryFailedTotal: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_memory_failed_total", + Help: "Number of memory usage hits limits", + }, + []string{"path"}, + ), + cpuUsage: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_cpu_usage", + Help: "CPU Usage of Cgroup", + }, + []string{"path", "type"}, + ), } } @@ -73,9 +91,42 @@ func (cg *CGroupV1Manager) AddCommand(cmd *command.Command) error { return fmt.Errorf("failed adding process to cgroup: %w", err) } + cg.paths[cgroupPath] = struct{}{} + return nil } +// Collect collects metrics from the cgroups controller +func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { + path := cg.currentProcessCgroup() + control, err := cgroups.Load(cg.hierarchy, cgroups.StaticPath(path)) + if err != nil { + return + } + + metrics, err := control.Stat() + if err != nil { + return + } + + memoryMetric := cg.memoryFailedTotal.WithLabelValues(path) + memoryMetric.Set(float64(metrics.Memory.Usage.Failcnt)) + ch <- memoryMetric + + cpuUserMetric := cg.cpuUsage.WithLabelValues(path, "user") + cpuUserMetric.Set(float64(metrics.CPU.Usage.User)) + ch <- cpuUserMetric + + cpuKernelMetric := cg.cpuUsage.WithLabelValues(path, "kernel") + cpuKernelMetric.Set(float64(metrics.CPU.Usage.Kernel)) + ch <- cpuKernelMetric +} + +// Describe describes the cgroup metrics that Collect provides +func (cg *CGroupV1Manager) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(cg, ch) +} + //nolint: revive,stylecheck // This is unintentionally missing documentation. func (cg *CGroupV1Manager) Cleanup() error { processCgroupPath := cg.currentProcessCgroup() diff --git a/internal/cgroups/v1_linux_test.go b/internal/cgroups/v1_linux_test.go index 6141fca35..252aa9287 100644 --- a/internal/cgroups/v1_linux_test.go +++ b/internal/cgroups/v1_linux_test.go @@ -1,6 +1,7 @@ package cgroups import ( + "bytes" "context" "fmt" "hash/crc32" @@ -11,6 +12,8 @@ import ( "strings" "testing" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v14/internal/command" "gitlab.com/gitlab-org/gitaly/v14/internal/gitaly/config/cgroups" @@ -65,6 +68,7 @@ func TestAddCommand(t *testing.T) { v1Manager1 := &CGroupV1Manager{ cfg: config, hierarchy: mock.hierarchy, + paths: make(map[string]interface{}), } require.NoError(t, v1Manager1.Setup()) @@ -79,6 +83,7 @@ func TestAddCommand(t *testing.T) { v1Manager2 := &CGroupV1Manager{ cfg: config, hierarchy: mock.hierarchy, + paths: make(map[string]interface{}), } require.NoError(t, v1Manager2.AddCommand(cmd2)) @@ -115,6 +120,43 @@ func TestCleanup(t *testing.T) { } } +func TestMetrics(t *testing.T) { + mock := newMock(t) + + config := defaultCgroupsConfig() + v1Manager1 := newV1Manager(config) + v1Manager1.hierarchy = mock.hierarchy + + require.NoError(t, v1Manager1.Setup()) + + ctx, cancel := testhelper.Context() + defer cancel() + + cmd1 := exec.Command("ls", "-hal", ".") + cmd2, err := command.New(ctx, cmd1, nil, nil, nil) + require.NoError(t, err) + require.NoError(t, cmd2.Wait()) + + require.NoError(t, v1Manager1.AddCommand(cmd2)) + mock.setupMockCgroupFiles(t, v1Manager1, 2) + + cgroupPath := v1Manager1.currentProcessCgroup() + + expected := bytes.NewBufferString(fmt.Sprintf(`# HELP gitaly_cgroup_cpu_usage CPU Usage of Cgroup +# TYPE gitaly_cgroup_cpu_usage gauge +gitaly_cgroup_cpu_usage{path="%s",type="kernel"} 0 +gitaly_cgroup_cpu_usage{path="%s",type="user"} 0 +# HELP gitaly_cgroup_memory_failed_total Number of memory usage hits limits +# TYPE gitaly_cgroup_memory_failed_total gauge +gitaly_cgroup_memory_failed_total{path="%s"} 2 +`, cgroupPath, cgroupPath, cgroupPath)) + assert.NoError(t, testutil.CollectAndCompare( + v1Manager1, + expected, + "gitaly_cgroup_memory_failed_total", + "gitaly_cgroup_cpu_usage")) +} + func readCgroupFile(t *testing.T, path string) []byte { t.Helper() diff --git a/internal/git/command_factory.go b/internal/git/command_factory.go index 46c23c008..c3be0eb60 100644 --- a/internal/git/command_factory.go +++ b/internal/git/command_factory.go @@ -74,6 +74,7 @@ func (cf *ExecCommandFactory) Describe(descs chan<- *prometheus.Desc) { // Collect is used to collect Prometheus metrics. func (cf *ExecCommandFactory) Collect(metrics chan<- prometheus.Metric) { cf.invalidCommandsMetric.Collect(metrics) + cf.cgroupsManager.Collect(metrics) } // New creates a new command for the repo repository. diff --git a/internal/git/command_factory_cgroup_test.go b/internal/git/command_factory_cgroup_test.go index d2b5e2e3e..d2b0ab2f9 100644 --- a/internal/git/command_factory_cgroup_test.go +++ b/internal/git/command_factory_cgroup_test.go @@ -5,6 +5,7 @@ import ( "path/filepath" "testing" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v14/internal/command" @@ -31,6 +32,9 @@ func (m *mockCgroupsManager) Cleanup() error { return nil } +func (m *mockCgroupsManager) Collect(ch chan<- prometheus.Metric) {} +func (m *mockCgroupsManager) Describe(ch chan<- *prometheus.Desc) {} + func TestNewCommandAddsToCgroup(t *testing.T) { root := testhelper.TempDir(t) |