diff options
author | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-07-11 09:37:10 +0300 |
---|---|---|
committer | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-07-11 09:37:10 +0300 |
commit | db86f94f1921fb1affa96adaffb45b78de569129 (patch) | |
tree | 9b881b874e8a099c75c9493c48a5b2608616f91a | |
parent | ab5d534057f73d43a8539406f5923e9314e9c982 (diff) | |
parent | e0e0cc45caff5305763f7e5afd6150e8a6f51407 (diff) |
Merge branch 'zh-cgroups-v2' into 'master'
cgroup: Add support for cgroups v2
See merge request https://gitlab.com/gitlab-org/gitaly/-/merge_requests/5547
Merged-by: Quang-Minh Nguyen <qmnguyen@gitlab.com>
Approved-by: karthik nayak <knayak@gitlab.com>
Approved-by: Quang-Minh Nguyen <qmnguyen@gitlab.com>
Reviewed-by: Steve Xuereb <sxuereb@gitlab.com>
Reviewed-by: Quang-Minh Nguyen <qmnguyen@gitlab.com>
Reviewed-by: karthik nayak <knayak@gitlab.com>
Co-authored-by: ZheNing Hu <adlternative@gmail.com>
-rw-r--r-- | NOTICE | 26 | ||||
-rw-r--r-- | go.mod | 1 | ||||
-rw-r--r-- | go.sum | 3 | ||||
-rw-r--r-- | internal/cgroups/cgroups.go | 24 | ||||
-rw-r--r-- | internal/cgroups/cgroups_linux_test.go | 178 | ||||
-rw-r--r-- | internal/cgroups/manager.go | 17 | ||||
-rw-r--r-- | internal/cgroups/manager_linux.go | 177 | ||||
-rw-r--r-- | internal/cgroups/metrics.go | 87 | ||||
-rw-r--r-- | internal/cgroups/mock_linux_test.go | 79 | ||||
-rw-r--r-- | internal/cgroups/v1.go | 12 | ||||
-rw-r--r-- | internal/cgroups/v1_linux.go | 221 | ||||
-rw-r--r-- | internal/cgroups/v1_linux_test.go | 332 | ||||
-rw-r--r-- | internal/cgroups/v2_linux.go | 175 | ||||
-rw-r--r-- | internal/cgroups/v2_linux_test.go | 546 |
14 files changed, 1425 insertions, 453 deletions
@@ -6499,6 +6499,32 @@ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +LICENSE - github.com/cilium/ebpf +MIT License + +Copyright (c) 2017 Nathan Sweet +Copyright (c) 2018, 2019 Cloudflare +Copyright (c) 2019 Authors of Cilium + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LICENSE - github.com/client9/reopen The MIT License (MIT) @@ -94,6 +94,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cilium/ebpf v0.9.1 // indirect github.com/client9/reopen v1.0.0 // indirect github.com/cloudflare/circl v1.3.3 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect @@ -911,6 +911,8 @@ github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA= +github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4= +github.com/cilium/ebpf v0.9.1/go.mod h1:+OhNOIXx/Fnu1IE8bJz2dzOA+VSfyTfdNUVdlQnxUFY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= github.com/clbanning/mxj v1.8.4/go.mod h1:BVjHeAH+rl9rs6f+QIpeRl0tfu10SXn1pUSa5PVGJng= @@ -1182,6 +1184,7 @@ github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHqu github.com/franela/goblin v0.0.0-20210519012713-85d372ac71e2/go.mod h1:VzmDKDJVZI3aJmnRI9VjAn9nJ8qPPsN1fqzr9dqInIo= github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/frankban/quicktest v1.14.3 h1:FJKSZTDHjyhriyC81FLQ0LY93eSai0ZyR/ZIkd3ZUKE= github.com/frankban/quicktest v1.14.3/go.mod h1:mgiwOwqx65TmIk1wJ6Q7wvnVMocbUorkibMOrVTHZps= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= diff --git a/internal/cgroups/cgroups.go b/internal/cgroups/cgroups.go index 5d44ba70d..0c6927502 100644 --- a/internal/cgroups/cgroups.go +++ b/internal/cgroups/cgroups.go @@ -2,11 +2,9 @@ package cgroups import ( "os/exec" - "path/filepath" "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" - "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" ) @@ -44,7 +42,7 @@ type Manager interface { // NewManager returns the appropriate Cgroups manager func NewManager(cfg cgroups.Config, pid int) Manager { if cfg.Repositories.Count > 0 { - return newV1Manager(cfg, pid) + return newCgroupManager(cfg, pid) } return &NoopManager{} @@ -52,23 +50,5 @@ func NewManager(cfg cgroups.Config, pid int) Manager { // PruneOldCgroups prunes old cgroups for both the memory and cpu subsystems func PruneOldCgroups(cfg cgroups.Config, logger log.FieldLogger) { - if cfg.HierarchyRoot == "" { - return - } - - if err := config.PruneOldGitalyProcessDirectories( - logger, - filepath.Join(cfg.Mountpoint, "memory", - cfg.HierarchyRoot), - ); err != nil { - logger.WithError(err).Error("failed to clean up memory cgroups") - } - - if err := config.PruneOldGitalyProcessDirectories( - logger, - filepath.Join(cfg.Mountpoint, "cpu", - cfg.HierarchyRoot), - ); err != nil { - logger.WithError(err).Error("failed to clean up cpu cgroups") - } + pruneOldCgroups(cfg, logger) } diff --git a/internal/cgroups/cgroups_linux_test.go b/internal/cgroups/cgroups_linux_test.go index e52eecb5e..8ed551d2d 100644 --- a/internal/cgroups/cgroups_linux_test.go +++ b/internal/cgroups/cgroups_linux_test.go @@ -1,17 +1,12 @@ +//go:build linux + package cgroups import ( - "fmt" - "io/fs" - "os" - "os/exec" - "path/filepath" "testing" - "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" - "gitlab.com/gitlab-org/gitaly/v16/internal/helper/perm" "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" ) @@ -20,174 +15,5 @@ func TestMain(m *testing.M) { } func TestNewManager(t *testing.T) { - cfg := cgroups.Config{Repositories: cgroups.Repositories{Count: 10}} - - require.IsType(t, &CGroupV1Manager{}, &CGroupV1Manager{cfg: cfg}) require.IsType(t, &NoopManager{}, NewManager(cgroups.Config{}, 1)) } - -func TestPruneOldCgroups(t *testing.T) { - t.Parallel() - - testCases := []struct { - desc string - cfg cgroups.Config - expectedPruned bool - // setup returns a pid - setup func(*testing.T, cgroups.Config) int - }{ - { - desc: "process belongs to another user", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "gitaly", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - pid := 1 - cgroupManager := NewManager(cfg, pid) - require.NoError(t, cgroupManager.Setup()) - - return pid - }, - expectedPruned: true, - }, - { - desc: "no hierarchy root", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - pid := 1 - cgroupManager := NewManager(cfg, pid) - require.NoError(t, cgroupManager.Setup()) - - return 1 - }, - expectedPruned: false, - }, - { - desc: "pid of finished process", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "gitaly", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - cmd := exec.Command("ls") - require.NoError(t, cmd.Run()) - pid := cmd.Process.Pid - - cgroupManager := NewManager(cfg, pid) - require.NoError(t, cgroupManager.Setup()) - - memoryRoot := filepath.Join( - cfg.Mountpoint, - "memory", - cfg.HierarchyRoot, - "memory.limit_in_bytes", - ) - require.NoError(t, os.WriteFile(memoryRoot, []byte{}, fs.ModeAppend)) - - return pid - }, - expectedPruned: true, - }, - { - desc: "pid of running process", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "gitaly", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - pid := os.Getpid() - - cgroupManager := NewManager(cfg, pid) - require.NoError(t, cgroupManager.Setup()) - - return pid - }, - expectedPruned: false, - }, - { - desc: "gitaly-0 directory is deleted", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "gitaly", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - cgroupManager := NewManager(cfg, 0) - require.NoError(t, cgroupManager.Setup()) - - return 0 - }, - expectedPruned: true, - }, - } - - for _, tc := range testCases { - t.Run(tc.desc, func(t *testing.T) { - memoryRoot := filepath.Join( - tc.cfg.Mountpoint, - "memory", - tc.cfg.HierarchyRoot, - ) - cpuRoot := filepath.Join( - tc.cfg.Mountpoint, - "cpu", - tc.cfg.HierarchyRoot, - ) - - require.NoError(t, os.MkdirAll(cpuRoot, perm.PublicDir)) - require.NoError(t, os.MkdirAll(memoryRoot, perm.PublicDir)) - - pid := tc.setup(t, tc.cfg) - - logger, hook := test.NewNullLogger() - PruneOldCgroups(tc.cfg, logger) - - // create cgroups directories with a different pid - oldGitalyProcessMemoryDir := filepath.Join( - memoryRoot, - fmt.Sprintf("gitaly-%d", pid), - ) - oldGitalyProcesssCPUDir := filepath.Join( - cpuRoot, - fmt.Sprintf("gitaly-%d", pid), - ) - - if tc.expectedPruned { - require.NoDirExists(t, oldGitalyProcessMemoryDir) - require.NoDirExists(t, oldGitalyProcesssCPUDir) - } else { - require.DirExists(t, oldGitalyProcessMemoryDir) - require.DirExists(t, oldGitalyProcesssCPUDir) - require.Len(t, hook.Entries, 0) - } - }) - } -} diff --git a/internal/cgroups/manager.go b/internal/cgroups/manager.go new file mode 100644 index 000000000..445138394 --- /dev/null +++ b/internal/cgroups/manager.go @@ -0,0 +1,17 @@ +//go:build !linux + +package cgroups + +import ( + log "github.com/sirupsen/logrus" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" + cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" +) + +func newCgroupManager(cfg cgroupscfg.Config, pid int) Manager { + return &NoopManager{} +} + +func pruneOldCgroups(cfg cgroups.Config, logger log.FieldLogger) { + return +} diff --git a/internal/cgroups/manager_linux.go b/internal/cgroups/manager_linux.go new file mode 100644 index 000000000..7b8c4a34d --- /dev/null +++ b/internal/cgroups/manager_linux.go @@ -0,0 +1,177 @@ +//go:build linux + +package cgroups + +import ( + "fmt" + "hash/crc32" + "os/exec" + "strings" + + cgrps "github.com/containerd/cgroups/v3" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/prometheus/client_golang/prometheus" + log "github.com/sirupsen/logrus" + cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" +) + +// cfs_period_us hardcoded to be 100ms. +const cfsPeriodUs uint64 = 100000 + +type cgroupHandler interface { + setupParent(reposResources *specs.LinuxResources) error + setupRepository(reposResources *specs.LinuxResources) error + addToCgroup(pid int, cgroupPath string) error + collect(ch chan<- prometheus.Metric) + cleanup() error + currentProcessCgroup() string + repoPath(groupID int) string +} + +// CGroupManager is a manager class that implements specific methods related to cgroups +type CGroupManager struct { + cfg cgroupscfg.Config + pid int + + handler cgroupHandler +} + +func newCgroupManager(cfg cgroupscfg.Config, pid int) *CGroupManager { + return newCgroupManagerWithMode(cfg, pid, cgrps.Mode()) +} + +func newCgroupManagerWithMode(cfg cgroupscfg.Config, pid int, mode cgrps.CGMode) *CGroupManager { + var handler cgroupHandler + switch mode { + case cgrps.Legacy, cgrps.Hybrid: + handler = newV1Handler(cfg, pid) + case cgrps.Unified: + handler = newV2Handler(cfg, pid) + log.Warnf("Gitaly now includes experimental support for CgroupV2. Please proceed with caution and use this experimental feature at your own risk") + default: + log.Fatalf("unknown cgroup version") + } + + return &CGroupManager{ + cfg: cfg, + pid: pid, + handler: handler, + } +} + +// Setup parent cgroups and repository sub cgroups +func (cgm *CGroupManager) Setup() error { + if err := cgm.handler.setupParent(cgm.configParentResources()); err != nil { + return err + } + if err := cgm.handler.setupRepository(cgm.configRepositoryResources()); err != nil { + return err + } + return nil +} + +// AddCommand adds a Cmd to a cgroup +func (cgm *CGroupManager) AddCommand(cmd *exec.Cmd, opts ...AddCommandOption) (string, error) { + var cfg addCommandCfg + for _, opt := range opts { + opt(&cfg) + } + + key := cfg.cgroupKey + if key == "" { + key = strings.Join(cmd.Args, "/") + } + + checksum := crc32.ChecksumIEEE( + []byte(key), + ) + + if cmd.Process == nil { + return "", fmt.Errorf("cannot add command that has not yet been started") + } + + groupID := uint(checksum) % cgm.cfg.Repositories.Count + cgroupPath := cgm.handler.repoPath(int(groupID)) + + return cgroupPath, cgm.handler.addToCgroup(cmd.Process.Pid, cgroupPath) +} + +// Cleanup cleans up cgroups created in Setup. +func (cgm *CGroupManager) Cleanup() error { + return cgm.handler.cleanup() +} + +// Describe is used to generate description information for each CGroupManager prometheus metric +func (cgm *CGroupManager) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(cgm, ch) +} + +// Collect is used to collect the current values of all CGroupManager prometheus metrics +func (cgm *CGroupManager) Collect(ch chan<- prometheus.Metric) { + cgm.handler.collect(ch) +} + +func (cgm *CGroupManager) currentProcessCgroup() string { + return cgm.handler.currentProcessCgroup() +} + +func (cgm *CGroupManager) configParentResources() *specs.LinuxResources { + cfsPeriodUs := cfsPeriodUs + var parentResources specs.LinuxResources + // Leave them `nil` so it takes kernel default unless cfg value above `0`. + parentResources.CPU = &specs.LinuxCPU{} + + if cgm.cfg.CPUShares > 0 { + parentResources.CPU.Shares = &cgm.cfg.CPUShares + } + + if cgm.cfg.CPUQuotaUs > 0 { + parentResources.CPU.Quota = &cgm.cfg.CPUQuotaUs + parentResources.CPU.Period = &cfsPeriodUs + } + + if cgm.cfg.MemoryBytes > 0 { + parentResources.Memory = &specs.LinuxMemory{Limit: &cgm.cfg.MemoryBytes} + } + return &parentResources +} + +func (cgm *CGroupManager) configRepositoryResources() *specs.LinuxResources { + cfsPeriodUs := cfsPeriodUs + var reposResources specs.LinuxResources + // Leave them `nil` so it takes kernel default unless cfg value above `0`. + reposResources.CPU = &specs.LinuxCPU{} + + if cgm.cfg.Repositories.CPUShares > 0 { + reposResources.CPU.Shares = &cgm.cfg.Repositories.CPUShares + } + + if cgm.cfg.Repositories.CPUQuotaUs > 0 { + reposResources.CPU.Quota = &cgm.cfg.Repositories.CPUQuotaUs + reposResources.CPU.Period = &cfsPeriodUs + } + + if cgm.cfg.Repositories.MemoryBytes > 0 { + reposResources.Memory = &specs.LinuxMemory{Limit: &cgm.cfg.Repositories.MemoryBytes} + } + return &reposResources +} + +func pruneOldCgroups(cfg cgroupscfg.Config, logger log.FieldLogger) { + pruneOldCgroupsWithMode(cfg, logger, cgrps.Mode()) +} + +func pruneOldCgroupsWithMode(cfg cgroupscfg.Config, logger log.FieldLogger, mode cgrps.CGMode) { + if cfg.HierarchyRoot == "" { + return + } + + switch mode { + case cgrps.Legacy, cgrps.Hybrid: + pruneOldCgroupsV1(cfg, logger) + case cgrps.Unified: + pruneOldCgroupsV2(cfg, logger) + default: + log.Fatalf("unknown cgroup version") + } +} diff --git a/internal/cgroups/metrics.go b/internal/cgroups/metrics.go new file mode 100644 index 000000000..a8ffa618f --- /dev/null +++ b/internal/cgroups/metrics.go @@ -0,0 +1,87 @@ +package cgroups + +import "github.com/prometheus/client_golang/prometheus" + +type cgroupsMetrics struct { + memoryReclaimAttemptsTotal *prometheus.GaugeVec + cpuUsage *prometheus.GaugeVec + cpuCFSPeriods *prometheus.Desc + cpuCFSThrottledPeriods *prometheus.Desc + cpuCFSThrottledTime *prometheus.Desc + procs *prometheus.GaugeVec +} + +func newV1CgroupsMetrics() *cgroupsMetrics { + return &cgroupsMetrics{ + memoryReclaimAttemptsTotal: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_memory_reclaim_attempts_total", + Help: "Number of memory usage hits limits", + }, + []string{"path"}, + ), + cpuUsage: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_cpu_usage_total", + Help: "CPU Usage of Cgroup", + }, + []string{"path", "type"}, + ), + cpuCFSPeriods: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_periods_total", + "Number of elapsed enforcement period intervals", + []string{"path"}, nil, + ), + cpuCFSThrottledPeriods: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_throttled_periods_total", + "Number of throttled period intervals", + []string{"path"}, nil, + ), + cpuCFSThrottledTime: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_throttled_seconds_total", + "Total time duration the Cgroup has been throttled", + []string{"path"}, nil, + ), + procs: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_procs_total", + Help: "Total number of procs", + }, + []string{"path", "subsystem"}, + ), + } +} + +func newV2CgroupsMetrics() *cgroupsMetrics { + return &cgroupsMetrics{ + cpuUsage: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_cpu_usage_total", + Help: "CPU Usage of Cgroup", + }, + []string{"path", "type"}, + ), + cpuCFSPeriods: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_periods_total", + "Number of elapsed enforcement period intervals", + []string{"path"}, nil, + ), + cpuCFSThrottledPeriods: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_throttled_periods_total", + "Number of throttled period intervals", + []string{"path"}, nil, + ), + cpuCFSThrottledTime: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_throttled_seconds_total", + "Total time duration the Cgroup has been throttled", + []string{"path"}, nil, + ), + procs: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_procs_total", + Help: "Total number of procs", + }, + []string{"path", "subsystem"}, + ), + } +} diff --git a/internal/cgroups/mock_linux_test.go b/internal/cgroups/mock_linux_test.go index 2cf735149..135dca76f 100644 --- a/internal/cgroups/mock_linux_test.go +++ b/internal/cgroups/mock_linux_test.go @@ -1,3 +1,5 @@ +//go:build linux + /* Adapted from https://github.com/containerd/cgroups/blob/f1d9380fd3c028194db9582825512fdf3f39ab2a/mock_test.go @@ -25,8 +27,11 @@ import ( "strconv" "testing" + cgrps "github.com/containerd/cgroups/v3" "github.com/containerd/cgroups/v3/cgroup1" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" + cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" "gitlab.com/gitlab-org/gitaly/v16/internal/helper/perm" "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" ) @@ -54,13 +59,9 @@ func newMock(t *testing.T) *mockCgroup { } } -func (m *mockCgroup) hierarchy() ([]cgroup1.Subsystem, error) { - return m.subsystems, nil -} - func (m *mockCgroup) setupMockCgroupFiles( t *testing.T, - manager *CGroupV1Manager, + manager *CGroupManager, memFailCount int, ) { for _, s := range m.subsystems { @@ -117,3 +118,71 @@ throttled_time 1000000` } } } + +func (m *mockCgroup) newCgroupManager(cfg cgroupscfg.Config, pid int) *CGroupManager { + return newCgroupManagerWithMode(cfg, pid, cgrps.Legacy) +} + +func (m *mockCgroup) pruneOldCgroups(cfg cgroupscfg.Config, logger logrus.FieldLogger) { + pruneOldCgroupsWithMode(cfg, logger, cgrps.Legacy) +} + +type mockCgroupV2 struct { + root string +} + +func newMockV2(t *testing.T) *mockCgroupV2 { + t.Helper() + + return &mockCgroupV2{ + root: testhelper.TempDir(t), + } +} + +func (m *mockCgroupV2) setupMockCgroupFiles( + t *testing.T, + manager *CGroupManager, +) { + cgroupPath := filepath.Join(m.root, manager.currentProcessCgroup()) + require.NoError(t, os.MkdirAll(cgroupPath, perm.SharedDir)) + + contentByFilename := map[string]string{ + "cgroup.procs": "", + "cgroup.subtree_control": "cpu cpuset memory", + "cgroup.controllers": "cpu cpuset memory", + "cpu.max": "max 100000", + "cpu.weight": "10", + "memory.max": "max", + "cpu.stat": `nr_periods 10 + nr_throttled 20 + throttled_usec 1000000`, + } + + for filename, content := range contentByFilename { + controlFilePath := filepath.Join(m.root, manager.cfg.HierarchyRoot, filename) + require.NoError(t, os.WriteFile(controlFilePath, []byte(content), perm.SharedFile)) + } + + for filename, content := range contentByFilename { + controlFilePath := filepath.Join(cgroupPath, filename) + require.NoError(t, os.WriteFile(controlFilePath, []byte(content), perm.SharedFile)) + } + + for shard := uint(0); shard < manager.cfg.Repositories.Count; shard++ { + shardPath := filepath.Join(cgroupPath, fmt.Sprintf("repos-%d", shard)) + require.NoError(t, os.MkdirAll(shardPath, perm.SharedDir)) + + for filename, content := range contentByFilename { + shardControlFilePath := filepath.Join(shardPath, filename) + require.NoError(t, os.WriteFile(shardControlFilePath, []byte(content), perm.SharedFile)) + } + } +} + +func (m *mockCgroupV2) newCgroupManager(cfg cgroupscfg.Config, pid int) *CGroupManager { + return newCgroupManagerWithMode(cfg, pid, cgrps.Unified) +} + +func (m *mockCgroupV2) pruneOldCgroups(cfg cgroupscfg.Config, logger logrus.FieldLogger) { + pruneOldCgroupsWithMode(cfg, logger, cgrps.Unified) +} diff --git a/internal/cgroups/v1.go b/internal/cgroups/v1.go deleted file mode 100644 index 8935bcdc5..000000000 --- a/internal/cgroups/v1.go +++ /dev/null @@ -1,12 +0,0 @@ -//go:build !linux - -package cgroups - -import ( - "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" -) - -// For systems other than Linux, we return a noop manager if cgroups was enabled. -func newV1Manager(cfg cgroups.Config, pid int) *NoopManager { - return &NoopManager{} -} diff --git a/internal/cgroups/v1_linux.go b/internal/cgroups/v1_linux.go index 09bf23619..22e9ab841 100644 --- a/internal/cgroups/v1_linux.go +++ b/internal/cgroups/v1_linux.go @@ -1,9 +1,9 @@ +//go:build linux + package cgroups import ( "fmt" - "hash/crc32" - "os/exec" "path/filepath" "strings" "time" @@ -11,167 +11,59 @@ import ( "github.com/containerd/cgroups/v3/cgroup1" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config" cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" "gitlab.com/gitlab-org/gitaly/v16/internal/log" ) -// cfs_period_us hardcoded to be 100ms. -const cfsPeriodUs uint64 = 100000 +type cgroupV1Handler struct { + cfg cgroupscfg.Config + hierarchy func() ([]cgroup1.Subsystem, error) -// CGroupV1Manager is the manager for cgroups v1 -type CGroupV1Manager struct { - cfg cgroupscfg.Config - hierarchy func() ([]cgroup1.Subsystem, error) - memoryReclaimAttemptsTotal *prometheus.GaugeVec - cpuUsage *prometheus.GaugeVec - cpuCFSPeriods *prometheus.Desc - cpuCFSThrottledPeriods *prometheus.Desc - cpuCFSThrottledTime *prometheus.Desc - procs *prometheus.GaugeVec - pid int + *cgroupsMetrics + pid int } -func newV1Manager(cfg cgroupscfg.Config, pid int) *CGroupV1Manager { - return &CGroupV1Manager{ +func newV1Handler(cfg cgroupscfg.Config, pid int) *cgroupV1Handler { + return &cgroupV1Handler{ cfg: cfg, pid: pid, hierarchy: func() ([]cgroup1.Subsystem, error) { return defaultSubsystems(cfg.Mountpoint) }, - memoryReclaimAttemptsTotal: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gitaly_cgroup_memory_reclaim_attempts_total", - Help: "Number of memory usage hits limits", - }, - []string{"path"}, - ), - cpuUsage: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gitaly_cgroup_cpu_usage_total", - Help: "CPU Usage of Cgroup", - }, - []string{"path", "type"}, - ), - cpuCFSPeriods: prometheus.NewDesc( - "gitaly_cgroup_cpu_cfs_periods_total", - "Number of elapsed enforcement period intervals", - []string{"path"}, nil, - ), - cpuCFSThrottledPeriods: prometheus.NewDesc( - "gitaly_cgroup_cpu_cfs_throttled_periods_total", - "Number of throttled period intervals", - []string{"path"}, nil, - ), - cpuCFSThrottledTime: prometheus.NewDesc( - "gitaly_cgroup_cpu_cfs_throttled_seconds_total", - "Total time duration the Cgroup has been throttled", - []string{"path"}, nil, - ), - procs: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gitaly_cgroup_procs_total", - Help: "Total number of procs", - }, - []string{"path", "subsystem"}, - ), + cgroupsMetrics: newV1CgroupsMetrics(), } } -//nolint:revive // This is unintentionally missing documentation. -func (cg *CGroupV1Manager) Setup() error { - cfsPeriodUs := cfsPeriodUs - - var parentResources specs.LinuxResources - // Leave them `nil` so it takes kernel default unless cfg value above `0`. - parentResources.CPU = &specs.LinuxCPU{} - - if cg.cfg.CPUShares > 0 { - parentResources.CPU.Shares = &cg.cfg.CPUShares - } - - if cg.cfg.CPUQuotaUs > 0 { - parentResources.CPU.Quota = &cg.cfg.CPUQuotaUs - parentResources.CPU.Period = &cfsPeriodUs - } - - if cg.cfg.MemoryBytes > 0 { - parentResources.Memory = &specs.LinuxMemory{Limit: &cg.cfg.MemoryBytes} - } - +func (cvh *cgroupV1Handler) setupParent(parentResources *specs.LinuxResources) error { if _, err := cgroup1.New( - cgroup1.StaticPath(cg.currentProcessCgroup()), - &parentResources, - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.StaticPath(cvh.currentProcessCgroup()), + parentResources, + cgroup1.WithHiearchy(cvh.hierarchy), ); err != nil { return fmt.Errorf("failed creating parent cgroup: %w", err) } + return nil +} - var reposResources specs.LinuxResources - // Leave them `nil` so it takes kernel default unless cfg value above `0`. - reposResources.CPU = &specs.LinuxCPU{} - - if cg.cfg.Repositories.CPUShares > 0 { - reposResources.CPU.Shares = &cg.cfg.Repositories.CPUShares - } - - if cg.cfg.Repositories.CPUQuotaUs > 0 { - reposResources.CPU.Quota = &cg.cfg.Repositories.CPUQuotaUs - reposResources.CPU.Period = &cfsPeriodUs - } - - if cg.cfg.Repositories.MemoryBytes > 0 { - reposResources.Memory = &specs.LinuxMemory{Limit: &cg.cfg.Repositories.MemoryBytes} - } - - for i := 0; i < int(cg.cfg.Repositories.Count); i++ { +func (cvh *cgroupV1Handler) setupRepository(reposResources *specs.LinuxResources) error { + for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { if _, err := cgroup1.New( - cgroup1.StaticPath(cg.repoPath(i)), - &reposResources, - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.StaticPath(cvh.repoPath(i)), + reposResources, + cgroup1.WithHiearchy(cvh.hierarchy), ); err != nil { return fmt.Errorf("failed creating repository cgroup: %w", err) } } - return nil } -// AddCommand adds the given command to one of the CGroup's buckets. The bucket used for the command -// is determined by hashing the repository storage and path. No error is returned if the command has already -// exited. -func (cg *CGroupV1Manager) AddCommand( - cmd *exec.Cmd, - opts ...AddCommandOption, -) (string, error) { - var cfg addCommandCfg - for _, opt := range opts { - opt(&cfg) - } - - key := cfg.cgroupKey - if key == "" { - key = strings.Join(cmd.Args, "/") - } - - checksum := crc32.ChecksumIEEE( - []byte(key), - ) - - if cmd.Process == nil { - return "", fmt.Errorf("cannot add command that has not yet been started") - } - - groupID := uint(checksum) % cg.cfg.Repositories.Count - cgroupPath := cg.repoPath(int(groupID)) - - return cgroupPath, cg.addToCgroup(cmd.Process.Pid, cgroupPath) -} - -func (cg *CGroupV1Manager) addToCgroup(pid int, cgroupPath string) error { +func (cvh *cgroupV1Handler) addToCgroup(pid int, cgroupPath string) error { control, err := cgroup1.Load( cgroup1.StaticPath(cgroupPath), - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.WithHiearchy(cvh.hierarchy), ) if err != nil { return fmt.Errorf("failed loading %s cgroup: %w", cgroupPath, err) @@ -189,18 +81,17 @@ func (cg *CGroupV1Manager) addToCgroup(pid int, cgroupPath string) error { return nil } -// Collect collects metrics from the cgroups controller -func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { - if !cg.cfg.MetricsEnabled { +func (cvh *cgroupV1Handler) collect(ch chan<- prometheus.Metric) { + if !cvh.cfg.MetricsEnabled { return } - for i := 0; i < int(cg.cfg.Repositories.Count); i++ { - repoPath := cg.repoPath(i) + for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { + repoPath := cvh.repoPath(i) logger := log.Default().WithField("cgroup_path", repoPath) control, err := cgroup1.Load( cgroup1.StaticPath(repoPath), - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.WithHiearchy(cvh.hierarchy), ) if err != nil { logger.WithError(err).Warn("unable to load cgroup controller") @@ -210,41 +101,41 @@ func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { if metrics, err := control.Stat(); err != nil { logger.WithError(err).Warn("unable to get cgroup stats") } else { - memoryMetric := cg.memoryReclaimAttemptsTotal.WithLabelValues(repoPath) + memoryMetric := cvh.memoryReclaimAttemptsTotal.WithLabelValues(repoPath) memoryMetric.Set(float64(metrics.Memory.Usage.Failcnt)) ch <- memoryMetric - cpuUserMetric := cg.cpuUsage.WithLabelValues(repoPath, "user") + cpuUserMetric := cvh.cpuUsage.WithLabelValues(repoPath, "user") cpuUserMetric.Set(float64(metrics.CPU.Usage.User)) ch <- cpuUserMetric ch <- prometheus.MustNewConstMetric( - cg.cpuCFSPeriods, + cvh.cpuCFSPeriods, prometheus.CounterValue, float64(metrics.CPU.Throttling.Periods), repoPath, ) ch <- prometheus.MustNewConstMetric( - cg.cpuCFSThrottledPeriods, + cvh.cpuCFSThrottledPeriods, prometheus.CounterValue, float64(metrics.CPU.Throttling.ThrottledPeriods), repoPath, ) ch <- prometheus.MustNewConstMetric( - cg.cpuCFSThrottledTime, + cvh.cpuCFSThrottledTime, prometheus.CounterValue, float64(metrics.CPU.Throttling.ThrottledTime)/float64(time.Second), repoPath, ) - cpuKernelMetric := cg.cpuUsage.WithLabelValues(repoPath, "kernel") + cpuKernelMetric := cvh.cpuUsage.WithLabelValues(repoPath, "kernel") cpuKernelMetric.Set(float64(metrics.CPU.Usage.Kernel)) ch <- cpuKernelMetric } - if subsystems, err := cg.hierarchy(); err != nil { + if subsystems, err := cvh.hierarchy(); err != nil { logger.WithError(err).Warn("unable to get cgroup hierarchy") } else { for _, subsystem := range subsystems { @@ -256,7 +147,7 @@ func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { continue } - procsMetric := cg.procs.WithLabelValues(repoPath, string(subsystem.Name())) + procsMetric := cvh.procs.WithLabelValues(repoPath, string(subsystem.Name())) procsMetric.Set(float64(len(processes))) ch <- procsMetric } @@ -264,18 +155,12 @@ func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { } } -// Describe describes the cgroup metrics that Collect provides -func (cg *CGroupV1Manager) Describe(ch chan<- *prometheus.Desc) { - prometheus.DescribeByCollect(cg, ch) -} - -//nolint:revive // This is unintentionally missing documentation. -func (cg *CGroupV1Manager) Cleanup() error { - processCgroupPath := cg.currentProcessCgroup() +func (cvh *cgroupV1Handler) cleanup() error { + processCgroupPath := cvh.currentProcessCgroup() control, err := cgroup1.Load( cgroup1.StaticPath(processCgroupPath), - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.WithHiearchy(cvh.hierarchy), ) if err != nil { return fmt.Errorf("failed loading cgroup %s: %w", processCgroupPath, err) @@ -288,12 +173,12 @@ func (cg *CGroupV1Manager) Cleanup() error { return nil } -func (cg *CGroupV1Manager) repoPath(groupID int) string { - return filepath.Join(cg.currentProcessCgroup(), fmt.Sprintf("repos-%d", groupID)) +func (cvh *cgroupV1Handler) repoPath(groupID int) string { + return filepath.Join(cvh.currentProcessCgroup(), fmt.Sprintf("repos-%d", groupID)) } -func (cg *CGroupV1Manager) currentProcessCgroup() string { - return config.GetGitalyProcessTempDir(cg.cfg.HierarchyRoot, cg.pid) +func (cvh *cgroupV1Handler) currentProcessCgroup() string { + return config.GetGitalyProcessTempDir(cvh.cfg.HierarchyRoot, cvh.pid) } func defaultSubsystems(root string) ([]cgroup1.Subsystem, error) { @@ -304,3 +189,21 @@ func defaultSubsystems(root string) ([]cgroup1.Subsystem, error) { return subsystems, nil } + +func pruneOldCgroupsV1(cfg cgroupscfg.Config, logger logrus.FieldLogger) { + if err := config.PruneOldGitalyProcessDirectories( + logger, + filepath.Join(cfg.Mountpoint, "memory", + cfg.HierarchyRoot), + ); err != nil { + logger.WithError(err).Error("failed to clean up memory cgroups") + } + + if err := config.PruneOldGitalyProcessDirectories( + logger, + filepath.Join(cfg.Mountpoint, "cpu", + cfg.HierarchyRoot), + ); err != nil { + logger.WithError(err).Error("failed to clean up cpu cgroups") + } +} diff --git a/internal/cgroups/v1_linux_test.go b/internal/cgroups/v1_linux_test.go index a364d7965..a68ebed4d 100644 --- a/internal/cgroups/v1_linux_test.go +++ b/internal/cgroups/v1_linux_test.go @@ -1,9 +1,11 @@ +//go:build linux + package cgroups import ( - "bytes" "fmt" "hash/crc32" + "io/fs" "os" "os/exec" "path/filepath" @@ -11,7 +13,9 @@ import ( "strings" "testing" + cgrps "github.com/containerd/cgroups/v3" "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" @@ -31,6 +35,15 @@ func defaultCgroupsConfig() cgroups.Config { } } +func TestNewManagerV1(t *testing.T) { + cfg := cgroups.Config{Repositories: cgroups.Repositories{Count: 10}} + + manager := newCgroupManagerWithMode(cfg, 1, cgrps.Legacy) + require.IsType(t, &cgroupV1Handler{}, manager.handler) + manager = newCgroupManagerWithMode(cfg, 1, cgrps.Hybrid) + require.IsType(t, &cgroupV1Handler{}, manager.handler) +} + func TestSetup_ParentCgroups(t *testing.T) { tests := []struct { name string @@ -84,12 +97,9 @@ func TestSetup_ParentCgroups(t *testing.T) { mock := newMock(t) pid := 1 tt.cfg.HierarchyRoot = "gitaly" + tt.cfg.Mountpoint = mock.root - v1Manager := &CGroupV1Manager{ - cfg: tt.cfg, - hierarchy: mock.hierarchy, - pid: pid, - } + v1Manager := mock.newCgroupManager(tt.cfg, pid) require.NoError(t, v1Manager.Setup()) memoryLimitPath := filepath.Join( @@ -167,12 +177,10 @@ func TestSetup_RepoCgroups(t *testing.T) { cfg := defaultCgroupsConfig() cfg.Repositories = tt.cfg cfg.Repositories.Count = 3 + cfg.HierarchyRoot = "gitaly" + cfg.Mountpoint = mock.root - v1Manager := &CGroupV1Manager{ - cfg: cfg, - hierarchy: mock.hierarchy, - pid: pid, - } + v1Manager := mock.newCgroupManager(cfg, pid) require.NoError(t, v1Manager.Setup()) @@ -208,24 +216,18 @@ func TestAddCommand(t *testing.T) { config.Repositories.Count = 10 config.Repositories.MemoryBytes = 1024 config.Repositories.CPUShares = 16 + config.HierarchyRoot = "gitaly" + config.Mountpoint = mock.root pid := 1 - v1Manager1 := &CGroupV1Manager{ - cfg: config, - hierarchy: mock.hierarchy, - pid: pid, - } + v1Manager1 := mock.newCgroupManager(config, pid) require.NoError(t, v1Manager1.Setup()) ctx := testhelper.Context(t) cmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") require.NoError(t, cmd2.Run()) - v1Manager2 := &CGroupV1Manager{ - cfg: config, - hierarchy: mock.hierarchy, - pid: pid, - } + v1Manager2 := mock.newCgroupManager(config, pid) t.Run("without overridden key", func(t *testing.T) { _, err := v1Manager2.AddCommand(cmd2) @@ -270,11 +272,11 @@ func TestCleanup(t *testing.T) { mock := newMock(t) pid := 1 - v1Manager := &CGroupV1Manager{ - cfg: defaultCgroupsConfig(), - hierarchy: mock.hierarchy, - pid: pid, - } + cfg := defaultCgroupsConfig() + cfg.Mountpoint = mock.root + + v1Manager := mock.newCgroupManager(cfg, pid) + require.NoError(t, v1Manager.Setup()) require.NoError(t, v1Manager.Cleanup()) @@ -288,48 +290,17 @@ func TestCleanup(t *testing.T) { } func TestMetrics(t *testing.T) { - t.Parallel() - - mock := newMock(t) - - config := defaultCgroupsConfig() - config.Repositories.Count = 1 - config.Repositories.MemoryBytes = 1048576 - config.Repositories.CPUShares = 16 - - v1Manager1 := newV1Manager(config, 1) - v1Manager1.hierarchy = mock.hierarchy - - mock.setupMockCgroupFiles(t, v1Manager1, 2) - - require.NoError(t, v1Manager1.Setup()) - - ctx := testhelper.Context(t) - - cmd := exec.CommandContext(ctx, "ls", "-hal", ".") - require.NoError(t, cmd.Start()) - _, err := v1Manager1.AddCommand(cmd) - require.NoError(t, err) - - gitCmd1 := exec.CommandContext(ctx, "ls", "-hal", ".") - require.NoError(t, gitCmd1.Start()) - _, err = v1Manager1.AddCommand(gitCmd1) - require.NoError(t, err) - - gitCmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") - require.NoError(t, gitCmd2.Start()) - _, err = v1Manager1.AddCommand(gitCmd2) - require.NoError(t, err) - defer func() { - require.NoError(t, gitCmd2.Wait()) - }() - - require.NoError(t, cmd.Wait()) - require.NoError(t, gitCmd1.Wait()) - - repoCgroupPath := filepath.Join(v1Manager1.currentProcessCgroup(), "repos-0") - - expected := strings.NewReader(strings.ReplaceAll(`# HELP gitaly_cgroup_cpu_usage_total CPU Usage of Cgroup + tests := []struct { + name string + metricsEnabled bool + pid int + expect string + }{ + { + name: "metrics enabled: true", + metricsEnabled: true, + pid: 1, + expect: `# HELP gitaly_cgroup_cpu_usage_total CPU Usage of Cgroup # TYPE gitaly_cgroup_cpu_usage_total gauge gitaly_cgroup_cpu_usage_total{path="%s",type="kernel"} 0 gitaly_cgroup_cpu_usage_total{path="%s",type="user"} 0 @@ -349,20 +320,223 @@ gitaly_cgroup_cpu_cfs_throttled_periods_total{path="%s"} 20 # HELP gitaly_cgroup_cpu_cfs_throttled_seconds_total Total time duration the Cgroup has been throttled # TYPE gitaly_cgroup_cpu_cfs_throttled_seconds_total counter gitaly_cgroup_cpu_cfs_throttled_seconds_total{path="%s"} 0.001 -`, "%s", repoCgroupPath)) +`, + }, + { + name: "metrics enabled: false", + metricsEnabled: false, + pid: 2, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + mock := newMock(t) + + config := defaultCgroupsConfig() + config.Repositories.Count = 1 + config.Repositories.MemoryBytes = 1048576 + config.Repositories.CPUShares = 16 + config.Mountpoint = mock.root + config.MetricsEnabled = tt.metricsEnabled + + v1Manager1 := mock.newCgroupManager(config, tt.pid) + + mock.setupMockCgroupFiles(t, v1Manager1, 2) + require.NoError(t, v1Manager1.Setup()) + + ctx := testhelper.Context(t) + + cmd := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, cmd.Start()) + _, err := v1Manager1.AddCommand(cmd) + require.NoError(t, err) + + gitCmd1 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, gitCmd1.Start()) + _, err = v1Manager1.AddCommand(gitCmd1) + require.NoError(t, err) + + gitCmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, gitCmd2.Start()) + _, err = v1Manager1.AddCommand(gitCmd2) + require.NoError(t, err) + defer func() { + require.NoError(t, gitCmd2.Wait()) + }() + + require.NoError(t, cmd.Wait()) + require.NoError(t, gitCmd1.Wait()) - for _, metricsEnabled := range []bool{true, false} { - t.Run(fmt.Sprintf("metrics enabled: %v", metricsEnabled), func(t *testing.T) { - v1Manager1.cfg.MetricsEnabled = metricsEnabled + repoCgroupPath := filepath.Join(v1Manager1.currentProcessCgroup(), "repos-0") + + expected := strings.NewReader(strings.ReplaceAll(tt.expect, "%s", repoCgroupPath)) + assert.NoError(t, testutil.CollectAndCompare(v1Manager1, expected)) + }) + } +} + +func TestPruneOldCgroups(t *testing.T) { + t.Parallel() + + testCases := []struct { + desc string + cfg cgroups.Config + expectedPruned bool + // setup returns a pid + setup func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int + }{ + { + desc: "process belongs to another user", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + pid := 1 + cgroupManager := mock.newCgroupManager(cfg, pid) + require.NoError(t, cgroupManager.Setup()) + + return pid + }, + expectedPruned: true, + }, + { + desc: "no hierarchy root", + cfg: cgroups.Config{ + HierarchyRoot: "", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + pid := 1 + cgroupManager := mock.newCgroupManager(cfg, pid) + require.NoError(t, cgroupManager.Setup()) + return 1 + }, + expectedPruned: false, + }, + { + desc: "pid of finished process", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + cmd := exec.Command("ls") + require.NoError(t, cmd.Run()) + pid := cmd.Process.Pid + + cgroupManager := mock.newCgroupManager(cfg, pid) + require.NoError(t, cgroupManager.Setup()) + + memoryRoot := filepath.Join( + cfg.Mountpoint, + "memory", + cfg.HierarchyRoot, + "memory.limit_in_bytes", + ) + require.NoError(t, os.WriteFile(memoryRoot, []byte{}, fs.ModeAppend)) + + return pid + }, + expectedPruned: true, + }, + { + desc: "pid of running process", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + pid := os.Getpid() + + cgroupManager := mock.newCgroupManager(cfg, pid) + require.NoError(t, cgroupManager.Setup()) + + return pid + }, + expectedPruned: false, + }, + { + desc: "gitaly-0 directory is deleted", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + cgroupManager := mock.newCgroupManager(cfg, 0) + require.NoError(t, cgroupManager.Setup()) + + return 0 + }, + expectedPruned: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + mock := newMock(t) + tc.cfg.Mountpoint = mock.root + + memoryRoot := filepath.Join( + tc.cfg.Mountpoint, + "memory", + tc.cfg.HierarchyRoot, + ) + cpuRoot := filepath.Join( + tc.cfg.Mountpoint, + "cpu", + tc.cfg.HierarchyRoot, + ) + + require.NoError(t, os.MkdirAll(cpuRoot, perm.PublicDir)) + require.NoError(t, os.MkdirAll(memoryRoot, perm.PublicDir)) + + pid := tc.setup(t, tc.cfg, mock) + + logger, hook := test.NewNullLogger() + + mock.pruneOldCgroups(tc.cfg, logger) + + // create cgroups directories with a different pid + oldGitalyProcessMemoryDir := filepath.Join( + memoryRoot, + fmt.Sprintf("gitaly-%d", pid), + ) + oldGitalyProcesssCPUDir := filepath.Join( + cpuRoot, + fmt.Sprintf("gitaly-%d", pid), + ) - if metricsEnabled { - assert.NoError(t, testutil.CollectAndCompare( - v1Manager1, - expected)) + if tc.expectedPruned { + require.NoDirExists(t, oldGitalyProcessMemoryDir) + require.NoDirExists(t, oldGitalyProcesssCPUDir) } else { - assert.NoError(t, testutil.CollectAndCompare( - v1Manager1, - bytes.NewBufferString(""))) + require.DirExists(t, oldGitalyProcessMemoryDir) + require.DirExists(t, oldGitalyProcesssCPUDir) + require.Len(t, hook.Entries, 0) } }) } diff --git a/internal/cgroups/v2_linux.go b/internal/cgroups/v2_linux.go new file mode 100644 index 000000000..a2f81f60b --- /dev/null +++ b/internal/cgroups/v2_linux.go @@ -0,0 +1,175 @@ +//go:build linux + +package cgroups + +import ( + "errors" + "fmt" + "io/fs" + "path/filepath" + "strings" + "time" + + "github.com/containerd/cgroups/v3/cgroup2" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config" + cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" + "gitlab.com/gitlab-org/gitaly/v16/internal/log" +) + +type cgroupV2Handler struct { + cfg cgroupscfg.Config + + *cgroupsMetrics + pid int +} + +func newV2Handler(cfg cgroupscfg.Config, pid int) *cgroupV2Handler { + return &cgroupV2Handler{ + cfg: cfg, + pid: pid, + cgroupsMetrics: newV2CgroupsMetrics(), + } +} + +func (cvh *cgroupV2Handler) setupParent(parentResources *specs.LinuxResources) error { + if _, err := cgroup2.NewManager(cvh.cfg.Mountpoint, "/"+cvh.currentProcessCgroup(), cgroup2.ToResources(parentResources)); err != nil { + return fmt.Errorf("failed creating parent cgroup: %w", err) + } + + return nil +} + +func (cvh *cgroupV2Handler) setupRepository(reposResources *specs.LinuxResources) error { + for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { + if _, err := cgroup2.NewManager( + cvh.cfg.Mountpoint, + "/"+cvh.repoPath(i), + cgroup2.ToResources(reposResources), + ); err != nil { + return fmt.Errorf("failed creating repository cgroup: %w", err) + } + } + return nil +} + +func (cvh *cgroupV2Handler) addToCgroup(pid int, cgroupPath string) error { + control, err := cgroup2.Load("/"+cgroupPath, cgroup2.WithMountpoint(cvh.cfg.Mountpoint)) + if err != nil { + return fmt.Errorf("failed loading %s cgroup: %w", cgroupPath, err) + } + + if err := control.AddProc(uint64(pid)); err != nil { + // Command could finish so quickly before we can add it to a cgroup, so + // we don't consider it an error. + if strings.Contains(err.Error(), "no such process") { + return nil + } + return fmt.Errorf("failed adding process to cgroup: %w", err) + } + + return nil +} + +func (cvh *cgroupV2Handler) collect(ch chan<- prometheus.Metric) { + if !cvh.cfg.MetricsEnabled { + return + } + + for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { + repoPath := cvh.repoPath(i) + logger := log.Default().WithField("cgroup_path", repoPath) + control, err := cgroup2.Load("/"+repoPath, cgroup2.WithMountpoint(cvh.cfg.Mountpoint)) + if err != nil { + logger.WithError(err).Warn("unable to load cgroup controller") + return + } + + if metrics, err := control.Stat(); err != nil { + logger.WithError(err).Warn("unable to get cgroup stats") + } else { + cpuUserMetric := cvh.cpuUsage.WithLabelValues(repoPath, "user") + cpuUserMetric.Set(float64(metrics.CPU.UserUsec)) + ch <- cpuUserMetric + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSPeriods, + prometheus.CounterValue, + float64(metrics.CPU.NrPeriods), + repoPath, + ) + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSThrottledPeriods, + prometheus.CounterValue, + float64(metrics.CPU.NrThrottled), + repoPath, + ) + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSThrottledTime, + prometheus.CounterValue, + float64(metrics.CPU.ThrottledUsec)/float64(time.Second), + repoPath, + ) + + cpuKernelMetric := cvh.cpuUsage.WithLabelValues(repoPath, "kernel") + cpuKernelMetric.Set(float64(metrics.CPU.SystemUsec)) + ch <- cpuKernelMetric + } + + if subsystems, err := control.Controllers(); err != nil { + logger.WithError(err).Warn("unable to get cgroup hierarchy") + } else { + processes, err := control.Procs(true) + if err != nil { + logger.WithError(err). + Warn("unable to get process list") + continue + } + + for _, subsystem := range subsystems { + procsMetric := cvh.procs.WithLabelValues(repoPath, subsystem) + procsMetric.Set(float64(len(processes))) + ch <- procsMetric + } + } + } +} + +func (cvh *cgroupV2Handler) cleanup() error { + processCgroupPath := cvh.currentProcessCgroup() + + control, err := cgroup2.Load("/"+processCgroupPath, cgroup2.WithMountpoint(cvh.cfg.Mountpoint)) + if err != nil { + return fmt.Errorf("failed loading cgroup %s: %w", processCgroupPath, err) + } + + if err := control.Delete(); err != nil { + return fmt.Errorf("failed cleaning up cgroup %s: %w", processCgroupPath, err) + } + + return nil +} + +func (cvh *cgroupV2Handler) repoPath(groupID int) string { + return filepath.Join(cvh.currentProcessCgroup(), fmt.Sprintf("repos-%d", groupID)) +} + +func (cvh *cgroupV2Handler) currentProcessCgroup() string { + return config.GetGitalyProcessTempDir(cvh.cfg.HierarchyRoot, cvh.pid) +} + +func pruneOldCgroupsV2(cfg cgroupscfg.Config, logger logrus.FieldLogger) { + if err := config.PruneOldGitalyProcessDirectories( + logger, + filepath.Join(cfg.Mountpoint, cfg.HierarchyRoot), + ); err != nil { + var pathError *fs.PathError + if !errors.As(err, &pathError) { + logger.WithError(err).Error("failed to clean up cpu cgroups") + } + } +} diff --git a/internal/cgroups/v2_linux_test.go b/internal/cgroups/v2_linux_test.go new file mode 100644 index 000000000..834a148cd --- /dev/null +++ b/internal/cgroups/v2_linux_test.go @@ -0,0 +1,546 @@ +//go:build linux + +package cgroups + +import ( + "fmt" + "hash/crc32" + "io/fs" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "testing" + + cgrps "github.com/containerd/cgroups/v3" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" + "gitlab.com/gitlab-org/gitaly/v16/internal/helper/perm" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" +) + +func defaultCgroupsV2Config() cgroups.Config { + return cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 3, + MemoryBytes: 1024000, + CPUShares: 256, + CPUQuotaUs: 2000, + }, + } +} + +func TestNewManagerV2(t *testing.T) { + cfg := cgroups.Config{Repositories: cgroups.Repositories{Count: 10}} + + manager := newCgroupManagerWithMode(cfg, 1, cgrps.Unified) + require.IsType(t, &cgroupV2Handler{}, manager.handler) +} + +func TestSetup_ParentCgroupsV2(t *testing.T) { + tests := []struct { + name string + cfg cgroups.Config + wantMemoryBytes int + wantCPUWeight int + wantCPUMax string + }{ + { + name: "all config specified", + cfg: cgroups.Config{ + MemoryBytes: 102400, + CPUShares: 256, + CPUQuotaUs: 2000, + }, + wantMemoryBytes: 102400, + wantCPUWeight: 256, + wantCPUMax: "2000 100000", + }, + { + name: "only memory limit set", + cfg: cgroups.Config{ + MemoryBytes: 102400, + }, + wantMemoryBytes: 102400, + }, + { + name: "only cpu shares set", + cfg: cgroups.Config{ + CPUShares: 512, + }, + wantCPUWeight: 512, + }, + { + name: "only cpu quota set", + cfg: cgroups.Config{ + CPUQuotaUs: 2000, + }, + wantCPUMax: "2000 100000", + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + mock := newMockV2(t) + + pid := 1 + tt.cfg.HierarchyRoot = "gitaly" + tt.cfg.Mountpoint = mock.root + + v2Manager := mock.newCgroupManager(tt.cfg, pid) + mock.setupMockCgroupFiles(t, v2Manager) + + require.NoError(t, v2Manager.Setup()) + + memoryMaxPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), "memory.max", + ) + requireCgroupWithInt(t, memoryMaxPath, tt.wantMemoryBytes) + + cpuWeightPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), "cpu.weight", + ) + requireCgroupWithInt(t, cpuWeightPath, calculateWantCPUWeight(tt.wantCPUWeight)) + + cpuMaxPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), "cpu.max", + ) + requireCgroupWithString(t, cpuMaxPath, tt.wantCPUMax) + }) + } +} + +func TestSetup_RepoCgroupsV2(t *testing.T) { + tests := []struct { + name string + cfg cgroups.Repositories + wantMemoryBytes int + wantCPUWeight int + wantCPUMax string + }{ + { + name: "all config specified", + cfg: defaultCgroupsV2Config().Repositories, + wantMemoryBytes: 1024000, + wantCPUWeight: 256, + wantCPUMax: "2000 100000", + }, + { + name: "only memory limit set", + cfg: cgroups.Repositories{ + Count: 3, + MemoryBytes: 1024000, + }, + wantMemoryBytes: 1024000, + }, + { + name: "only cpu shares set", + cfg: cgroups.Repositories{ + Count: 3, + CPUShares: 512, + }, + wantCPUWeight: 512, + }, + { + name: "only cpu quota set", + cfg: cgroups.Repositories{ + Count: 3, + CPUQuotaUs: 1000, + }, + wantCPUMax: "1000 100000", + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + mock := newMockV2(t) + + pid := 1 + + cfg := defaultCgroupsV2Config() + cfg.Mountpoint = mock.root + cfg.Repositories = tt.cfg + + v2Manager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, v2Manager) + require.NoError(t, v2Manager.Setup()) + + for i := 0; i < 3; i++ { + memoryMaxPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", i), "memory.max", + ) + requireCgroupWithInt(t, memoryMaxPath, tt.wantMemoryBytes) + + cpuWeightPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", i), "cpu.weight", + ) + requireCgroupWithInt(t, cpuWeightPath, calculateWantCPUWeight(tt.wantCPUWeight)) + + cpuMaxPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", i), "cpu.max", + ) + requireCgroupWithString(t, cpuMaxPath, tt.wantCPUMax) + } + }) + } +} + +func TestAddCommandV2(t *testing.T) { + mock := newMockV2(t) + + config := defaultCgroupsV2Config() + config.Repositories.Count = 10 + config.Repositories.MemoryBytes = 1024 + config.Repositories.CPUShares = 16 + config.Mountpoint = mock.root + + pid := 1 + + v2Manager1 := mock.newCgroupManager(config, pid) + mock.setupMockCgroupFiles(t, v2Manager1) + + require.NoError(t, v2Manager1.Setup()) + ctx := testhelper.Context(t) + + cmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, cmd2.Run()) + + v2Manager2 := mock.newCgroupManager(config, pid) + + t.Run("without overridden key", func(t *testing.T) { + _, err := v2Manager2.AddCommand(cmd2) + require.NoError(t, err) + + checksum := crc32.ChecksumIEEE([]byte(strings.Join(cmd2.Args, "/"))) + groupID := uint(checksum) % config.Repositories.Count + + path := filepath.Join(mock.root, "gitaly", + fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", groupID), "cgroup.procs") + content := readCgroupFile(t, path) + + cmdPid, err := strconv.Atoi(string(content)) + require.NoError(t, err) + + require.Equal(t, cmd2.Process.Pid, cmdPid) + }) + + t.Run("with overridden key", func(t *testing.T) { + _, err := v2Manager2.AddCommand(cmd2, WithCgroupKey("foobar")) + require.NoError(t, err) + + checksum := crc32.ChecksumIEEE([]byte("foobar")) + groupID := uint(checksum) % config.Repositories.Count + + path := filepath.Join(mock.root, "gitaly", + fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", groupID), "cgroup.procs") + content := readCgroupFile(t, path) + + cmdPid, err := strconv.Atoi(string(content)) + require.NoError(t, err) + + require.Equal(t, cmd2.Process.Pid, cmdPid) + }) +} + +func TestCleanupV2(t *testing.T) { + mock := newMockV2(t) + + pid := 1 + cfg := defaultCgroupsV2Config() + cfg.Mountpoint = mock.root + + v2Manager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, v2Manager) + + require.NoError(t, v2Manager.Setup()) + require.NoError(t, v2Manager.Cleanup()) + + for i := 0; i < 3; i++ { + require.NoDirExists(t, filepath.Join(mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", i))) + } +} + +func TestMetricsV2(t *testing.T) { + tests := []struct { + name string + metricsEnabled bool + pid int + expect string + }{ + { + name: "metrics enabled: true", + metricsEnabled: true, + pid: 1, + expect: `# HELP gitaly_cgroup_cpu_cfs_periods_total Number of elapsed enforcement period intervals +# TYPE gitaly_cgroup_cpu_cfs_periods_total counter +gitaly_cgroup_cpu_cfs_periods_total{path="%s"} 10 +# HELP gitaly_cgroup_cpu_cfs_throttled_periods_total Number of throttled period intervals +# TYPE gitaly_cgroup_cpu_cfs_throttled_periods_total counter +gitaly_cgroup_cpu_cfs_throttled_periods_total{path="%s"} 20 +# HELP gitaly_cgroup_cpu_cfs_throttled_seconds_total Total time duration the Cgroup has been throttled +# TYPE gitaly_cgroup_cpu_cfs_throttled_seconds_total counter +gitaly_cgroup_cpu_cfs_throttled_seconds_total{path="%s"} 0.001 +# HELP gitaly_cgroup_cpu_usage_total CPU Usage of Cgroup +# TYPE gitaly_cgroup_cpu_usage_total gauge +gitaly_cgroup_cpu_usage_total{path="%s",type="kernel"} 0 +gitaly_cgroup_cpu_usage_total{path="%s",type="user"} 0 +# HELP gitaly_cgroup_procs_total Total number of procs +# TYPE gitaly_cgroup_procs_total gauge +gitaly_cgroup_procs_total{path="%s",subsystem="cpu"} 1 +gitaly_cgroup_procs_total{path="%s",subsystem="cpuset"} 1 +gitaly_cgroup_procs_total{path="%s",subsystem="memory"} 1 +`, + }, + { + name: "metrics enabled: false", + metricsEnabled: false, + pid: 2, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + mock := newMockV2(t) + + config := defaultCgroupsV2Config() + config.Repositories.Count = 1 + config.Repositories.MemoryBytes = 1048576 + config.Repositories.CPUShares = 16 + config.Mountpoint = mock.root + config.MetricsEnabled = tt.metricsEnabled + + v2Manager1 := mock.newCgroupManager(config, tt.pid) + + mock.setupMockCgroupFiles(t, v2Manager1) + require.NoError(t, v2Manager1.Setup()) + + ctx := testhelper.Context(t) + + cmd := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, cmd.Start()) + _, err := v2Manager1.AddCommand(cmd) + require.NoError(t, err) + + gitCmd1 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, gitCmd1.Start()) + _, err = v2Manager1.AddCommand(gitCmd1) + require.NoError(t, err) + + gitCmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, gitCmd2.Start()) + _, err = v2Manager1.AddCommand(gitCmd2) + require.NoError(t, err) + defer func() { + require.NoError(t, gitCmd2.Wait()) + }() + + require.NoError(t, cmd.Wait()) + require.NoError(t, gitCmd1.Wait()) + + repoCgroupPath := filepath.Join(v2Manager1.currentProcessCgroup(), "repos-0") + + expected := strings.NewReader(strings.ReplaceAll(tt.expect, "%s", repoCgroupPath)) + + assert.NoError(t, testutil.CollectAndCompare(v2Manager1, expected)) + }) + } +} + +func TestPruneOldCgroupsV2(t *testing.T) { + t.Parallel() + + testCases := []struct { + desc string + cfg cgroups.Config + expectedPruned bool + // setup returns a pid + setup func(*testing.T, cgroups.Config, *mockCgroupV2) int + }{ + { + desc: "process belongs to another user", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + pid := 1 + + cgroupManager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + + return pid + }, + expectedPruned: true, + }, + { + desc: "no hierarchy root", + cfg: cgroups.Config{ + HierarchyRoot: "", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + pid := 1 + + cgroupManager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + return 1 + }, + expectedPruned: false, + }, + { + desc: "pid of finished process", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + cmd := exec.Command("ls") + require.NoError(t, cmd.Run()) + pid := cmd.Process.Pid + + cgroupManager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + + memoryFile := filepath.Join( + cfg.Mountpoint, + cfg.HierarchyRoot, + "memory.limit_in_bytes", + ) + require.NoError(t, os.WriteFile(memoryFile, []byte{}, fs.ModeAppend)) + + return pid + }, + expectedPruned: true, + }, + { + desc: "pid of running process", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + pid := os.Getpid() + + cgroupManager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + + return pid + }, + expectedPruned: false, + }, + { + desc: "gitaly-0 directory is deleted", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + cgroupManager := mock.newCgroupManager(cfg, 0) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + + return 0 + }, + expectedPruned: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + mock := newMockV2(t) + tc.cfg.Mountpoint = mock.root + + root := filepath.Join( + tc.cfg.Mountpoint, + tc.cfg.HierarchyRoot, + ) + require.NoError(t, os.MkdirAll(root, perm.PublicDir)) + + pid := tc.setup(t, tc.cfg, mock) + + logger, _ := test.NewNullLogger() + mock.pruneOldCgroups(tc.cfg, logger) + + // create cgroups directories with a different pid + oldGitalyProcessDir := filepath.Join( + root, + fmt.Sprintf("gitaly-%d", pid), + ) + + if tc.expectedPruned { + require.NoDirExists(t, oldGitalyProcessDir) + } else { + require.DirExists(t, oldGitalyProcessDir) + } + }) + } +} + +func calculateWantCPUWeight(wantCPUWeight int) int { + if wantCPUWeight == 0 { + return 0 + } + return 1 + ((wantCPUWeight-2)*9999)/262142 +} + +func requireCgroupWithString(t *testing.T, cgroupFile string, want string) { + t.Helper() + + if want == "" { + return + } + require.Equal(t, + string(readCgroupFile(t, cgroupFile)), + want, + ) +} + +func requireCgroupWithInt(t *testing.T, cgroupFile string, want int) { + t.Helper() + + if want <= 0 { + return + } + + require.Equal(t, + string(readCgroupFile(t, cgroupFile)), + strconv.Itoa(want), + ) +} |