diff options
author | Will Chandler <wchandler@gitlab.com> | 2023-10-26 23:48:56 +0300 |
---|---|---|
committer | Will Chandler <wchandler@gitlab.com> | 2023-11-08 06:53:56 +0300 |
commit | 02a39ec23e6f768d3a57d6c4ca119faf9ccb757d (patch) | |
tree | 413f136bf0932c91d69728f3099e63442908e91b | |
parent | e53929774ee1d3d8f910f706564307526997f2d6 (diff) |
cgroups: Update collect to check if cgroup exists
We will shortly start creating cgroups on-demand, rather than up-front
as part of Gitaly's start up process.
Hoist the loop over repository cgroups into the manager's `Collect`
method, passing the path into the version-specific handlers. This allows
us to avoid leaking the `cgroupLock` abstraction outside of the manager.
-rw-r--r-- | internal/cgroups/manager_linux.go | 17 | ||||
-rw-r--r-- | internal/cgroups/v1_linux.go | 117 | ||||
-rw-r--r-- | internal/cgroups/v1_linux_test.go | 4 | ||||
-rw-r--r-- | internal/cgroups/v2_linux.go | 103 | ||||
-rw-r--r-- | internal/cgroups/v2_linux_test.go | 4 |
5 files changed, 126 insertions, 119 deletions
diff --git a/internal/cgroups/manager_linux.go b/internal/cgroups/manager_linux.go index a97a94530..7dc5cae42 100644 --- a/internal/cgroups/manager_linux.go +++ b/internal/cgroups/manager_linux.go @@ -30,7 +30,7 @@ type cgroupHandler interface { setupRepository(status *cgroupStatus, reposResources *specs.LinuxResources) error createCgroup(repoResources *specs.LinuxResources, cgroupPath string) error addToCgroup(pid int, cgroupPath string) error - collect(ch chan<- prometheus.Metric) + collect(repoPath string, ch chan<- prometheus.Metric) cleanup() error currentProcessCgroup() string repoPath(groupID int) string @@ -216,7 +216,20 @@ func (cgm *CGroupManager) Describe(ch chan<- *prometheus.Desc) { // Collect is used to collect the current values of all CGroupManager prometheus metrics func (cgm *CGroupManager) Collect(ch chan<- prometheus.Metric) { - cgm.handler.collect(ch) + if !cgm.cfg.MetricsEnabled { + return + } + + for i := 0; i < int(cgm.cfg.Repositories.Count); i++ { + repoPath := cgm.handler.repoPath(i) + + cgLock := cgm.status.getLock(repoPath) + if !cgLock.isCreated() { + continue + } + + cgm.handler.collect(repoPath, ch) + } } // Stats returns cgroup accounting statistics collected by reading diff --git a/internal/cgroups/v1_linux.go b/internal/cgroups/v1_linux.go index 2b2e2b274..d82668fc4 100644 --- a/internal/cgroups/v1_linux.go +++ b/internal/cgroups/v1_linux.go @@ -102,73 +102,66 @@ func (cvh *cgroupV1Handler) loadCgroup(cgroupPath string) (cgroup1.Cgroup, error return control, nil } -func (cvh *cgroupV1Handler) collect(ch chan<- prometheus.Metric) { - if !cvh.cfg.MetricsEnabled { +func (cvh *cgroupV1Handler) collect(repoPath string, ch chan<- prometheus.Metric) { + logger := cvh.logger.WithField("cgroup_path", repoPath) + control, err := cvh.loadCgroup(repoPath) + if err != nil { + logger.WithError(err).Warn("unable to load cgroup controller") return } - for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { - repoPath := cvh.repoPath(i) - logger := cvh.logger.WithField("cgroup_path", repoPath) - control, err := cvh.loadCgroup(repoPath) - if err != nil { - logger.WithError(err).Warn("unable to load cgroup controller") - return - } - - if metrics, err := control.Stat(); err != nil { - logger.WithError(err).Warn("unable to get cgroup stats") - } else { - memoryMetric := cvh.memoryReclaimAttemptsTotal.WithLabelValues(repoPath) - memoryMetric.Set(float64(metrics.Memory.Usage.Failcnt)) - ch <- memoryMetric - - cpuUserMetric := cvh.cpuUsage.WithLabelValues(repoPath, "user") - cpuUserMetric.Set(float64(metrics.CPU.Usage.User)) - ch <- cpuUserMetric - - ch <- prometheus.MustNewConstMetric( - cvh.cpuCFSPeriods, - prometheus.CounterValue, - float64(metrics.CPU.Throttling.Periods), - repoPath, - ) - - ch <- prometheus.MustNewConstMetric( - cvh.cpuCFSThrottledPeriods, - prometheus.CounterValue, - float64(metrics.CPU.Throttling.ThrottledPeriods), - repoPath, - ) - - ch <- prometheus.MustNewConstMetric( - cvh.cpuCFSThrottledTime, - prometheus.CounterValue, - float64(metrics.CPU.Throttling.ThrottledTime)/float64(time.Second), - repoPath, - ) - - cpuKernelMetric := cvh.cpuUsage.WithLabelValues(repoPath, "kernel") - cpuKernelMetric.Set(float64(metrics.CPU.Usage.Kernel)) - ch <- cpuKernelMetric - } + if metrics, err := control.Stat(); err != nil { + logger.WithError(err).Warn("unable to get cgroup stats") + } else { + memoryMetric := cvh.memoryReclaimAttemptsTotal.WithLabelValues(repoPath) + memoryMetric.Set(float64(metrics.Memory.Usage.Failcnt)) + ch <- memoryMetric + + cpuUserMetric := cvh.cpuUsage.WithLabelValues(repoPath, "user") + cpuUserMetric.Set(float64(metrics.CPU.Usage.User)) + ch <- cpuUserMetric + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSPeriods, + prometheus.CounterValue, + float64(metrics.CPU.Throttling.Periods), + repoPath, + ) + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSThrottledPeriods, + prometheus.CounterValue, + float64(metrics.CPU.Throttling.ThrottledPeriods), + repoPath, + ) + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSThrottledTime, + prometheus.CounterValue, + float64(metrics.CPU.Throttling.ThrottledTime)/float64(time.Second), + repoPath, + ) + + cpuKernelMetric := cvh.cpuUsage.WithLabelValues(repoPath, "kernel") + cpuKernelMetric.Set(float64(metrics.CPU.Usage.Kernel)) + ch <- cpuKernelMetric + } - if subsystems, err := cvh.hierarchy(); err != nil { - logger.WithError(err).Warn("unable to get cgroup hierarchy") - } else { - for _, subsystem := range subsystems { - processes, err := control.Processes(subsystem.Name(), true) - if err != nil { - logger.WithField("subsystem", subsystem.Name()). - WithError(err). - Warn("unable to get process list") - continue - } - - procsMetric := cvh.procs.WithLabelValues(repoPath, string(subsystem.Name())) - procsMetric.Set(float64(len(processes))) - ch <- procsMetric + if subsystems, err := cvh.hierarchy(); err != nil { + logger.WithError(err).Warn("unable to get cgroup hierarchy") + } else { + for _, subsystem := range subsystems { + processes, err := control.Processes(subsystem.Name(), true) + if err != nil { + logger.WithField("subsystem", subsystem.Name()). + WithError(err). + Warn("unable to get process list") + continue } + + procsMetric := cvh.procs.WithLabelValues(repoPath, string(subsystem.Name())) + procsMetric.Set(float64(len(processes))) + ch <- procsMetric } } } diff --git a/internal/cgroups/v1_linux_test.go b/internal/cgroups/v1_linux_test.go index dcb4358b8..15361000b 100644 --- a/internal/cgroups/v1_linux_test.go +++ b/internal/cgroups/v1_linux_test.go @@ -346,6 +346,10 @@ gitaly_cgroup_cpu_cfs_throttled_seconds_total{path="%s"} 0.001 v1Manager1 := mock.newCgroupManager(config, testhelper.SharedLogger(t), tt.pid) + groupID := calcGroupID(cmdArgs, config.Repositories.Count) + cgLock := v1Manager1.status.getLock(v1Manager1.handler.repoPath(int(groupID))) + cgLock.created.Store(true) + mock.setupMockCgroupFiles(t, v1Manager1, mockCgroupFile{"memory.failcnt", "2"}) require.NoError(t, v1Manager1.Setup()) diff --git a/internal/cgroups/v2_linux.go b/internal/cgroups/v2_linux.go index 407edc90e..ccb1642dc 100644 --- a/internal/cgroups/v2_linux.go +++ b/internal/cgroups/v2_linux.go @@ -103,68 +103,61 @@ func (cvh *cgroupV2Handler) loadCgroup(cgroupPath string) (*cgroup2.Manager, err return control, nil } -func (cvh *cgroupV2Handler) collect(ch chan<- prometheus.Metric) { - if !cvh.cfg.MetricsEnabled { +func (cvh *cgroupV2Handler) collect(repoPath string, ch chan<- prometheus.Metric) { + logger := cvh.logger.WithField("cgroup_path", repoPath) + control, err := cvh.loadCgroup(repoPath) + if err != nil { + logger.WithError(err).Warn("unable to load cgroup controller") return } - for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { - repoPath := cvh.repoPath(i) - logger := cvh.logger.WithField("cgroup_path", repoPath) - control, err := cvh.loadCgroup(repoPath) + if metrics, err := control.Stat(); err != nil { + logger.WithError(err).Warn("unable to get cgroup stats") + } else { + cpuUserMetric := cvh.cpuUsage.WithLabelValues(repoPath, "user") + cpuUserMetric.Set(float64(metrics.CPU.UserUsec)) + ch <- cpuUserMetric + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSPeriods, + prometheus.CounterValue, + float64(metrics.CPU.NrPeriods), + repoPath, + ) + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSThrottledPeriods, + prometheus.CounterValue, + float64(metrics.CPU.NrThrottled), + repoPath, + ) + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSThrottledTime, + prometheus.CounterValue, + float64(metrics.CPU.ThrottledUsec)/float64(time.Second), + repoPath, + ) + + cpuKernelMetric := cvh.cpuUsage.WithLabelValues(repoPath, "kernel") + cpuKernelMetric.Set(float64(metrics.CPU.SystemUsec)) + ch <- cpuKernelMetric + } + + if subsystems, err := control.Controllers(); err != nil { + logger.WithError(err).Warn("unable to get cgroup hierarchy") + } else { + processes, err := control.Procs(true) if err != nil { - logger.WithError(err).Warn("unable to load cgroup controller") + logger.WithError(err). + Warn("unable to get process list") return } - if metrics, err := control.Stat(); err != nil { - logger.WithError(err).Warn("unable to get cgroup stats") - } else { - cpuUserMetric := cvh.cpuUsage.WithLabelValues(repoPath, "user") - cpuUserMetric.Set(float64(metrics.CPU.UserUsec)) - ch <- cpuUserMetric - - ch <- prometheus.MustNewConstMetric( - cvh.cpuCFSPeriods, - prometheus.CounterValue, - float64(metrics.CPU.NrPeriods), - repoPath, - ) - - ch <- prometheus.MustNewConstMetric( - cvh.cpuCFSThrottledPeriods, - prometheus.CounterValue, - float64(metrics.CPU.NrThrottled), - repoPath, - ) - - ch <- prometheus.MustNewConstMetric( - cvh.cpuCFSThrottledTime, - prometheus.CounterValue, - float64(metrics.CPU.ThrottledUsec)/float64(time.Second), - repoPath, - ) - - cpuKernelMetric := cvh.cpuUsage.WithLabelValues(repoPath, "kernel") - cpuKernelMetric.Set(float64(metrics.CPU.SystemUsec)) - ch <- cpuKernelMetric - } - - if subsystems, err := control.Controllers(); err != nil { - logger.WithError(err).Warn("unable to get cgroup hierarchy") - } else { - processes, err := control.Procs(true) - if err != nil { - logger.WithError(err). - Warn("unable to get process list") - continue - } - - for _, subsystem := range subsystems { - procsMetric := cvh.procs.WithLabelValues(repoPath, subsystem) - procsMetric.Set(float64(len(processes))) - ch <- procsMetric - } + for _, subsystem := range subsystems { + procsMetric := cvh.procs.WithLabelValues(repoPath, subsystem) + procsMetric.Set(float64(len(processes))) + ch <- procsMetric } } } diff --git a/internal/cgroups/v2_linux_test.go b/internal/cgroups/v2_linux_test.go index a0042083d..08a62428f 100644 --- a/internal/cgroups/v2_linux_test.go +++ b/internal/cgroups/v2_linux_test.go @@ -325,6 +325,10 @@ gitaly_cgroup_procs_total{path="%s",subsystem="memory"} 1 v2Manager1 := mock.newCgroupManager(config, testhelper.SharedLogger(t), tt.pid) + groupID := calcGroupID(cmdArgs, config.Repositories.Count) + cgLock := v2Manager1.status.getLock(v2Manager1.handler.repoPath(int(groupID))) + cgLock.created.Store(true) + mock.setupMockCgroupFiles(t, v2Manager1) require.NoError(t, v2Manager1.Setup()) |