1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
package supervisor
import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
log "github.com/sirupsen/logrus"
"gitlab.com/gitlab-org/gitaly/v14/internal/ps"
)
var (
rssGauge = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "gitaly_supervisor_rss_bytes",
Help: "Resident set size of supervised processes, in bytes.",
},
[]string{"name"},
)
healthCounter = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "gitaly_supervisor_health_checks_total",
Help: "Count of Gitaly supervisor health checks",
},
[]string{"name", "status"},
)
)
type monitorProcess struct {
pid int
wait <-chan struct{}
}
func monitorRss(procs <-chan monitorProcess, done chan<- struct{}, events chan<- Event, name string, threshold int) {
log.WithField("supervisor.name", name).WithField("supervisor.rss_threshold", threshold).Info("starting RSS monitor")
t := time.NewTicker(15 * time.Second)
defer t.Stop()
defer close(done)
for mp := range procs {
monitorLoop:
for {
rss, err := ps.RSS(mp.pid)
if err != nil {
log.WithError(err).Warn("getting RSS")
}
// converts from kB to B
rss *= 1024
rssGauge.WithLabelValues(name).Set(float64(rss))
if rss > 0 {
event := Event{Type: MemoryLow, Pid: mp.pid}
if rss > threshold {
event.Type = MemoryHigh
}
select {
case events <- event:
case <-time.After(1 * time.Second):
// Prevent sending stale events
}
}
select {
case <-mp.wait:
break monitorLoop
case <-t.C:
}
}
}
}
func monitorHealth(f func() error, events chan<- Event, name string, shutdown <-chan struct{}) {
for {
e := Event{Error: f()}
if e.Error != nil {
e.Type = HealthBad
healthCounter.WithLabelValues(name, "bad").Inc()
} else {
e.Type = HealthOK
healthCounter.WithLabelValues(name, "ok").Inc()
}
select {
case events <- e:
case <-time.After(1 * time.Second):
// Prevent sending stale events
case <-shutdown:
return
}
time.Sleep(15 * time.Second)
}
}
|