1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
# frozen_string_literal: true
module Gitlab
module Memory
# A background thread that observes Ruby heap fragmentation and calls
# into a handler when the Ruby heap has been fragmented for an extended
# period of time.
#
# See Gitlab::Metrics::Memory for how heap fragmentation is defined.
#
# To decide whether a given fragmentation level is being exceeded,
# the watchdog regularly polls the GC. Whenever a violation occurs
# a strike is issued. If the maximum number of strikes are reached,
# a handler is invoked to deal with the situation.
#
# The duration for which a process may be above a given fragmentation
# threshold is computed as `max_strikes * sleep_time_seconds`.
class Watchdog
DEFAULT_SLEEP_TIME_SECONDS = 60
DEFAULT_HEAP_FRAG_THRESHOLD = 0.5
DEFAULT_MAX_STRIKES = 5
# This handler does nothing. It returns `false` to indicate to the
# caller that the situation has not been dealt with so it will
# receive calls repeatedly if fragmentation remains high.
#
# This is useful for "dress rehearsals" in production since it allows
# us to observe how frequently the handler is invoked before taking action.
class NullHandler
include Singleton
def on_high_heap_fragmentation(value)
# NOP
false
end
end
# This handler sends SIGTERM and considers the situation handled.
class TermProcessHandler
def initialize(pid = $$)
@pid = pid
end
def on_high_heap_fragmentation(value)
Process.kill(:TERM, @pid)
true
end
end
# This handler invokes Puma's graceful termination handler, which takes
# into account a configurable grace period during which a process may
# remain unresponsive to a SIGTERM.
class PumaHandler
def initialize(puma_options = ::Puma.cli_config.options)
@worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
end
def on_high_heap_fragmentation(value)
@worker.term
true
end
end
# max_heap_fragmentation:
# The degree to which the Ruby heap is allowed to be fragmented. Range [0,1].
# max_strikes:
# How many times the process is allowed to be above max_heap_fragmentation before
# a handler is invoked.
# sleep_time_seconds:
# Used to control the frequency with which the watchdog will wake up and poll the GC.
def initialize(
handler: NullHandler.instance,
logger: Logger.new($stdout),
max_heap_fragmentation: ENV['GITLAB_MEMWD_MAX_HEAP_FRAG']&.to_f || DEFAULT_HEAP_FRAG_THRESHOLD,
max_strikes: ENV['GITLAB_MEMWD_MAX_STRIKES']&.to_i || DEFAULT_MAX_STRIKES,
sleep_time_seconds: ENV['GITLAB_MEMWD_SLEEP_TIME_SEC']&.to_i || DEFAULT_SLEEP_TIME_SECONDS,
**options)
super(**options)
@handler = handler
@logger = logger
@max_heap_fragmentation = max_heap_fragmentation
@sleep_time_seconds = sleep_time_seconds
@max_strikes = max_strikes
@alive = true
@strikes = 0
init_prometheus_metrics(max_heap_fragmentation)
end
attr_reader :strikes, :max_heap_fragmentation, :max_strikes, :sleep_time_seconds
def call
@logger.info(log_labels.merge(message: 'started'))
while @alive
sleep(@sleep_time_seconds)
monitor_heap_fragmentation if Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
end
@logger.info(log_labels.merge(message: 'stopped'))
end
def stop
@alive = false
end
private
def monitor_heap_fragmentation
heap_fragmentation = Gitlab::Metrics::Memory.gc_heap_fragmentation
if heap_fragmentation > @max_heap_fragmentation
@strikes += 1
@heap_frag_violations.increment
else
@strikes = 0
end
if @strikes > @max_strikes
# If the handler returns true, it means the event is handled and we can shut down.
@alive = !handle_heap_fragmentation_limit_exceeded(heap_fragmentation)
@strikes = 0
end
end
def handle_heap_fragmentation_limit_exceeded(value)
@logger.warn(
log_labels.merge(
message: 'heap fragmentation limit exceeded',
memwd_cur_heap_frag: value
))
@heap_frag_violations_handled.increment
handler.on_high_heap_fragmentation(value)
end
def handler
# This allows us to keep the watchdog running but turn it into "friendly mode" where
# all that happens is we collect logs and Prometheus events for fragmentation violations.
return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
@handler
end
def log_labels
{
pid: $$,
worker_id: worker_id,
memwd_handler_class: handler.class.name,
memwd_sleep_time_s: @sleep_time_seconds,
memwd_max_heap_frag: @max_heap_fragmentation,
memwd_max_strikes: @max_strikes,
memwd_cur_strikes: @strikes,
memwd_rss_bytes: process_rss_bytes
}
end
def worker_id
::Prometheus::PidProvider.worker_id
end
def process_rss_bytes
Gitlab::Metrics::System.memory_usage_rss
end
def init_prometheus_metrics(max_heap_fragmentation)
@heap_frag_limit = Gitlab::Metrics.gauge(
:gitlab_memwd_heap_frag_limit,
'The configured limit for how fragmented the Ruby heap is allowed to be'
)
@heap_frag_limit.set({}, max_heap_fragmentation)
default_labels = { pid: worker_id }
@heap_frag_violations = Gitlab::Metrics.counter(
:gitlab_memwd_heap_frag_violations_total,
'Total number of times heap fragmentation in a Ruby process exceeded its allowed maximum',
default_labels
)
@heap_frag_violations_handled = Gitlab::Metrics.counter(
:gitlab_memwd_heap_frag_violations_handled_total,
'Total number of times heap fragmentation violations in a Ruby process were handled',
default_labels
)
end
end
end
end
|