diff options
Diffstat (limited to 'spec/lib/gitlab/memory/watchdog_spec.rb')
-rw-r--r-- | spec/lib/gitlab/memory/watchdog_spec.rb | 396 |
1 files changed, 135 insertions, 261 deletions
diff --git a/spec/lib/gitlab/memory/watchdog_spec.rb b/spec/lib/gitlab/memory/watchdog_spec.rb index beb49660022..84e9a577afb 100644 --- a/spec/lib/gitlab/memory/watchdog_spec.rb +++ b/spec/lib/gitlab/memory/watchdog_spec.rb @@ -1,35 +1,35 @@ # frozen_string_literal: true require 'spec_helper' -require_relative '../../../../lib/gitlab/cluster/lifecycle_events' -RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do +RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures do context 'watchdog' do - let(:logger) { instance_double(::Logger) } + let(:configuration) { instance_double(described_class::Configuration) } let(:handler) { instance_double(described_class::NullHandler) } - - let(:heap_frag_limit_gauge) { instance_double(::Prometheus::Client::Gauge) } + let(:logger) { instance_double(::Logger) } + let(:sleep_time_seconds) { 60 } + let(:threshold_violated) { false } let(:violations_counter) { instance_double(::Prometheus::Client::Counter) } let(:violations_handled_counter) { instance_double(::Prometheus::Client::Counter) } - - let(:sleep_time) { 0.1 } - let(:max_heap_fragmentation) { 0.2 } - let(:max_mem_growth) { 2 } - - # Defaults that will not trigger any events. - let(:fragmentation) { 0 } - let(:worker_memory) { 0 } - let(:primary_memory) { 0 } - let(:max_strikes) { 0 } - - # Tests should set this to control the number of loop iterations in `call`. let(:watchdog_iterations) { 1 } + let(:name) { :monitor_name } + let(:payload) { { message: 'dummy_text' } } + let(:max_strikes) { 2 } + let(:monitor_class) do + Struct.new(:threshold_violated, :payload) do + def call + { threshold_violated: threshold_violated, payload: payload } + end + + def self.name + 'MonitorName' + end + end + end subject(:watchdog) do - described_class.new(handler: handler, logger: logger, sleep_time_seconds: sleep_time, - max_strikes: max_strikes, max_mem_growth: max_mem_growth, - max_heap_fragmentation: max_heap_fragmentation).tap do |instance| - # We need to defuse `sleep` and stop the internal loop after N iterations. + described_class.new.tap do |instance| + # We need to defuse `sleep` and stop the internal loop after 1 iteration iterations = 0 allow(instance).to receive(:sleep) do instance.stop if (iterations += 1) > watchdog_iterations @@ -38,9 +38,6 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do end def stub_prometheus_metrics - allow(Gitlab::Metrics).to receive(:gauge) - .with(:gitlab_memwd_heap_frag_limit, anything) - .and_return(heap_frag_limit_gauge) allow(Gitlab::Metrics).to receive(:counter) .with(:gitlab_memwd_violations_total, anything, anything) .and_return(violations_counter) @@ -48,318 +45,195 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do .with(:gitlab_memwd_violations_handled_total, anything, anything) .and_return(violations_handled_counter) - allow(heap_frag_limit_gauge).to receive(:set) allow(violations_counter).to receive(:increment) allow(violations_handled_counter).to receive(:increment) end - before do - stub_prometheus_metrics - - allow(handler).to receive(:call).and_return(true) - - allow(logger).to receive(:warn) - allow(logger).to receive(:info) - - allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(fragmentation) - allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return({ uss: worker_memory }) - allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).with( - pid: Gitlab::Cluster::PRIMARY_PID - ).and_return({ uss: primary_memory }) - - allow(::Prometheus::PidProvider).to receive(:worker_id).and_return('worker_1') - end - - context 'when created' do - it 'sets the heap fragmentation limit gauge' do - expect(heap_frag_limit_gauge).to receive(:set).with({}, max_heap_fragmentation) + describe '#initialize' do + it 'initialize new configuration' do + expect(described_class::Configuration).to receive(:new) watchdog end - - context 'when no settings are set in the environment' do - it 'initializes with defaults' do - watchdog = described_class.new(handler: handler, logger: logger) - - expect(watchdog.max_heap_fragmentation).to eq(described_class::DEFAULT_MAX_HEAP_FRAG) - expect(watchdog.max_mem_growth).to eq(described_class::DEFAULT_MAX_MEM_GROWTH) - expect(watchdog.max_strikes).to eq(described_class::DEFAULT_MAX_STRIKES) - expect(watchdog.sleep_time_seconds).to eq(described_class::DEFAULT_SLEEP_TIME_SECONDS) - end - end - - context 'when settings are passed through the environment' do - before do - stub_env('GITLAB_MEMWD_MAX_HEAP_FRAG', 1) - stub_env('GITLAB_MEMWD_MAX_STRIKES', 2) - stub_env('GITLAB_MEMWD_SLEEP_TIME_SEC', 3) - stub_env('GITLAB_MEMWD_MAX_MEM_GROWTH', 4) - end - - it 'initializes with these settings' do - watchdog = described_class.new(handler: handler, logger: logger) - - expect(watchdog.max_heap_fragmentation).to eq(1) - expect(watchdog.max_strikes).to eq(2) - expect(watchdog.sleep_time_seconds).to eq(3) - expect(watchdog.max_mem_growth).to eq(4) - end - end end - shared_examples 'has strikes left' do |stat| - context 'when process has not exceeded allowed number of strikes' do - let(:watchdog_iterations) { max_strikes } - - it 'does not signal the handler' do - expect(handler).not_to receive(:call) - - watchdog.call - end - - it 'does not log any events' do - expect(logger).not_to receive(:warn) - - watchdog.call - end - - it 'increments the violations counter' do - expect(violations_counter).to receive(:increment).with(reason: stat).exactly(watchdog_iterations) - - watchdog.call + describe '#call' do + before do + stub_prometheus_metrics + allow(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024) + allow(::Prometheus::PidProvider).to receive(:worker_id).and_return('worker_1') + + watchdog.configure do |config| + config.handler = handler + config.logger = logger + config.sleep_time_seconds = sleep_time_seconds + config.monitors.use monitor_class, threshold_violated, payload, max_strikes: max_strikes end - it 'does not increment violations handled counter' do - expect(violations_handled_counter).not_to receive(:increment) - - watchdog.call - end + allow(handler).to receive(:call).and_return(true) + allow(logger).to receive(:info) + allow(logger).to receive(:warn) end - end - shared_examples 'no strikes left' do |stat| - it 'signals the handler and resets strike counter' do - expect(handler).to receive(:call).and_return(true) + it 'logs start message once' do + expect(logger).to receive(:info).once + .with( + pid: Process.pid, + worker_id: 'worker_1', + memwd_handler_class: handler.class.name, + memwd_sleep_time_s: sleep_time_seconds, + memwd_rss_bytes: 1024, + message: 'started') watchdog.call - - expect(watchdog.strikes(stat.to_sym)).to eq(0) end - it 'increments both the violations and violations handled counters' do - expect(violations_counter).to receive(:increment).with(reason: stat).exactly(watchdog_iterations) - expect(violations_handled_counter).to receive(:increment).with(reason: stat) + it 'waits for check interval seconds' do + expect(watchdog).to receive(:sleep).with(sleep_time_seconds) watchdog.call end - context 'when enforce_memory_watchdog ops toggle is off' do + context 'when gitlab_memory_watchdog ops toggle is off' do before do - stub_feature_flags(enforce_memory_watchdog: false) + stub_feature_flags(gitlab_memory_watchdog: false) end - it 'always uses the NullHandler' do - expect(handler).not_to receive(:call) - expect(described_class::NullHandler.instance).to receive(:call).and_return(true) - - watchdog.call + it 'does not trigger any monitor' do + expect(configuration).not_to receive(:monitors) end end - context 'when handler result is true' do - it 'considers the event handled and stops itself' do - expect(handler).to receive(:call).once.and_return(true) - expect(logger).to receive(:info).with(hash_including(message: 'stopped')) + context 'when process does not exceed threshold' do + it 'does not increment violations counters' do + expect(violations_counter).not_to receive(:increment) + expect(violations_handled_counter).not_to receive(:increment) watchdog.call end - end - - context 'when handler result is false' do - let(:max_strikes) { 0 } # to make sure the handler fires each iteration - let(:watchdog_iterations) { 3 } - it 'keeps running' do - expect(violations_counter).to receive(:increment).exactly(watchdog_iterations) - expect(violations_handled_counter).to receive(:increment).exactly(watchdog_iterations) - # Return true the third time to terminate the daemon. - expect(handler).to receive(:call).and_return(false, false, true) + it 'does not log violation' do + expect(logger).not_to receive(:warn) watchdog.call end - end - end - - context 'when monitoring memory growth' do - let(:primary_memory) { 2048 } - - context 'when process does not exceed threshold' do - let(:worker_memory) { max_mem_growth * primary_memory - 1 } - it 'does not signal the handler' do + it 'does not execute handler' do expect(handler).not_to receive(:call) watchdog.call end end - context 'when process exceeds threshold permanently' do - let(:worker_memory) { max_mem_growth * primary_memory + 1 } - let(:max_strikes) { 3 } - - it_behaves_like 'has strikes left', 'mem_growth' + context 'when process exceeds threshold' do + let(:threshold_violated) { true } - context 'when process exceeds the allowed number of strikes' do - let(:watchdog_iterations) { max_strikes + 1 } + it 'increments violations counter' do + expect(violations_counter).to receive(:increment).with(reason: name) - it_behaves_like 'no strikes left', 'mem_growth' + watchdog.call + end - it 'only reads reference memory once' do - expect(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss) - .with(pid: Gitlab::Cluster::PRIMARY_PID) - .once + context 'when process does not exceed the allowed number of strikes' do + it 'does not increment handled violations counter' do + expect(violations_handled_counter).not_to receive(:increment) watchdog.call end - it 'logs the event' do - expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024) - expect(logger).to receive(:warn).with({ - message: 'memory limit exceeded', - pid: Process.pid, - worker_id: 'worker_1', - memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble', - memwd_sleep_time_s: sleep_time, - memwd_max_uss_bytes: max_mem_growth * primary_memory, - memwd_ref_uss_bytes: primary_memory, - memwd_uss_bytes: worker_memory, - memwd_rss_bytes: 1024, - memwd_max_strikes: max_strikes, - memwd_cur_strikes: max_strikes + 1 - }) + it 'does not log violation' do + expect(logger).not_to receive(:warn) watchdog.call end - end - end - context 'when process exceeds threshold temporarily' do - let(:worker_memory) { max_mem_growth * primary_memory } - let(:max_strikes) { 1 } - let(:watchdog_iterations) { 4 } + it 'does not execute handler' do + expect(handler).not_to receive(:call) - before do - allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return( - { uss: worker_memory - 0.1 }, - { uss: worker_memory + 0.2 }, - { uss: worker_memory - 0.1 }, - { uss: worker_memory + 0.1 } - ) - allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).with( - pid: Gitlab::Cluster::PRIMARY_PID - ).and_return({ uss: primary_memory }) + watchdog.call + end end - it 'does not signal the handler' do - expect(handler).not_to receive(:call) + context 'when monitor exceeds the allowed number of strikes' do + let(:max_strikes) { 0 } - watchdog.call - end - end - end + it 'increments handled violations counter' do + expect(violations_handled_counter).to receive(:increment).with(reason: name) - context 'when monitoring heap fragmentation' do - context 'when process does not exceed threshold' do - let(:fragmentation) { max_heap_fragmentation - 0.1 } - - it 'does not signal the handler' do - expect(handler).not_to receive(:call) - - watchdog.call - end - end - - context 'when process exceeds threshold permanently' do - let(:fragmentation) { max_heap_fragmentation + 0.1 } - let(:max_strikes) { 3 } - - it_behaves_like 'has strikes left', 'heap_frag' + watchdog.call + end - context 'when process exceeds the allowed number of strikes' do - let(:watchdog_iterations) { max_strikes + 1 } + it 'logs violation' do + expect(logger).to receive(:warn) + .with( + pid: Process.pid, + worker_id: 'worker_1', + memwd_handler_class: handler.class.name, + memwd_sleep_time_s: sleep_time_seconds, + memwd_rss_bytes: 1024, + memwd_cur_strikes: 1, + memwd_max_strikes: max_strikes, + message: 'dummy_text') - it_behaves_like 'no strikes left', 'heap_frag' + watchdog.call + end - it 'logs the event' do - expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024) - expect(logger).to receive(:warn).with({ - message: 'heap fragmentation limit exceeded', - pid: Process.pid, - worker_id: 'worker_1', - memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble', - memwd_sleep_time_s: sleep_time, - memwd_max_heap_frag: max_heap_fragmentation, - memwd_cur_heap_frag: fragmentation, - memwd_max_strikes: max_strikes, - memwd_cur_strikes: max_strikes + 1, - memwd_rss_bytes: 1024 - }) + it 'executes handler' do + expect(handler).to receive(:call) watchdog.call end - end - end - context 'when process exceeds threshold temporarily' do - let(:fragmentation) { max_heap_fragmentation } - let(:max_strikes) { 1 } - let(:watchdog_iterations) { 4 } + context 'when enforce_memory_watchdog ops toggle is off' do + before do + stub_feature_flags(enforce_memory_watchdog: false) + end - before do - allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return( - fragmentation - 0.1, - fragmentation + 0.2, - fragmentation - 0.1, - fragmentation + 0.1 - ) - end + it 'always uses the NullHandler' do + expect(handler).not_to receive(:call) + expect(described_class::NullHandler.instance).to receive(:call).and_return(true) - it 'does not signal the handler' do - expect(handler).not_to receive(:call) + watchdog.call + end + end - watchdog.call + context 'when multiple monitors exceeds allowed number of strikes' do + before do + watchdog.configure do |config| + config.handler = handler + config.logger = logger + config.sleep_time_seconds = sleep_time_seconds + config.monitors.use monitor_class, threshold_violated, payload, max_strikes: max_strikes + config.monitors.use monitor_class, threshold_violated, payload, max_strikes: max_strikes + end + end + + it 'only calls the handler once' do + expect(handler).to receive(:call).once.and_return(true) + + watchdog.call + end + end end end - end - - context 'when both memory fragmentation and growth exceed thresholds' do - let(:fragmentation) { max_heap_fragmentation + 0.1 } - let(:primary_memory) { 2048 } - let(:worker_memory) { max_mem_growth * primary_memory + 1 } - let(:watchdog_iterations) { max_strikes + 1 } - it 'only calls the handler once' do - expect(handler).to receive(:call).once.and_return(true) + it 'logs stop message once' do + expect(logger).to receive(:info).once + .with( + pid: Process.pid, + worker_id: 'worker_1', + memwd_handler_class: handler.class.name, + memwd_sleep_time_s: sleep_time_seconds, + memwd_rss_bytes: 1024, + message: 'stopped') watchdog.call end end - context 'when gitlab_memory_watchdog ops toggle is off' do - before do - stub_feature_flags(gitlab_memory_watchdog: false) - end - - it 'does not monitor heap fragmentation' do - expect(Gitlab::Metrics::Memory).not_to receive(:gc_heap_fragmentation) - - watchdog.call - end - - it 'does not monitor memory growth' do - expect(Gitlab::Metrics::System).not_to receive(:memory_usage_uss_pss) - - watchdog.call + describe '#configure' do + it 'yields block' do + expect { |b| watchdog.configure(&b) }.to yield_control end end end |