diff options
Diffstat (limited to 'spec/lib/gitlab/memory/watchdog_spec.rb')
-rw-r--r-- | spec/lib/gitlab/memory/watchdog_spec.rb | 318 |
1 files changed, 216 insertions, 102 deletions
diff --git a/spec/lib/gitlab/memory/watchdog_spec.rb b/spec/lib/gitlab/memory/watchdog_spec.rb index 010f6884df3..beb49660022 100644 --- a/spec/lib/gitlab/memory/watchdog_spec.rb +++ b/spec/lib/gitlab/memory/watchdog_spec.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require 'spec_helper' +require_relative '../../../../lib/gitlab/cluster/lifecycle_events' RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do context 'watchdog' do @@ -8,23 +9,31 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do let(:handler) { instance_double(described_class::NullHandler) } let(:heap_frag_limit_gauge) { instance_double(::Prometheus::Client::Gauge) } - let(:heap_frag_violations_counter) { instance_double(::Prometheus::Client::Counter) } - let(:heap_frag_violations_handled_counter) { instance_double(::Prometheus::Client::Counter) } + let(:violations_counter) { instance_double(::Prometheus::Client::Counter) } + let(:violations_handled_counter) { instance_double(::Prometheus::Client::Counter) } let(:sleep_time) { 0.1 } let(:max_heap_fragmentation) { 0.2 } + let(:max_mem_growth) { 2 } + + # Defaults that will not trigger any events. + let(:fragmentation) { 0 } + let(:worker_memory) { 0 } + let(:primary_memory) { 0 } + let(:max_strikes) { 0 } # Tests should set this to control the number of loop iterations in `call`. let(:watchdog_iterations) { 1 } subject(:watchdog) do described_class.new(handler: handler, logger: logger, sleep_time_seconds: sleep_time, - max_strikes: max_strikes, max_heap_fragmentation: max_heap_fragmentation).tap do |instance| + max_strikes: max_strikes, max_mem_growth: max_mem_growth, + max_heap_fragmentation: max_heap_fragmentation).tap do |instance| # We need to defuse `sleep` and stop the internal loop after N iterations. iterations = 0 - expect(instance).to receive(:sleep) do - instance.stop if (iterations += 1) >= watchdog_iterations - end.at_most(watchdog_iterations) + allow(instance).to receive(:sleep) do + instance.stop if (iterations += 1) > watchdog_iterations + end end end @@ -33,34 +42,35 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do .with(:gitlab_memwd_heap_frag_limit, anything) .and_return(heap_frag_limit_gauge) allow(Gitlab::Metrics).to receive(:counter) - .with(:gitlab_memwd_heap_frag_violations_total, anything, anything) - .and_return(heap_frag_violations_counter) + .with(:gitlab_memwd_violations_total, anything, anything) + .and_return(violations_counter) allow(Gitlab::Metrics).to receive(:counter) - .with(:gitlab_memwd_heap_frag_violations_handled_total, anything, anything) - .and_return(heap_frag_violations_handled_counter) + .with(:gitlab_memwd_violations_handled_total, anything, anything) + .and_return(violations_handled_counter) allow(heap_frag_limit_gauge).to receive(:set) - allow(heap_frag_violations_counter).to receive(:increment) - allow(heap_frag_violations_handled_counter).to receive(:increment) + allow(violations_counter).to receive(:increment) + allow(violations_handled_counter).to receive(:increment) end before do stub_prometheus_metrics - allow(handler).to receive(:on_high_heap_fragmentation).and_return(true) + allow(handler).to receive(:call).and_return(true) allow(logger).to receive(:warn) allow(logger).to receive(:info) allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return(fragmentation) + allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return({ uss: worker_memory }) + allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).with( + pid: Gitlab::Cluster::PRIMARY_PID + ).and_return({ uss: primary_memory }) allow(::Prometheus::PidProvider).to receive(:worker_id).and_return('worker_1') end context 'when created' do - let(:fragmentation) { 0 } - let(:max_strikes) { 0 } - it 'sets the heap fragmentation limit gauge' do expect(heap_frag_limit_gauge).to receive(:set).with({}, max_heap_fragmentation) @@ -71,7 +81,8 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do it 'initializes with defaults' do watchdog = described_class.new(handler: handler, logger: logger) - expect(watchdog.max_heap_fragmentation).to eq(described_class::DEFAULT_HEAP_FRAG_THRESHOLD) + expect(watchdog.max_heap_fragmentation).to eq(described_class::DEFAULT_MAX_HEAP_FRAG) + expect(watchdog.max_mem_growth).to eq(described_class::DEFAULT_MAX_MEM_GROWTH) expect(watchdog.max_strikes).to eq(described_class::DEFAULT_MAX_STRIKES) expect(watchdog.sleep_time_seconds).to eq(described_class::DEFAULT_SLEEP_TIME_SECONDS) end @@ -82,6 +93,7 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do stub_env('GITLAB_MEMWD_MAX_HEAP_FRAG', 1) stub_env('GITLAB_MEMWD_MAX_STRIKES', 2) stub_env('GITLAB_MEMWD_SLEEP_TIME_SEC', 3) + stub_env('GITLAB_MEMWD_MAX_MEM_GROWTH', 4) end it 'initializes with these settings' do @@ -90,30 +102,17 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do expect(watchdog.max_heap_fragmentation).to eq(1) expect(watchdog.max_strikes).to eq(2) expect(watchdog.sleep_time_seconds).to eq(3) + expect(watchdog.max_mem_growth).to eq(4) end end end - context 'when process does not exceed heap fragmentation threshold' do - let(:fragmentation) { max_heap_fragmentation - 0.1 } - let(:max_strikes) { 0 } # To rule out that we were granting too many strikes. - - it 'does not signal the handler' do - expect(handler).not_to receive(:on_high_heap_fragmentation) - - watchdog.call - end - end - - context 'when process exceeds heap fragmentation threshold permanently' do - let(:fragmentation) { max_heap_fragmentation + 0.1 } - let(:max_strikes) { 3 } - + shared_examples 'has strikes left' do |stat| context 'when process has not exceeded allowed number of strikes' do let(:watchdog_iterations) { max_strikes } it 'does not signal the handler' do - expect(handler).not_to receive(:on_high_heap_fragmentation) + expect(handler).not_to receive(:call) watchdog.call end @@ -125,119 +124,228 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do end it 'increments the violations counter' do - expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations) + expect(violations_counter).to receive(:increment).with(reason: stat).exactly(watchdog_iterations) watchdog.call end it 'does not increment violations handled counter' do - expect(heap_frag_violations_handled_counter).not_to receive(:increment) + expect(violations_handled_counter).not_to receive(:increment) watchdog.call end end + end + + shared_examples 'no strikes left' do |stat| + it 'signals the handler and resets strike counter' do + expect(handler).to receive(:call).and_return(true) + + watchdog.call + + expect(watchdog.strikes(stat.to_sym)).to eq(0) + end + + it 'increments both the violations and violations handled counters' do + expect(violations_counter).to receive(:increment).with(reason: stat).exactly(watchdog_iterations) + expect(violations_handled_counter).to receive(:increment).with(reason: stat) + + watchdog.call + end - context 'when process exceeds the allowed number of strikes' do - let(:watchdog_iterations) { max_strikes + 1 } + context 'when enforce_memory_watchdog ops toggle is off' do + before do + stub_feature_flags(enforce_memory_watchdog: false) + end - it 'signals the handler and resets strike counter' do - expect(handler).to receive(:on_high_heap_fragmentation).and_return(true) + it 'always uses the NullHandler' do + expect(handler).not_to receive(:call) + expect(described_class::NullHandler.instance).to receive(:call).and_return(true) watchdog.call + end + end - expect(watchdog.strikes).to eq(0) + context 'when handler result is true' do + it 'considers the event handled and stops itself' do + expect(handler).to receive(:call).once.and_return(true) + expect(logger).to receive(:info).with(hash_including(message: 'stopped')) + + watchdog.call end + end - it 'logs the event' do - expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024) - expect(logger).to receive(:warn).with({ - message: 'heap fragmentation limit exceeded', - pid: Process.pid, - worker_id: 'worker_1', - memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble', - memwd_sleep_time_s: sleep_time, - memwd_max_heap_frag: max_heap_fragmentation, - memwd_cur_heap_frag: fragmentation, - memwd_max_strikes: max_strikes, - memwd_cur_strikes: max_strikes + 1, - memwd_rss_bytes: 1024 - }) + context 'when handler result is false' do + let(:max_strikes) { 0 } # to make sure the handler fires each iteration + let(:watchdog_iterations) { 3 } + + it 'keeps running' do + expect(violations_counter).to receive(:increment).exactly(watchdog_iterations) + expect(violations_handled_counter).to receive(:increment).exactly(watchdog_iterations) + # Return true the third time to terminate the daemon. + expect(handler).to receive(:call).and_return(false, false, true) watchdog.call end + end + end + + context 'when monitoring memory growth' do + let(:primary_memory) { 2048 } - it 'increments both the violations and violations handled counters' do - expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations) - expect(heap_frag_violations_handled_counter).to receive(:increment) + context 'when process does not exceed threshold' do + let(:worker_memory) { max_mem_growth * primary_memory - 1 } + + it 'does not signal the handler' do + expect(handler).not_to receive(:call) watchdog.call end + end - context 'when enforce_memory_watchdog ops toggle is off' do - before do - stub_feature_flags(enforce_memory_watchdog: false) - end + context 'when process exceeds threshold permanently' do + let(:worker_memory) { max_mem_growth * primary_memory + 1 } + let(:max_strikes) { 3 } + + it_behaves_like 'has strikes left', 'mem_growth' + + context 'when process exceeds the allowed number of strikes' do + let(:watchdog_iterations) { max_strikes + 1 } - it 'always uses the NullHandler' do - expect(handler).not_to receive(:on_high_heap_fragmentation) - expect(described_class::NullHandler.instance).to( - receive(:on_high_heap_fragmentation).with(fragmentation).and_return(true) - ) + it_behaves_like 'no strikes left', 'mem_growth' + + it 'only reads reference memory once' do + expect(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss) + .with(pid: Gitlab::Cluster::PRIMARY_PID) + .once watchdog.call end - end - context 'when handler result is true' do - it 'considers the event handled and stops itself' do - expect(handler).to receive(:on_high_heap_fragmentation).once.and_return(true) - expect(logger).to receive(:info).with(hash_including(message: 'stopped')) + it 'logs the event' do + expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024) + expect(logger).to receive(:warn).with({ + message: 'memory limit exceeded', + pid: Process.pid, + worker_id: 'worker_1', + memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble', + memwd_sleep_time_s: sleep_time, + memwd_max_uss_bytes: max_mem_growth * primary_memory, + memwd_ref_uss_bytes: primary_memory, + memwd_uss_bytes: worker_memory, + memwd_rss_bytes: 1024, + memwd_max_strikes: max_strikes, + memwd_cur_strikes: max_strikes + 1 + }) watchdog.call end end + end + + context 'when process exceeds threshold temporarily' do + let(:worker_memory) { max_mem_growth * primary_memory } + let(:max_strikes) { 1 } + let(:watchdog_iterations) { 4 } + + before do + allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).and_return( + { uss: worker_memory - 0.1 }, + { uss: worker_memory + 0.2 }, + { uss: worker_memory - 0.1 }, + { uss: worker_memory + 0.1 } + ) + allow(Gitlab::Metrics::System).to receive(:memory_usage_uss_pss).with( + pid: Gitlab::Cluster::PRIMARY_PID + ).and_return({ uss: primary_memory }) + end + + it 'does not signal the handler' do + expect(handler).not_to receive(:call) + + watchdog.call + end + end + end + + context 'when monitoring heap fragmentation' do + context 'when process does not exceed threshold' do + let(:fragmentation) { max_heap_fragmentation - 0.1 } + + it 'does not signal the handler' do + expect(handler).not_to receive(:call) + + watchdog.call + end + end + + context 'when process exceeds threshold permanently' do + let(:fragmentation) { max_heap_fragmentation + 0.1 } + let(:max_strikes) { 3 } - context 'when handler result is false' do - let(:max_strikes) { 0 } # to make sure the handler fires each iteration - let(:watchdog_iterations) { 3 } + it_behaves_like 'has strikes left', 'heap_frag' - it 'keeps running' do - expect(heap_frag_violations_counter).to receive(:increment).exactly(watchdog_iterations) - expect(heap_frag_violations_handled_counter).to receive(:increment).exactly(watchdog_iterations) - # Return true the third time to terminate the daemon. - expect(handler).to receive(:on_high_heap_fragmentation).and_return(false, false, true) + context 'when process exceeds the allowed number of strikes' do + let(:watchdog_iterations) { max_strikes + 1 } + + it_behaves_like 'no strikes left', 'heap_frag' + + it 'logs the event' do + expect(Gitlab::Metrics::System).to receive(:memory_usage_rss).at_least(:once).and_return(1024) + expect(logger).to receive(:warn).with({ + message: 'heap fragmentation limit exceeded', + pid: Process.pid, + worker_id: 'worker_1', + memwd_handler_class: 'RSpec::Mocks::InstanceVerifyingDouble', + memwd_sleep_time_s: sleep_time, + memwd_max_heap_frag: max_heap_fragmentation, + memwd_cur_heap_frag: fragmentation, + memwd_max_strikes: max_strikes, + memwd_cur_strikes: max_strikes + 1, + memwd_rss_bytes: 1024 + }) watchdog.call end end end - end - context 'when process exceeds heap fragmentation threshold temporarily' do - let(:fragmentation) { max_heap_fragmentation } - let(:max_strikes) { 1 } - let(:watchdog_iterations) { 4 } + context 'when process exceeds threshold temporarily' do + let(:fragmentation) { max_heap_fragmentation } + let(:max_strikes) { 1 } + let(:watchdog_iterations) { 4 } - before do - allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return( - fragmentation - 0.1, - fragmentation + 0.2, - fragmentation - 0.1, - fragmentation + 0.1 - ) + before do + allow(Gitlab::Metrics::Memory).to receive(:gc_heap_fragmentation).and_return( + fragmentation - 0.1, + fragmentation + 0.2, + fragmentation - 0.1, + fragmentation + 0.1 + ) + end + + it 'does not signal the handler' do + expect(handler).not_to receive(:call) + + watchdog.call + end end + end - it 'does not signal the handler' do - expect(handler).not_to receive(:on_high_heap_fragmentation) + context 'when both memory fragmentation and growth exceed thresholds' do + let(:fragmentation) { max_heap_fragmentation + 0.1 } + let(:primary_memory) { 2048 } + let(:worker_memory) { max_mem_growth * primary_memory + 1 } + let(:watchdog_iterations) { max_strikes + 1 } + + it 'only calls the handler once' do + expect(handler).to receive(:call).once.and_return(true) watchdog.call end end context 'when gitlab_memory_watchdog ops toggle is off' do - let(:fragmentation) { 0 } - let(:max_strikes) { 0 } - before do stub_feature_flags(gitlab_memory_watchdog: false) end @@ -247,6 +355,12 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do watchdog.call end + + it 'does not monitor memory growth' do + expect(Gitlab::Metrics::System).not_to receive(:memory_usage_uss_pss) + + watchdog.call + end end end @@ -254,9 +368,9 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do context 'NullHandler' do subject(:handler) { described_class::NullHandler.instance } - describe '#on_high_heap_fragmentation' do + describe '#call' do it 'does nothing' do - expect(handler.on_high_heap_fragmentation(1.0)).to be(false) + expect(handler.call).to be(false) end end end @@ -264,11 +378,11 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do context 'TermProcessHandler' do subject(:handler) { described_class::TermProcessHandler.new(42) } - describe '#on_high_heap_fragmentation' do + describe '#call' do it 'sends SIGTERM to the current process' do expect(Process).to receive(:kill).with(:TERM, 42) - expect(handler.on_high_heap_fragmentation(1.0)).to be(true) + expect(handler.call).to be(true) end end end @@ -286,12 +400,12 @@ RSpec.describe Gitlab::Memory::Watchdog, :aggregate_failures, :prometheus do stub_const('::Puma::Cluster::WorkerHandle', puma_worker_handle_class) end - describe '#on_high_heap_fragmentation' do + describe '#call' do it 'invokes orderly termination via Puma API' do expect(puma_worker_handle_class).to receive(:new).and_return(puma_worker_handle) expect(puma_worker_handle).to receive(:term) - expect(handler.on_high_heap_fragmentation(1.0)).to be(true) + expect(handler.call).to be(true) end end end |